org.apache.nutch.net.protocols.HttpDateFormat Java Examples

The following examples show how to use org.apache.nutch.net.protocols.HttpDateFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FileResponse.java    From anthelion with Apache License 2.0 6 votes vote down vote up
private void getDirAsHttpResponse(java.io.File f) throws IOException {

    String path = f.toString();
    if (this.file.crawlParents)
      this.content = list2html(f.listFiles(), path, "/".equals(path) ? false
          : true);
    else
      this.content = list2html(f.listFiles(), path, false);

    // set headers
    headers.set(Response.CONTENT_LENGTH,
        new Integer(this.content.length).toString());
    headers.set(Response.CONTENT_TYPE, "text/html");
    headers.set(Response.LAST_MODIFIED,
        HttpDateFormat.toString(f.lastModified()));

    // response code
    this.code = 200; // http OK
  }
 
Example #2
Source File: FileResponse.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
/**
 * get dir list as http response
 * @param f
 * @throws IOException
 */
private void getDirAsHttpResponse(java.io.File f) throws IOException {

  String path = f.toString();
  if (this.file.crawlParents)
    this.content = list2html(f.listFiles(), path, "/".equals(path) ? false
        : true);
  else
    this.content = list2html(f.listFiles(), path, false);

  // set headers
  headers.set(Response.CONTENT_LENGTH,
      new Integer(this.content.length).toString());
  headers.set(Response.CONTENT_TYPE, "text/html");
  headers.set(Response.LAST_MODIFIED,
      HttpDateFormat.toString(f.lastModified()));

  // response code
  this.code = 200; // http OK
}
 
Example #3
Source File: FtpResponse.java    From anthelion with Apache License 2.0 5 votes vote down vote up
private byte[] list2html(List list, String path, boolean includeDotDot) {

    //StringBuffer x = new StringBuffer("<!doctype html public \"-//ietf//dtd html//en\"><html><head>");
    StringBuffer x = new StringBuffer("<html><head>");
    x.append("<title>Index of "+path+"</title></head>\n");
    x.append("<body><h1>Index of "+path+"</h1><pre>\n");

    if (includeDotDot) {
      x.append("<a href='../'>../</a>\t-\t-\t-\n");
    }

    for (int i=0; i<list.size(); i++) {
      FTPFile f = (FTPFile) list.get(i);
      String name = f.getName();
      String time = HttpDateFormat.toString(f.getTimestamp());
      if (f.isDirectory()) {
        // some ftp server LIST "." and "..", we skip them here
        if (name.equals(".") || name.equals(".."))
          continue;
        x.append("<a href='"+name+"/"+"'>"+name+"/</a>\t");
        x.append(time+"\t-\n");
      } else if (f.isFile()) {
        x.append("<a href='"+name+    "'>"+name+"</a>\t");
        x.append(time+"\t"+f.getSize()+"\n");
      } else {
        // ignore isSymbolicLink()
        // ignore isUnknown()
      }
    }

    x.append("</pre></body></html>\n");

    return new String(x).getBytes();
  }
 
Example #4
Source File: FileResponse.java    From anthelion with Apache License 2.0 5 votes vote down vote up
private byte[] list2html(java.io.File[] list, String path,
    boolean includeDotDot) {

  StringBuffer x = new StringBuffer("<html><head>");
  x.append("<title>Index of " + path + "</title></head>\n");
  x.append("<body><h1>Index of " + path + "</h1><pre>\n");

  if (includeDotDot) {
    x.append("<a href='../'>../</a>\t-\t-\t-\n");
  }

  // fix me: we might want to sort list here! but not now.

  java.io.File f;
  for (int i = 0; i < list.length; i++) {
    f = list[i];
    String name = f.getName();
    String time = HttpDateFormat.toString(f.lastModified());
    if (f.isDirectory()) {
      // java 1.4.2 api says dir itself and parent dir are not listed
      // so the following is not needed.
      // if (name.equals(".") || name.equals(".."))
      // continue;
      x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
      x.append(time + "\t-\n");
    } else if (f.isFile()) {
      x.append("<a href='" + name + "'>" + name + "</a>\t");
      x.append(time + "\t" + f.length() + "\n");
    } else {
      // ignore any other
    }
  }

  x.append("</pre></body></html>\n");

  return new String(x).getBytes();
}
 
Example #5
Source File: FtpResponse.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
private byte[] list2html(List<FTPFile> list, String path, boolean includeDotDot) {

    //StringBuffer x = new StringBuffer("<!doctype html public \"-//ietf//dtd html//en\"><html><head>");
    StringBuffer x = new StringBuffer("<html><head>");
    x.append("<title>Index of "+path+"</title></head>\n");
    x.append("<body><h1>Index of "+path+"</h1><pre>\n");

    if (includeDotDot) {
      x.append("<a href='../'>../</a>\t-\t-\t-\n");
    }

    for (int i=0; i<list.size(); i++) {
      FTPFile f = (FTPFile) list.get(i);
      String name = f.getName();
      String time = HttpDateFormat.toString(f.getTimestamp());
      if (f.isDirectory()) {
        // some ftp server LIST "." and "..", we skip them here
        if (name.equals(".") || name.equals(".."))
          continue;
        x.append("<a href='"+name+"/"+"'>"+name+"/</a>\t");
        x.append(time+"\t-\n");
      } else if (f.isFile()) {
        x.append("<a href='"+name+    "'>"+name+"</a>\t");
        x.append(time+"\t"+f.getSize()+"\n");
      } else {
        // ignore isSymbolicLink()
        // ignore isUnknown()
      }
    }

    x.append("</pre></body></html>\n");

    return new String(x).getBytes();
  }
 
Example #6
Source File: FileResponse.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * generate html page from dir list
 * @param list
 * @param path
 * @param includeDotDot
 * @return
 */
private byte[] list2html(java.io.File[] list, String path,
    boolean includeDotDot) {

  StringBuffer x = new StringBuffer("<html><head>");
  x.append("<title>Index of " + path + "</title></head>\n");
  x.append("<body><h1>Index of " + path + "</h1><pre>\n");

  if (includeDotDot) {
    x.append("<a href='../'>../</a>\t-\t-\t-\n");
  }

  // fix me: we might want to sort list here! but not now.

  java.io.File f;
  for (int i = 0; i < list.length; i++) {
    f = list[i];
    String name = f.getName();
    String time = HttpDateFormat.toString(f.lastModified());
    if (f.isDirectory()) {
      // java 1.4.2 api says dir itself and parent dir are not listed
      // so the following is not needed.
      // if (name.equals(".") || name.equals(".."))
      // continue;
      x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
      x.append(time + "\t-\n");
    } else if (f.isFile()) {
      x.append("<a href='" + name + "'>" + name + "</a>\t");
      x.append(time + "\t" + f.length() + "\n");
    } else {
      // ignore any other
    }
  }

  x.append("</pre></body></html>\n");

  return new String(x).getBytes();
}
 
Example #7
Source File: FileResponse.java    From anthelion with Apache License 2.0 4 votes vote down vote up
public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
  throws FileException, IOException {

  this.orig = url.toString();
  this.base = url.toString();
  this.file = file;
  this.conf = conf;
  
  MIME = new MimeUtil(conf);
  tika = new Tika();

  if (!"file".equals(url.getProtocol()))
    throw new FileException("Not a file url:" + url);

  if (File.LOG.isTraceEnabled()) {
    File.LOG.trace("fetching " + url);
  }

  if (url.getPath() != url.getFile()) {
    if (File.LOG.isWarnEnabled()) {
      File.LOG.warn("url.getPath() != url.getFile(): " + url);
    }
  }

  String path = "".equals(url.getPath()) ? "/" : url.getPath();

  try {
    // specify the encoding via the config later?
    path = java.net.URLDecoder.decode(path, "UTF-8");
  } catch (UnsupportedEncodingException ex) {
  }

  try {

    this.content = null;

    // url.toURI() is only in j2se 1.5.0
    //java.io.File f = new java.io.File(url.toURI());
    java.io.File f = new java.io.File(path);

    if (!f.exists()) {
      this.code = 404;  // http Not Found
      return;
    }

    if (!f.canRead()) {
      this.code = 401;  // http Unauthorized
      return;
    }

    // symbolic link or relative path on unix
    // fix me: what's the consequence on windows platform
    // where case is insensitive
    if (!f.equals(f.getCanonicalFile())) {
      // set headers
      //hdrs.put("Location", f.getCanonicalFile().toURI());
      //
      // we want to automatically escape characters that are illegal in URLs. 
      // It is recommended that new code convert an abstract pathname into a URL 
      // by first converting it into a URI, via the toURI method, and then 
      // converting the URI into a URL via the URI.toURL method.
      headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL().toString());

      this.code = 300;  // http redirect
      return;
    }
    if (f.lastModified() <= datum.getModifiedTime()) {
      this.code = 304;
      this.headers.set("Last-Modified", HttpDateFormat.toString(f.lastModified()));
      return;
    }

    if (f.isDirectory()) {
      getDirAsHttpResponse(f);
    } else if (f.isFile()) {
      getFileAsHttpResponse(f);
    } else {
      this.code = 500; // http Internal Server Error
      return;
    }

  } catch (IOException e) {
    throw e;
  }

}
 
Example #8
Source File: FileResponse.java    From anthelion with Apache License 2.0 4 votes vote down vote up
private void getFileAsHttpResponse(java.io.File f) throws FileException,
    IOException {

  // ignore file of size larger than
  // Integer.MAX_VALUE = 2^31-1 = 2147483647
  long size = f.length();
  if (size > Integer.MAX_VALUE) {
    throw new FileException("file is too large, size: " + size);
    // or we can do this?
    // this.code = 400; // http Bad request
    // return;
  }

  // capture content
  int len = (int) size;

  if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength)
    len = this.file.maxContentLength;

  this.content = new byte[len];

  java.io.InputStream is = new java.io.FileInputStream(f);
  int offset = 0;
  int n = 0;
  while (offset < len
      && (n = is.read(this.content, offset, len - offset)) >= 0) {
    offset += n;
  }
  if (offset < len) { // keep whatever already have, but issue a warning
    if (File.LOG.isWarnEnabled()) {
      File.LOG.warn("not enough bytes read from file: " + f.getPath());
    }
  }
  is.close();

  // set headers
  headers.set(Response.CONTENT_LENGTH, new Long(size).toString());
  headers.set(Response.LAST_MODIFIED,
      HttpDateFormat.toString(f.lastModified()));

  String mimeType = MIME.getMimeType(f);

  headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : "");

  // response code
  this.code = 200; // http OK
}
 
Example #9
Source File: MoreIndexingFilter.java    From anthelion with Apache License 2.0 4 votes vote down vote up
private long getTime(String date, String url) {
  long time = -1;
  try {
    time = HttpDateFormat.toLong(date);
  } catch (ParseException e) {
// try to parse it as date in alternative format
try {
    Date parsedDate = DateUtils.parseDate(date,
    new String [] {
        "EEE MMM dd HH:mm:ss yyyy",
        "EEE MMM dd HH:mm:ss yyyy zzz",
        "EEE MMM dd HH:mm:ss zzz yyyy",
        "EEE, MMM dd HH:mm:ss yyyy zzz",
        "EEE, dd MMM yyyy HH:mm:ss zzz",
        "EEE,dd MMM yyyy HH:mm:ss zzz",
        "EEE, dd MMM yyyy HH:mm:sszzz",
        "EEE, dd MMM yyyy HH:mm:ss",
        "EEE, dd-MMM-yy HH:mm:ss zzz",
        "yyyy/MM/dd HH:mm:ss.SSS zzz",
        "yyyy/MM/dd HH:mm:ss.SSS",
        "yyyy/MM/dd HH:mm:ss zzz",
        "yyyy/MM/dd",
        "yyyy.MM.dd HH:mm:ss",
        "yyyy-MM-dd HH:mm",
        "MMM dd yyyy HH:mm:ss. zzz",
        "MMM dd yyyy HH:mm:ss zzz",
        "dd.MM.yyyy HH:mm:ss zzz",
        "dd MM yyyy HH:mm:ss zzz",
        "dd.MM.yyyy; HH:mm:ss",
        "dd.MM.yyyy HH:mm:ss",
        "dd.MM.yyyy zzz",
        "yyyy-MM-dd'T'HH:mm:ss'Z'"
    });
    time = parsedDate.getTime();
          // if (LOG.isWarnEnabled()) {
    //   LOG.warn(url + ": parsed date: " + date +" to:"+time);
          // }
} catch (Exception e2) {
          if (LOG.isWarnEnabled()) {
      LOG.warn(url + ": can't parse erroneous date: " + date);
          }
}
  }
  return time;
}
 
Example #10
Source File: FileResponse.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
/**
 * Default public constructor
 * @param url
 * @param datum
 * @param file
 * @param conf
 * @throws FileException
 * @throws IOException
 */
public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
  throws FileException, IOException {

  this.orig = url.toString();
  this.base = url.toString();
  this.file = file;
  this.conf = conf;
  
  MIME = new MimeUtil(conf);
  tika = new Tika();

  if (!"file".equals(url.getProtocol()))
    throw new FileException("Not a file url:" + url);

  if (File.LOG.isTraceEnabled()) {
    File.LOG.trace("fetching " + url);
  }

  if (url.getPath() != url.getFile()) {
    if (File.LOG.isWarnEnabled()) {
      File.LOG.warn("url.getPath() != url.getFile(): " + url);
    }
  }

  String path = "".equals(url.getPath()) ? "/" : url.getPath();

  try {
    // specify the encoding via the config later?
    path = java.net.URLDecoder.decode(path, "UTF-8");
  } catch (UnsupportedEncodingException ex) {
  }

  try {

    this.content = null;

    // url.toURI() is only in j2se 1.5.0
    //java.io.File f = new java.io.File(url.toURI());
    java.io.File f = new java.io.File(path);

    if (!f.exists()) {
      this.code = 404;  // http Not Found
      return;
    }

    if (!f.canRead()) {
      this.code = 401;  // http Unauthorized
      return;
    }

    // symbolic link or relative path on unix
    // fix me: what's the consequence on windows platform
    // where case is insensitive
    if (!f.equals(f.getCanonicalFile())) {
      // set headers
      //hdrs.put("Location", f.getCanonicalFile().toURI());
      //
      // we want to automatically escape characters that are illegal in URLs. 
      // It is recommended that new code convert an abstract pathname into a URL 
      // by first converting it into a URI, via the toURI method, and then 
      // converting the URI into a URL via the URI.toURL method.
      headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL().toString());

      this.code = 300;  // http redirect
      return;
    }
    if (f.lastModified() <= datum.getModifiedTime()) {
      this.code = 304;
      this.headers.set("Last-Modified", HttpDateFormat.toString(f.lastModified()));
      return;
    }

    if (f.isDirectory()) {
      getDirAsHttpResponse(f);
    } else if (f.isFile()) {
      getFileAsHttpResponse(f);
    } else {
      this.code = 500; // http Internal Server Error
      return;
    }

  } catch (IOException e) {
    throw e;
  }

}
 
Example #11
Source File: FileResponse.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
private void getFileAsHttpResponse(java.io.File f) throws FileException,
    IOException {

  // ignore file of size larger than
  // Integer.MAX_VALUE = 2^31-1 = 2147483647
  long size = f.length();
  if (size > Integer.MAX_VALUE) {
    throw new FileException("file is too large, size: " + size);
    // or we can do this?
    // this.code = 400; // http Bad request
    // return;
  }

  // capture content
  int len = (int) size;

  if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength)
    len = this.file.maxContentLength;

  this.content = new byte[len];

  java.io.InputStream is = new java.io.FileInputStream(f);
  int offset = 0;
  int n = 0;
  while (offset < len
      && (n = is.read(this.content, offset, len - offset)) >= 0) {
    offset += n;
  }
  if (offset < len) { // keep whatever already have, but issue a warning
    if (File.LOG.isWarnEnabled()) {
      File.LOG.warn("not enough bytes read from file: " + f.getPath());
    }
  }
  is.close();

  // set headers
  headers.set(Response.CONTENT_LENGTH, new Long(size).toString());
  headers.set(Response.LAST_MODIFIED,
      HttpDateFormat.toString(f.lastModified()));

  String mimeType = tika.detect(f);

  headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : "");

  // response code
  this.code = 200; // http OK
}
 
Example #12
Source File: MoreIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
private long getTime(String date, String url) {
  long time = -1;
  try {
    time = HttpDateFormat.toLong(date);
  } catch (ParseException e) {
// try to parse it as date in alternative format
try {
    Date parsedDate = DateUtils.parseDate(date,
    new String [] {
        "EEE MMM dd HH:mm:ss yyyy",
        "EEE MMM dd HH:mm:ss yyyy zzz",
        "EEE MMM dd HH:mm:ss zzz yyyy",
        "EEE, MMM dd HH:mm:ss yyyy zzz",
        "EEE, dd MMM yyyy HH:mm:ss zzz",
        "EEE,dd MMM yyyy HH:mm:ss zzz",
        "EEE, dd MMM yyyy HH:mm:sszzz",
        "EEE, dd MMM yyyy HH:mm:ss",
        "EEE, dd-MMM-yy HH:mm:ss zzz",
        "yyyy/MM/dd HH:mm:ss.SSS zzz",
        "yyyy/MM/dd HH:mm:ss.SSS",
        "yyyy/MM/dd HH:mm:ss zzz",
        "yyyy/MM/dd",
        "yyyy.MM.dd HH:mm:ss",
        "yyyy-MM-dd HH:mm",
        "MMM dd yyyy HH:mm:ss. zzz",
        "MMM dd yyyy HH:mm:ss zzz",
        "dd.MM.yyyy HH:mm:ss zzz",
        "dd MM yyyy HH:mm:ss zzz",
        "dd.MM.yyyy; HH:mm:ss",
        "dd.MM.yyyy HH:mm:ss",
        "dd.MM.yyyy zzz",
        "yyyy-MM-dd'T'HH:mm:ss'Z'"
    });
    time = parsedDate.getTime();
          // if (LOG.isWarnEnabled()) {
    //   LOG.warn(url + ": parsed date: " + date +" to:"+time);
          // }
} catch (Exception e2) {
          if (LOG.isWarnEnabled()) {
      LOG.warn(url + ": can't parse erroneous date: " + date);
          }
}
  }
  return time;
}