org.apache.nutch.net.protocols.HttpDateFormat Java Exaples

Source File: FileResponse.java From anthelion with Apache License 2.0

6 votes

private void getDirAsHttpResponse(java.io.File f) throws IOException {

    String path = f.toString();
    if (this.file.crawlParents)
      this.content = list2html(f.listFiles(), path, "/".equals(path) ? false
          : true);
    else
      this.content = list2html(f.listFiles(), path, false);

    // set headers
    headers.set(Response.CONTENT_LENGTH,
        new Integer(this.content.length).toString());
    headers.set(Response.CONTENT_TYPE, "text/html");
    headers.set(Response.LAST_MODIFIED,
        HttpDateFormat.toString(f.lastModified()));

    // response code
    this.code = 200; // http OK
  }

Source File: FileResponse.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * get dir list as http response
 * @param f
 * @throws IOException
 */
private void getDirAsHttpResponse(java.io.File f) throws IOException {

  String path = f.toString();
  if (this.file.crawlParents)
    this.content = list2html(f.listFiles(), path, "/".equals(path) ? false
        : true);
  else
    this.content = list2html(f.listFiles(), path, false);

  // set headers
  headers.set(Response.CONTENT_LENGTH,
      new Integer(this.content.length).toString());
  headers.set(Response.CONTENT_TYPE, "text/html");
  headers.set(Response.LAST_MODIFIED,
      HttpDateFormat.toString(f.lastModified()));

  // response code
  this.code = 200; // http OK
}

Source File: FtpResponse.java From anthelion with Apache License 2.0

5 votes

private byte[] list2html(List list, String path, boolean includeDotDot) {

    //StringBuffer x = new StringBuffer("<!doctype html public \"-//ietf//dtd html//en\"><html><head>");
    StringBuffer x = new StringBuffer("<html><head>");
    x.append("<title>Index of "+path+"</title></head>\n");
    x.append("<body><h1>Index of "+path+"</h1><pre>\n");

    if (includeDotDot) {
      x.append("<a href='../'>../</a>\t-\t-\t-\n");
    }

    for (int i=0; i<list.size(); i++) {
      FTPFile f = (FTPFile) list.get(i);
      String name = f.getName();
      String time = HttpDateFormat.toString(f.getTimestamp());
      if (f.isDirectory()) {
        // some ftp server LIST "." and "..", we skip them here
        if (name.equals(".") || name.equals(".."))
          continue;
        x.append("<a href='"+name+"/"+"'>"+name+"/</a>\t");
        x.append(time+"\t-\n");
      } else if (f.isFile()) {
        x.append("<a href='"+name+    "'>"+name+"</a>\t");
        x.append(time+"\t"+f.getSize()+"\n");
      } else {
        // ignore isSymbolicLink()
        // ignore isUnknown()
      }
    }

    x.append("</pre></body></html>\n");

    return new String(x).getBytes();
  }

Source File: FileResponse.java From anthelion with Apache License 2.0

5 votes

private byte[] list2html(java.io.File[] list, String path,
    boolean includeDotDot) {

  StringBuffer x = new StringBuffer("<html><head>");
  x.append("<title>Index of " + path + "</title></head>\n");
  x.append("<body><h1>Index of " + path + "</h1><pre>\n");

  if (includeDotDot) {
    x.append("<a href='../'>../</a>\t-\t-\t-\n");
  }

  // fix me: we might want to sort list here! but not now.

  java.io.File f;
  for (int i = 0; i < list.length; i++) {
    f = list[i];
    String name = f.getName();
    String time = HttpDateFormat.toString(f.lastModified());
    if (f.isDirectory()) {
      // java 1.4.2 api says dir itself and parent dir are not listed
      // so the following is not needed.
      // if (name.equals(".") || name.equals(".."))
      // continue;
      x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
      x.append(time + "\t-\n");
    } else if (f.isFile()) {
      x.append("<a href='" + name + "'>" + name + "</a>\t");
      x.append(time + "\t" + f.length() + "\n");
    } else {
      // ignore any other
    }
  }

  x.append("</pre></body></html>\n");

  return new String(x).getBytes();
}

Source File: FtpResponse.java From nutch-htmlunit with Apache License 2.0

5 votes

private byte[] list2html(List<FTPFile> list, String path, boolean includeDotDot) {

    //StringBuffer x = new StringBuffer("<!doctype html public \"-//ietf//dtd html//en\"><html><head>");
    StringBuffer x = new StringBuffer("<html><head>");
    x.append("<title>Index of "+path+"</title></head>\n");
    x.append("<body><h1>Index of "+path+"</h1><pre>\n");

    if (includeDotDot) {
      x.append("<a href='../'>../</a>\t-\t-\t-\n");
    }

    for (int i=0; i<list.size(); i++) {
      FTPFile f = (FTPFile) list.get(i);
      String name = f.getName();
      String time = HttpDateFormat.toString(f.getTimestamp());
      if (f.isDirectory()) {
        // some ftp server LIST "." and "..", we skip them here
        if (name.equals(".") || name.equals(".."))
          continue;
        x.append("<a href='"+name+"/"+"'>"+name+"/</a>\t");
        x.append(time+"\t-\n");
      } else if (f.isFile()) {
        x.append("<a href='"+name+    "'>"+name+"</a>\t");
        x.append(time+"\t"+f.getSize()+"\n");
      } else {
        // ignore isSymbolicLink()
        // ignore isUnknown()
      }
    }

    x.append("</pre></body></html>\n");

    return new String(x).getBytes();
  }

Source File: FileResponse.java From nutch-htmlunit with Apache License 2.0

5 votes

/**
 * generate html page from dir list
 * @param list
 * @param path
 * @param includeDotDot
 * @return
 */
private byte[] list2html(java.io.File[] list, String path,
    boolean includeDotDot) {

  StringBuffer x = new StringBuffer("<html><head>");
  x.append("<title>Index of " + path + "</title></head>\n");
  x.append("<body><h1>Index of " + path + "</h1><pre>\n");

  if (includeDotDot) {
    x.append("<a href='../'>../</a>\t-\t-\t-\n");
  }

  // fix me: we might want to sort list here! but not now.

  java.io.File f;
  for (int i = 0; i < list.length; i++) {
    f = list[i];
    String name = f.getName();
    String time = HttpDateFormat.toString(f.lastModified());
    if (f.isDirectory()) {
      // java 1.4.2 api says dir itself and parent dir are not listed
      // so the following is not needed.
      // if (name.equals(".") || name.equals(".."))
      // continue;
      x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
      x.append(time + "\t-\n");
    } else if (f.isFile()) {
      x.append("<a href='" + name + "'>" + name + "</a>\t");
      x.append(time + "\t" + f.length() + "\n");
    } else {
      // ignore any other
    }
  }

  x.append("</pre></body></html>\n");

  return new String(x).getBytes();
}

Source File: FileResponse.java From anthelion with Apache License 2.0

4 votes

public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
  throws FileException, IOException {

  this.orig = url.toString();
  this.base = url.toString();
  this.file = file;
  this.conf = conf;
  
  MIME = new MimeUtil(conf);
  tika = new Tika();

  if (!"file".equals(url.getProtocol()))
    throw new FileException("Not a file url:" + url);

  if (File.LOG.isTraceEnabled()) {
    File.LOG.trace("fetching " + url);
  }

  if (url.getPath() != url.getFile()) {
    if (File.LOG.isWarnEnabled()) {
      File.LOG.warn("url.getPath() != url.getFile(): " + url);
    }
  }

  String path = "".equals(url.getPath()) ? "/" : url.getPath();

  try {
    // specify the encoding via the config later?
    path = java.net.URLDecoder.decode(path, "UTF-8");
  } catch (UnsupportedEncodingException ex) {
  }

  try {

    this.content = null;

    // url.toURI() is only in j2se 1.5.0
    //java.io.File f = new java.io.File(url.toURI());
    java.io.File f = new java.io.File(path);

    if (!f.exists()) {
      this.code = 404;  // http Not Found
      return;
    }

    if (!f.canRead()) {
      this.code = 401;  // http Unauthorized
      return;
    }

    // symbolic link or relative path on unix
    // fix me: what's the consequence on windows platform
    // where case is insensitive
    if (!f.equals(f.getCanonicalFile())) {
      // set headers
      //hdrs.put("Location", f.getCanonicalFile().toURI());
      //
      // we want to automatically escape characters that are illegal in URLs. 
      // It is recommended that new code convert an abstract pathname into a URL 
      // by first converting it into a URI, via the toURI method, and then 
      // converting the URI into a URL via the URI.toURL method.
      headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL().toString());

      this.code = 300;  // http redirect
      return;
    }
    if (f.lastModified() <= datum.getModifiedTime()) {
      this.code = 304;
      this.headers.set("Last-Modified", HttpDateFormat.toString(f.lastModified()));
      return;
    }

    if (f.isDirectory()) {
      getDirAsHttpResponse(f);
    } else if (f.isFile()) {
      getFileAsHttpResponse(f);
    } else {
      this.code = 500; // http Internal Server Error
      return;
    }

  } catch (IOException e) {
    throw e;
  }

}

Source File: FileResponse.java From anthelion with Apache License 2.0

4 votes

private void getFileAsHttpResponse(java.io.File f) throws FileException,
    IOException {

  // ignore file of size larger than
  // Integer.MAX_VALUE = 2^31-1 = 2147483647
  long size = f.length();
  if (size > Integer.MAX_VALUE) {
    throw new FileException("file is too large, size: " + size);
    // or we can do this?
    // this.code = 400; // http Bad request
    // return;
  }

  // capture content
  int len = (int) size;

  if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength)
    len = this.file.maxContentLength;

  this.content = new byte[len];

  java.io.InputStream is = new java.io.FileInputStream(f);
  int offset = 0;
  int n = 0;
  while (offset < len
      && (n = is.read(this.content, offset, len - offset)) >= 0) {
    offset += n;
  }
  if (offset < len) { // keep whatever already have, but issue a warning
    if (File.LOG.isWarnEnabled()) {
      File.LOG.warn("not enough bytes read from file: " + f.getPath());
    }
  }
  is.close();

  // set headers
  headers.set(Response.CONTENT_LENGTH, new Long(size).toString());
  headers.set(Response.LAST_MODIFIED,
      HttpDateFormat.toString(f.lastModified()));

  String mimeType = MIME.getMimeType(f);

  headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : "");

  // response code
  this.code = 200; // http OK
}

Source File: MoreIndexingFilter.java From anthelion with Apache License 2.0

4 votes

private long getTime(String date, String url) {
  long time = -1;
  try {
    time = HttpDateFormat.toLong(date);
  } catch (ParseException e) {
// try to parse it as date in alternative format
try {
    Date parsedDate = DateUtils.parseDate(date,
    new String [] {
        "EEE MMM dd HH:mm:ss yyyy",
        "EEE MMM dd HH:mm:ss yyyy zzz",
        "EEE MMM dd HH:mm:ss zzz yyyy",
        "EEE, MMM dd HH:mm:ss yyyy zzz",
        "EEE, dd MMM yyyy HH:mm:ss zzz",
        "EEE,dd MMM yyyy HH:mm:ss zzz",
        "EEE, dd MMM yyyy HH:mm:sszzz",
        "EEE, dd MMM yyyy HH:mm:ss",
        "EEE, dd-MMM-yy HH:mm:ss zzz",
        "yyyy/MM/dd HH:mm:ss.SSS zzz",
        "yyyy/MM/dd HH:mm:ss.SSS",
        "yyyy/MM/dd HH:mm:ss zzz",
        "yyyy/MM/dd",
        "yyyy.MM.dd HH:mm:ss",
        "yyyy-MM-dd HH:mm",
        "MMM dd yyyy HH:mm:ss. zzz",
        "MMM dd yyyy HH:mm:ss zzz",
        "dd.MM.yyyy HH:mm:ss zzz",
        "dd MM yyyy HH:mm:ss zzz",
        "dd.MM.yyyy; HH:mm:ss",
        "dd.MM.yyyy HH:mm:ss",
        "dd.MM.yyyy zzz",
        "yyyy-MM-dd'T'HH:mm:ss'Z'"
    });
    time = parsedDate.getTime();
          // if (LOG.isWarnEnabled()) {
    //   LOG.warn(url + ": parsed date: " + date +" to:"+time);
          // }
} catch (Exception e2) {
          if (LOG.isWarnEnabled()) {
      LOG.warn(url + ": can't parse erroneous date: " + date);
          }
}
  }
  return time;
}

Source File: FileResponse.java From nutch-htmlunit with Apache License 2.0

4 votes

/**
 * Default public constructor
 * @param url
 * @param datum
 * @param file
 * @param conf
 * @throws FileException
 * @throws IOException
 */
public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
  throws FileException, IOException {

  this.orig = url.toString();
  this.base = url.toString();
  this.file = file;
  this.conf = conf;
  
  MIME = new MimeUtil(conf);
  tika = new Tika();

  if (!"file".equals(url.getProtocol()))
    throw new FileException("Not a file url:" + url);

  if (File.LOG.isTraceEnabled()) {
    File.LOG.trace("fetching " + url);
  }

  if (url.getPath() != url.getFile()) {
    if (File.LOG.isWarnEnabled()) {
      File.LOG.warn("url.getPath() != url.getFile(): " + url);
    }
  }

  String path = "".equals(url.getPath()) ? "/" : url.getPath();

  try {
    // specify the encoding via the config later?
    path = java.net.URLDecoder.decode(path, "UTF-8");
  } catch (UnsupportedEncodingException ex) {
  }

  try {

    this.content = null;

    // url.toURI() is only in j2se 1.5.0
    //java.io.File f = new java.io.File(url.toURI());
    java.io.File f = new java.io.File(path);

    if (!f.exists()) {
      this.code = 404;  // http Not Found
      return;
    }

    if (!f.canRead()) {
      this.code = 401;  // http Unauthorized
      return;
    }

    // symbolic link or relative path on unix
    // fix me: what's the consequence on windows platform
    // where case is insensitive
    if (!f.equals(f.getCanonicalFile())) {
      // set headers
      //hdrs.put("Location", f.getCanonicalFile().toURI());
      //
      // we want to automatically escape characters that are illegal in URLs. 
      // It is recommended that new code convert an abstract pathname into a URL 
      // by first converting it into a URI, via the toURI method, and then 
      // converting the URI into a URL via the URI.toURL method.
      headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL().toString());

      this.code = 300;  // http redirect
      return;
    }
    if (f.lastModified() <= datum.getModifiedTime()) {
      this.code = 304;
      this.headers.set("Last-Modified", HttpDateFormat.toString(f.lastModified()));
      return;
    }

    if (f.isDirectory()) {
      getDirAsHttpResponse(f);
    } else if (f.isFile()) {
      getFileAsHttpResponse(f);
    } else {
      this.code = 500; // http Internal Server Error
      return;
    }

  } catch (IOException e) {
    throw e;
  }

}

Source File: FileResponse.java From nutch-htmlunit with Apache License 2.0

4 votes

private void getFileAsHttpResponse(java.io.File f) throws FileException,
    IOException {

  // ignore file of size larger than
  // Integer.MAX_VALUE = 2^31-1 = 2147483647
  long size = f.length();
  if (size > Integer.MAX_VALUE) {
    throw new FileException("file is too large, size: " + size);
    // or we can do this?
    // this.code = 400; // http Bad request
    // return;
  }

  // capture content
  int len = (int) size;

  if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength)
    len = this.file.maxContentLength;

  this.content = new byte[len];

  java.io.InputStream is = new java.io.FileInputStream(f);
  int offset = 0;
  int n = 0;
  while (offset < len
      && (n = is.read(this.content, offset, len - offset)) >= 0) {
    offset += n;
  }
  if (offset < len) { // keep whatever already have, but issue a warning
    if (File.LOG.isWarnEnabled()) {
      File.LOG.warn("not enough bytes read from file: " + f.getPath());
    }
  }
  is.close();

  // set headers
  headers.set(Response.CONTENT_LENGTH, new Long(size).toString());
  headers.set(Response.LAST_MODIFIED,
      HttpDateFormat.toString(f.lastModified()));

  String mimeType = tika.detect(f);

  headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : "");

  // response code
  this.code = 200; // http OK
}

Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

4 votes

private long getTime(String date, String url) {
  long time = -1;
  try {
    time = HttpDateFormat.toLong(date);
  } catch (ParseException e) {
// try to parse it as date in alternative format
try {
    Date parsedDate = DateUtils.parseDate(date,
    new String [] {
        "EEE MMM dd HH:mm:ss yyyy",
        "EEE MMM dd HH:mm:ss yyyy zzz",
        "EEE MMM dd HH:mm:ss zzz yyyy",
        "EEE, MMM dd HH:mm:ss yyyy zzz",
        "EEE, dd MMM yyyy HH:mm:ss zzz",
        "EEE,dd MMM yyyy HH:mm:ss zzz",
        "EEE, dd MMM yyyy HH:mm:sszzz",
        "EEE, dd MMM yyyy HH:mm:ss",
        "EEE, dd-MMM-yy HH:mm:ss zzz",
        "yyyy/MM/dd HH:mm:ss.SSS zzz",
        "yyyy/MM/dd HH:mm:ss.SSS",
        "yyyy/MM/dd HH:mm:ss zzz",
        "yyyy/MM/dd",
        "yyyy.MM.dd HH:mm:ss",
        "yyyy-MM-dd HH:mm",
        "MMM dd yyyy HH:mm:ss. zzz",
        "MMM dd yyyy HH:mm:ss zzz",
        "dd.MM.yyyy HH:mm:ss zzz",
        "dd MM yyyy HH:mm:ss zzz",
        "dd.MM.yyyy; HH:mm:ss",
        "dd.MM.yyyy HH:mm:ss",
        "dd.MM.yyyy zzz",
        "yyyy-MM-dd'T'HH:mm:ss'Z'"
    });
    time = parsedDate.getTime();
          // if (LOG.isWarnEnabled()) {
    //   LOG.warn(url + ": parsed date: " + date +" to:"+time);
          // }
} catch (Exception e2) {
          if (LOG.isWarnEnabled()) {
      LOG.warn(url + ": can't parse erroneous date: " + date);
          }
}
  }
  return time;
}

org.apache.nutch.net.protocols.HttpDateFormat Java Examples