Java Code Examples for org.apache.nutch.crawl.CrawlDatum#getModifiedTime()

The following examples show how to use org.apache.nutch.crawl.CrawlDatum#getModifiedTime() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

private NutchDocument addTime(NutchDocument doc, ParseData data,
                         String url, CrawlDatum datum) {
  long time = -1;

  String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
  if (lastModified != null) {                   // try parse last-modified
    time = getTime(lastModified,url);           // use as time
                                                // store as string
    doc.add("lastModified", new Date(time));
  }

  if (time == -1) {                             // if no last-modified specified in HTTP header
    time = datum.getModifiedTime();             // use value in CrawlDatum
    if (time <= 0) {                            // if also unset
      time = datum.getFetchTime();              // use time the fetch took place (fetchTime of fetchDatum)
    }
  }

  // un-stored, indexed and un-tokenized
  doc.add("date", new Date(time));
  return doc;
}

Example 2

Source File: FileResponse.java From anthelion with Apache License 2.0

4 votes

public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
  throws FileException, IOException {

  this.orig = url.toString();
  this.base = url.toString();
  this.file = file;
  this.conf = conf;
  
  MIME = new MimeUtil(conf);
  tika = new Tika();

  if (!"file".equals(url.getProtocol()))
    throw new FileException("Not a file url:" + url);

  if (File.LOG.isTraceEnabled()) {
    File.LOG.trace("fetching " + url);
  }

  if (url.getPath() != url.getFile()) {
    if (File.LOG.isWarnEnabled()) {
      File.LOG.warn("url.getPath() != url.getFile(): " + url);
    }
  }

  String path = "".equals(url.getPath()) ? "/" : url.getPath();

  try {
    // specify the encoding via the config later?
    path = java.net.URLDecoder.decode(path, "UTF-8");
  } catch (UnsupportedEncodingException ex) {
  }

  try {

    this.content = null;

    // url.toURI() is only in j2se 1.5.0
    //java.io.File f = new java.io.File(url.toURI());
    java.io.File f = new java.io.File(path);

    if (!f.exists()) {
      this.code = 404;  // http Not Found
      return;
    }

    if (!f.canRead()) {
      this.code = 401;  // http Unauthorized
      return;
    }

    // symbolic link or relative path on unix
    // fix me: what's the consequence on windows platform
    // where case is insensitive
    if (!f.equals(f.getCanonicalFile())) {
      // set headers
      //hdrs.put("Location", f.getCanonicalFile().toURI());
      //
      // we want to automatically escape characters that are illegal in URLs. 
      // It is recommended that new code convert an abstract pathname into a URL 
      // by first converting it into a URI, via the toURI method, and then 
      // converting the URI into a URL via the URI.toURL method.
      headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL().toString());

      this.code = 300;  // http redirect
      return;
    }
    if (f.lastModified() <= datum.getModifiedTime()) {
      this.code = 304;
      this.headers.set("Last-Modified", HttpDateFormat.toString(f.lastModified()));
      return;
    }

    if (f.isDirectory()) {
      getDirAsHttpResponse(f);
    } else if (f.isFile()) {
      getFileAsHttpResponse(f);
    } else {
      this.code = 500; // http Internal Server Error
      return;
    }

  } catch (IOException e) {
    throw e;
  }

}

Example 3

Source File: FileResponse.java From nutch-htmlunit with Apache License 2.0

4 votes

/**
 * Default public constructor
 * @param url
 * @param datum
 * @param file
 * @param conf
 * @throws FileException
 * @throws IOException
 */
public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
  throws FileException, IOException {

  this.orig = url.toString();
  this.base = url.toString();
  this.file = file;
  this.conf = conf;
  
  MIME = new MimeUtil(conf);
  tika = new Tika();

  if (!"file".equals(url.getProtocol()))
    throw new FileException("Not a file url:" + url);

  if (File.LOG.isTraceEnabled()) {
    File.LOG.trace("fetching " + url);
  }

  if (url.getPath() != url.getFile()) {
    if (File.LOG.isWarnEnabled()) {
      File.LOG.warn("url.getPath() != url.getFile(): " + url);
    }
  }

  String path = "".equals(url.getPath()) ? "/" : url.getPath();

  try {
    // specify the encoding via the config later?
    path = java.net.URLDecoder.decode(path, "UTF-8");
  } catch (UnsupportedEncodingException ex) {
  }

  try {

    this.content = null;

    // url.toURI() is only in j2se 1.5.0
    //java.io.File f = new java.io.File(url.toURI());
    java.io.File f = new java.io.File(path);

    if (!f.exists()) {
      this.code = 404;  // http Not Found
      return;
    }

    if (!f.canRead()) {
      this.code = 401;  // http Unauthorized
      return;
    }

    // symbolic link or relative path on unix
    // fix me: what's the consequence on windows platform
    // where case is insensitive
    if (!f.equals(f.getCanonicalFile())) {
      // set headers
      //hdrs.put("Location", f.getCanonicalFile().toURI());
      //
      // we want to automatically escape characters that are illegal in URLs. 
      // It is recommended that new code convert an abstract pathname into a URL 
      // by first converting it into a URI, via the toURI method, and then 
      // converting the URI into a URL via the URI.toURL method.
      headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL().toString());

      this.code = 300;  // http redirect
      return;
    }
    if (f.lastModified() <= datum.getModifiedTime()) {
      this.code = 304;
      this.headers.set("Last-Modified", HttpDateFormat.toString(f.lastModified()));
      return;
    }

    if (f.isDirectory()) {
      getDirAsHttpResponse(f);
    } else if (f.isFile()) {
      getFileAsHttpResponse(f);
    } else {
      this.code = 500; // http Internal Server Error
      return;
    }

  } catch (IOException e) {
    throw e;
  }

}