Java Code Examples for org.apache.nutch.crawl.CrawlDatum#getModifiedTime()
The following examples show how to use
org.apache.nutch.crawl.CrawlDatum#getModifiedTime() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
private NutchDocument addTime(NutchDocument doc, ParseData data, String url, CrawlDatum datum) { long time = -1; String lastModified = data.getMeta(Metadata.LAST_MODIFIED); if (lastModified != null) { // try parse last-modified time = getTime(lastModified,url); // use as time // store as string doc.add("lastModified", new Date(time)); } if (time == -1) { // if no last-modified specified in HTTP header time = datum.getModifiedTime(); // use value in CrawlDatum if (time <= 0) { // if also unset time = datum.getFetchTime(); // use time the fetch took place (fetchTime of fetchDatum) } } // un-stored, indexed and un-tokenized doc.add("date", new Date(time)); return doc; }
Example 2
Source File: FileResponse.java From anthelion with Apache License 2.0 | 4 votes |
public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf) throws FileException, IOException { this.orig = url.toString(); this.base = url.toString(); this.file = file; this.conf = conf; MIME = new MimeUtil(conf); tika = new Tika(); if (!"file".equals(url.getProtocol())) throw new FileException("Not a file url:" + url); if (File.LOG.isTraceEnabled()) { File.LOG.trace("fetching " + url); } if (url.getPath() != url.getFile()) { if (File.LOG.isWarnEnabled()) { File.LOG.warn("url.getPath() != url.getFile(): " + url); } } String path = "".equals(url.getPath()) ? "/" : url.getPath(); try { // specify the encoding via the config later? path = java.net.URLDecoder.decode(path, "UTF-8"); } catch (UnsupportedEncodingException ex) { } try { this.content = null; // url.toURI() is only in j2se 1.5.0 //java.io.File f = new java.io.File(url.toURI()); java.io.File f = new java.io.File(path); if (!f.exists()) { this.code = 404; // http Not Found return; } if (!f.canRead()) { this.code = 401; // http Unauthorized return; } // symbolic link or relative path on unix // fix me: what's the consequence on windows platform // where case is insensitive if (!f.equals(f.getCanonicalFile())) { // set headers //hdrs.put("Location", f.getCanonicalFile().toURI()); // // we want to automatically escape characters that are illegal in URLs. // It is recommended that new code convert an abstract pathname into a URL // by first converting it into a URI, via the toURI method, and then // converting the URI into a URL via the URI.toURL method. headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL().toString()); this.code = 300; // http redirect return; } if (f.lastModified() <= datum.getModifiedTime()) { this.code = 304; this.headers.set("Last-Modified", HttpDateFormat.toString(f.lastModified())); return; } if (f.isDirectory()) { getDirAsHttpResponse(f); } else if (f.isFile()) { getFileAsHttpResponse(f); } else { this.code = 500; // http Internal Server Error return; } } catch (IOException e) { throw e; } }
Example 3
Source File: FileResponse.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
/** * Default public constructor * @param url * @param datum * @param file * @param conf * @throws FileException * @throws IOException */ public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf) throws FileException, IOException { this.orig = url.toString(); this.base = url.toString(); this.file = file; this.conf = conf; MIME = new MimeUtil(conf); tika = new Tika(); if (!"file".equals(url.getProtocol())) throw new FileException("Not a file url:" + url); if (File.LOG.isTraceEnabled()) { File.LOG.trace("fetching " + url); } if (url.getPath() != url.getFile()) { if (File.LOG.isWarnEnabled()) { File.LOG.warn("url.getPath() != url.getFile(): " + url); } } String path = "".equals(url.getPath()) ? "/" : url.getPath(); try { // specify the encoding via the config later? path = java.net.URLDecoder.decode(path, "UTF-8"); } catch (UnsupportedEncodingException ex) { } try { this.content = null; // url.toURI() is only in j2se 1.5.0 //java.io.File f = new java.io.File(url.toURI()); java.io.File f = new java.io.File(path); if (!f.exists()) { this.code = 404; // http Not Found return; } if (!f.canRead()) { this.code = 401; // http Unauthorized return; } // symbolic link or relative path on unix // fix me: what's the consequence on windows platform // where case is insensitive if (!f.equals(f.getCanonicalFile())) { // set headers //hdrs.put("Location", f.getCanonicalFile().toURI()); // // we want to automatically escape characters that are illegal in URLs. // It is recommended that new code convert an abstract pathname into a URL // by first converting it into a URI, via the toURI method, and then // converting the URI into a URL via the URI.toURL method. headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL().toString()); this.code = 300; // http redirect return; } if (f.lastModified() <= datum.getModifiedTime()) { this.code = 304; this.headers.set("Last-Modified", HttpDateFormat.toString(f.lastModified())); return; } if (f.isDirectory()) { getDirAsHttpResponse(f); } else if (f.isFile()) { getFileAsHttpResponse(f); } else { this.code = 500; // http Internal Server Error return; } } catch (IOException e) { throw e; } }