Java Code Examples for org.apache.nutch.crawl.CrawlDatum#getFetchTime()
The following examples show how to use
org.apache.nutch.crawl.CrawlDatum#getFetchTime() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: MoreIndexingFilter.java From anthelion with Apache License 2.0 | 6 votes |
private NutchDocument addTime(NutchDocument doc, ParseData data, String url, CrawlDatum datum) { long time = -1; String lastModified = data.getMeta(Metadata.LAST_MODIFIED); if (lastModified != null) { // try parse last-modified time = getTime(lastModified,url); // use as time // store as string doc.add("lastModified", new Date(time)); } if (time == -1) { // if no last-modified time = datum.getFetchTime(); // use fetch time } // un-stored, indexed and un-tokenized doc.add("date", new Date(time)); return doc; }
Example 2
Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
private NutchDocument addTime(NutchDocument doc, ParseData data, String url, CrawlDatum datum) { long time = -1; String lastModified = data.getMeta(Metadata.LAST_MODIFIED); if (lastModified != null) { // try parse last-modified time = getTime(lastModified,url); // use as time // store as string doc.add("lastModified", new Date(time)); } if (time == -1) { // if no last-modified specified in HTTP header time = datum.getModifiedTime(); // use value in CrawlDatum if (time <= 0) { // if also unset time = datum.getFetchTime(); // use time the fetch took place (fetchTime of fetchDatum) } } // un-stored, indexed and un-tokenized doc.add("date", new Date(time)); return doc; }
Example 3
Source File: MimeAdaptiveFetchSchedule.java From anthelion with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { FetchSchedule fs = new MimeAdaptiveFetchSchedule(); fs.setConf(NutchConfiguration.create()); // we start the time at 0, for simplicity long curTime = 0; long delta = 1000L * 3600L * 24L; // 2 hours // we trigger the update of the page every 30 days long update = 1000L * 3600L * 24L * 30L; // 30 days boolean changed = true; long lastModified = 0; int miss = 0; int totalMiss = 0; int maxMiss = 0; int fetchCnt = 0; int changeCnt = 0; // initial fetchInterval is 10 days CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f); // Set a default MIME-type to test with org.apache.hadoop.io.MapWritable x = new org.apache.hadoop.io.MapWritable(); x.put(HttpHeaders.WRITABLE_CONTENT_TYPE, new Text("text/html; charset=utf-8")); p.setMetaData(x); p.setFetchTime(0); LOG.info(p.toString()); // let's move the timeline a couple of deltas for (int i = 0; i < 10000; i++) { if (lastModified + update < curTime) { //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime); changed = true; changeCnt++; lastModified = curTime; } LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval " + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss); if (p.getFetchTime() <= curTime) { fetchCnt++; fs.setFetchSchedule(new Text("http://www.example.com"), p, p.getFetchTime(), p.getModifiedTime(), curTime, lastModified, changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED); LOG.info("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval " + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days"); if (!changed) miss++; if (miss > maxMiss) maxMiss = miss; changed = false; totalMiss += miss; miss = 0; } if (changed) miss++; curTime += delta; } LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss); LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times."); }
Example 4
Source File: AbstractFetchSchedule.java From anthelion with Apache License 2.0 | 4 votes |
/** * This method provides information whether the page is suitable for * selection in the current fetchlist. NOTE: a true return value does not * guarantee that the page will be fetched, it just allows it to be * included in the further selection process based on scores. The default * implementation checks <code>fetchTime</code>, if it is higher than the * <code>curTime</code> it returns false, and true otherwise. It will also * check that fetchTime is not too remote (more than <code>maxInterval</code>, * in which case it lowers the interval and returns true. * * @param url URL of the page. * * @param datum datum instance. * * @param curTime reference time (usually set to the time when the * fetchlist generation process was started). * * @return true, if the page should be considered for inclusion in the current * fetchlist, otherwise false. */ public boolean shouldFetch(Text url, CrawlDatum datum, long curTime) { // pages are never truly GONE - we have to check them from time to time. // pages with too long fetchInterval are adjusted so that they fit within // maximum fetchInterval (segment retention period). if (datum.getFetchTime() - curTime > (long) maxInterval * 1000) { if (datum.getFetchInterval() > maxInterval) { datum.setFetchInterval(maxInterval * 0.9f); } datum.setFetchTime(curTime); } if (datum.getFetchTime() > curTime) { return false; // not time yet } return true; }
Example 5
Source File: AdaptiveFetchSchedule.java From anthelion with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { FetchSchedule fs = new AdaptiveFetchSchedule(); fs.setConf(NutchConfiguration.create()); // we start the time at 0, for simplicity long curTime = 0; long delta = 1000L * 3600L * 24L; // 2 hours // we trigger the update of the page every 30 days long update = 1000L * 3600L * 24L * 30L; // 30 days boolean changed = true; long lastModified = 0; int miss = 0; int totalMiss = 0; int maxMiss = 0; int fetchCnt = 0; int changeCnt = 0; // initial fetchInterval is 10 days CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f); p.setFetchTime(0); LOG.info(p.toString()); // let's move the timeline a couple of deltas for (int i = 0; i < 10000; i++) { if (lastModified + update < curTime) { //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime); changed = true; changeCnt++; lastModified = curTime; } LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval " + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss); if (p.getFetchTime() <= curTime) { fetchCnt++; fs.setFetchSchedule(new Text("http://www.example.com"), p, p.getFetchTime(), p.getModifiedTime(), curTime, lastModified, changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED); LOG.info("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval " + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days"); if (!changed) miss++; if (miss > maxMiss) maxMiss = miss; changed = false; totalMiss += miss; miss = 0; } if (changed) miss++; curTime += delta; } LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss); LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times."); }
Example 6
Source File: MimeAdaptiveFetchSchedule.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { FetchSchedule fs = new MimeAdaptiveFetchSchedule(); fs.setConf(NutchConfiguration.create()); // we start the time at 0, for simplicity long curTime = 0; long delta = 1000L * 3600L * 24L; // 2 hours // we trigger the update of the page every 30 days long update = 1000L * 3600L * 24L * 30L; // 30 days boolean changed = true; long lastModified = 0; int miss = 0; int totalMiss = 0; int maxMiss = 0; int fetchCnt = 0; int changeCnt = 0; // initial fetchInterval is 10 days CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f); // Set a default MIME-type to test with org.apache.hadoop.io.MapWritable x = new org.apache.hadoop.io.MapWritable(); x.put(HttpHeaders.WRITABLE_CONTENT_TYPE, new Text("text/html; charset=utf-8")); p.setMetaData(x); p.setFetchTime(0); LOG.info(p.toString()); // let's move the timeline a couple of deltas for (int i = 0; i < 10000; i++) { if (lastModified + update < curTime) { //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime); changed = true; changeCnt++; lastModified = curTime; } LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval " + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss); if (p.getFetchTime() <= curTime) { fetchCnt++; fs.setFetchSchedule(new Text("http://www.example.com"), p, p.getFetchTime(), p.getModifiedTime(), curTime, lastModified, changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED); LOG.info("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval " + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days"); if (!changed) miss++; if (miss > maxMiss) maxMiss = miss; changed = false; totalMiss += miss; miss = 0; } if (changed) miss++; curTime += delta; } LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss); LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times."); }
Example 7
Source File: AbstractFetchSchedule.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
/** * This method provides information whether the page is suitable for * selection in the current fetchlist. NOTE: a true return value does not * guarantee that the page will be fetched, it just allows it to be * included in the further selection process based on scores. The default * implementation checks <code>fetchTime</code>, if it is higher than the * <code>curTime</code> it returns false, and true otherwise. It will also * check that fetchTime is not too remote (more than <code>maxInterval</code>, * in which case it lowers the interval and returns true. * * @param url URL of the page. * * @param datum datum instance. * * @param curTime reference time (usually set to the time when the * fetchlist generation process was started). * * @return true, if the page should be considered for inclusion in the current * fetchlist, otherwise false. */ public boolean shouldFetch(Text url, CrawlDatum datum, long curTime) { // pages are never truly GONE - we have to check them from time to time. // pages with too long fetchInterval are adjusted so that they fit within // maximum fetchInterval (segment retention period). if (datum.getFetchTime() - curTime > (long) maxInterval * 1000) { if (datum.getFetchInterval() > maxInterval) { datum.setFetchInterval(maxInterval * 0.9f); } datum.setFetchTime(curTime); } if (datum.getFetchTime() > curTime) { return false; // not time yet } return true; }
Example 8
Source File: AdaptiveFetchSchedule.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { FetchSchedule fs = new AdaptiveFetchSchedule(); fs.setConf(NutchConfiguration.create()); // we start the time at 0, for simplicity long curTime = 0; long delta = 1000L * 3600L * 24L; // 2 hours // we trigger the update of the page every 30 days long update = 1000L * 3600L * 24L * 30L; // 30 days boolean changed = true; long lastModified = 0; int miss = 0; int totalMiss = 0; int maxMiss = 0; int fetchCnt = 0; int changeCnt = 0; // initial fetchInterval is 10 days CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f); p.setFetchTime(0); LOG.info(p.toString()); // let's move the timeline a couple of deltas for (int i = 0; i < 10000; i++) { if (lastModified + update < curTime) { //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime); changed = true; changeCnt++; lastModified = curTime; } LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval " + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss); if (p.getFetchTime() <= curTime) { fetchCnt++; fs.setFetchSchedule(new Text("http://www.example.com"), p, p.getFetchTime(), p.getModifiedTime(), curTime, lastModified, changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED); LOG.info("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval " + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days"); if (!changed) miss++; if (miss > maxMiss) maxMiss = miss; changed = false; totalMiss += miss; miss = 0; } if (changed) miss++; curTime += delta; } LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss); LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times."); }
Example 9
Source File: AbstractFetchSchedule.java From anthelion with Apache License 2.0 | 2 votes |
/** * This method return the last fetch time of the CrawlDatum * @return the date as a long. */ public long calculateLastFetchTime(CrawlDatum datum) { return datum.getFetchTime() - (long)datum.getFetchInterval() * 1000; }
Example 10
Source File: AbstractFetchSchedule.java From nutch-htmlunit with Apache License 2.0 | 2 votes |
/** * This method return the last fetch time of the CrawlDatum * @return the date as a long. */ public long calculateLastFetchTime(CrawlDatum datum) { return datum.getFetchTime() - (long)datum.getFetchInterval() * 1000; }