Java Code Examples for org.apache.nutch.crawl.CrawlDatum#getStatus()
The following examples show how to use
org.apache.nutch.crawl.CrawlDatum#getStatus() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DeduplicationJob.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
@Override public void map(Text key, CrawlDatum value, OutputCollector<BytesWritable, CrawlDatum> output, Reporter reporter) throws IOException { if (value.getStatus() == CrawlDatum.STATUS_DB_FETCHED || value.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) { // || value.getStatus() ==CrawlDatum.STATUS_DB_GONE){ byte[] signature = value.getSignature(); if (signature == null) return; BytesWritable sig = new BytesWritable(signature); // add the URL as a temporary MD value.getMetaData().put(urlKey, key); // reduce on the signature output.collect(sig, value); } }
Example 2
Source File: DeduplicationJob.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { boolean duplicateSet = false; while (values.hasNext()) { CrawlDatum val = values.next(); if (val.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) { duplicate.set(val); duplicateSet = true; } else { old.set(val); } } // keep the duplicate if there is one if (duplicateSet) { output.collect(key, duplicate); return; } // no duplicate? keep old one then output.collect(key, old); }
Example 3
Source File: DomainStatistics.java From anthelion with Apache License 2.0 | 5 votes |
public void map(Text urlText, CrawlDatum datum, Context context) throws IOException, InterruptedException { if(datum.getStatus() == CrawlDatum.STATUS_DB_FETCHED || datum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) { try { URL url = new URL(urlText.toString()); String out = null; switch (mode) { case MODE_HOST: out = url.getHost(); break; case MODE_DOMAIN: out = URLUtil.getDomainName(url); break; case MODE_SUFFIX: out = URLUtil.getDomainSuffix(url).getDomain(); break; case MODE_TLD: out = URLUtil.getTopLevelDomainName(url); break; } if(out.trim().equals("")) { LOG.info("url : " + url); context.getCounter(MyCounter.EMPTY_RESULT).increment(1); } context.write(new Text(out), new LongWritable(1)); } catch (Exception ex) { } context.getCounter(MyCounter.FETCHED).increment(1); context.write(FETCHED_TEXT, new LongWritable(1)); } else { context.getCounter(MyCounter.NOT_FETCHED).increment(1); context.write(NOT_FETCHED_TEXT, new LongWritable(1)); } }
Example 4
Source File: SolrClean.java From anthelion with Apache License 2.0 | 5 votes |
@Override public void map(Text key, CrawlDatum value, OutputCollector<ByteWritable, Text> output, Reporter reporter) throws IOException { if (value.getStatus() == CrawlDatum.STATUS_DB_GONE) { output.collect(OUT, key); } }
Example 5
Source File: DomainStatistics.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public void map(Text urlText, CrawlDatum datum, Context context) throws IOException, InterruptedException { if(datum.getStatus() == CrawlDatum.STATUS_DB_FETCHED || datum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) { try { URL url = new URL(urlText.toString()); String out = null; switch (mode) { case MODE_HOST: out = url.getHost(); break; case MODE_DOMAIN: out = URLUtil.getDomainName(url); break; case MODE_SUFFIX: out = URLUtil.getDomainSuffix(url).getDomain(); break; case MODE_TLD: out = URLUtil.getTopLevelDomainName(url); break; } if(out.trim().equals("")) { LOG.info("url : " + url); context.getCounter(MyCounter.EMPTY_RESULT).increment(1); } context.write(new Text(out), new LongWritable(1)); } catch (Exception ex) { } context.getCounter(MyCounter.FETCHED).increment(1); context.write(FETCHED_TEXT, new LongWritable(1)); } else { context.getCounter(MyCounter.NOT_FETCHED).increment(1); context.write(NOT_FETCHED_TEXT, new LongWritable(1)); } }
Example 6
Source File: CleaningJob.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
@Override public void map(Text key, CrawlDatum value, OutputCollector<ByteWritable, Text> output, Reporter reporter) throws IOException { if (value.getStatus() == CrawlDatum.STATUS_DB_GONE || value.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) { output.collect(OUT, key); } }
Example 7
Source File: TestSegmentMergerCrawlDatums.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
/** * Checks the merged segment and removes the stuff again. * * @param the test directory * @param the merged segment * @return the final status */ protected byte checkMergedSegment(Path testDir, Path mergedSegment) throws Exception { // Get a MapFile reader for the <Text,CrawlDatum> pairs MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path(mergedSegment, CrawlDatum.FETCH_DIR_NAME), conf); Text key = new Text(); CrawlDatum value = new CrawlDatum(); byte finalStatus = 0x0; for (MapFile.Reader reader : readers) { while (reader.next(key, value)) { LOG.info("Reading status for: " + key.toString() + " > " + CrawlDatum.getStatusName(value.getStatus())); // Only consider fetch status if (CrawlDatum.hasFetchStatus(value) && key.toString().equals("http://nutch.apache.org/")) { finalStatus = value.getStatus(); } } // Close the reader again reader.close(); } // Remove the test directory again fs.delete(testDir, true); LOG.info("Final fetch status for: http://nutch.apache.org/ > " + CrawlDatum.getStatusName(finalStatus)); // Return the final status return finalStatus; }