Java Code Examples for org.apache.nutch.crawl.CrawlDatum#STATUS_DB_FETCHED
The following examples show how to use
org.apache.nutch.crawl.CrawlDatum#STATUS_DB_FETCHED .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DeduplicationJob.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
@Override public void map(Text key, CrawlDatum value, OutputCollector<BytesWritable, CrawlDatum> output, Reporter reporter) throws IOException { if (value.getStatus() == CrawlDatum.STATUS_DB_FETCHED || value.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) { // || value.getStatus() ==CrawlDatum.STATUS_DB_GONE){ byte[] signature = value.getSignature(); if (signature == null) return; BytesWritable sig = new BytesWritable(signature); // add the URL as a temporary MD value.getMetaData().put(urlKey, key); // reduce on the signature output.collect(sig, value); } }
Example 2
Source File: DomainStatistics.java From anthelion with Apache License 2.0 | 5 votes |
public void map(Text urlText, CrawlDatum datum, Context context) throws IOException, InterruptedException { if(datum.getStatus() == CrawlDatum.STATUS_DB_FETCHED || datum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) { try { URL url = new URL(urlText.toString()); String out = null; switch (mode) { case MODE_HOST: out = url.getHost(); break; case MODE_DOMAIN: out = URLUtil.getDomainName(url); break; case MODE_SUFFIX: out = URLUtil.getDomainSuffix(url).getDomain(); break; case MODE_TLD: out = URLUtil.getTopLevelDomainName(url); break; } if(out.trim().equals("")) { LOG.info("url : " + url); context.getCounter(MyCounter.EMPTY_RESULT).increment(1); } context.write(new Text(out), new LongWritable(1)); } catch (Exception ex) { } context.getCounter(MyCounter.FETCHED).increment(1); context.write(FETCHED_TEXT, new LongWritable(1)); } else { context.getCounter(MyCounter.NOT_FETCHED).increment(1); context.write(NOT_FETCHED_TEXT, new LongWritable(1)); } }
Example 3
Source File: DomainStatistics.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public void map(Text urlText, CrawlDatum datum, Context context) throws IOException, InterruptedException { if(datum.getStatus() == CrawlDatum.STATUS_DB_FETCHED || datum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) { try { URL url = new URL(urlText.toString()); String out = null; switch (mode) { case MODE_HOST: out = url.getHost(); break; case MODE_DOMAIN: out = URLUtil.getDomainName(url); break; case MODE_SUFFIX: out = URLUtil.getDomainSuffix(url).getDomain(); break; case MODE_TLD: out = URLUtil.getTopLevelDomainName(url); break; } if(out.trim().equals("")) { LOG.info("url : " + url); context.getCounter(MyCounter.EMPTY_RESULT).increment(1); } context.write(new Text(out), new LongWritable(1)); } catch (Exception ex) { } context.getCounter(MyCounter.FETCHED).increment(1); context.write(FETCHED_TEXT, new LongWritable(1)); } else { context.getCounter(MyCounter.NOT_FETCHED).increment(1); context.write(NOT_FETCHED_TEXT, new LongWritable(1)); } }
Example 4
Source File: ArcSegmentCreator.java From anthelion with Apache License 2.0 | 4 votes |
/** * <p>Runs the Map job to translate an arc record into output for Nutch * segments.</p> * * @param key The arc record header. * @param bytes The arc record raw content bytes. * @param output The output collecter. * @param reporter The progress reporter. */ public void map(Text key, BytesWritable bytes, OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException { String[] headers = key.toString().split("\\s+"); String urlStr = headers[0]; String version = headers[2]; String contentType = headers[3]; // arcs start with a file description. for now we ignore this as it is not // a content record if (urlStr.startsWith("filedesc://")) { LOG.info("Ignoring file header: " + urlStr); return; } LOG.info("Processing: " + urlStr); // get the raw bytes from the arc file, create a new crawldatum Text url = new Text(); CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, interval, 1.0f); String segmentName = getConf().get(Nutch.SEGMENT_NAME_KEY); // normalize and filter the urls try { urlStr = normalizers.normalize(urlStr, URLNormalizers.SCOPE_FETCHER); urlStr = urlFilters.filter(urlStr); // filter the url } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Skipping " + url + ":" + e); } urlStr = null; } // if still a good url then process if (urlStr != null) { url.set(urlStr); try { // set the protocol status to success and the crawl status to success // create the content from the normalized url and the raw bytes from // the arc file, TODO: currently this doesn't handle text of errors // pages (i.e. 404, etc.). We assume we won't get those. ProtocolStatus status = ProtocolStatus.STATUS_SUCCESS; Content content = new Content(urlStr, urlStr, bytes.getBytes(), contentType, new Metadata(), getConf()); // set the url version into the metadata content.getMetadata().set(URL_VERSION, version); ParseStatus pstatus = null; pstatus = output(output, segmentName, url, datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS); reporter.progress(); } catch (Throwable t) { // unexpected exception logError(url, t); output(output, segmentName, url, datum, null, null, CrawlDatum.STATUS_FETCH_RETRY); } } }
Example 5
Source File: ArcSegmentCreator.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
/** * <p>Runs the Map job to translate an arc record into output for Nutch * segments.</p> * * @param key The arc record header. * @param bytes The arc record raw content bytes. * @param output The output collecter. * @param reporter The progress reporter. */ public void map(Text key, BytesWritable bytes, OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException { String[] headers = key.toString().split("\\s+"); String urlStr = headers[0]; String version = headers[2]; String contentType = headers[3]; // arcs start with a file description. for now we ignore this as it is not // a content record if (urlStr.startsWith("filedesc://")) { LOG.info("Ignoring file header: " + urlStr); return; } LOG.info("Processing: " + urlStr); // get the raw bytes from the arc file, create a new crawldatum Text url = new Text(); CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, interval, 1.0f); String segmentName = getConf().get(Nutch.SEGMENT_NAME_KEY); // normalize and filter the urls try { urlStr = normalizers.normalize(urlStr, URLNormalizers.SCOPE_FETCHER); urlStr = urlFilters.filter(urlStr); // filter the url } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Skipping " + url + ":" + e); } urlStr = null; } // if still a good url then process if (urlStr != null) { url.set(urlStr); try { // set the protocol status to success and the crawl status to success // create the content from the normalized url and the raw bytes from // the arc file, TODO: currently this doesn't handle text of errors // pages (i.e. 404, etc.). We assume we won't get those. ProtocolStatus status = ProtocolStatus.STATUS_SUCCESS; Content content = new Content(urlStr, urlStr, bytes.getBytes(), contentType, new Metadata(), getConf()); // set the url version into the metadata content.getMetadata().set(URL_VERSION, version); ParseStatus pstatus = null; pstatus = output(output, segmentName, url, datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS); reporter.progress(); } catch (Throwable t) { // unexpected exception logError(url, t); output(output, segmentName, url, datum, null, null, CrawlDatum.STATUS_FETCH_RETRY); } } }