org.apache.nutch.crawl.CrawlDatum#STATUS_DB

Source File: DeduplicationJob.java From nutch-htmlunit with Apache License 2.0

6 votes

@Override
public void map(Text key, CrawlDatum value,
        OutputCollector<BytesWritable, CrawlDatum> output,
        Reporter reporter) throws IOException {

    if (value.getStatus() == CrawlDatum.STATUS_DB_FETCHED
            || value.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
        // || value.getStatus() ==CrawlDatum.STATUS_DB_GONE){
        byte[] signature = value.getSignature();
        if (signature == null) return;
        BytesWritable sig = new BytesWritable(signature);
        // add the URL as a temporary MD
        value.getMetaData().put(urlKey, key);
        // reduce on the signature
        output.collect(sig, value);
    }
}

Source File: DomainStatistics.java From anthelion with Apache License 2.0

5 votes

public void map(Text urlText, CrawlDatum datum, Context context) throws IOException, InterruptedException {

      if(datum.getStatus() == CrawlDatum.STATUS_DB_FETCHED
          || datum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {

        try {
          URL url = new URL(urlText.toString());
          String out = null;
          switch (mode) {
            case MODE_HOST:
              out = url.getHost();
              break;
            case MODE_DOMAIN:
              out = URLUtil.getDomainName(url);
              break;
            case MODE_SUFFIX:
              out = URLUtil.getDomainSuffix(url).getDomain();
              break;
            case MODE_TLD:
              out = URLUtil.getTopLevelDomainName(url);
              break;
          }
          if(out.trim().equals("")) {
            LOG.info("url : " + url);
            context.getCounter(MyCounter.EMPTY_RESULT).increment(1);
          }

          context.write(new Text(out), new LongWritable(1));
        } catch (Exception ex) { }

        context.getCounter(MyCounter.FETCHED).increment(1);
        context.write(FETCHED_TEXT, new LongWritable(1));
      }
      else {
        context.getCounter(MyCounter.NOT_FETCHED).increment(1);
        context.write(NOT_FETCHED_TEXT, new LongWritable(1));
      }
    }

Source File: DomainStatistics.java From nutch-htmlunit with Apache License 2.0

5 votes

public void map(Text urlText, CrawlDatum datum, Context context) throws IOException, InterruptedException {

      if(datum.getStatus() == CrawlDatum.STATUS_DB_FETCHED
          || datum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {

        try {
          URL url = new URL(urlText.toString());
          String out = null;
          switch (mode) {
            case MODE_HOST:
              out = url.getHost();
              break;
            case MODE_DOMAIN:
              out = URLUtil.getDomainName(url);
              break;
            case MODE_SUFFIX:
              out = URLUtil.getDomainSuffix(url).getDomain();
              break;
            case MODE_TLD:
              out = URLUtil.getTopLevelDomainName(url);
              break;
          }
          if(out.trim().equals("")) {
            LOG.info("url : " + url);
            context.getCounter(MyCounter.EMPTY_RESULT).increment(1);
          }

          context.write(new Text(out), new LongWritable(1));
        } catch (Exception ex) { }

        context.getCounter(MyCounter.FETCHED).increment(1);
        context.write(FETCHED_TEXT, new LongWritable(1));
      }
      else {
        context.getCounter(MyCounter.NOT_FETCHED).increment(1);
        context.write(NOT_FETCHED_TEXT, new LongWritable(1));
      }
    }

Source File: ArcSegmentCreator.java From anthelion with Apache License 2.0

4 votes

/**
 * <p>Runs the Map job to translate an arc record into output for Nutch 
 * segments.</p>
 * 
 * @param key The arc record header.
 * @param bytes The arc record raw content bytes.
 * @param output The output collecter.
 * @param reporter The progress reporter.
 */
public void map(Text key, BytesWritable bytes,
  OutputCollector<Text, NutchWritable> output, Reporter reporter)
  throws IOException {

  String[] headers = key.toString().split("\\s+");
  String urlStr = headers[0];
  String version = headers[2];
  String contentType = headers[3];
  
  // arcs start with a file description.  for now we ignore this as it is not
  // a content record
  if (urlStr.startsWith("filedesc://")) {
    LOG.info("Ignoring file header: " + urlStr);
    return;
  }
  LOG.info("Processing: " + urlStr);

  // get the raw  bytes from the arc file, create a new crawldatum
  Text url = new Text();
  CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, interval,
    1.0f);
  String segmentName = getConf().get(Nutch.SEGMENT_NAME_KEY);

  // normalize and filter the urls
  try {
    urlStr = normalizers.normalize(urlStr, URLNormalizers.SCOPE_FETCHER);
    urlStr = urlFilters.filter(urlStr); // filter the url
  }
  catch (Exception e) {
    if (LOG.isWarnEnabled()) {
      LOG.warn("Skipping " + url + ":" + e);
    }
    urlStr = null;
  }

  // if still a good url then process
  if (urlStr != null) {

    url.set(urlStr);
    try {

      // set the protocol status to success and the crawl status to success
      // create the content from the normalized url and the raw bytes from
      // the arc file,  TODO: currently this doesn't handle text of errors
      // pages (i.e. 404, etc.). We assume we won't get those.
      ProtocolStatus status = ProtocolStatus.STATUS_SUCCESS;
      Content content = new Content(urlStr, urlStr, bytes.getBytes(), contentType,
        new Metadata(), getConf());
      
      // set the url version into the metadata
      content.getMetadata().set(URL_VERSION, version);
      ParseStatus pstatus = null;
      pstatus = output(output, segmentName, url, datum, content, status,
        CrawlDatum.STATUS_FETCH_SUCCESS);
      reporter.progress();
    }
    catch (Throwable t) { // unexpected exception
      logError(url, t);
      output(output, segmentName, url, datum, null, null,
        CrawlDatum.STATUS_FETCH_RETRY);
    }
  }
}

Source File: ArcSegmentCreator.java From nutch-htmlunit with Apache License 2.0

4 votes

/**
 * <p>Runs the Map job to translate an arc record into output for Nutch 
 * segments.</p>
 * 
 * @param key The arc record header.
 * @param bytes The arc record raw content bytes.
 * @param output The output collecter.
 * @param reporter The progress reporter.
 */
public void map(Text key, BytesWritable bytes,
  OutputCollector<Text, NutchWritable> output, Reporter reporter)
  throws IOException {

  String[] headers = key.toString().split("\\s+");
  String urlStr = headers[0];
  String version = headers[2];
  String contentType = headers[3];
  
  // arcs start with a file description.  for now we ignore this as it is not
  // a content record
  if (urlStr.startsWith("filedesc://")) {
    LOG.info("Ignoring file header: " + urlStr);
    return;
  }
  LOG.info("Processing: " + urlStr);

  // get the raw  bytes from the arc file, create a new crawldatum
  Text url = new Text();
  CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, interval,
    1.0f);
  String segmentName = getConf().get(Nutch.SEGMENT_NAME_KEY);

  // normalize and filter the urls
  try {
    urlStr = normalizers.normalize(urlStr, URLNormalizers.SCOPE_FETCHER);
    urlStr = urlFilters.filter(urlStr); // filter the url
  }
  catch (Exception e) {
    if (LOG.isWarnEnabled()) {
      LOG.warn("Skipping " + url + ":" + e);
    }
    urlStr = null;
  }

  // if still a good url then process
  if (urlStr != null) {

    url.set(urlStr);
    try {

      // set the protocol status to success and the crawl status to success
      // create the content from the normalized url and the raw bytes from
      // the arc file,  TODO: currently this doesn't handle text of errors
      // pages (i.e. 404, etc.). We assume we won't get those.
      ProtocolStatus status = ProtocolStatus.STATUS_SUCCESS;
      Content content = new Content(urlStr, urlStr, bytes.getBytes(), contentType,
        new Metadata(), getConf());
      
      // set the url version into the metadata
      content.getMetadata().set(URL_VERSION, version);
      ParseStatus pstatus = null;
      pstatus = output(output, segmentName, url, datum, content, status,
        CrawlDatum.STATUS_FETCH_SUCCESS);
      reporter.progress();
    }
    catch (Throwable t) { // unexpected exception
      logError(url, t);
      output(output, segmentName, url, datum, null, null,
        CrawlDatum.STATUS_FETCH_RETRY);
    }
  }
}

Java Code Examples for org.apache.nutch.crawl.CrawlDatum#STATUS_DB_FETCHED