org.apache.nutch.indexer.NutchDocument Java Exaples

Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

private NutchDocument addTime(NutchDocument doc, ParseData data,
                         String url, CrawlDatum datum) {
  long time = -1;

  String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
  if (lastModified != null) {                   // try parse last-modified
    time = getTime(lastModified,url);           // use as time
                                                // store as string
    doc.add("lastModified", new Date(time));
  }

  if (time == -1) {                             // if no last-modified specified in HTTP header
    time = datum.getModifiedTime();             // use value in CrawlDatum
    if (time <= 0) {                            // if also unset
      time = datum.getFetchTime();              // use time the fetch took place (fetchTime of fetchDatum)
    }
  }

  // un-stored, indexed and un-tokenized
  doc.add("date", new Date(time));
  return doc;
}

Source File: LanguageIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}

Source File: TestAnchorIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

public void testDeduplicateAnchor() throws Exception {
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("anchorIndexingFilter.deduplicate", true);
  AnchorIndexingFilter filter = new AnchorIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());
  Inlinks inlinks = new Inlinks();
  inlinks.add(new Inlink("http://test1.com/", "text1"));
  inlinks.add(new Inlink("http://test2.com/", "text2"));
  inlinks.add(new Inlink("http://test3.com/", "text2"));
  try {
    filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks);
  } catch(Exception e){
    e.printStackTrace();
    fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
  assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size());
}

Source File: LanguageIndexingFilter.java From anthelion with Apache License 2.0

6 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}

Source File: MoreIndexingFilter.java From anthelion with Apache License 2.0

6 votes

private NutchDocument addTime(NutchDocument doc, ParseData data,
                         String url, CrawlDatum datum) {
  long time = -1;

  String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
  if (lastModified != null) {                   // try parse last-modified
    time = getTime(lastModified,url);           // use as time
                                                // store as string
    doc.add("lastModified", new Date(time));
  }

  if (time == -1) {                             // if no last-modified
    time = datum.getFetchTime();                // use fetch time
  }

  // un-stored, indexed and un-tokenized
  doc.add("date", new Date(time));

  return doc;
}

Source File: TestMoreIndexingFilter.java From anthelion with Apache License 2.0

6 votes

/**
 * @since NUTCH-901
 */
public void testNoParts(){
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());
  
  try{
      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
  }
  catch(Exception e){
      e.printStackTrace();
      fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue(doc.getFieldNames().contains("type"));
  assertEquals(1, doc.getField("type").getValues().size());
  assertEquals("text/html", doc.getFieldValue("type"));    
}

Source File: URLMetaIndexingFilter.java From anthelion with Apache License 2.0

6 votes

/**
 * This will take the metatags that you have listed in your "urlmeta.tags"
 * property, and looks for them inside the CrawlDatum object. If they exist,
 * this will add it as an attribute inside the NutchDocument.
 * 
 * @see IndexingFilter#filter
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf != null)
		this.setConf(conf);

	if (urlMetaTags == null || doc == null)
		return doc;

	for (String metatag : urlMetaTags) {
		Text metadata = (Text) datum.getMetaData().get(new Text(metatag));

		if (metadata != null)
			doc.add(metatag, metadata.toString());
	}

	return doc;
}

Source File: TestAnchorIndexingFilter.java From anthelion with Apache License 2.0

6 votes

public void testDeduplicateAnchor() throws Exception {
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("anchorIndexingFilter.deduplicate", true);
  AnchorIndexingFilter filter = new AnchorIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());
  Inlinks inlinks = new Inlinks();
  inlinks.add(new Inlink("http://test1.com/", "text1"));
  inlinks.add(new Inlink("http://test2.com/", "text2"));
  inlinks.add(new Inlink("http://test3.com/", "text2"));
  try {
    filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks);
  } catch(Exception e){
    e.printStackTrace();
    fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
  assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size());
}

Source File: CCIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

/** Add the features represented by a license URL.  Urls are of the form
 * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
 * license feature. */
public void addUrlFeatures(NutchDocument doc, String urlString) {
  try {
    URL url = new URL(urlString);

    // tokenize the path of the url, breaking at slashes and dashes
    StringTokenizer names = new StringTokenizer(url.getPath(), "/-");

    if (names.hasMoreTokens())
      names.nextToken();                        // throw away "licenses"

    // add a feature per component after "licenses"
    while (names.hasMoreTokens()) {
      String feature = names.nextToken();
      addFeature(doc, feature);
    }
  } catch (MalformedURLException e) {
    if (LOG.isWarnEnabled()) {
      LOG.warn("CC: failed to parse url: " + urlString + " : " + e);
    }
  }
}

Source File: TLDScoringFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
    CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
    throws ScoringFilterException {

  NutchField tlds = doc.getField("tld");
  float boost = 1.0f;

  if(tlds != null) {
    for(Object tld : tlds.getValues()) {
      DomainSuffix entry = tldEntries.get(tld.toString());
      if(entry != null)
        boost *= entry.getBoost();
    }
  }
  return initScore * boost;
}

Source File: CCIndexingFilter.java From anthelion with Apache License 2.0

6 votes

/** Add the features represented by a license URL.  Urls are of the form
 * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
 * license feature. */
public void addUrlFeatures(NutchDocument doc, String urlString) {
  try {
    URL url = new URL(urlString);

    // tokenize the path of the url, breaking at slashes and dashes
    StringTokenizer names = new StringTokenizer(url.getPath(), "/-");

    if (names.hasMoreTokens())
      names.nextToken();                        // throw away "licenses"

    // add a feature per component after "licenses"
    while (names.hasMoreTokens()) {
      String feature = names.nextToken();
      addFeature(doc, feature);
    }
  } catch (MalformedURLException e) {
    if (LOG.isWarnEnabled()) {
      LOG.warn("CC: failed to parse url: " + urlString + " : " + e);
    }
  }
}

Source File: TLDScoringFilter.java From anthelion with Apache License 2.0

6 votes

public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
    CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
    throws ScoringFilterException {

  NutchField tlds = doc.getField("tld");
  float boost = 1.0f;

  if(tlds != null) {
    for(Object tld : tlds.getValues()) {
      DomainSuffix entry = tldEntries.get(tld.toString());
      if(entry != null)
        boost *= entry.getBoost();
    }
  }
  return initScore * boost;
}

Source File: TestMoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * @since NUTCH-901
 */
public void testNoParts(){
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());
  
  try{
      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
  }
  catch(Exception e){
      e.printStackTrace();
      fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue(doc.getFieldNames().contains("type"));
  assertEquals(1, doc.getField("type").getValues().size());
  assertEquals("text/html", doc.getFieldValue("type"));    
}

Source File: URLMetaIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * This will take the metatags that you have listed in your "urlmeta.tags"
 * property, and looks for them inside the CrawlDatum object. If they exist,
 * this will add it as an attribute inside the NutchDocument.
 * 
 * @see IndexingFilter#filter
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf != null)
		this.setConf(conf);

	if (urlMetaTags == null || doc == null)
		return doc;

	for (String metatag : urlMetaTags) {
		Text metadata = (Text) datum.getMetaData().get(new Text(metatag));

		if (metadata != null)
			doc.add(metatag, metadata.toString());
	}

	return doc;
}

Source File: TestStaticFieldIndexerTest.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * Test that empty {@code index.static} does not add anything to the document
 * @throws Exception 
 */
public void testEmptyIndexStatic() throws Exception {

  assertNotNull(filter);
  filter.setConf(conf);

  NutchDocument doc = new NutchDocument();

  try {
    filter.filter(doc, parse, url, crawlDatum, inlinks);
  } catch (Exception e) {
    e.printStackTrace();
    fail(e.getMessage());
  }

  assertNotNull(doc);
  assertTrue("tests if no field is set for empty index.static", doc.getFieldNames().isEmpty());
}

Source File: StaticFieldIndexer.java From nutch-htmlunit with Apache License 2.0

5 votes

/**
 * The {@link StaticFieldIndexer} filter object which adds fields as per
 * configuration setting. See {@code index.static} in nutch-default.xml.
 * 
 * @param doc The {@link NutchDocument} object
 * @param parse  The relevant {@link Parse} object passing through the filter
 * @param url URL to be filtered for anchor text
 * @param datum The {@link CrawlDatum} entry
 * @param inlinks The {@link Inlinks} containing anchor text
 * @return filtered NutchDocument
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
    CrawlDatum datum, Inlinks inlinks) throws IndexingException {

  if (this.addStaticFields == true) {
    for (Entry<String, String[]> entry : this.fields.entrySet()) {
      for (String val : entry.getValue()) {
        doc.add(entry.getKey(), val);
      }
    }
  }
  return doc;
}

Source File: ElasticIndexWriter.java From nutch-htmlunit with Apache License 2.0

5 votes

@Override
public void write(NutchDocument doc) throws IOException {
  String id = (String)doc.getFieldValue("url");
  String type = doc.getDocumentMeta().get("type");
  if (type == null) type = "doc";
  IndexRequestBuilder request = client.prepareIndex(defaultIndex, type, id);

  Map<String, Object> source = new HashMap<String, Object>();

  // Loop through all fields of this doc
  for (String fieldName : doc.getFieldNames()) {
    if (doc.getField(fieldName).getValues().size() > 1) {
      source.put(fieldName, doc.getFieldValue(fieldName));
      // Loop through the values to keep track of the size of this document
      for (Object value : doc.getField(fieldName).getValues()) {
        bulkLength += value.toString().length();
      }
    } else {
      source.put(fieldName, doc.getFieldValue(fieldName));
      bulkLength += doc.getFieldValue(fieldName).toString().length();
    }
  }
  request.setSource(source);

  // Add this indexing request to a bulk request
  bulk.add(request);
  indexedDocs++;
  bulkDocs++;

  if (bulkDocs >= maxBulkDocs || bulkLength >= maxBulkLength) {
    LOG.info("Processing bulk request [docs = " + bulkDocs + ", length = "
            + bulkLength + ", total docs = " + indexedDocs
            + ", last doc in bulk = '" + id + "']");
    // Flush the bulk of indexing requests
    createNewBulk = true;
    commit();
  }
}

Source File: RelTagIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // Check if some Rel-Tags found, possibly put there by RelTagParser
  String[] tags = parse.getData().getParseMeta().getValues(RelTagParser.REL_TAG);
  if (tags != null) {
    for (int i=0; i<tags.length; i++) {
      doc.add("tag", tags[i]);
    }
  }

  return doc;
}

Source File: TLDIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {

  try {
    URL url = new URL(urlText.toString());
    DomainSuffix d = URLUtil.getDomainSuffix(url);
    
    doc.add("tld", d.getDomain());
    
  }catch (Exception ex) {
    LOG.warn(ex.toString());
  }

  return doc;
}

Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
    CrawlDatum datum, Inlinks inlinks) throws IndexingException {

  String url_s = url.toString();

  addTime(doc, parse.getData(), url_s, datum);
  addLength(doc, parse.getData(), url_s);
  addType(doc, parse.getData(), url_s, datum);
  resetTitle(doc, parse.getData(), url_s);

  return doc;
}

Source File: S2jhIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

@Override
public NutchDocument filterInternal(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) {
    ParseData parseData = parse.getData();

    String sku = parseData.getMeta("sku");
    if (StringUtils.isBlank(sku)) {
        return null;
    }

    doc.add("sku", sku);
    doc.add("price", parseData.getMeta("price"));

    return doc;
}

Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

private NutchDocument addLength(NutchDocument doc, ParseData data, String url) {
  String contentLength = data.getMeta(Response.CONTENT_LENGTH);

  if (contentLength != null) {
    // NUTCH-1010 ContentLength not trimmed
    String trimmed = contentLength.toString().trim();
    if (!trimmed.isEmpty())
      doc.add("contentLength", trimmed);
  }
  return doc;
}

Source File: TestBasicIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

public void testBasicIndexingFilter() throws Exception { 
  Configuration conf = NutchConfiguration.create();
  conf.setInt("indexer.max.title.length", 10);
  conf.setBoolean("indexer.add.domain", true);
  conf.setInt("indexer.max.content.length", 20);

  BasicIndexingFilter filter = new BasicIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);

  NutchDocument doc = new NutchDocument();

  String title = "The Foo Page";
  Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
  Metadata metaData = new Metadata();
  metaData.add("Language", "en/us");
  ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
  ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);

  CrawlDatum crawlDatum = new CrawlDatum();
  crawlDatum.setFetchTime(100L);

  Inlinks inlinks = new Inlinks();

  try {
    filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
  } catch(Exception e){
    e.printStackTrace();
    fail(e.getMessage());
  }
  assertNotNull(doc);
  assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc.getField("title").getValues().get(0));
  assertEquals("test domain, expect \"apache.org\"", "apache.org", doc.getField("domain").getValues().get(0));
  assertEquals("test host, expect \"nutch.apache.org\"", "nutch.apache.org", doc.getField("host").getValues().get(0));
  assertEquals("test url, expect \"http://nutch.apache.org/index.html\"", "http://nutch.apache.org/index.html", 
    doc.getField("url").getValues().get(0));
  assertEquals("test content", "this is a sample foo", doc.getField("content").getValues().get(0));
  assertEquals("test fetch time", new Date(100L), (Date)doc.getField("tstamp").getValues().get(0));
}

Source File: TestStaticFieldIndexerTest.java From nutch-htmlunit with Apache License 2.0

5 votes

/**
 * Test that valid field:value pairs are added to the document
 * @throws Exception 
 */
public void testNormalScenario() throws Exception {

  conf.set("index.static",
      "field1:val1, field2    :      val2 val3     , field3, field4 :val4 , ");
  assertNotNull(filter);
  filter.setConf(conf);

  NutchDocument doc = new NutchDocument();

  try {
    filter.filter(doc, parse, url, crawlDatum, inlinks);
  } catch (Exception e) {
    e.printStackTrace();
    fail(e.getMessage());
  }

  assertNotNull(doc);
  assertFalse("test if doc is not empty", doc.getFieldNames().isEmpty());
  assertEquals("test if doc has 3 fields", 3, doc.getFieldNames().size());
  assertTrue("test if doc has field1", doc.getField("field1").getValues()
      .contains("val1"));
  assertTrue("test if doc has field2", doc.getField("field2").getValues()
      .contains("val2"));
  assertTrue("test if doc has field4", doc.getField("field4").getValues()
      .contains("val4"));
}

Source File: AbstractIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
        throws IndexingException {
    LOG.debug("Invoking  indexer {} for url: {}", this.getClass().getName(), url);

    if (doc == null) {
        LOG.debug("Skipped as NutchDocument doc is null");
        return doc;
    }

    return filterInternal(doc, parse, url, datum, inlinks);
}

Source File: IndexWriters.java From nutch-htmlunit with Apache License 2.0

5 votes

public void update(NutchDocument doc) throws IOException {
	for (int i = 0; i < this.indexWriters.length; i++) {
		try {
			this.indexWriters[i].update(doc);
		} catch (IOException ioe) {
			throw ioe;
		}
	}
}

Source File: IndexWriters.java From nutch-htmlunit with Apache License 2.0

5 votes

public void write(NutchDocument doc) throws IOException {
	for (int i = 0; i < this.indexWriters.length; i++) {
		try {
			this.indexWriters[i].write(doc);
		} catch (IOException ioe) {
			throw ioe;
		}
	}
}

Source File: TestMoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

public void testContentDispositionTitle() throws IndexingException {
  Configuration conf = NutchConfiguration.create();

  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
    new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());

  assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
}

Source File: TestMoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_TYPE, source);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
}

Source File: CCIndexingFilter.java From anthelion with Apache License 2.0

5 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {
  
  Metadata metadata = parse.getData().getParseMeta();
  // index the license
  String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
  if (licenseUrl != null) {
    if (LOG.isInfoEnabled()) {
      LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
    }

    // add the entire license as cc:license=xxx
    addFeature(doc, "license=" + licenseUrl);

    // index license attributes extracted of the license url
    addUrlFeatures(doc, licenseUrl);
  }

  // index the license location as cc:meta=xxx
  String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION);
  if (licenseLocation != null) {
    addFeature(doc, "meta=" + licenseLocation);
  }

  // index the work type cc:type=xxx
  String workType = metadata.get(CreativeCommons.WORK_TYPE);
  if (workType != null) {
    addFeature(doc, workType);
  }

  return doc;
}

org.apache.nutch.indexer.NutchDocument Java Examples