org.apache.nutch.crawl.Inlinks Java Exaples

Source File: LanguageIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}

Source File: URLMetaIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * This will take the metatags that you have listed in your "urlmeta.tags"
 * property, and looks for them inside the CrawlDatum object. If they exist,
 * this will add it as an attribute inside the NutchDocument.
 * 
 * @see IndexingFilter#filter
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf != null)
		this.setConf(conf);

	if (urlMetaTags == null || doc == null)
		return doc;

	for (String metatag : urlMetaTags) {
		Text metadata = (Text) datum.getMetaData().get(new Text(metatag));

		if (metadata != null)
			doc.add(metatag, metadata.toString());
	}

	return doc;
}

Source File: TestMoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * @since NUTCH-901
 */
public void testNoParts(){
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());
  
  try{
      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
  }
  catch(Exception e){
      e.printStackTrace();
      fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue(doc.getFieldNames().contains("type"));
  assertEquals(1, doc.getField("type").getValues().size());
  assertEquals("text/html", doc.getFieldValue("type"));    
}

Source File: TestMoreIndexingFilter.java From anthelion with Apache License 2.0

6 votes

/**
 * @since NUTCH-901
 */
public void testNoParts(){
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());
  
  try{
      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
  }
  catch(Exception e){
      e.printStackTrace();
      fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue(doc.getFieldNames().contains("type"));
  assertEquals(1, doc.getField("type").getValues().size());
  assertEquals("text/html", doc.getFieldValue("type"));    
}

Source File: LanguageIndexingFilter.java From anthelion with Apache License 2.0

6 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}

Source File: URLMetaIndexingFilter.java From anthelion with Apache License 2.0

6 votes

/**
 * This will take the metatags that you have listed in your "urlmeta.tags"
 * property, and looks for them inside the CrawlDatum object. If they exist,
 * this will add it as an attribute inside the NutchDocument.
 * 
 * @see IndexingFilter#filter
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf != null)
		this.setConf(conf);

	if (urlMetaTags == null || doc == null)
		return doc;

	for (String metatag : urlMetaTags) {
		Text metadata = (Text) datum.getMetaData().get(new Text(metatag));

		if (metadata != null)
			doc.add(metatag, metadata.toString());
	}

	return doc;
}

Source File: TestAnchorIndexingFilter.java From anthelion with Apache License 2.0

6 votes

public void testDeduplicateAnchor() throws Exception {
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("anchorIndexingFilter.deduplicate", true);
  AnchorIndexingFilter filter = new AnchorIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());
  Inlinks inlinks = new Inlinks();
  inlinks.add(new Inlink("http://test1.com/", "text1"));
  inlinks.add(new Inlink("http://test2.com/", "text2"));
  inlinks.add(new Inlink("http://test3.com/", "text2"));
  try {
    filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks);
  } catch(Exception e){
    e.printStackTrace();
    fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
  assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size());
}

Source File: TestAnchorIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

public void testDeduplicateAnchor() throws Exception {
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("anchorIndexingFilter.deduplicate", true);
  AnchorIndexingFilter filter = new AnchorIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());
  Inlinks inlinks = new Inlinks();
  inlinks.add(new Inlink("http://test1.com/", "text1"));
  inlinks.add(new Inlink("http://test2.com/", "text2"));
  inlinks.add(new Inlink("http://test3.com/", "text2"));
  try {
    filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks);
  } catch(Exception e){
    e.printStackTrace();
    fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
  assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size());
}

Source File: TLDScoringFilter.java From anthelion with Apache License 2.0

6 votes

public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
    CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
    throws ScoringFilterException {

  NutchField tlds = doc.getField("tld");
  float boost = 1.0f;

  if(tlds != null) {
    for(Object tld : tlds.getValues()) {
      DomainSuffix entry = tldEntries.get(tld.toString());
      if(entry != null)
        boost *= entry.getBoost();
    }
  }
  return initScore * boost;
}

Source File: TLDScoringFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
    CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
    throws ScoringFilterException {

  NutchField tlds = doc.getField("tld");
  float boost = 1.0f;

  if(tlds != null) {
    for(Object tld : tlds.getValues()) {
      DomainSuffix entry = tldEntries.get(tld.toString());
      if(entry != null)
        boost *= entry.getBoost();
    }
  }
  return initScore * boost;
}

Source File: TLDIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {

  try {
    URL url = new URL(urlText.toString());
    DomainSuffix d = URLUtil.getDomainSuffix(url);
    
    doc.add("tld", d.getDomain());
    
  }catch (Exception ex) {
    LOG.warn(ex.toString());
  }

  return doc;
}

Source File: RelTagIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // Check if some Rel-Tags found, possibly put there by RelTagParser
  String[] tags = parse.getData().getParseMeta().getValues(RelTagParser.REL_TAG);
  if (tags != null) {
    for (int i=0; i<tags.length; i++) {
      doc.add("tag", tags[i]);
    }
  }

  return doc;
}

Source File: StaticFieldIndexer.java From nutch-htmlunit with Apache License 2.0

5 votes

/**
 * The {@link StaticFieldIndexer} filter object which adds fields as per
 * configuration setting. See {@code index.static} in nutch-default.xml.
 * 
 * @param doc The {@link NutchDocument} object
 * @param parse  The relevant {@link Parse} object passing through the filter
 * @param url URL to be filtered for anchor text
 * @param datum The {@link CrawlDatum} entry
 * @param inlinks The {@link Inlinks} containing anchor text
 * @return filtered NutchDocument
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
    CrawlDatum datum, Inlinks inlinks) throws IndexingException {

  if (this.addStaticFields == true) {
    for (Entry<String, String[]> entry : this.fields.entrySet()) {
      for (String val : entry.getValue()) {
        doc.add(entry.getKey(), val);
      }
    }
  }
  return doc;
}

Source File: AbstractIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
        throws IndexingException {
    LOG.debug("Invoking  indexer {} for url: {}", this.getClass().getName(), url);

    if (doc == null) {
        LOG.debug("Skipped as NutchDocument doc is null");
        return doc;
    }

    return filterInternal(doc, parse, url, datum, inlinks);
}

Source File: S2jhDiscardIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

@Override
public NutchDocument filterInternal(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) {

    if (!keepIndexPattern.matcher(url.toString()).find()) {
        LOG.debug("Cancel index for {} as not match regex [{}]", url, keepIndexPattern);
        doc = null;
        return null;
    }

    return doc;
}

Source File: S2jhIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

@Override
public NutchDocument filterInternal(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) {
    ParseData parseData = parse.getData();

    String sku = parseData.getMeta("sku");
    if (StringUtils.isBlank(sku)) {
        return null;
    }

    doc.add("sku", sku);
    doc.add("price", parseData.getMeta("price"));

    return doc;
}

Source File: TestStaticFieldIndexerTest.java From nutch-htmlunit with Apache License 2.0

5 votes

protected void setUp() throws Exception {
  conf = NutchConfiguration.create();
  parse = new ParseImpl();
  url = new Text("http://nutch.apache.org/index.html");
  crawlDatum = new CrawlDatum();
  inlinks = new Inlinks();
  filter = new StaticFieldIndexer();
}

Source File: IndexingFilters.java From nutch-htmlunit with Apache License 2.0

5 votes

/** Run all defined filters. */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
    Inlinks inlinks) throws IndexingException {
  for (int i = 0; i < this.indexingFilters.length; i++) {
    doc = this.indexingFilters[i].filter(doc, parse, url, datum, inlinks);
    // break the loop if an indexing filter discards the doc
    if (doc == null) return null;
  }

  return doc;
}

Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
    CrawlDatum datum, Inlinks inlinks) throws IndexingException {

  String url_s = url.toString();

  addTime(doc, parse.getData(), url_s, datum);
  addLength(doc, parse.getData(), url_s);
  addType(doc, parse.getData(), url_s, datum);
  resetTitle(doc, parse.getData(), url_s);

  return doc;
}

Source File: TestMoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

public void testContentDispositionTitle() throws IndexingException {
  Configuration conf = NutchConfiguration.create();

  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
    new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());

  assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
}

Source File: TestMoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_TYPE, source);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
}

Source File: TestBasicIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

public void testBasicIndexingFilter() throws Exception { 
  Configuration conf = NutchConfiguration.create();
  conf.setInt("indexer.max.title.length", 10);
  conf.setBoolean("indexer.add.domain", true);
  conf.setInt("indexer.max.content.length", 20);

  BasicIndexingFilter filter = new BasicIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);

  NutchDocument doc = new NutchDocument();

  String title = "The Foo Page";
  Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
  Metadata metaData = new Metadata();
  metaData.add("Language", "en/us");
  ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
  ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);

  CrawlDatum crawlDatum = new CrawlDatum();
  crawlDatum.setFetchTime(100L);

  Inlinks inlinks = new Inlinks();

  try {
    filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
  } catch(Exception e){
    e.printStackTrace();
    fail(e.getMessage());
  }
  assertNotNull(doc);
  assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc.getField("title").getValues().get(0));
  assertEquals("test domain, expect \"apache.org\"", "apache.org", doc.getField("domain").getValues().get(0));
  assertEquals("test host, expect \"nutch.apache.org\"", "nutch.apache.org", doc.getField("host").getValues().get(0));
  assertEquals("test url, expect \"http://nutch.apache.org/index.html\"", "http://nutch.apache.org/index.html", 
    doc.getField("url").getValues().get(0));
  assertEquals("test content", "this is a sample foo", doc.getField("content").getValues().get(0));
  assertEquals("test fetch time", new Date(100L), (Date)doc.getField("tstamp").getValues().get(0));
}

Source File: AnchorIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

/**
 * The {@link AnchorIndexingFilter} filter object which supports boolean 
 * configuration settings for the deduplication of anchors. 
 * See {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
 *  
 * @param doc The {@link NutchDocument} object
 * @param parse The relevant {@link Parse} object passing through the filter 
 * @param url URL to be filtered for anchor text
 * @param datum The {@link CrawlDatum} entry
 * @param inlinks The {@link Inlinks} containing anchor text
 * @return filtered NutchDocument
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
  Inlinks inlinks) throws IndexingException {

  String[] anchors = (inlinks != null ? inlinks.getAnchors()
    : new String[0]);

  HashSet<String> set = null;

  for (int i = 0; i < anchors.length; i++) {
    if (deduplicate) {
      if (set == null) set = new HashSet<String>();
      String lcAnchor = anchors[i].toLowerCase();

      // Check if already processed the current anchor
      if (!set.contains(lcAnchor)) {
        doc.add("anchor", anchors[i]);

        // Add to map
        set.add(lcAnchor);
      }
    } else {
      doc.add("anchor", anchors[i]);
    }
  }

  return doc;
}

Source File: CCIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {
  
  Metadata metadata = parse.getData().getParseMeta();
  // index the license
  String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
  if (licenseUrl != null) {
    if (LOG.isInfoEnabled()) {
      LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
    }

    // add the entire license as cc:license=xxx
    addFeature(doc, "license=" + licenseUrl);

    // index license attributes extracted of the license url
    addUrlFeatures(doc, licenseUrl);
  }

  // index the license location as cc:meta=xxx
  String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION);
  if (licenseLocation != null) {
    addFeature(doc, "meta=" + licenseLocation);
  }

  // index the work type cc:type=xxx
  String workType = metadata.get(CreativeCommons.WORK_TYPE);
  if (workType != null) {
    addFeature(doc, workType);
  }

  return doc;
}

Source File: TestIndexingFilters.java From nutch-htmlunit with Apache License 2.0

5 votes

/**
 * Test behaviour when defined filter does not exist.
 * @throws IndexingException
 */
public void testNonExistingIndexingFilter() throws IndexingException {
  Configuration conf = NutchConfiguration.create();
    conf.addResource("nutch-default.xml");
    conf.addResource("crawl-tests.xml");

  String class1 = "NonExistingFilter";
  String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
  conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);

  IndexingFilters filters = new IndexingFilters(conf);
  filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
    new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
    "http://www.example.com/"), new CrawlDatum(), new Inlinks());
}

Source File: TestIndexingFilters.java From nutch-htmlunit with Apache License 2.0

5 votes

/**
 * Test behaviour when NutchDOcument is null
 */

public void testNutchDocumentNullIndexingFilter() throws IndexingException{
  Configuration conf = NutchConfiguration.create();
  conf.addResource("nutch-default.xml");
  conf.addResource("crawl-tests.xml");

  IndexingFilters filters = new IndexingFilters(conf);
  NutchDocument doc = filters.filter(null, new ParseImpl("text", new ParseData(
    new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
    "http://www.example.com/"), new CrawlDatum(), new Inlinks());
   
  assertNull(doc);
}

Source File: TestIndexingFilters.java From nutch-htmlunit with Apache License 2.0

5 votes

/**
 * Test behaviour when reset the index filter order will not take effect
 *
 * @throws IndexingException
 */
public void testFilterCacheIndexingFilter() throws IndexingException{
  Configuration conf = NutchConfiguration.create();
  conf.addResource("nutch-default.xml");
  conf.addResource("crawl-tests.xml");

  String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
  conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);

  IndexingFilters filters1 = new IndexingFilters(conf);
  NutchDocument fdoc1 = filters1.filter(new NutchDocument(),new ParseImpl("text",new ParseData(
    new ParseStatus(),"title",new Outlink[0],new Metadata())),new Text("http://www.example.com/"),
    new CrawlDatum(),new Inlinks());

  // add another index filter
  String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
  // set content metadata
  Metadata md = new Metadata();
  md.add("example","data");
  // set content metadata property defined in MetadataIndexer
  conf.set("index.content.md","example");
  // add MetadataIndxer filter
  conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
  IndexingFilters filters2 = new IndexingFilters(conf);
  NutchDocument fdoc2 = filters2.filter(new NutchDocument(),new ParseImpl("text",new ParseData(
    new ParseStatus(),"title",new Outlink[0],md)),new Text("http://www.example.com/"),
    new CrawlDatum(),new Inlinks());
  assertEquals(fdoc1.getFieldNames().size(),fdoc2.getFieldNames().size());
}

Source File: LanguageDetectionFilter.java From language-detection with Apache License 2.0

5 votes

/**
 * {@inheritDoc}
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf == null) {
		throw new IndexingException("Not Yet Initialization.");
	}
	if (cause != null) {
		throw new IndexingException("Initialization Failed.", cause);
	}

	String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);
	if (lang == null) {
		StringBuilder text = new StringBuilder();
		text.append(parse.getData().getTitle()).append(" ")
				.append(parse.getText());
		try {
			Detector detector = DetectorFactory.create();
			detector.setMaxTextLength(textsize_upper_limit);
			detector.append(text.toString());
			lang = detector.detect();
		} catch (LangDetectException e) {
			throw new IndexingException("Detection failed.", e);
		}
	}
	if (lang == null) lang = "unknown";

	doc.add("lang", lang);
	return doc;
}

Source File: IndexingFilters.java From anthelion with Apache License 2.0

5 votes

/** Run all defined filters. */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
    Inlinks inlinks) throws IndexingException {
  for (int i = 0; i < this.indexingFilters.length; i++) {
    doc = this.indexingFilters[i].filter(doc, parse, url, datum, inlinks);
    // break the loop if an indexing filter discards the doc
    if (doc == null) return null;
  }

  return doc;
}

Source File: MoreIndexingFilter.java From anthelion with Apache License 2.0

5 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
    CrawlDatum datum, Inlinks inlinks) throws IndexingException {

  String url_s = url.toString();

  addTime(doc, parse.getData(), url_s, datum);
  addLength(doc, parse.getData(), url_s);
  addType(doc, parse.getData(), url_s, datum);
  resetTitle(doc, parse.getData(), url_s);

  return doc;
}

org.apache.nutch.crawl.Inlinks Java Examples