org.apache.nutch.crawl.Inlinks Java Examples

The following examples show how to use org.apache.nutch.crawl.Inlinks. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: LanguageIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}
 
Example #2
Source File: URLMetaIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
/**
 * This will take the metatags that you have listed in your "urlmeta.tags"
 * property, and looks for them inside the CrawlDatum object. If they exist,
 * this will add it as an attribute inside the NutchDocument.
 * 
 * @see IndexingFilter#filter
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf != null)
		this.setConf(conf);

	if (urlMetaTags == null || doc == null)
		return doc;

	for (String metatag : urlMetaTags) {
		Text metadata = (Text) datum.getMetaData().get(new Text(metatag));

		if (metadata != null)
			doc.add(metatag, metadata.toString());
	}

	return doc;
}
 
Example #3
Source File: TestMoreIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
/**
 * @since NUTCH-901
 */
public void testNoParts(){
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());
  
  try{
      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
  }
  catch(Exception e){
      e.printStackTrace();
      fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue(doc.getFieldNames().contains("type"));
  assertEquals(1, doc.getField("type").getValues().size());
  assertEquals("text/html", doc.getFieldValue("type"));    
}
 
Example #4
Source File: TestMoreIndexingFilter.java    From anthelion with Apache License 2.0 6 votes vote down vote up
/**
 * @since NUTCH-901
 */
public void testNoParts(){
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());
  
  try{
      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
  }
  catch(Exception e){
      e.printStackTrace();
      fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue(doc.getFieldNames().contains("type"));
  assertEquals(1, doc.getField("type").getValues().size());
  assertEquals("text/html", doc.getFieldValue("type"));    
}
 
Example #5
Source File: LanguageIndexingFilter.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}
 
Example #6
Source File: URLMetaIndexingFilter.java    From anthelion with Apache License 2.0 6 votes vote down vote up
/**
 * This will take the metatags that you have listed in your "urlmeta.tags"
 * property, and looks for them inside the CrawlDatum object. If they exist,
 * this will add it as an attribute inside the NutchDocument.
 * 
 * @see IndexingFilter#filter
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf != null)
		this.setConf(conf);

	if (urlMetaTags == null || doc == null)
		return doc;

	for (String metatag : urlMetaTags) {
		Text metadata = (Text) datum.getMetaData().get(new Text(metatag));

		if (metadata != null)
			doc.add(metatag, metadata.toString());
	}

	return doc;
}
 
Example #7
Source File: TestAnchorIndexingFilter.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public void testDeduplicateAnchor() throws Exception {
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("anchorIndexingFilter.deduplicate", true);
  AnchorIndexingFilter filter = new AnchorIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());
  Inlinks inlinks = new Inlinks();
  inlinks.add(new Inlink("http://test1.com/", "text1"));
  inlinks.add(new Inlink("http://test2.com/", "text2"));
  inlinks.add(new Inlink("http://test3.com/", "text2"));
  try {
    filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks);
  } catch(Exception e){
    e.printStackTrace();
    fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
  assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size());
}
 
Example #8
Source File: TestAnchorIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public void testDeduplicateAnchor() throws Exception {
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("anchorIndexingFilter.deduplicate", true);
  AnchorIndexingFilter filter = new AnchorIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());
  Inlinks inlinks = new Inlinks();
  inlinks.add(new Inlink("http://test1.com/", "text1"));
  inlinks.add(new Inlink("http://test2.com/", "text2"));
  inlinks.add(new Inlink("http://test3.com/", "text2"));
  try {
    filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks);
  } catch(Exception e){
    e.printStackTrace();
    fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
  assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size());
}
 
Example #9
Source File: TLDScoringFilter.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
    CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
    throws ScoringFilterException {

  NutchField tlds = doc.getField("tld");
  float boost = 1.0f;

  if(tlds != null) {
    for(Object tld : tlds.getValues()) {
      DomainSuffix entry = tldEntries.get(tld.toString());
      if(entry != null)
        boost *= entry.getBoost();
    }
  }
  return initScore * boost;
}
 
Example #10
Source File: TLDScoringFilter.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
    CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
    throws ScoringFilterException {

  NutchField tlds = doc.getField("tld");
  float boost = 1.0f;

  if(tlds != null) {
    for(Object tld : tlds.getValues()) {
      DomainSuffix entry = tldEntries.get(tld.toString());
      if(entry != null)
        boost *= entry.getBoost();
    }
  }
  return initScore * boost;
}
 
Example #11
Source File: TLDIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {

  try {
    URL url = new URL(urlText.toString());
    DomainSuffix d = URLUtil.getDomainSuffix(url);
    
    doc.add("tld", d.getDomain());
    
  }catch (Exception ex) {
    LOG.warn(ex.toString());
  }

  return doc;
}
 
Example #12
Source File: RelTagIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // Check if some Rel-Tags found, possibly put there by RelTagParser
  String[] tags = parse.getData().getParseMeta().getValues(RelTagParser.REL_TAG);
  if (tags != null) {
    for (int i=0; i<tags.length; i++) {
      doc.add("tag", tags[i]);
    }
  }

  return doc;
}
 
Example #13
Source File: StaticFieldIndexer.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * The {@link StaticFieldIndexer} filter object which adds fields as per
 * configuration setting. See {@code index.static} in nutch-default.xml.
 * 
 * @param doc The {@link NutchDocument} object
 * @param parse  The relevant {@link Parse} object passing through the filter
 * @param url URL to be filtered for anchor text
 * @param datum The {@link CrawlDatum} entry
 * @param inlinks The {@link Inlinks} containing anchor text
 * @return filtered NutchDocument
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
    CrawlDatum datum, Inlinks inlinks) throws IndexingException {

  if (this.addStaticFields == true) {
    for (Entry<String, String[]> entry : this.fields.entrySet()) {
      for (String val : entry.getValue()) {
        doc.add(entry.getKey(), val);
      }
    }
  }
  return doc;
}
 
Example #14
Source File: AbstractIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
        throws IndexingException {
    LOG.debug("Invoking  indexer {} for url: {}", this.getClass().getName(), url);

    if (doc == null) {
        LOG.debug("Skipped as NutchDocument doc is null");
        return doc;
    }

    return filterInternal(doc, parse, url, datum, inlinks);
}
 
Example #15
Source File: S2jhDiscardIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
@Override
public NutchDocument filterInternal(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) {

    if (!keepIndexPattern.matcher(url.toString()).find()) {
        LOG.debug("Cancel index for {} as not match regex [{}]", url, keepIndexPattern);
        doc = null;
        return null;
    }

    return doc;
}
 
Example #16
Source File: S2jhIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
@Override
public NutchDocument filterInternal(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) {
    ParseData parseData = parse.getData();

    String sku = parseData.getMeta("sku");
    if (StringUtils.isBlank(sku)) {
        return null;
    }

    doc.add("sku", sku);
    doc.add("price", parseData.getMeta("price"));

    return doc;
}
 
Example #17
Source File: TestStaticFieldIndexerTest.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
protected void setUp() throws Exception {
  conf = NutchConfiguration.create();
  parse = new ParseImpl();
  url = new Text("http://nutch.apache.org/index.html");
  crawlDatum = new CrawlDatum();
  inlinks = new Inlinks();
  filter = new StaticFieldIndexer();
}
 
Example #18
Source File: IndexingFilters.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/** Run all defined filters. */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
    Inlinks inlinks) throws IndexingException {
  for (int i = 0; i < this.indexingFilters.length; i++) {
    doc = this.indexingFilters[i].filter(doc, parse, url, datum, inlinks);
    // break the loop if an indexing filter discards the doc
    if (doc == null) return null;
  }

  return doc;
}
 
Example #19
Source File: MoreIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
    CrawlDatum datum, Inlinks inlinks) throws IndexingException {

  String url_s = url.toString();

  addTime(doc, parse.getData(), url_s, datum);
  addLength(doc, parse.getData(), url_s);
  addType(doc, parse.getData(), url_s, datum);
  resetTitle(doc, parse.getData(), url_s);

  return doc;
}
 
Example #20
Source File: TestMoreIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public void testContentDispositionTitle() throws IndexingException {
  Configuration conf = NutchConfiguration.create();

  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
    new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());

  assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
}
 
Example #21
Source File: TestMoreIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_TYPE, source);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
}
 
Example #22
Source File: TestBasicIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public void testBasicIndexingFilter() throws Exception { 
  Configuration conf = NutchConfiguration.create();
  conf.setInt("indexer.max.title.length", 10);
  conf.setBoolean("indexer.add.domain", true);
  conf.setInt("indexer.max.content.length", 20);

  BasicIndexingFilter filter = new BasicIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);

  NutchDocument doc = new NutchDocument();

  String title = "The Foo Page";
  Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
  Metadata metaData = new Metadata();
  metaData.add("Language", "en/us");
  ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
  ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);

  CrawlDatum crawlDatum = new CrawlDatum();
  crawlDatum.setFetchTime(100L);

  Inlinks inlinks = new Inlinks();

  try {
    filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
  } catch(Exception e){
    e.printStackTrace();
    fail(e.getMessage());
  }
  assertNotNull(doc);
  assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc.getField("title").getValues().get(0));
  assertEquals("test domain, expect \"apache.org\"", "apache.org", doc.getField("domain").getValues().get(0));
  assertEquals("test host, expect \"nutch.apache.org\"", "nutch.apache.org", doc.getField("host").getValues().get(0));
  assertEquals("test url, expect \"http://nutch.apache.org/index.html\"", "http://nutch.apache.org/index.html", 
    doc.getField("url").getValues().get(0));
  assertEquals("test content", "this is a sample foo", doc.getField("content").getValues().get(0));
  assertEquals("test fetch time", new Date(100L), (Date)doc.getField("tstamp").getValues().get(0));
}
 
Example #23
Source File: AnchorIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * The {@link AnchorIndexingFilter} filter object which supports boolean 
 * configuration settings for the deduplication of anchors. 
 * See {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
 *  
 * @param doc The {@link NutchDocument} object
 * @param parse The relevant {@link Parse} object passing through the filter 
 * @param url URL to be filtered for anchor text
 * @param datum The {@link CrawlDatum} entry
 * @param inlinks The {@link Inlinks} containing anchor text
 * @return filtered NutchDocument
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
  Inlinks inlinks) throws IndexingException {

  String[] anchors = (inlinks != null ? inlinks.getAnchors()
    : new String[0]);

  HashSet<String> set = null;

  for (int i = 0; i < anchors.length; i++) {
    if (deduplicate) {
      if (set == null) set = new HashSet<String>();
      String lcAnchor = anchors[i].toLowerCase();

      // Check if already processed the current anchor
      if (!set.contains(lcAnchor)) {
        doc.add("anchor", anchors[i]);

        // Add to map
        set.add(lcAnchor);
      }
    } else {
      doc.add("anchor", anchors[i]);
    }
  }

  return doc;
}
 
Example #24
Source File: CCIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {
  
  Metadata metadata = parse.getData().getParseMeta();
  // index the license
  String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
  if (licenseUrl != null) {
    if (LOG.isInfoEnabled()) {
      LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
    }

    // add the entire license as cc:license=xxx
    addFeature(doc, "license=" + licenseUrl);

    // index license attributes extracted of the license url
    addUrlFeatures(doc, licenseUrl);
  }

  // index the license location as cc:meta=xxx
  String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION);
  if (licenseLocation != null) {
    addFeature(doc, "meta=" + licenseLocation);
  }

  // index the work type cc:type=xxx
  String workType = metadata.get(CreativeCommons.WORK_TYPE);
  if (workType != null) {
    addFeature(doc, workType);
  }

  return doc;
}
 
Example #25
Source File: TestIndexingFilters.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * Test behaviour when defined filter does not exist.
 * @throws IndexingException
 */
public void testNonExistingIndexingFilter() throws IndexingException {
  Configuration conf = NutchConfiguration.create();
    conf.addResource("nutch-default.xml");
    conf.addResource("crawl-tests.xml");

  String class1 = "NonExistingFilter";
  String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
  conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);

  IndexingFilters filters = new IndexingFilters(conf);
  filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
    new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
    "http://www.example.com/"), new CrawlDatum(), new Inlinks());
}
 
Example #26
Source File: TestIndexingFilters.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * Test behaviour when NutchDOcument is null
 */

public void testNutchDocumentNullIndexingFilter() throws IndexingException{
  Configuration conf = NutchConfiguration.create();
  conf.addResource("nutch-default.xml");
  conf.addResource("crawl-tests.xml");

  IndexingFilters filters = new IndexingFilters(conf);
  NutchDocument doc = filters.filter(null, new ParseImpl("text", new ParseData(
    new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
    "http://www.example.com/"), new CrawlDatum(), new Inlinks());
   
  assertNull(doc);
}
 
Example #27
Source File: TestIndexingFilters.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * Test behaviour when reset the index filter order will not take effect
 *
 * @throws IndexingException
 */
public void testFilterCacheIndexingFilter() throws IndexingException{
  Configuration conf = NutchConfiguration.create();
  conf.addResource("nutch-default.xml");
  conf.addResource("crawl-tests.xml");

  String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
  conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);

  IndexingFilters filters1 = new IndexingFilters(conf);
  NutchDocument fdoc1 = filters1.filter(new NutchDocument(),new ParseImpl("text",new ParseData(
    new ParseStatus(),"title",new Outlink[0],new Metadata())),new Text("http://www.example.com/"),
    new CrawlDatum(),new Inlinks());

  // add another index filter
  String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
  // set content metadata
  Metadata md = new Metadata();
  md.add("example","data");
  // set content metadata property defined in MetadataIndexer
  conf.set("index.content.md","example");
  // add MetadataIndxer filter
  conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
  IndexingFilters filters2 = new IndexingFilters(conf);
  NutchDocument fdoc2 = filters2.filter(new NutchDocument(),new ParseImpl("text",new ParseData(
    new ParseStatus(),"title",new Outlink[0],md)),new Text("http://www.example.com/"),
    new CrawlDatum(),new Inlinks());
  assertEquals(fdoc1.getFieldNames().size(),fdoc2.getFieldNames().size());
}
 
Example #28
Source File: LanguageDetectionFilter.java    From language-detection with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf == null) {
		throw new IndexingException("Not Yet Initialization.");
	}
	if (cause != null) {
		throw new IndexingException("Initialization Failed.", cause);
	}

	String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);
	if (lang == null) {
		StringBuilder text = new StringBuilder();
		text.append(parse.getData().getTitle()).append(" ")
				.append(parse.getText());
		try {
			Detector detector = DetectorFactory.create();
			detector.setMaxTextLength(textsize_upper_limit);
			detector.append(text.toString());
			lang = detector.detect();
		} catch (LangDetectException e) {
			throw new IndexingException("Detection failed.", e);
		}
	}
	if (lang == null) lang = "unknown";

	doc.add("lang", lang);
	return doc;
}
 
Example #29
Source File: IndexingFilters.java    From anthelion with Apache License 2.0 5 votes vote down vote up
/** Run all defined filters. */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
    Inlinks inlinks) throws IndexingException {
  for (int i = 0; i < this.indexingFilters.length; i++) {
    doc = this.indexingFilters[i].filter(doc, parse, url, datum, inlinks);
    // break the loop if an indexing filter discards the doc
    if (doc == null) return null;
  }

  return doc;
}
 
Example #30
Source File: MoreIndexingFilter.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
    CrawlDatum datum, Inlinks inlinks) throws IndexingException {

  String url_s = url.toString();

  addTime(doc, parse.getData(), url_s, datum);
  addLength(doc, parse.getData(), url_s);
  addType(doc, parse.getData(), url_s, datum);
  resetTitle(doc, parse.getData(), url_s);

  return doc;
}