org.apache.nutch.parse.Parse Java Exaples

Source File: LanguageIndexingFilter.java From anthelion with Apache License 2.0

6 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}

Source File: RelTagParser.java From anthelion with Apache License 2.0

6 votes

/**
 * Scan the HTML document looking at possible rel-tags
 */
public ParseResult filter(Content content, ParseResult parseResult,
  HTMLMetaTags metaTags, DocumentFragment doc) {
  
  // get parse obj
  Parse parse = parseResult.get(content.getUrl());
  // Trying to find the document's rel-tags
  Parser parser = new Parser(doc);
  Set tags = parser.getRelTags();
  Iterator iter = tags.iterator();
  Metadata metadata = parse.getData().getParseMeta();
  while (iter.hasNext()) {
    metadata.add(REL_TAG, (String) iter.next());
  }
  return parseResult;
}

Source File: TLDScoringFilter.java From anthelion with Apache License 2.0

6 votes

public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
    CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
    throws ScoringFilterException {

  NutchField tlds = doc.getField("tld");
  float boost = 1.0f;

  if(tlds != null) {
    for(Object tld : tlds.getValues()) {
      DomainSuffix entry = tldEntries.get(tld.toString());
      if(entry != null)
        boost *= entry.getBoost();
    }
  }
  return initScore * boost;
}

Source File: RelTagParser.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * Scan the HTML document looking at possible rel-tags
 */
public ParseResult filter(Content content, ParseResult parseResult,
  HTMLMetaTags metaTags, DocumentFragment doc) {
  
  // get parse obj
  Parse parse = parseResult.get(content.getUrl());
  // Trying to find the document's rel-tags
  Parser parser = new Parser(doc);
  Set<?> tags = parser.getRelTags();
  Iterator<?> iter = tags.iterator();
  Metadata metadata = parse.getData().getParseMeta();
  while (iter.hasNext())
    metadata.add(REL_TAG, (String) iter.next());

  return parseResult;
}

Source File: URLMetaIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * This will take the metatags that you have listed in your "urlmeta.tags"
 * property, and looks for them inside the CrawlDatum object. If they exist,
 * this will add it as an attribute inside the NutchDocument.
 * 
 * @see IndexingFilter#filter
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf != null)
		this.setConf(conf);

	if (urlMetaTags == null || doc == null)
		return doc;

	for (String metatag : urlMetaTags) {
		Text metadata = (Text) datum.getMetaData().get(new Text(metatag));

		if (metadata != null)
			doc.add(metatag, metadata.toString());
	}

	return doc;
}

Source File: TestImageMetadata.java From nutch-htmlunit with Apache License 2.0

6 votes

public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    Configuration conf = NutchConfiguration.create();
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());

    assertEquals("121", parse.getData().getMeta("width"));
    assertEquals("48", parse.getData().getMeta("height"));
  }
}

Source File: LanguageIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}

Source File: JSParseFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

public ParseResult filter(Content content, ParseResult parseResult,
  HTMLMetaTags metaTags, DocumentFragment doc) {

  Parse parse = parseResult.get(content.getUrl());

  String url = content.getBaseUrl();
  ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
  walk(doc, parse, metaTags, url, outlinks);
  if (outlinks.size() > 0) {
    Outlink[] old = parse.getData().getOutlinks();
    String title = parse.getData().getTitle();
    List<Outlink> list = Arrays.asList(old);
    outlinks.addAll(list);
    ParseStatus status = parse.getData().getStatus();
    String text = parse.getText();
    Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
    ParseData parseData = new ParseData(status, title, newlinks,
                                        parse.getData().getContentMeta(),
                                        parse.getData().getParseMeta());

    // replace original parse obj with new one
    parseResult.put(content.getUrl(), new ParseText(text), parseData);
  }
  return parseResult;
}

Source File: HTMLLanguageParser.java From nutch-htmlunit with Apache License 2.0

6 votes

/** Try to find the document's language from page headers and metadata */
private String detectLanguage(Parse page, DocumentFragment doc) {
    String lang = getLanguageFromMetadata(page.getData().getParseMeta());
    if (lang == null) {
        LanguageParser parser = new LanguageParser(doc);
        lang = parser.getLanguage();
    }

    if (lang != null) {
        return lang;
    }

    lang = page.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);

    return lang;
}

Source File: TestZipParser.java From nutch-htmlunit with Apache License 2.0

6 votes

public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  Configuration conf = NutchConfiguration.create();
  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content).get(content.getUrl());
    assertTrue(parse.getText().equals(expectedText));
  }
}

Source File: TestSWFParser.java From nutch-htmlunit with Apache License 2.0

6 votes

public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;
  Configuration conf = NutchConfiguration.create();

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

    parse = new ParseUtil(conf).parse(content).get(content.getUrl());

    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
    assertTrue(sampleTexts[i].equals(text));
  }
}

Source File: TestPdfParser.java From nutch-htmlunit with Apache License 2.0

6 votes

public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    Configuration conf = NutchConfiguration.create();
    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());

    int index = parse.getText().indexOf(expectedText);
    assertTrue(index > 0);
  }
}

Source File: URLMetaIndexingFilter.java From anthelion with Apache License 2.0

6 votes

/**
 * This will take the metatags that you have listed in your "urlmeta.tags"
 * property, and looks for them inside the CrawlDatum object. If they exist,
 * this will add it as an attribute inside the NutchDocument.
 * 
 * @see IndexingFilter#filter
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf != null)
		this.setConf(conf);

	if (urlMetaTags == null || doc == null)
		return doc;

	for (String metatag : urlMetaTags) {
		Text metadata = (Text) datum.getMetaData().get(new Text(metatag));

		if (metadata != null)
			doc.add(metatag, metadata.toString());
	}

	return doc;
}

Source File: TestSWFParser.java From anthelion with Apache License 2.0

6 votes

public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Protocol protocol;
  Content content;
  Parse parse;
  Configuration conf = NutchConfiguration.create();

  for (int i = 0; i < sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    protocol = new ProtocolFactory(conf).getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();

    parse = new ParseUtil(conf).parse(content).get(content.getUrl());

    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
    assertTrue(sampleTexts[i].equals(text));
  }
}

Source File: TestCCParseFilter.java From anthelion with Apache License 2.0

6 votes

public void pageTest(File file, String url,
                     String license, String location, String type)
  throws Exception {

  String contentType = "text/html";
  InputStream in = new FileInputStream(file);
  ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
  byte[] buffer = new byte[1024];
  int i;
  while ((i = in.read(buffer)) != -1) {
    out.write(buffer, 0, i);
  }
  in.close();
  byte[] bytes = out.toByteArray();
  Configuration conf = NutchConfiguration.create();

  Content content =
    new Content(url, url, bytes, contentType, new Metadata(), conf);
  Parse parse =  new ParseUtil(conf).parse(content).get(content.getUrl());
  
  Metadata metadata = parse.getData().getParseMeta();
  assertEquals(license, metadata.get("License-Url"));
  assertEquals(location, metadata.get("License-Location"));
  assertEquals(type, metadata.get("Work-Type"));
}

Source File: TLDScoringFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
    CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
    throws ScoringFilterException {

  NutchField tlds = doc.getField("tld");
  float boost = 1.0f;

  if(tlds != null) {
    for(Object tld : tlds.getValues()) {
      DomainSuffix entry = tldEntries.get(tld.toString());
      if(entry != null)
        boost *= entry.getBoost();
    }
  }
  return initScore * boost;
}

Source File: JSParseFilter.java From anthelion with Apache License 2.0

6 votes

public ParseResult filter(Content content, ParseResult parseResult,
  HTMLMetaTags metaTags, DocumentFragment doc) {

  Parse parse = parseResult.get(content.getUrl());

  String url = content.getBaseUrl();
  ArrayList outlinks = new ArrayList();
  walk(doc, parse, metaTags, url, outlinks);
  if (outlinks.size() > 0) {
    Outlink[] old = parse.getData().getOutlinks();
    String title = parse.getData().getTitle();
    List list = Arrays.asList(old);
    outlinks.addAll(list);
    ParseStatus status = parse.getData().getStatus();
    String text = parse.getText();
    Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
    ParseData parseData = new ParseData(status, title, newlinks,
                                        parse.getData().getContentMeta(),
                                        parse.getData().getParseMeta());

    // replace original parse obj with new one
    parseResult.put(content.getUrl(), new ParseText(text), parseData);
  }
  return parseResult;
}

Source File: HeadingsParseFilter.java From anthelion with Apache License 2.0

6 votes

public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
  this.doc = doc;

  String heading;
  Parse parse = parseResult.get(content.getUrl());

  for (int i = 0 ; headings != null && i < headings.length ; i++ ) {
    heading = getElement(headings[i]);

    if (heading != null) {
      heading.trim();

      if (heading.length() > 0) {
        parse.getData().getParseMeta().set(headings[i], heading);
      }
    }
  }

  return parseResult;
}

Source File: FeedParser.java From anthelion with Apache License 2.0

6 votes

/**
 * Runs a command line version of this {@link Parser}.
 * 
 * @param args
 *          A single argument (expected at arg[0]) representing a path on the
 *          local filesystem that points to a feed file.
 * 
 * @throws Exception
 *           If any error occurs.
 */
public static void main(String[] args) throws Exception {
  if (args.length != 1) {
    System.err.println("Usage: FeedParser <feed>");
    System.exit(1);
  }
  String name = args[0];
  String url = "file:" + name;
  Configuration conf = NutchConfiguration.create();
  FeedParser parser = new FeedParser();
  parser.setConf(conf);
  File file = new File(name);
  byte[] bytes = new byte[(int) file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
  in.readFully(bytes);
  ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
      "application/rss+xml", new Metadata(), conf));
  for (Entry<Text, Parse> entry : parseResult) {
    System.out.println("key: " + entry.getKey());
    Parse parse = entry.getValue();
    System.out.println("data: " + parse.getData());
    System.out.println("text: " + parse.getText() + "\n");
  }
}

Source File: TestHTMLLanguageParser.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * Test parsing of language identifiers from html 
 **/
public void testMetaHTMLParsing() {

  try {
    ParseUtil parser = new ParseUtil(NutchConfiguration.create());
    /* loop through the test documents and validate result */
    for (int t = 0; t < docs.length; t++) {
      Content content = getContent(docs[t]);
      Parse parse = parser.parse(content).get(content.getUrl());
      assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE));
    }
  } catch (Exception e) {
    e.printStackTrace(System.out);
    fail(e.toString());
  }

}

Source File: TestMetatagParser.java From anthelion with Apache License 2.0

6 votes

public void testIt() {
  Configuration conf = NutchConfiguration.create();
  
  String urlString = "file:" + sampleDir + fileSeparator + sampleFile;
  
  try {
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    Content content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();
    
    Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
    
    // check that we get the same values
    Metadata parseMeta = parse.getData().getParseMeta();
    
    assertEquals(description, parseMeta.get("metatag.description"));
    assertEquals(keywords, parseMeta.get("metatag.keywords"));
  } catch (Exception e) {
    e.printStackTrace();
    fail(e.toString());
  }
}

Source File: HTMLLanguageParser.java From anthelion with Apache License 2.0

6 votes

/** Try to find the document's language from page headers and metadata */
private String detectLanguage(Parse page, DocumentFragment doc) {
    String lang = getLanguageFromMetadata(page.getData().getParseMeta());
    if (lang == null) {
        LanguageParser parser = new LanguageParser(doc);
        lang = parser.getLanguage();
    }

    if (lang != null) {
        return lang;
    }

    lang = page.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);

    return lang;
}

Source File: FeedParser.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * Runs a command line version of this {@link Parser}.
 * 
 * @param args
 *          A single argument (expected at arg[0]) representing a path on the
 *          local filesystem that points to a feed file.
 * 
 * @throws Exception
 *           If any error occurs.
 */
public static void main(String[] args) throws Exception {
  if (args.length != 1) {
    System.err.println("Usage: FeedParser <feed>");
    System.exit(1);
  }
  String name = args[0];
  String url = "file:" + name;
  Configuration conf = NutchConfiguration.create();
  FeedParser parser = new FeedParser();
  parser.setConf(conf);
  File file = new File(name);
  byte[] bytes = new byte[(int) file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
  in.readFully(bytes);
  ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
      "application/rss+xml", new Metadata(), conf));
  for (Entry<Text, Parse> entry : parseResult) {
    System.out.println("key: " + entry.getKey());
    Parse parse = entry.getValue();
    System.out.println("data: " + parse.getData());
    System.out.println("text: " + parse.getText() + "\n");
  }
}

Source File: S2jhIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

@Override
public NutchDocument filterInternal(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) {
    ParseData parseData = parse.getData();

    String sku = parseData.getMeta("sku");
    if (StringUtils.isBlank(sku)) {
        return null;
    }

    doc.add("sku", sku);
    doc.add("price", parseData.getMeta("price"));

    return doc;
}

Source File: HTMLLanguageParser.java From nutch-htmlunit with Apache License 2.0

5 votes

/**
 * Scan the HTML document looking at possible indications of content
 * language<br>
 * <li>1. html lang attribute
 * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1) <li>2. meta
 * dc.language
 * (http://dublincore.org/documents/2000/07/16/usageguide/qualified
 * -html.shtml#language) <li>3. meta http-equiv (content-language)
 * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) <br>
 */
public ParseResult filter(Content content, ParseResult parseResult,
        HTMLMetaTags metaTags, DocumentFragment doc) {
    String lang = null;

    Parse parse = parseResult.get(content.getUrl());

    if (detect >= 0 && identify < 0) {
        lang = detectLanguage(parse, doc);
    } else if (detect < 0 && identify >= 0) {
        lang = identifyLanguage(parse);
    } else if (detect < identify) {
        lang = detectLanguage(parse, doc);
        if (lang == null) {
            lang = identifyLanguage(parse);
        }
    } else if (identify < detect) {
        lang = identifyLanguage(parse);
        if (lang == null) {
            lang = detectLanguage(parse, doc);
        }
    } else {
        LOG.warn("No configuration for language extraction policy is provided");
        return parseResult;
    }

    if (lang != null) {
        parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang);
        return parseResult;
    }

    return parseResult;
}

Source File: TestOOParser.java From nutch-htmlunit with Apache License 2.0

5 votes

public void testIt() throws ProtocolException, ParseException {
  String urlString;
  Content content;
  Parse parse;
  Configuration conf = NutchConfiguration.create();
  Protocol protocol;
  ProtocolFactory factory = new ProtocolFactory(conf);

  System.out.println("Expected : "+expectedText);
  
  for (int i=0; i<sampleFiles.length; i++) {
    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];

    if (sampleFiles[i].startsWith("ootest")==false) continue;
    
    protocol = factory.getProtocol(urlString);
    content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
    parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
    
    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();

    // simply test for the presence of a text - the ordering of the elements may differ from what was expected
    // in the previous tests
    assertTrue(text!=null && text.length() > 0);
    
    System.out.println("Found "+sampleFiles[i]+": "+text);
  }
}

Source File: AnchorIndexingFilter.java From anthelion with Apache License 2.0

5 votes

/**
 * The {@link AnchorIndexingFilter} filter object which supports boolean 
 * configuration settings for the deduplication of anchors. 
 * See {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
 *  
 * @param doc The {@link NutchDocument} object
 * @param parse The relevant {@link Parse} object passing through the filter 
 * @param url URL to be filtered for anchor text
 * @param datum The {@link CrawlDatum} entry
 * @param inlinks The {@link Inlinks} containing anchor text
 * @return filtered NutchDocument
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
  Inlinks inlinks) throws IndexingException {

  String[] anchors = (inlinks != null ? inlinks.getAnchors()
    : new String[0]);

  HashSet<String> set = null;

  for (int i = 0; i < anchors.length; i++) {
    if (deduplicate) {
      if (set == null) set = new HashSet<String>();
      String lcAnchor = anchors[i].toLowerCase();

      // Check if already processed the current anchor
      if (!set.contains(lcAnchor)) {
        doc.add("anchor", anchors[i]);

        // Add to map
        set.add(lcAnchor);
      }
    } else {
      doc.add("anchor", anchors[i]);
    }
  }

  return doc;
}

Source File: AnchorIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

/**
 * The {@link AnchorIndexingFilter} filter object which supports boolean 
 * configuration settings for the deduplication of anchors. 
 * See {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
 *  
 * @param doc The {@link NutchDocument} object
 * @param parse The relevant {@link Parse} object passing through the filter 
 * @param url URL to be filtered for anchor text
 * @param datum The {@link CrawlDatum} entry
 * @param inlinks The {@link Inlinks} containing anchor text
 * @return filtered NutchDocument
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
  Inlinks inlinks) throws IndexingException {

  String[] anchors = (inlinks != null ? inlinks.getAnchors()
    : new String[0]);

  HashSet<String> set = null;

  for (int i = 0; i < anchors.length; i++) {
    if (deduplicate) {
      if (set == null) set = new HashSet<String>();
      String lcAnchor = anchors[i].toLowerCase();

      // Check if already processed the current anchor
      if (!set.contains(lcAnchor)) {
        doc.add("anchor", anchors[i]);

        // Add to map
        set.add(lcAnchor);
      }
    } else {
      doc.add("anchor", anchors[i]);
    }
  }

  return doc;
}

Source File: LanguageDetectionFilter.java From weslang with Apache License 2.0

5 votes

/**
 * {@inheritDoc}
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf == null) {
		throw new IndexingException("Not Yet Initialization.");
	}
	if (cause != null) {
		throw new IndexingException("Initialization Failed.", cause);
	}

	String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);
	if (lang == null) {
		StringBuilder text = new StringBuilder();
		text.append(parse.getData().getTitle()).append(" ")
				.append(parse.getText());
		try {
			Detector detector = DetectorFactory.create();
			detector.setMaxTextLength(textsize_upper_limit);
			detector.append(text.toString());
			lang = detector.detect();
		} catch (LangDetectException e) {
			throw new IndexingException("Detection failed.", e);
		}
	}
	if (lang == null) lang = "unknown";

	doc.add("lang", lang);
	return doc;
}

Source File: HTMLLanguageParser.java From nutch-htmlunit with Apache License 2.0

5 votes

/** Use statistical language identification to extract page language */
private String identifyLanguage(Parse parse) {
    StringBuilder text = new StringBuilder();
    if (parse == null)
        return null;

    String title = parse.getData().getTitle();
    if (title != null) {
        text.append(title.toString());
    }

    String content = parse.getText();
    if (content != null) {
        text.append(" ").append(content.toString());
    }

    // trim content?
    String titleandcontent = text.toString();

    if (this.contentMaxlength != -1
            && titleandcontent.length() > this.contentMaxlength)
        titleandcontent = titleandcontent.substring(0, contentMaxlength);

    LanguageIdentifier identifier = new LanguageIdentifier(titleandcontent);

    if (onlyCertain) {
        if (identifier.isReasonablyCertain())
            return identifier.getLanguage();
        else
            return null;
    }
    return identifier.getLanguage();
}

org.apache.nutch.parse.Parse Java Examples