org.apache.nutch.metadata.Metadata Java Examples

The following examples show how to use org.apache.nutch.metadata.Metadata. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MoreIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
private NutchDocument addTime(NutchDocument doc, ParseData data,
                         String url, CrawlDatum datum) {
  long time = -1;

  String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
  if (lastModified != null) {                   // try parse last-modified
    time = getTime(lastModified,url);           // use as time
                                                // store as string
    doc.add("lastModified", new Date(time));
  }

  if (time == -1) {                             // if no last-modified specified in HTTP header
    time = datum.getModifiedTime();             // use value in CrawlDatum
    if (time <= 0) {                            // if also unset
      time = datum.getFetchTime();              // use time the fetch took place (fetchTime of fetchDatum)
    }
  }

  // un-stored, indexed and un-tokenized
  doc.add("date", new Date(time));
  return doc;
}
 
Example #2
Source File: RelTagParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
/**
 * Scan the HTML document looking at possible rel-tags
 */
public ParseResult filter(Content content, ParseResult parseResult,
  HTMLMetaTags metaTags, DocumentFragment doc) {
  
  // get parse obj
  Parse parse = parseResult.get(content.getUrl());
  // Trying to find the document's rel-tags
  Parser parser = new Parser(doc);
  Set tags = parser.getRelTags();
  Iterator iter = tags.iterator();
  Metadata metadata = parse.getData().getParseMeta();
  while (iter.hasNext()) {
    metadata.add(REL_TAG, (String) iter.next());
  }
  return parseResult;
}
 
Example #3
Source File: HtmlParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
  //LOG.setLevel(Level.FINE);
  String name = args[0];
  String url = "file:"+name;
  File file = new File(name);
  byte[] bytes = new byte[(int)file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
  in.readFully(bytes);
  Configuration conf = NutchConfiguration.create();
  HtmlParser parser = new HtmlParser();
  parser.setConf(conf);
  Parse parse = parser.getParse(
          new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url);
  System.out.println("data: "+parse.getData());

  System.out.println("text: "+parse.getText());
  
}
 
Example #4
Source File: WdcParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
	// LOG.setLevel(Level.FINE);
	String name = args[0];
	String url = "file:" + name;
	File file = new File(name);
	byte[] bytes = new byte[(int) file.length()];
	DataInputStream in = new DataInputStream(new FileInputStream(file));
	in.readFully(bytes);
	Configuration conf = NutchConfiguration.create();
	WdcParser parser = new WdcParser();
	parser.setConf(conf);
	Parse parse = parser.getParse(new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url);
	System.out.println("data: " + parse.getData());

	System.out.println("text: " + parse.getText());

	String contains = parse.getData().getMeta(META_CONTAINS_SEM);
	System.out.println("contains: " + contains);

}
 
Example #5
Source File: MoreIndexingFilter.java    From anthelion with Apache License 2.0 6 votes vote down vote up
private NutchDocument addTime(NutchDocument doc, ParseData data,
                         String url, CrawlDatum datum) {
  long time = -1;

  String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
  if (lastModified != null) {                   // try parse last-modified
    time = getTime(lastModified,url);           // use as time
                                                // store as string
    doc.add("lastModified", new Date(time));
  }

  if (time == -1) {                             // if no last-modified
    time = datum.getFetchTime();                // use fetch time
  }

  // un-stored, indexed and un-tokenized
  doc.add("date", new Date(time));

  return doc;
}
 
Example #6
Source File: FeedParser.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
/**
 * Runs a command line version of this {@link Parser}.
 * 
 * @param args
 *          A single argument (expected at arg[0]) representing a path on the
 *          local filesystem that points to a feed file.
 * 
 * @throws Exception
 *           If any error occurs.
 */
public static void main(String[] args) throws Exception {
  if (args.length != 1) {
    System.err.println("Usage: FeedParser <feed>");
    System.exit(1);
  }
  String name = args[0];
  String url = "file:" + name;
  Configuration conf = NutchConfiguration.create();
  FeedParser parser = new FeedParser();
  parser.setConf(conf);
  File file = new File(name);
  byte[] bytes = new byte[(int) file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
  in.readFully(bytes);
  ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
      "application/rss+xml", new Metadata(), conf));
  for (Entry<Text, Parse> entry : parseResult) {
    System.out.println("key: " + entry.getKey());
    Parse parse = entry.getValue();
    System.out.println("data: " + parse.getData());
    System.out.println("text: " + parse.getText() + "\n");
  }
}
 
Example #7
Source File: HtmlParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
  //LOG.setLevel(Level.FINE);
  String name = args[0];
  String url = "file:"+name;
  File file = new File(name);
  byte[] bytes = new byte[(int)file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
  in.readFully(bytes);
  Configuration conf = NutchConfiguration.create();
  HtmlParser parser = new HtmlParser();
  parser.setConf(conf);
  Parse parse = parser.getParse(
          new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url);
  System.out.println("data: "+parse.getData());

  System.out.println("text: "+parse.getText());
  
}
 
Example #8
Source File: FeedParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
/**
 * Runs a command line version of this {@link Parser}.
 * 
 * @param args
 *          A single argument (expected at arg[0]) representing a path on the
 *          local filesystem that points to a feed file.
 * 
 * @throws Exception
 *           If any error occurs.
 */
public static void main(String[] args) throws Exception {
  if (args.length != 1) {
    System.err.println("Usage: FeedParser <feed>");
    System.exit(1);
  }
  String name = args[0];
  String url = "file:" + name;
  Configuration conf = NutchConfiguration.create();
  FeedParser parser = new FeedParser();
  parser.setConf(conf);
  File file = new File(name);
  byte[] bytes = new byte[(int) file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
  in.readFully(bytes);
  ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
      "application/rss+xml", new Metadata(), conf));
  for (Entry<Text, Parse> entry : parseResult) {
    System.out.println("key: " + entry.getKey());
    Parse parse = entry.getValue();
    System.out.println("data: " + parse.getData());
    System.out.println("text: " + parse.getText() + "\n");
  }
}
 
Example #9
Source File: LanguageIndexingFilter.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}
 
Example #10
Source File: TestHTMLLanguageParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
/**
 * Test parsing of language identifiers from html 
 **/
public void testMetaHTMLParsing() {

  try {
    ParseUtil parser = new ParseUtil(NutchConfiguration.create());
    /* loop through the test documents and validate result */
    for (int t = 0; t < docs.length; t++) {
      Content content = getContent(docs[t]);
      Parse parse = parser.parse(content).get(content.getUrl());
      assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE));
    }
  } catch (Exception e) {
    e.printStackTrace(System.out);
    fail(e.toString());
  }

}
 
Example #11
Source File: SWFParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
/**
 * Arguments are: 0. Name of input SWF file.
 */
public static void main(String[] args) throws IOException {
  FileInputStream in = new FileInputStream(args[0]);

  byte[] buf = new byte[in.available()];
  in.read(buf);
  SWFParser parser = new SWFParser();
  ParseResult parseResult = parser.getParse(new Content("file:" + args[0], "file:" + args[0],
                                        buf, "application/x-shockwave-flash",
                                        new Metadata(),
                                        NutchConfiguration.create()));
  Parse p = parseResult.get("file:" + args[0]);
  System.out.println("Parse Text:");
  System.out.println(p.getText());
  System.out.println("Parse Data:");
  System.out.println(p.getData());
}
 
Example #12
Source File: TestMetatagParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public void testIt() {
  Configuration conf = NutchConfiguration.create();
  
  String urlString = "file:" + sampleDir + fileSeparator + sampleFile;
  
  try {
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    Content content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();
    
    Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
    
    // check that we get the same values
    Metadata parseMeta = parse.getData().getParseMeta();
    
    assertEquals(description, parseMeta.get("metatag.description"));
    assertEquals(keywords, parseMeta.get("metatag.keywords"));
  } catch (Exception e) {
    e.printStackTrace();
    fail(e.toString());
  }
}
 
Example #13
Source File: LanguageIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}
 
Example #14
Source File: TestParseData.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public void testParseData() throws Exception {

    String title = "The Foo Page";

    Outlink[] outlinks = new Outlink[] {
      new Outlink("http://foo.com/", "Foo"),
      new Outlink("http://bar.com/", "Bar")
    };

    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    metaData.add("Charset", "UTF-8");

    ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
                        
    WritableTestUtils.testWritable(r, null);
  }
 
Example #15
Source File: TestHTMLLanguageParser.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
/**
 * Test parsing of language identifiers from html 
 **/
public void testMetaHTMLParsing() {

  try {
    ParseUtil parser = new ParseUtil(NutchConfiguration.create());
    /* loop through the test documents and validate result */
    for (int t = 0; t < docs.length; t++) {
      Content content = getContent(docs[t]);
      Parse parse = parser.parse(content).get(content.getUrl());
      assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE));
    }
  } catch (Exception e) {
    e.printStackTrace(System.out);
    fail(e.toString());
  }

}
 
Example #16
Source File: SWFParser.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
/**
 * Arguments are: 0. Name of input SWF file.
 */
public static void main(String[] args) throws IOException {
  FileInputStream in = new FileInputStream(args[0]);

  byte[] buf = new byte[in.available()];
  in.read(buf);
  in.close();
  SWFParser parser = new SWFParser();
  ParseResult parseResult = parser.getParse(new Content("file:" + args[0], "file:" + args[0],
                                        buf, "application/x-shockwave-flash",
                                        new Metadata(),
                                        NutchConfiguration.create()));
  Parse p = parseResult.get("file:" + args[0]);
  System.out.println("Parse Text:");
  System.out.println(p.getText());
  System.out.println("Parse Data:");
  System.out.println(p.getData());
}
 
Example #17
Source File: TestParseData.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public void testParseData() throws Exception {

    String title = "The Foo Page";

    Outlink[] outlinks = new Outlink[] {
      new Outlink("http://foo.com/", "Foo"),
      new Outlink("http://bar.com/", "Bar")
    };

    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    metaData.add("Charset", "UTF-8");

    ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
                        
    WritableTestUtils.testWritable(r, null);
  }
 
Example #18
Source File: HttpAuthenticationFactory.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public HttpAuthentication findAuthentication(Metadata header) {

    if (header == null) return null;

    try {
      Collection<String> challenge = new ArrayList<String>();
      challenge.add(header.get(WWW_AUTHENTICATE));

      for(String challengeString: challenge) {
        if (challengeString.equals("NTLM"))
          challengeString="Basic realm=techweb";

        if (LOG.isTraceEnabled())
          LOG.trace("Checking challengeString=" + challengeString);

        HttpAuthentication auth = HttpBasicAuthentication.getAuthentication(challengeString, conf);
        if (auth != null) return auth;

        //TODO Add additional Authentication lookups here
      }
    } catch (Exception e) {
      LOG.error("Error: ", e);
    }
    return null;
  }
 
Example #19
Source File: Content.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public Content(String url, String base, byte[] content, String contentType,
    Metadata metadata, Configuration conf) {

  if (url == null)
    throw new IllegalArgumentException("null url");
  if (base == null)
    throw new IllegalArgumentException("null base");
  if (content == null)
    throw new IllegalArgumentException("null content");
  if (metadata == null)
    throw new IllegalArgumentException("null metadata");

  this.url = url;
  this.base = base;
  this.content = content;
  this.metadata = metadata;

  this.mimeTypes = new MimeUtil(conf);
  this.contentType = getContentType(contentType, url, content);
}
 
Example #20
Source File: TestCCParseFilter.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public void pageTest(File file, String url,
                     String license, String location, String type)
  throws Exception {

  String contentType = "text/html";
  InputStream in = new FileInputStream(file);
  ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
  byte[] buffer = new byte[1024];
  int i;
  while ((i = in.read(buffer)) != -1) {
    out.write(buffer, 0, i);
  }
  in.close();
  byte[] bytes = out.toByteArray();
  Configuration conf = NutchConfiguration.create();

  Content content =
    new Content(url, url, bytes, contentType, new Metadata(), conf);
  Parse parse =  new ParseUtil(conf).parse(content).get(content.getUrl());
  
  Metadata metadata = parse.getData().getParseMeta();
  assertEquals(license, metadata.get("License-Url"));
  assertEquals(location, metadata.get("License-Location"));
  assertEquals(type, metadata.get("Work-Type"));
}
 
Example #21
Source File: TestMetatagParser.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public Metadata parseMeta(String fileName, Configuration conf) {
  Metadata metadata = null;
  try {
    String urlString = "file:" + sampleDir + fileSeparator + fileName;     
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    Content content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();
    Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
    metadata = parse.getData().getParseMeta();
  } catch (Exception e) {
    e.printStackTrace();
    fail(e.toString());
  }
  return metadata;
}
 
Example #22
Source File: TestIndexingFilters.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * Test behaviour when reset the index filter order will not take effect
 *
 * @throws IndexingException
 */
public void testFilterCacheIndexingFilter() throws IndexingException{
  Configuration conf = NutchConfiguration.create();
  conf.addResource("nutch-default.xml");
  conf.addResource("crawl-tests.xml");

  String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
  conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);

  IndexingFilters filters1 = new IndexingFilters(conf);
  NutchDocument fdoc1 = filters1.filter(new NutchDocument(),new ParseImpl("text",new ParseData(
    new ParseStatus(),"title",new Outlink[0],new Metadata())),new Text("http://www.example.com/"),
    new CrawlDatum(),new Inlinks());

  // add another index filter
  String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
  // set content metadata
  Metadata md = new Metadata();
  md.add("example","data");
  // set content metadata property defined in MetadataIndexer
  conf.set("index.content.md","example");
  // add MetadataIndxer filter
  conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
  IndexingFilters filters2 = new IndexingFilters(conf);
  NutchDocument fdoc2 = filters2.filter(new NutchDocument(),new ParseImpl("text",new ParseData(
    new ParseStatus(),"title",new Outlink[0],md)),new Text("http://www.example.com/"),
    new CrawlDatum(),new Inlinks());
  assertEquals(fdoc1.getFieldNames().size(),fdoc2.getFieldNames().size());
}
 
Example #23
Source File: ParseData.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public ParseData(ParseStatus status, String title, Outlink[] outlinks,
                 Metadata contentMeta, Metadata parseMeta) {
  this.status = status;
  this.title = title;
  this.outlinks = outlinks;
  this.contentMeta = contentMeta;
  this.parseMeta = parseMeta;
}
 
Example #24
Source File: ParseSegment.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * Checks if the page's content is truncated.
 * @param content
 * @return If the page is truncated <code>true</code>. When it is not,
 * or when it could be determined, <code>false</code>. 
 */
public static boolean isTruncated(Content content) {
  byte[] contentBytes = content.getContent();
  if (contentBytes == null) return false;
  Metadata metadata = content.getMetadata();
  if (metadata == null) return false;
  
  String lengthStr = metadata.get(Response.CONTENT_LENGTH);
  if (lengthStr != null) lengthStr=lengthStr.trim();
  if (StringUtil.isEmpty(lengthStr)) {
    return false;
  }
  int inHeaderSize;
  String url = content.getUrl();
  try {
    inHeaderSize = Integer.parseInt(lengthStr);
  } catch (NumberFormatException e) {
    LOG.warn("Wrong contentlength format for " + url, e);
    return false;
  }
  int actualSize = contentBytes.length;
  if (inHeaderSize > actualSize) {
    LOG.info(url + " skipped. Content of size " + inHeaderSize
        + " was truncated to " + actualSize);
    return true;
  }
  if (LOG.isDebugEnabled()) {
    LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
  }
  return false;
}
 
Example #25
Source File: TestIndexingFilters.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * Test behaviour when defined filter does not exist.
 * @throws IndexingException
 */
public void testNonExistingIndexingFilter() throws IndexingException {
  Configuration conf = NutchConfiguration.create();
    conf.addResource("nutch-default.xml");
    conf.addResource("crawl-tests.xml");

  String class1 = "NonExistingFilter";
  String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
  conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);

  IndexingFilters filters = new IndexingFilters(conf);
  filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
    new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
    "http://www.example.com/"), new CrawlDatum(), new Inlinks());
}
 
Example #26
Source File: CCIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {
  
  Metadata metadata = parse.getData().getParseMeta();
  // index the license
  String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
  if (licenseUrl != null) {
    if (LOG.isInfoEnabled()) {
      LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
    }

    // add the entire license as cc:license=xxx
    addFeature(doc, "license=" + licenseUrl);

    // index license attributes extracted of the license url
    addUrlFeatures(doc, licenseUrl);
  }

  // index the license location as cc:meta=xxx
  String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION);
  if (licenseLocation != null) {
    addFeature(doc, "meta=" + licenseLocation);
  }

  // index the work type cc:type=xxx
  String workType = metadata.get(CreativeCommons.WORK_TYPE);
  if (workType != null) {
    addFeature(doc, workType);
  }

  return doc;
}
 
Example #27
Source File: ParseSegment.java    From anthelion with Apache License 2.0 5 votes vote down vote up
/**
 * Checks if the page's content is truncated.
 * @param content
 * @return If the page is truncated <code>true</code>. When it is not,
 * or when it could be determined, <code>false</code>. 
 */
public static boolean isTruncated(Content content) {
  byte[] contentBytes = content.getContent();
  if (contentBytes == null) return false;
  Metadata metadata = content.getMetadata();
  if (metadata == null) return false;
  
  String lengthStr = metadata.get(Response.CONTENT_LENGTH);
  if (lengthStr != null) lengthStr=lengthStr.trim();
  if (StringUtil.isEmpty(lengthStr)) {
    return false;
  }
  int inHeaderSize;
  String url = content.getUrl();
  try {
    inHeaderSize = Integer.parseInt(lengthStr);
  } catch (NumberFormatException e) {
    LOG.warn("Wrong contentlength format for " + url, e);
    return false;
  }
  int actualSize = contentBytes.length;
  if (inHeaderSize > actualSize) {
    LOG.info(url + " skipped. Content of size " + inHeaderSize
        + " was truncated to " + actualSize);
    return true;
  }
  if (LOG.isDebugEnabled()) {
    LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
  }
  return false;
}
 
Example #28
Source File: HTMLLanguageParser.java    From anthelion with Apache License 2.0 5 votes vote down vote up
private static String getLanguageFromMetadata(Metadata meta) {
    if (meta == null)
        return null;
    // dublin core
    String lang = meta.get("dc.language");
    if (lang != null)
        return lang;
    // meta content-language
    lang = meta.get("content-language");
    if (lang != null)
        return lang;
    // lang attribute
    return meta.get("lang");
}
 
Example #29
Source File: HTMLLanguageParser.java    From anthelion with Apache License 2.0 5 votes vote down vote up
/**
 * Scan the HTML document looking at possible indications of content
 * language<br>
 * <li>1. html lang attribute
 * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1) <li>2. meta
 * dc.language
 * (http://dublincore.org/documents/2000/07/16/usageguide/qualified
 * -html.shtml#language) <li>3. meta http-equiv (content-language)
 * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) <br>
 */
public ParseResult filter(Content content, ParseResult parseResult,
        HTMLMetaTags metaTags, DocumentFragment doc) {
    String lang = null;

    Parse parse = parseResult.get(content.getUrl());

    if (detect >= 0 && identify < 0) {
        lang = detectLanguage(parse, doc);
    } else if (detect < 0 && identify >= 0) {
        lang = identifyLanguage(parse);
    } else if (detect < identify) {
        lang = detectLanguage(parse, doc);
        if (lang == null) {
            lang = identifyLanguage(parse);
        }
    } else if (identify < detect) {
        lang = identifyLanguage(parse);
        if (lang == null) {
            lang = detectLanguage(parse, doc);
        }
    } else {
        LOG.warn("No configuration for language extraction policy is provided");
        return parseResult;
    }

    if (lang != null) {
        parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang);
        return parseResult;
    }

    return parseResult;
}
 
Example #30
Source File: TestIndexingFilters.java    From anthelion with Apache License 2.0 5 votes vote down vote up
/**
 * Test behaviour when defined filter does not exist.
 * @throws IndexingException
 */
public void testNonExistingIndexingFilter() throws IndexingException {
  Configuration conf = NutchConfiguration.create();
  String class1 = "NonExistingFilter";
  String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
  conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);

  IndexingFilters filters = new IndexingFilters(conf);
  filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
}