org.apache.nutch.metadata.Metadata Java Exaples

Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

private NutchDocument addTime(NutchDocument doc, ParseData data,
                         String url, CrawlDatum datum) {
  long time = -1;

  String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
  if (lastModified != null) {                   // try parse last-modified
    time = getTime(lastModified,url);           // use as time
                                                // store as string
    doc.add("lastModified", new Date(time));
  }

  if (time == -1) {                             // if no last-modified specified in HTTP header
    time = datum.getModifiedTime();             // use value in CrawlDatum
    if (time <= 0) {                            // if also unset
      time = datum.getFetchTime();              // use time the fetch took place (fetchTime of fetchDatum)
    }
  }

  // un-stored, indexed and un-tokenized
  doc.add("date", new Date(time));
  return doc;
}

Source File: RelTagParser.java From anthelion with Apache License 2.0

6 votes

/**
 * Scan the HTML document looking at possible rel-tags
 */
public ParseResult filter(Content content, ParseResult parseResult,
  HTMLMetaTags metaTags, DocumentFragment doc) {
  
  // get parse obj
  Parse parse = parseResult.get(content.getUrl());
  // Trying to find the document's rel-tags
  Parser parser = new Parser(doc);
  Set tags = parser.getRelTags();
  Iterator iter = tags.iterator();
  Metadata metadata = parse.getData().getParseMeta();
  while (iter.hasNext()) {
    metadata.add(REL_TAG, (String) iter.next());
  }
  return parseResult;
}

Source File: HtmlParser.java From anthelion with Apache License 2.0

6 votes

public static void main(String[] args) throws Exception {
  //LOG.setLevel(Level.FINE);
  String name = args[0];
  String url = "file:"+name;
  File file = new File(name);
  byte[] bytes = new byte[(int)file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
  in.readFully(bytes);
  Configuration conf = NutchConfiguration.create();
  HtmlParser parser = new HtmlParser();
  parser.setConf(conf);
  Parse parse = parser.getParse(
          new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url);
  System.out.println("data: "+parse.getData());

  System.out.println("text: "+parse.getText());
  
}

Source File: WdcParser.java From anthelion with Apache License 2.0

6 votes

public static void main(String[] args) throws Exception {
	// LOG.setLevel(Level.FINE);
	String name = args[0];
	String url = "file:" + name;
	File file = new File(name);
	byte[] bytes = new byte[(int) file.length()];
	DataInputStream in = new DataInputStream(new FileInputStream(file));
	in.readFully(bytes);
	Configuration conf = NutchConfiguration.create();
	WdcParser parser = new WdcParser();
	parser.setConf(conf);
	Parse parse = parser.getParse(new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url);
	System.out.println("data: " + parse.getData());

	System.out.println("text: " + parse.getText());

	String contains = parse.getData().getMeta(META_CONTAINS_SEM);
	System.out.println("contains: " + contains);

}

Source File: MoreIndexingFilter.java From anthelion with Apache License 2.0

6 votes

private NutchDocument addTime(NutchDocument doc, ParseData data,
                         String url, CrawlDatum datum) {
  long time = -1;

  String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
  if (lastModified != null) {                   // try parse last-modified
    time = getTime(lastModified,url);           // use as time
                                                // store as string
    doc.add("lastModified", new Date(time));
  }

  if (time == -1) {                             // if no last-modified
    time = datum.getFetchTime();                // use fetch time
  }

  // un-stored, indexed and un-tokenized
  doc.add("date", new Date(time));

  return doc;
}

Source File: FeedParser.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * Runs a command line version of this {@link Parser}.
 * 
 * @param args
 *          A single argument (expected at arg[0]) representing a path on the
 *          local filesystem that points to a feed file.
 * 
 * @throws Exception
 *           If any error occurs.
 */
public static void main(String[] args) throws Exception {
  if (args.length != 1) {
    System.err.println("Usage: FeedParser <feed>");
    System.exit(1);
  }
  String name = args[0];
  String url = "file:" + name;
  Configuration conf = NutchConfiguration.create();
  FeedParser parser = new FeedParser();
  parser.setConf(conf);
  File file = new File(name);
  byte[] bytes = new byte[(int) file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
  in.readFully(bytes);
  ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
      "application/rss+xml", new Metadata(), conf));
  for (Entry<Text, Parse> entry : parseResult) {
    System.out.println("key: " + entry.getKey());
    Parse parse = entry.getValue();
    System.out.println("data: " + parse.getData());
    System.out.println("text: " + parse.getText() + "\n");
  }
}

Source File: HtmlParser.java From anthelion with Apache License 2.0

6 votes

public static void main(String[] args) throws Exception {
  //LOG.setLevel(Level.FINE);
  String name = args[0];
  String url = "file:"+name;
  File file = new File(name);
  byte[] bytes = new byte[(int)file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
  in.readFully(bytes);
  Configuration conf = NutchConfiguration.create();
  HtmlParser parser = new HtmlParser();
  parser.setConf(conf);
  Parse parse = parser.getParse(
          new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url);
  System.out.println("data: "+parse.getData());

  System.out.println("text: "+parse.getText());
  
}

Source File: FeedParser.java From anthelion with Apache License 2.0

6 votes

/**
 * Runs a command line version of this {@link Parser}.
 * 
 * @param args
 *          A single argument (expected at arg[0]) representing a path on the
 *          local filesystem that points to a feed file.
 * 
 * @throws Exception
 *           If any error occurs.
 */
public static void main(String[] args) throws Exception {
  if (args.length != 1) {
    System.err.println("Usage: FeedParser <feed>");
    System.exit(1);
  }
  String name = args[0];
  String url = "file:" + name;
  Configuration conf = NutchConfiguration.create();
  FeedParser parser = new FeedParser();
  parser.setConf(conf);
  File file = new File(name);
  byte[] bytes = new byte[(int) file.length()];
  DataInputStream in = new DataInputStream(new FileInputStream(file));
  in.readFully(bytes);
  ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
      "application/rss+xml", new Metadata(), conf));
  for (Entry<Text, Parse> entry : parseResult) {
    System.out.println("key: " + entry.getKey());
    Parse parse = entry.getValue();
    System.out.println("data: " + parse.getData());
    System.out.println("text: " + parse.getText() + "\n");
  }
}

Source File: LanguageIndexingFilter.java From anthelion with Apache License 2.0

6 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}

Source File: TestHTMLLanguageParser.java From anthelion with Apache License 2.0

6 votes

/**
 * Test parsing of language identifiers from html 
 **/
public void testMetaHTMLParsing() {

  try {
    ParseUtil parser = new ParseUtil(NutchConfiguration.create());
    /* loop through the test documents and validate result */
    for (int t = 0; t < docs.length; t++) {
      Content content = getContent(docs[t]);
      Parse parse = parser.parse(content).get(content.getUrl());
      assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE));
    }
  } catch (Exception e) {
    e.printStackTrace(System.out);
    fail(e.toString());
  }

}

Source File: SWFParser.java From anthelion with Apache License 2.0

6 votes

/**
 * Arguments are: 0. Name of input SWF file.
 */
public static void main(String[] args) throws IOException {
  FileInputStream in = new FileInputStream(args[0]);

  byte[] buf = new byte[in.available()];
  in.read(buf);
  SWFParser parser = new SWFParser();
  ParseResult parseResult = parser.getParse(new Content("file:" + args[0], "file:" + args[0],
                                        buf, "application/x-shockwave-flash",
                                        new Metadata(),
                                        NutchConfiguration.create()));
  Parse p = parseResult.get("file:" + args[0]);
  System.out.println("Parse Text:");
  System.out.println(p.getText());
  System.out.println("Parse Data:");
  System.out.println(p.getData());
}

Source File: TestMetatagParser.java From anthelion with Apache License 2.0

6 votes

public void testIt() {
  Configuration conf = NutchConfiguration.create();
  
  String urlString = "file:" + sampleDir + fileSeparator + sampleFile;
  
  try {
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    Content content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();
    
    Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
    
    // check that we get the same values
    Metadata parseMeta = parse.getData().getParseMeta();
    
    assertEquals(description, parseMeta.get("metatag.description"));
    assertEquals(keywords, parseMeta.get("metatag.keywords"));
  } catch (Exception e) {
    e.printStackTrace();
    fail(e.toString());
  }
}

Source File: LanguageIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}

Source File: TestParseData.java From anthelion with Apache License 2.0

6 votes

public void testParseData() throws Exception {

    String title = "The Foo Page";

    Outlink[] outlinks = new Outlink[] {
      new Outlink("http://foo.com/", "Foo"),
      new Outlink("http://bar.com/", "Bar")
    };

    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    metaData.add("Charset", "UTF-8");

    ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
                        
    WritableTestUtils.testWritable(r, null);
  }

Source File: TestHTMLLanguageParser.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * Test parsing of language identifiers from html 
 **/
public void testMetaHTMLParsing() {

  try {
    ParseUtil parser = new ParseUtil(NutchConfiguration.create());
    /* loop through the test documents and validate result */
    for (int t = 0; t < docs.length; t++) {
      Content content = getContent(docs[t]);
      Parse parse = parser.parse(content).get(content.getUrl());
      assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE));
    }
  } catch (Exception e) {
    e.printStackTrace(System.out);
    fail(e.toString());
  }

}

Source File: SWFParser.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * Arguments are: 0. Name of input SWF file.
 */
public static void main(String[] args) throws IOException {
  FileInputStream in = new FileInputStream(args[0]);

  byte[] buf = new byte[in.available()];
  in.read(buf);
  in.close();
  SWFParser parser = new SWFParser();
  ParseResult parseResult = parser.getParse(new Content("file:" + args[0], "file:" + args[0],
                                        buf, "application/x-shockwave-flash",
                                        new Metadata(),
                                        NutchConfiguration.create()));
  Parse p = parseResult.get("file:" + args[0]);
  System.out.println("Parse Text:");
  System.out.println(p.getText());
  System.out.println("Parse Data:");
  System.out.println(p.getData());
}

Source File: TestParseData.java From nutch-htmlunit with Apache License 2.0

6 votes

public void testParseData() throws Exception {

    String title = "The Foo Page";

    Outlink[] outlinks = new Outlink[] {
      new Outlink("http://foo.com/", "Foo"),
      new Outlink("http://bar.com/", "Bar")
    };

    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    metaData.add("Charset", "UTF-8");

    ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
                        
    WritableTestUtils.testWritable(r, null);
  }

Source File: HttpAuthenticationFactory.java From nutch-htmlunit with Apache License 2.0

6 votes

public HttpAuthentication findAuthentication(Metadata header) {

    if (header == null) return null;

    try {
      Collection<String> challenge = new ArrayList<String>();
      challenge.add(header.get(WWW_AUTHENTICATE));

      for(String challengeString: challenge) {
        if (challengeString.equals("NTLM"))
          challengeString="Basic realm=techweb";

        if (LOG.isTraceEnabled())
          LOG.trace("Checking challengeString=" + challengeString);

        HttpAuthentication auth = HttpBasicAuthentication.getAuthentication(challengeString, conf);
        if (auth != null) return auth;

        //TODO Add additional Authentication lookups here
      }
    } catch (Exception e) {
      LOG.error("Error: ", e);
    }
    return null;
  }

Source File: Content.java From anthelion with Apache License 2.0

6 votes

public Content(String url, String base, byte[] content, String contentType,
    Metadata metadata, Configuration conf) {

  if (url == null)
    throw new IllegalArgumentException("null url");
  if (base == null)
    throw new IllegalArgumentException("null base");
  if (content == null)
    throw new IllegalArgumentException("null content");
  if (metadata == null)
    throw new IllegalArgumentException("null metadata");

  this.url = url;
  this.base = base;
  this.content = content;
  this.metadata = metadata;

  this.mimeTypes = new MimeUtil(conf);
  this.contentType = getContentType(contentType, url, content);
}

Source File: TestCCParseFilter.java From anthelion with Apache License 2.0

6 votes

public void pageTest(File file, String url,
                     String license, String location, String type)
  throws Exception {

  String contentType = "text/html";
  InputStream in = new FileInputStream(file);
  ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
  byte[] buffer = new byte[1024];
  int i;
  while ((i = in.read(buffer)) != -1) {
    out.write(buffer, 0, i);
  }
  in.close();
  byte[] bytes = out.toByteArray();
  Configuration conf = NutchConfiguration.create();

  Content content =
    new Content(url, url, bytes, contentType, new Metadata(), conf);
  Parse parse =  new ParseUtil(conf).parse(content).get(content.getUrl());
  
  Metadata metadata = parse.getData().getParseMeta();
  assertEquals(license, metadata.get("License-Url"));
  assertEquals(location, metadata.get("License-Location"));
  assertEquals(type, metadata.get("Work-Type"));
}

Source File: TestMetatagParser.java From nutch-htmlunit with Apache License 2.0

5 votes

public Metadata parseMeta(String fileName, Configuration conf) {
  Metadata metadata = null;
  try {
    String urlString = "file:" + sampleDir + fileSeparator + fileName;     
    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
    Content content = protocol.getProtocolOutput(new Text(urlString),
        new CrawlDatum()).getContent();
    Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
    metadata = parse.getData().getParseMeta();
  } catch (Exception e) {
    e.printStackTrace();
    fail(e.toString());
  }
  return metadata;
}

Source File: TestIndexingFilters.java From nutch-htmlunit with Apache License 2.0

5 votes

/**
 * Test behaviour when reset the index filter order will not take effect
 *
 * @throws IndexingException
 */
public void testFilterCacheIndexingFilter() throws IndexingException{
  Configuration conf = NutchConfiguration.create();
  conf.addResource("nutch-default.xml");
  conf.addResource("crawl-tests.xml");

  String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
  conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);

  IndexingFilters filters1 = new IndexingFilters(conf);
  NutchDocument fdoc1 = filters1.filter(new NutchDocument(),new ParseImpl("text",new ParseData(
    new ParseStatus(),"title",new Outlink[0],new Metadata())),new Text("http://www.example.com/"),
    new CrawlDatum(),new Inlinks());

  // add another index filter
  String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
  // set content metadata
  Metadata md = new Metadata();
  md.add("example","data");
  // set content metadata property defined in MetadataIndexer
  conf.set("index.content.md","example");
  // add MetadataIndxer filter
  conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
  IndexingFilters filters2 = new IndexingFilters(conf);
  NutchDocument fdoc2 = filters2.filter(new NutchDocument(),new ParseImpl("text",new ParseData(
    new ParseStatus(),"title",new Outlink[0],md)),new Text("http://www.example.com/"),
    new CrawlDatum(),new Inlinks());
  assertEquals(fdoc1.getFieldNames().size(),fdoc2.getFieldNames().size());
}

Source File: ParseData.java From nutch-htmlunit with Apache License 2.0

5 votes

public ParseData(ParseStatus status, String title, Outlink[] outlinks,
                 Metadata contentMeta, Metadata parseMeta) {
  this.status = status;
  this.title = title;
  this.outlinks = outlinks;
  this.contentMeta = contentMeta;
  this.parseMeta = parseMeta;
}

Source File: ParseSegment.java From nutch-htmlunit with Apache License 2.0

5 votes

/**
 * Checks if the page's content is truncated.
 * @param content
 * @return If the page is truncated <code>true</code>. When it is not,
 * or when it could be determined, <code>false</code>. 
 */
public static boolean isTruncated(Content content) {
  byte[] contentBytes = content.getContent();
  if (contentBytes == null) return false;
  Metadata metadata = content.getMetadata();
  if (metadata == null) return false;
  
  String lengthStr = metadata.get(Response.CONTENT_LENGTH);
  if (lengthStr != null) lengthStr=lengthStr.trim();
  if (StringUtil.isEmpty(lengthStr)) {
    return false;
  }
  int inHeaderSize;
  String url = content.getUrl();
  try {
    inHeaderSize = Integer.parseInt(lengthStr);
  } catch (NumberFormatException e) {
    LOG.warn("Wrong contentlength format for " + url, e);
    return false;
  }
  int actualSize = contentBytes.length;
  if (inHeaderSize > actualSize) {
    LOG.info(url + " skipped. Content of size " + inHeaderSize
        + " was truncated to " + actualSize);
    return true;
  }
  if (LOG.isDebugEnabled()) {
    LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
  }
  return false;
}

Source File: TestIndexingFilters.java From nutch-htmlunit with Apache License 2.0

5 votes

/**
 * Test behaviour when defined filter does not exist.
 * @throws IndexingException
 */
public void testNonExistingIndexingFilter() throws IndexingException {
  Configuration conf = NutchConfiguration.create();
    conf.addResource("nutch-default.xml");
    conf.addResource("crawl-tests.xml");

  String class1 = "NonExistingFilter";
  String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
  conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);

  IndexingFilters filters = new IndexingFilters(conf);
  filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
    new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
    "http://www.example.com/"), new CrawlDatum(), new Inlinks());
}

Source File: CCIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {
  
  Metadata metadata = parse.getData().getParseMeta();
  // index the license
  String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
  if (licenseUrl != null) {
    if (LOG.isInfoEnabled()) {
      LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
    }

    // add the entire license as cc:license=xxx
    addFeature(doc, "license=" + licenseUrl);

    // index license attributes extracted of the license url
    addUrlFeatures(doc, licenseUrl);
  }

  // index the license location as cc:meta=xxx
  String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION);
  if (licenseLocation != null) {
    addFeature(doc, "meta=" + licenseLocation);
  }

  // index the work type cc:type=xxx
  String workType = metadata.get(CreativeCommons.WORK_TYPE);
  if (workType != null) {
    addFeature(doc, workType);
  }

  return doc;
}

Source File: ParseSegment.java From anthelion with Apache License 2.0

5 votes

/**
 * Checks if the page's content is truncated.
 * @param content
 * @return If the page is truncated <code>true</code>. When it is not,
 * or when it could be determined, <code>false</code>. 
 */
public static boolean isTruncated(Content content) {
  byte[] contentBytes = content.getContent();
  if (contentBytes == null) return false;
  Metadata metadata = content.getMetadata();
  if (metadata == null) return false;
  
  String lengthStr = metadata.get(Response.CONTENT_LENGTH);
  if (lengthStr != null) lengthStr=lengthStr.trim();
  if (StringUtil.isEmpty(lengthStr)) {
    return false;
  }
  int inHeaderSize;
  String url = content.getUrl();
  try {
    inHeaderSize = Integer.parseInt(lengthStr);
  } catch (NumberFormatException e) {
    LOG.warn("Wrong contentlength format for " + url, e);
    return false;
  }
  int actualSize = contentBytes.length;
  if (inHeaderSize > actualSize) {
    LOG.info(url + " skipped. Content of size " + inHeaderSize
        + " was truncated to " + actualSize);
    return true;
  }
  if (LOG.isDebugEnabled()) {
    LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
  }
  return false;
}

Source File: HTMLLanguageParser.java From anthelion with Apache License 2.0

5 votes

private static String getLanguageFromMetadata(Metadata meta) {
    if (meta == null)
        return null;
    // dublin core
    String lang = meta.get("dc.language");
    if (lang != null)
        return lang;
    // meta content-language
    lang = meta.get("content-language");
    if (lang != null)
        return lang;
    // lang attribute
    return meta.get("lang");
}

Source File: HTMLLanguageParser.java From anthelion with Apache License 2.0

5 votes

/**
 * Scan the HTML document looking at possible indications of content
 * language<br>
 * <li>1. html lang attribute
 * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1) <li>2. meta
 * dc.language
 * (http://dublincore.org/documents/2000/07/16/usageguide/qualified
 * -html.shtml#language) <li>3. meta http-equiv (content-language)
 * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) <br>
 */
public ParseResult filter(Content content, ParseResult parseResult,
        HTMLMetaTags metaTags, DocumentFragment doc) {
    String lang = null;

    Parse parse = parseResult.get(content.getUrl());

    if (detect >= 0 && identify < 0) {
        lang = detectLanguage(parse, doc);
    } else if (detect < 0 && identify >= 0) {
        lang = identifyLanguage(parse);
    } else if (detect < identify) {
        lang = detectLanguage(parse, doc);
        if (lang == null) {
            lang = identifyLanguage(parse);
        }
    } else if (identify < detect) {
        lang = identifyLanguage(parse);
        if (lang == null) {
            lang = detectLanguage(parse, doc);
        }
    } else {
        LOG.warn("No configuration for language extraction policy is provided");
        return parseResult;
    }

    if (lang != null) {
        parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang);
        return parseResult;
    }

    return parseResult;
}

Source File: TestIndexingFilters.java From anthelion with Apache License 2.0

5 votes

/**
 * Test behaviour when defined filter does not exist.
 * @throws IndexingException
 */
public void testNonExistingIndexingFilter() throws IndexingException {
  Configuration conf = NutchConfiguration.create();
  String class1 = "NonExistingFilter";
  String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
  conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);

  IndexingFilters filters = new IndexingFilters(conf);
  filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
}

org.apache.nutch.metadata.Metadata Java Examples