Java Code Examples for org.apache.nutch.metadata.Metadata#add()

The following examples show how to use org.apache.nutch.metadata.Metadata#add() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestParseData.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public void testParseData() throws Exception {

    String title = "The Foo Page";

    Outlink[] outlinks = new Outlink[] {
      new Outlink("http://foo.com/", "Foo"),
      new Outlink("http://bar.com/", "Bar")
    };

    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    metaData.add("Charset", "UTF-8");

    ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
                        
    WritableTestUtils.testWritable(r, null);
  }
 
Example 2
Source File: TestParseData.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public void testParseData() throws Exception {

    String title = "The Foo Page";

    Outlink[] outlinks = new Outlink[] {
      new Outlink("http://foo.com/", "Foo"),
      new Outlink("http://bar.com/", "Bar")
    };

    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    metaData.add("Charset", "UTF-8");

    ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
                        
    WritableTestUtils.testWritable(r, null);
  }
 
Example 3
Source File: RelTagParser.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
/**
 * Scan the HTML document looking at possible rel-tags
 */
public ParseResult filter(Content content, ParseResult parseResult,
  HTMLMetaTags metaTags, DocumentFragment doc) {
  
  // get parse obj
  Parse parse = parseResult.get(content.getUrl());
  // Trying to find the document's rel-tags
  Parser parser = new Parser(doc);
  Set<?> tags = parser.getRelTags();
  Iterator<?> iter = tags.iterator();
  Metadata metadata = parse.getData().getParseMeta();
  while (iter.hasNext())
    metadata.add(REL_TAG, (String) iter.next());

  return parseResult;
}
 
Example 4
Source File: RelTagParser.java    From anthelion with Apache License 2.0 6 votes vote down vote up
/**
 * Scan the HTML document looking at possible rel-tags
 */
public ParseResult filter(Content content, ParseResult parseResult,
  HTMLMetaTags metaTags, DocumentFragment doc) {
  
  // get parse obj
  Parse parse = parseResult.get(content.getUrl());
  // Trying to find the document's rel-tags
  Parser parser = new Parser(doc);
  Set tags = parser.getRelTags();
  Iterator iter = tags.iterator();
  Metadata metadata = parse.getData().getParseMeta();
  while (iter.hasNext()) {
    metadata.add(REL_TAG, (String) iter.next());
  }
  return parseResult;
}
 
Example 5
Source File: TestMoreIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public void testContentDispositionTitle() throws IndexingException {
  Configuration conf = NutchConfiguration.create();

  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
    new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());

  assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
}
 
Example 6
Source File: TestIndexingFilters.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * Test behaviour when reset the index filter order will not take effect
 *
 * @throws IndexingException
 */
public void testFilterCacheIndexingFilter() throws IndexingException{
  Configuration conf = NutchConfiguration.create();
  conf.addResource("nutch-default.xml");
  conf.addResource("crawl-tests.xml");

  String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
  conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);

  IndexingFilters filters1 = new IndexingFilters(conf);
  NutchDocument fdoc1 = filters1.filter(new NutchDocument(),new ParseImpl("text",new ParseData(
    new ParseStatus(),"title",new Outlink[0],new Metadata())),new Text("http://www.example.com/"),
    new CrawlDatum(),new Inlinks());

  // add another index filter
  String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
  // set content metadata
  Metadata md = new Metadata();
  md.add("example","data");
  // set content metadata property defined in MetadataIndexer
  conf.set("index.content.md","example");
  // add MetadataIndxer filter
  conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
  IndexingFilters filters2 = new IndexingFilters(conf);
  NutchDocument fdoc2 = filters2.filter(new NutchDocument(),new ParseImpl("text",new ParseData(
    new ParseStatus(),"title",new Outlink[0],md)),new Text("http://www.example.com/"),
    new CrawlDatum(),new Inlinks());
  assertEquals(fdoc1.getFieldNames().size(),fdoc2.getFieldNames().size());
}
 
Example 7
Source File: FeedParser.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
private void mergeMetadata(Metadata first, Metadata second) {
  for (String name : second.names()) {
    String[] values = second.getValues(name);
    for (String value : values) {
      first.add(name, value);
    }
  }
}
 
Example 8
Source File: TestBasicIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public void testBasicIndexingFilter() throws Exception { 
  Configuration conf = NutchConfiguration.create();
  conf.setInt("indexer.max.title.length", 10);
  conf.setBoolean("indexer.add.domain", true);
  conf.setInt("indexer.max.content.length", 20);

  BasicIndexingFilter filter = new BasicIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);

  NutchDocument doc = new NutchDocument();

  String title = "The Foo Page";
  Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
  Metadata metaData = new Metadata();
  metaData.add("Language", "en/us");
  ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
  ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);

  CrawlDatum crawlDatum = new CrawlDatum();
  crawlDatum.setFetchTime(100L);

  Inlinks inlinks = new Inlinks();

  try {
    filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
  } catch(Exception e){
    e.printStackTrace();
    fail(e.getMessage());
  }
  assertNotNull(doc);
  assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc.getField("title").getValues().get(0));
  assertEquals("test domain, expect \"apache.org\"", "apache.org", doc.getField("domain").getValues().get(0));
  assertEquals("test host, expect \"nutch.apache.org\"", "nutch.apache.org", doc.getField("host").getValues().get(0));
  assertEquals("test url, expect \"http://nutch.apache.org/index.html\"", "http://nutch.apache.org/index.html", 
    doc.getField("url").getValues().get(0));
  assertEquals("test content", "this is a sample foo", doc.getField("content").getValues().get(0));
  assertEquals("test fetch time", new Date(100L), (Date)doc.getField("tstamp").getValues().get(0));
}
 
Example 9
Source File: TestMoreIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_TYPE, source);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
}
 
Example 10
Source File: FeedParser.java    From anthelion with Apache License 2.0 5 votes vote down vote up
private void mergeMetadata(Metadata first, Metadata second) {
  for (String name : second.names()) {
    String[] values = second.getValues(name);
    for (String value : values) {
      first.add(name, value);
    }
  }
}
 
Example 11
Source File: TestMoreIndexingFilter.java    From anthelion with Apache License 2.0 5 votes vote down vote up
private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_TYPE, source);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
}
 
Example 12
Source File: TestMoreIndexingFilter.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public void testContentDispositionTitle() throws IndexingException {
  Configuration conf = NutchConfiguration.create();

  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
    new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());

  assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
}
 
Example 13
Source File: CCParseFilter.java    From anthelion with Apache License 2.0 4 votes vote down vote up
/** Scan the document adding attributes to metadata.*/
public static void walk(Node doc, URL base, Metadata metadata, Configuration conf)
  throws ParseException {

  // walk the DOM tree, scanning for license data
  Walker walker = new Walker(base);
  walker.walk(doc);

  // interpret results of walk
  String licenseUrl = null;
  String licenseLocation = null;
  if (walker.rdfLicense != null) {            // 1st choice: subject in RDF
    licenseLocation = "rdf";
    licenseUrl = walker.rdfLicense;
  } else if (walker.relLicense != null) {     // 2nd: anchor w/ rel=license
    licenseLocation = "rel";
    licenseUrl = walker.relLicense.toString();
  } else if (walker.anchorLicense != null) {  // 3rd: anchor w/ CC license
    licenseLocation = "a";
    licenseUrl = walker.anchorLicense.toString();
  } else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) {
    throw new ParseException("No CC license.  Excluding.");
  }

  // add license to metadata
  if (licenseUrl != null) {
    if (LOG.isInfoEnabled()) {
      LOG.info("CC: found "+licenseUrl+" in "+licenseLocation+" of "+base);
    }
    metadata.add(CreativeCommons.LICENSE_URL, licenseUrl);
    metadata.add(CreativeCommons.LICENSE_LOCATION, licenseLocation);
  }

  if (walker.workType != null) {
    if (LOG.isInfoEnabled()) {
      LOG.info("CC: found "+walker.workType+" in "+base);
    }
    metadata.add(CreativeCommons.WORK_TYPE, walker.workType);
  }

}
 
Example 14
Source File: TestHTMLLanguageParser.java    From anthelion with Apache License 2.0 4 votes vote down vote up
private Content getContent(String text) {
  Metadata meta = new Metadata();
  meta.add("Content-Type", "text/html");
  return new Content(URL, BASE, text.getBytes(), "text/html", meta, NutchConfiguration.create());
}
 
Example 15
Source File: TestHTMLLanguageParser.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
private Content getContent(String text) {
  Metadata meta = new Metadata();
  meta.add("Content-Type", "text/html");
  return new Content(URL, BASE, text.getBytes(), "text/html", meta, NutchConfiguration.create());
}
 
Example 16
Source File: CCParseFilter.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
/** Scan the document adding attributes to metadata.*/
public static void walk(Node doc, URL base, Metadata metadata, Configuration conf)
  throws ParseException {

  // walk the DOM tree, scanning for license data
  Walker walker = new Walker(base);
  walker.walk(doc);

  // interpret results of walk
  String licenseUrl = null;
  String licenseLocation = null;
  if (walker.rdfLicense != null) {            // 1st choice: subject in RDF
    licenseLocation = "rdf";
    licenseUrl = walker.rdfLicense;
  } else if (walker.relLicense != null) {     // 2nd: anchor w/ rel=license
    licenseLocation = "rel";
    licenseUrl = walker.relLicense.toString();
  } else if (walker.anchorLicense != null) {  // 3rd: anchor w/ CC license
    licenseLocation = "a";
    licenseUrl = walker.anchorLicense.toString();
  } else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) {
    throw new ParseException("No CC license.  Excluding.");
  }

  // add license to metadata
  if (licenseUrl != null) {
    if (LOG.isInfoEnabled()) {
      LOG.info("CC: found "+licenseUrl+" in "+licenseLocation+" of "+base);
    }
    metadata.add(CreativeCommons.LICENSE_URL, licenseUrl);
    metadata.add(CreativeCommons.LICENSE_LOCATION, licenseLocation);
  }

  if (walker.workType != null) {
    if (LOG.isInfoEnabled()) {
      LOG.info("CC: found "+walker.workType+" in "+base);
    }
    metadata.add(CreativeCommons.WORK_TYPE, walker.workType);
  }

}
 
Example 17
Source File: S2jhHtmlParseFilter.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
@Override
public ParseResult filterInternal(Content content, ParseResult parseResult, HTMLMetaTags metaTags,
        DocumentFragment doc) {
    String url = content.getUrl();
    Parse parse = parseResult.get(new Text(url));
    //String text = parse.getText();
    ParseData parseData = parse.getData();

    //后续的index索引之用元数据
    Metadata parseMeta = parseData.getParseMeta();
    //数据库需要采集记录的数据信息集合
    //一个属性一行数据
    List<CrawlData> crawlDatas = Lists.newArrayList();

    String sku = null;
    Matcher matcher = this.skuMatchPattern.matcher(url);
    if (matcher.find()) {
        sku = matcher.group(1);
    }
    if (StringUtils.isBlank(sku)) {
        LOG.warn("SKU not parsed for url: " + url);
        return parseResult;
    }

    //数据库记录
    crawlDatas.add(new CrawlData(url, "sku").setTextValue(sku));
    //元数据数据
    parseMeta.add("sku", sku);

    String price = getXPathValue(doc, "//SPAN[@class='tm-price']");
    crawlDatas.add(new CrawlData(url, "price").setTextValue(price));
    parseMeta.add("price", price);

    LOG.info(" - SKU:{}, Parse Meta: {}", sku, parseMeta);

    NodeList nodes = selectNodeList(doc,
            "//DIV[@id='description']/DIV[@class='content ke-post']//IMG[@data-ks-lazyload]");
    LOG.info("Product description content image list: ");
    for (int i = 0; i < nodes.getLength(); i++) {
        Node node = nodes.item(i);
        String imgUrl = node.getAttributes().getNamedItem("data-ks-lazyload").getTextContent();
        LOG.info(" - {}", imgUrl);
    }

    saveCrawlData(url, crawlDatas);

    return parseResult;
}