org.apache.nutch.indexer.NutchDocument Java Examples

The following examples show how to use org.apache.nutch.indexer.NutchDocument. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

private NutchDocument addTime(NutchDocument doc, ParseData data,
                         String url, CrawlDatum datum) {
  long time = -1;

  String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
  if (lastModified != null) {                   // try parse last-modified
    time = getTime(lastModified,url);           // use as time
                                                // store as string
    doc.add("lastModified", new Date(time));
  }

  if (time == -1) {                             // if no last-modified specified in HTTP header
    time = datum.getModifiedTime();             // use value in CrawlDatum
    if (time <= 0) {                            // if also unset
      time = datum.getFetchTime();              // use time the fetch took place (fetchTime of fetchDatum)
    }
  }

  // un-stored, indexed and un-tokenized
  doc.add("date", new Date(time));
  return doc;
}

Example #2

Source File: LanguageIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}

Example #3

Source File: TestAnchorIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

public void testDeduplicateAnchor() throws Exception {
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("anchorIndexingFilter.deduplicate", true);
  AnchorIndexingFilter filter = new AnchorIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());
  Inlinks inlinks = new Inlinks();
  inlinks.add(new Inlink("http://test1.com/", "text1"));
  inlinks.add(new Inlink("http://test2.com/", "text2"));
  inlinks.add(new Inlink("http://test3.com/", "text2"));
  try {
    filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks);
  } catch(Exception e){
    e.printStackTrace();
    fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
  assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size());
}

Example #4

Source File: LanguageIndexingFilter.java From anthelion with Apache License 2.0

6 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}

Example #5

Source File: MoreIndexingFilter.java From anthelion with Apache License 2.0

6 votes

private NutchDocument addTime(NutchDocument doc, ParseData data,
                         String url, CrawlDatum datum) {
  long time = -1;

  String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
  if (lastModified != null) {                   // try parse last-modified
    time = getTime(lastModified,url);           // use as time
                                                // store as string
    doc.add("lastModified", new Date(time));
  }

  if (time == -1) {                             // if no last-modified
    time = datum.getFetchTime();                // use fetch time
  }

  // un-stored, indexed and un-tokenized
  doc.add("date", new Date(time));

  return doc;
}

Example #6

Source File: TestMoreIndexingFilter.java From anthelion with Apache License 2.0

6 votes

/**
 * @since NUTCH-901
 */
public void testNoParts(){
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());
  
  try{
      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
  }
  catch(Exception e){
      e.printStackTrace();
      fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue(doc.getFieldNames().contains("type"));
  assertEquals(1, doc.getField("type").getValues().size());
  assertEquals("text/html", doc.getFieldValue("type"));    
}

Example #7

Source File: URLMetaIndexingFilter.java From anthelion with Apache License 2.0

6 votes

/**
 * This will take the metatags that you have listed in your "urlmeta.tags"
 * property, and looks for them inside the CrawlDatum object. If they exist,
 * this will add it as an attribute inside the NutchDocument.
 * 
 * @see IndexingFilter#filter
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf != null)
		this.setConf(conf);

	if (urlMetaTags == null || doc == null)
		return doc;

	for (String metatag : urlMetaTags) {
		Text metadata = (Text) datum.getMetaData().get(new Text(metatag));

		if (metadata != null)
			doc.add(metatag, metadata.toString());
	}

	return doc;
}

Example #8

Source File: TestAnchorIndexingFilter.java From anthelion with Apache License 2.0

6 votes

public void testDeduplicateAnchor() throws Exception {
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("anchorIndexingFilter.deduplicate", true);
  AnchorIndexingFilter filter = new AnchorIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());
  Inlinks inlinks = new Inlinks();
  inlinks.add(new Inlink("http://test1.com/", "text1"));
  inlinks.add(new Inlink("http://test2.com/", "text2"));
  inlinks.add(new Inlink("http://test3.com/", "text2"));
  try {
    filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks);
  } catch(Exception e){
    e.printStackTrace();
    fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
  assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size());
}

Example #9

Source File: CCIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

/** Add the features represented by a license URL.  Urls are of the form
 * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
 * license feature. */
public void addUrlFeatures(NutchDocument doc, String urlString) {
  try {
    URL url = new URL(urlString);

    // tokenize the path of the url, breaking at slashes and dashes
    StringTokenizer names = new StringTokenizer(url.getPath(), "/-");

    if (names.hasMoreTokens())
      names.nextToken();                        // throw away "licenses"

    // add a feature per component after "licenses"
    while (names.hasMoreTokens()) {
      String feature = names.nextToken();
      addFeature(doc, feature);
    }
  } catch (MalformedURLException e) {
    if (LOG.isWarnEnabled()) {
      LOG.warn("CC: failed to parse url: " + urlString + " : " + e);
    }
  }
}

Example #10

Source File: TLDScoringFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
    CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
    throws ScoringFilterException {

  NutchField tlds = doc.getField("tld");
  float boost = 1.0f;

  if(tlds != null) {
    for(Object tld : tlds.getValues()) {
      DomainSuffix entry = tldEntries.get(tld.toString());
      if(entry != null)
        boost *= entry.getBoost();
    }
  }
  return initScore * boost;
}

Example #11

Source File: CCIndexingFilter.java From anthelion with Apache License 2.0

6 votes

/** Add the features represented by a license URL.  Urls are of the form
 * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
 * license feature. */
public void addUrlFeatures(NutchDocument doc, String urlString) {
  try {
    URL url = new URL(urlString);

    // tokenize the path of the url, breaking at slashes and dashes
    StringTokenizer names = new StringTokenizer(url.getPath(), "/-");

    if (names.hasMoreTokens())
      names.nextToken();                        // throw away "licenses"

    // add a feature per component after "licenses"
    while (names.hasMoreTokens()) {
      String feature = names.nextToken();
      addFeature(doc, feature);
    }
  } catch (MalformedURLException e) {
    if (LOG.isWarnEnabled()) {
      LOG.warn("CC: failed to parse url: " + urlString + " : " + e);
    }
  }
}

Example #12

Source File: TLDScoringFilter.java From anthelion with Apache License 2.0

6 votes

public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
    CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
    throws ScoringFilterException {

  NutchField tlds = doc.getField("tld");
  float boost = 1.0f;

  if(tlds != null) {
    for(Object tld : tlds.getValues()) {
      DomainSuffix entry = tldEntries.get(tld.toString());
      if(entry != null)
        boost *= entry.getBoost();
    }
  }
  return initScore * boost;
}

Example #13

Source File: TestMoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * @since NUTCH-901
 */
public void testNoParts(){
  Configuration conf = NutchConfiguration.create();
  conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);
  NutchDocument doc = new NutchDocument();
  ParseImpl parse = new ParseImpl("foo bar", new ParseData());
  
  try{
      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
  }
  catch(Exception e){
      e.printStackTrace();
      fail(e.getMessage());
  }
  assertNotNull(doc);
  assertTrue(doc.getFieldNames().contains("type"));
  assertEquals(1, doc.getField("type").getValues().size());
  assertEquals("text/html", doc.getFieldValue("type"));    
}

Example #14

Source File: URLMetaIndexingFilter.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * This will take the metatags that you have listed in your "urlmeta.tags"
 * property, and looks for them inside the CrawlDatum object. If they exist,
 * this will add it as an attribute inside the NutchDocument.
 * 
 * @see IndexingFilter#filter
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf != null)
		this.setConf(conf);

	if (urlMetaTags == null || doc == null)
		return doc;

	for (String metatag : urlMetaTags) {
		Text metadata = (Text) datum.getMetaData().get(new Text(metatag));

		if (metadata != null)
			doc.add(metatag, metadata.toString());
	}

	return doc;
}

Example #15

Source File: TestStaticFieldIndexerTest.java From nutch-htmlunit with Apache License 2.0

6 votes

/**
 * Test that empty {@code index.static} does not add anything to the document
 * @throws Exception 
 */
public void testEmptyIndexStatic() throws Exception {

  assertNotNull(filter);
  filter.setConf(conf);

  NutchDocument doc = new NutchDocument();

  try {
    filter.filter(doc, parse, url, crawlDatum, inlinks);
  } catch (Exception e) {
    e.printStackTrace();
    fail(e.getMessage());
  }

  assertNotNull(doc);
  assertTrue("tests if no field is set for empty index.static", doc.getFieldNames().isEmpty());
}

Example #16

Source File: StaticFieldIndexer.java From nutch-htmlunit with Apache License 2.0

5 votes

/**
 * The {@link StaticFieldIndexer} filter object which adds fields as per
 * configuration setting. See {@code index.static} in nutch-default.xml.
 * 
 * @param doc The {@link NutchDocument} object
 * @param parse  The relevant {@link Parse} object passing through the filter
 * @param url URL to be filtered for anchor text
 * @param datum The {@link CrawlDatum} entry
 * @param inlinks The {@link Inlinks} containing anchor text
 * @return filtered NutchDocument
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
    CrawlDatum datum, Inlinks inlinks) throws IndexingException {

  if (this.addStaticFields == true) {
    for (Entry<String, String[]> entry : this.fields.entrySet()) {
      for (String val : entry.getValue()) {
        doc.add(entry.getKey(), val);
      }
    }
  }
  return doc;
}

Example #17

Source File: ElasticIndexWriter.java From nutch-htmlunit with Apache License 2.0

5 votes

@Override
public void write(NutchDocument doc) throws IOException {
  String id = (String)doc.getFieldValue("url");
  String type = doc.getDocumentMeta().get("type");
  if (type == null) type = "doc";
  IndexRequestBuilder request = client.prepareIndex(defaultIndex, type, id);

  Map<String, Object> source = new HashMap<String, Object>();

  // Loop through all fields of this doc
  for (String fieldName : doc.getFieldNames()) {
    if (doc.getField(fieldName).getValues().size() > 1) {
      source.put(fieldName, doc.getFieldValue(fieldName));
      // Loop through the values to keep track of the size of this document
      for (Object value : doc.getField(fieldName).getValues()) {
        bulkLength += value.toString().length();
      }
    } else {
      source.put(fieldName, doc.getFieldValue(fieldName));
      bulkLength += doc.getFieldValue(fieldName).toString().length();
    }
  }
  request.setSource(source);

  // Add this indexing request to a bulk request
  bulk.add(request);
  indexedDocs++;
  bulkDocs++;

  if (bulkDocs >= maxBulkDocs || bulkLength >= maxBulkLength) {
    LOG.info("Processing bulk request [docs = " + bulkDocs + ", length = "
            + bulkLength + ", total docs = " + indexedDocs
            + ", last doc in bulk = '" + id + "']");
    // Flush the bulk of indexing requests
    createNewBulk = true;
    commit();
  }
}

Example #18

Source File: RelTagIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // Check if some Rel-Tags found, possibly put there by RelTagParser
  String[] tags = parse.getData().getParseMeta().getValues(RelTagParser.REL_TAG);
  if (tags != null) {
    for (int i=0; i<tags.length; i++) {
      doc.add("tag", tags[i]);
    }
  }

  return doc;
}

Example #19

Source File: TLDIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {

  try {
    URL url = new URL(urlText.toString());
    DomainSuffix d = URLUtil.getDomainSuffix(url);
    
    doc.add("tld", d.getDomain());
    
  }catch (Exception ex) {
    LOG.warn(ex.toString());
  }

  return doc;
}

Example #20

Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
    CrawlDatum datum, Inlinks inlinks) throws IndexingException {

  String url_s = url.toString();

  addTime(doc, parse.getData(), url_s, datum);
  addLength(doc, parse.getData(), url_s);
  addType(doc, parse.getData(), url_s, datum);
  resetTitle(doc, parse.getData(), url_s);

  return doc;
}

Example #21

Source File: S2jhIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

@Override
public NutchDocument filterInternal(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) {
    ParseData parseData = parse.getData();

    String sku = parseData.getMeta("sku");
    if (StringUtils.isBlank(sku)) {
        return null;
    }

    doc.add("sku", sku);
    doc.add("price", parseData.getMeta("price"));

    return doc;
}

Example #22

Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

private NutchDocument addLength(NutchDocument doc, ParseData data, String url) {
  String contentLength = data.getMeta(Response.CONTENT_LENGTH);

  if (contentLength != null) {
    // NUTCH-1010 ContentLength not trimmed
    String trimmed = contentLength.toString().trim();
    if (!trimmed.isEmpty())
      doc.add("contentLength", trimmed);
  }
  return doc;
}

Example #23

Source File: TestBasicIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

public void testBasicIndexingFilter() throws Exception { 
  Configuration conf = NutchConfiguration.create();
  conf.setInt("indexer.max.title.length", 10);
  conf.setBoolean("indexer.add.domain", true);
  conf.setInt("indexer.max.content.length", 20);

  BasicIndexingFilter filter = new BasicIndexingFilter();
  filter.setConf(conf);
  assertNotNull(filter);

  NutchDocument doc = new NutchDocument();

  String title = "The Foo Page";
  Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
  Metadata metaData = new Metadata();
  metaData.add("Language", "en/us");
  ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
  ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);

  CrawlDatum crawlDatum = new CrawlDatum();
  crawlDatum.setFetchTime(100L);

  Inlinks inlinks = new Inlinks();

  try {
    filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
  } catch(Exception e){
    e.printStackTrace();
    fail(e.getMessage());
  }
  assertNotNull(doc);
  assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc.getField("title").getValues().get(0));
  assertEquals("test domain, expect \"apache.org\"", "apache.org", doc.getField("domain").getValues().get(0));
  assertEquals("test host, expect \"nutch.apache.org\"", "nutch.apache.org", doc.getField("host").getValues().get(0));
  assertEquals("test url, expect \"http://nutch.apache.org/index.html\"", "http://nutch.apache.org/index.html", 
    doc.getField("url").getValues().get(0));
  assertEquals("test content", "this is a sample foo", doc.getField("content").getValues().get(0));
  assertEquals("test fetch time", new Date(100L), (Date)doc.getField("tstamp").getValues().get(0));
}

Example #24

Source File: TestStaticFieldIndexerTest.java From nutch-htmlunit with Apache License 2.0

5 votes

/**
 * Test that valid field:value pairs are added to the document
 * @throws Exception 
 */
public void testNormalScenario() throws Exception {

  conf.set("index.static",
      "field1:val1, field2    :      val2 val3     , field3, field4 :val4 , ");
  assertNotNull(filter);
  filter.setConf(conf);

  NutchDocument doc = new NutchDocument();

  try {
    filter.filter(doc, parse, url, crawlDatum, inlinks);
  } catch (Exception e) {
    e.printStackTrace();
    fail(e.getMessage());
  }

  assertNotNull(doc);
  assertFalse("test if doc is not empty", doc.getFieldNames().isEmpty());
  assertEquals("test if doc has 3 fields", 3, doc.getFieldNames().size());
  assertTrue("test if doc has field1", doc.getField("field1").getValues()
      .contains("val1"));
  assertTrue("test if doc has field2", doc.getField("field2").getValues()
      .contains("val2"));
  assertTrue("test if doc has field4", doc.getField("field4").getValues()
      .contains("val4"));
}

Example #25

Source File: AbstractIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
        throws IndexingException {
    LOG.debug("Invoking  indexer {} for url: {}", this.getClass().getName(), url);

    if (doc == null) {
        LOG.debug("Skipped as NutchDocument doc is null");
        return doc;
    }

    return filterInternal(doc, parse, url, datum, inlinks);
}

Example #26

Source File: IndexWriters.java From nutch-htmlunit with Apache License 2.0

5 votes

public void update(NutchDocument doc) throws IOException {
	for (int i = 0; i < this.indexWriters.length; i++) {
		try {
			this.indexWriters[i].update(doc);
		} catch (IOException ioe) {
			throw ioe;
		}
	}
}

Example #27

Source File: IndexWriters.java From nutch-htmlunit with Apache License 2.0

5 votes

public void write(NutchDocument doc) throws IOException {
	for (int i = 0; i < this.indexWriters.length; i++) {
		try {
			this.indexWriters[i].write(doc);
		} catch (IOException ioe) {
			throw ioe;
		}
	}
}

Example #28

Source File: TestMoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

public void testContentDispositionTitle() throws IndexingException {
  Configuration conf = NutchConfiguration.create();

  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);

  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
    new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());

  assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
}

Example #29

Source File: TestMoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0

5 votes

private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
  Metadata metadata = new Metadata();
  metadata.add(Response.CONTENT_TYPE, source);
  MoreIndexingFilter filter = new MoreIndexingFilter();
  filter.setConf(conf);
  NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
      new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
      "http://www.example.com/"), new CrawlDatum(), new Inlinks());
  assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
}

Example #30

Source File: CCIndexingFilter.java From anthelion with Apache License 2.0

5 votes

public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {
  
  Metadata metadata = parse.getData().getParseMeta();
  // index the license
  String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
  if (licenseUrl != null) {
    if (LOG.isInfoEnabled()) {
      LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
    }

    // add the entire license as cc:license=xxx
    addFeature(doc, "license=" + licenseUrl);

    // index license attributes extracted of the license url
    addUrlFeatures(doc, licenseUrl);
  }

  // index the license location as cc:meta=xxx
  String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION);
  if (licenseLocation != null) {
    addFeature(doc, "meta=" + licenseLocation);
  }

  // index the work type cc:type=xxx
  String workType = metadata.get(CreativeCommons.WORK_TYPE);
  if (workType != null) {
    addFeature(doc, workType);
  }

  return doc;
}