Java Code Examples for org.apache.nutch.indexer.NutchDocument#add()

The following examples show how to use org.apache.nutch.indexer.NutchDocument#add() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: LanguageIndexingFilter.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}
 
Example 2
Source File: MoreIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
private NutchDocument addTime(NutchDocument doc, ParseData data,
                         String url, CrawlDatum datum) {
  long time = -1;

  String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
  if (lastModified != null) {                   // try parse last-modified
    time = getTime(lastModified,url);           // use as time
                                                // store as string
    doc.add("lastModified", new Date(time));
  }

  if (time == -1) {                             // if no last-modified specified in HTTP header
    time = datum.getModifiedTime();             // use value in CrawlDatum
    if (time <= 0) {                            // if also unset
      time = datum.getFetchTime();              // use time the fetch took place (fetchTime of fetchDatum)
    }
  }

  // un-stored, indexed and un-tokenized
  doc.add("date", new Date(time));
  return doc;
}
 
Example 3
Source File: URLMetaIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
/**
 * This will take the metatags that you have listed in your "urlmeta.tags"
 * property, and looks for them inside the CrawlDatum object. If they exist,
 * this will add it as an attribute inside the NutchDocument.
 * 
 * @see IndexingFilter#filter
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf != null)
		this.setConf(conf);

	if (urlMetaTags == null || doc == null)
		return doc;

	for (String metatag : urlMetaTags) {
		Text metadata = (Text) datum.getMetaData().get(new Text(metatag));

		if (metadata != null)
			doc.add(metatag, metadata.toString());
	}

	return doc;
}
 
Example 4
Source File: LanguageIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // check if LANGUAGE found, possibly put there by HTMLLanguageParser
  String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);

  // check if HTTP-header tels us the language
  if (lang == null) {
      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
  }

  if (lang == null || lang.length() == 0) {
    lang = "unknown";
  }

  doc.add("lang", lang);

  return doc;
}
 
Example 5
Source File: MoreIndexingFilter.java    From anthelion with Apache License 2.0 6 votes vote down vote up
private NutchDocument addTime(NutchDocument doc, ParseData data,
                         String url, CrawlDatum datum) {
  long time = -1;

  String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
  if (lastModified != null) {                   // try parse last-modified
    time = getTime(lastModified,url);           // use as time
                                                // store as string
    doc.add("lastModified", new Date(time));
  }

  if (time == -1) {                             // if no last-modified
    time = datum.getFetchTime();                // use fetch time
  }

  // un-stored, indexed and un-tokenized
  doc.add("date", new Date(time));

  return doc;
}
 
Example 6
Source File: LanguageDetectionFilter.java    From language-detection with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf == null) {
		throw new IndexingException("Not Yet Initialization.");
	}
	if (cause != null) {
		throw new IndexingException("Initialization Failed.", cause);
	}

	String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);
	if (lang == null) {
		StringBuilder text = new StringBuilder();
		text.append(parse.getData().getTitle()).append(" ")
				.append(parse.getText());
		try {
			Detector detector = DetectorFactory.create();
			detector.setMaxTextLength(textsize_upper_limit);
			detector.append(text.toString());
			lang = detector.detect();
		} catch (LangDetectException e) {
			throw new IndexingException("Detection failed.", e);
		}
	}
	if (lang == null) lang = "unknown";

	doc.add("lang", lang);
	return doc;
}
 
Example 7
Source File: MoreIndexingFilter.java    From anthelion with Apache License 2.0 5 votes vote down vote up
private NutchDocument addLength(NutchDocument doc, ParseData data, String url) {
  String contentLength = data.getMeta(Response.CONTENT_LENGTH);

  if (contentLength != null) {
    // NUTCH-1010 ContentLength not trimmed
    String trimmed = contentLength.toString().trim();
    if (!trimmed.isEmpty())
      doc.add("contentLength", trimmed);
  }
  return doc;
}
 
Example 8
Source File: SubcollectionIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * "Mark" document to be a part of subcollection
 * 
 * @param doc
 * @param url
 */
private void addSubCollectionField(NutchDocument doc, String url) {
  for (Subcollection coll : CollectionManager.getCollectionManager(getConf()).getSubCollections(url)) {
    if (coll.getKey() == null) {
      doc.add(fieldName, coll.getName());
    } else {
      doc.add(coll.getKey(), coll.getName());
    }
  }
}
 
Example 9
Source File: SubcollectionIndexingFilter.java    From anthelion with Apache License 2.0 5 votes vote down vote up
/**
 * "Mark" document to be a part of subcollection
 * 
 * @param doc
 * @param url
 */
private void addSubCollectionField(NutchDocument doc, String url) {
  for (Subcollection coll : CollectionManager.getCollectionManager(getConf()).getSubCollections(url)) {
    if (coll.getKey() == null) {
      doc.add(fieldName, coll.getName());
    } else {
      doc.add(coll.getKey(), coll.getName());
    }
  }
}
 
Example 10
Source File: RelTagIndexingFilter.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // Check if some Rel-Tags found, possibly put there by RelTagParser
  String[] tags = parse.getData().getParseMeta().getValues(RelTagParser.REL_TAG);
  if (tags != null) {
    for (int i=0; i<tags.length; i++) {
      doc.add("tag", tags[i]);
    }
  }

  return doc;
}
 
Example 11
Source File: AnchorIndexingFilter.java    From anthelion with Apache License 2.0 5 votes vote down vote up
/**
 * The {@link AnchorIndexingFilter} filter object which supports boolean 
 * configuration settings for the deduplication of anchors. 
 * See {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
 *  
 * @param doc The {@link NutchDocument} object
 * @param parse The relevant {@link Parse} object passing through the filter 
 * @param url URL to be filtered for anchor text
 * @param datum The {@link CrawlDatum} entry
 * @param inlinks The {@link Inlinks} containing anchor text
 * @return filtered NutchDocument
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
  Inlinks inlinks) throws IndexingException {

  String[] anchors = (inlinks != null ? inlinks.getAnchors()
    : new String[0]);

  HashSet<String> set = null;

  for (int i = 0; i < anchors.length; i++) {
    if (deduplicate) {
      if (set == null) set = new HashSet<String>();
      String lcAnchor = anchors[i].toLowerCase();

      // Check if already processed the current anchor
      if (!set.contains(lcAnchor)) {
        doc.add("anchor", anchors[i]);

        // Add to map
        set.add(lcAnchor);
      }
    } else {
      doc.add("anchor", anchors[i]);
    }
  }

  return doc;
}
 
Example 12
Source File: LanguageDetectionFilter.java    From weslang with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
		CrawlDatum datum, Inlinks inlinks) throws IndexingException {
	if (conf == null) {
		throw new IndexingException("Not Yet Initialization.");
	}
	if (cause != null) {
		throw new IndexingException("Initialization Failed.", cause);
	}

	String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);
	if (lang == null) {
		StringBuilder text = new StringBuilder();
		text.append(parse.getData().getTitle()).append(" ")
				.append(parse.getText());
		try {
			Detector detector = DetectorFactory.create();
			detector.setMaxTextLength(textsize_upper_limit);
			detector.append(text.toString());
			lang = detector.detect();
		} catch (LangDetectException e) {
			throw new IndexingException("Detection failed.", e);
		}
	}
	if (lang == null) lang = "unknown";

	doc.add("lang", lang);
	return doc;
}
 
Example 13
Source File: RelTagIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
  throws IndexingException {

  // Check if some Rel-Tags found, possibly put there by RelTagParser
  String[] tags = parse.getData().getParseMeta().getValues(RelTagParser.REL_TAG);
  if (tags != null) {
    for (int i=0; i<tags.length; i++) {
      doc.add("tag", tags[i]);
    }
  }

  return doc;
}
 
Example 14
Source File: TLDIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {

  try {
    URL url = new URL(urlText.toString());
    DomainSuffix d = URLUtil.getDomainSuffix(url);
    
    doc.add("tld", d.getDomain());
    
  }catch (Exception ex) {
    LOG.warn(ex.toString());
  }

  return doc;
}
 
Example 15
Source File: StaticFieldIndexer.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * The {@link StaticFieldIndexer} filter object which adds fields as per
 * configuration setting. See {@code index.static} in nutch-default.xml.
 * 
 * @param doc The {@link NutchDocument} object
 * @param parse  The relevant {@link Parse} object passing through the filter
 * @param url URL to be filtered for anchor text
 * @param datum The {@link CrawlDatum} entry
 * @param inlinks The {@link Inlinks} containing anchor text
 * @return filtered NutchDocument
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
    CrawlDatum datum, Inlinks inlinks) throws IndexingException {

  if (this.addStaticFields == true) {
    for (Entry<String, String[]> entry : this.fields.entrySet()) {
      for (String val : entry.getValue()) {
        doc.add(entry.getKey(), val);
      }
    }
  }
  return doc;
}
 
Example 16
Source File: AnchorIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/**
 * The {@link AnchorIndexingFilter} filter object which supports boolean 
 * configuration settings for the deduplication of anchors. 
 * See {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
 *  
 * @param doc The {@link NutchDocument} object
 * @param parse The relevant {@link Parse} object passing through the filter 
 * @param url URL to be filtered for anchor text
 * @param datum The {@link CrawlDatum} entry
 * @param inlinks The {@link Inlinks} containing anchor text
 * @return filtered NutchDocument
 */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
  Inlinks inlinks) throws IndexingException {

  String[] anchors = (inlinks != null ? inlinks.getAnchors()
    : new String[0]);

  HashSet<String> set = null;

  for (int i = 0; i < anchors.length; i++) {
    if (deduplicate) {
      if (set == null) set = new HashSet<String>();
      String lcAnchor = anchors[i].toLowerCase();

      // Check if already processed the current anchor
      if (!set.contains(lcAnchor)) {
        doc.add("anchor", anchors[i]);

        // Add to map
        set.add(lcAnchor);
      }
    } else {
      doc.add("anchor", anchors[i]);
    }
  }

  return doc;
}
 
Example 17
Source File: TLDIndexingFilter.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {

  try {
    URL url = new URL(urlText.toString());
    DomainSuffix d = URLUtil.getDomainSuffix(url);
    
    doc.add("tld", d.getDomain());
    
  }catch (Exception ex) {
    LOG.warn(ex.toString());
  }

  return doc;
}
 
Example 18
Source File: MoreIndexingFilter.java    From anthelion with Apache License 2.0 4 votes vote down vote up
/**
 * <p>
 * Add Content-Type and its primaryType and subType add contentType,
 * primaryType and subType to field "type" as un-stored, indexed and
 * un-tokenized, so that search results can be confined by contentType or its
 * primaryType or its subType.
 * </p>
 * <p>
 * For example, if contentType is application/vnd.ms-powerpoint, search can be
 * done with one of the following qualifiers
 * type:application/vnd.ms-powerpoint type:application type:vnd.ms-powerpoint
 * all case insensitive. The query filter is implemented in
 * {@link TypeQueryFilter}.
 * </p>
 *
 * @param doc
 * @param data
 * @param url
 * @return
 */
private NutchDocument addType(NutchDocument doc, ParseData data, String url,
    CrawlDatum datum) {
  String mimeType = null;
  String contentType = null;

  Writable tcontentType = datum.getMetaData().get(
      new Text(Response.CONTENT_TYPE));
  if (tcontentType != null) {
    contentType = tcontentType.toString();
  } else
    contentType = data.getMeta(Response.CONTENT_TYPE);
  if (contentType == null) {
    // Note by Jerome Charron on 20050415:
    // Content Type not solved by a previous plugin
    // Or unable to solve it... Trying to find it
    // Should be better to use the doc content too
    // (using MimeTypes.getMimeType(byte[], String), but I don't know
    // which field it is?
    // if (MAGIC) {
    //   contentType = MIME.getMimeType(url, content);
    // } else {
    //   contentType = MIME.getMimeType(url);
    // }
    mimeType = MIME.getMimeType(url);
  } else {
    mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
  }

  // Checks if we solved the content-type.
  if (mimeType == null) {
    return doc;
  }

  // Check if we have to map mime types
  if (mapMimes) {
    // Check if the current mime is mapped
    if (mimeMap.containsKey(mimeType)) {
      // It's mapped, let's replace it
      mimeType = mimeMap.get(mimeType);
    }
  }

  contentType = mimeType;

  doc.add("type", contentType);

  // Check if we need to split the content type in sub parts
  if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
    String[] parts = getParts(contentType);

    for(String part: parts) {
      doc.add("type", part);
    }
  }

  // leave this for future improvement
  //MimeTypeParameterList parameterList = mimeType.getParameters()

  return doc;
}
 
Example 19
Source File: CCIndexingFilter.java    From anthelion with Apache License 2.0 4 votes vote down vote up
private void addFeature(NutchDocument doc, String feature) {
  doc.add(FIELD, feature);
}
 
Example 20
Source File: MoreIndexingFilter.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
/**
 * <p>
 * Add Content-Type and its primaryType and subType add contentType,
 * primaryType and subType to field "type" as un-stored, indexed and
 * un-tokenized, so that search results can be confined by contentType or its
 * primaryType or its subType.
 * </p>
 * <p>
 * For example, if contentType is application/vnd.ms-powerpoint, search can be
 * done with one of the following qualifiers
 * type:application/vnd.ms-powerpoint type:application type:vnd.ms-powerpoint
 * all case insensitive. The query filter is implemented in
 * {@link TypeQueryFilter}.
 * </p>
 *
 * @param doc
 * @param data
 * @param url
 * @return
 */
private NutchDocument addType(NutchDocument doc, ParseData data, String url,
    CrawlDatum datum) {
  String mimeType = null;
  String contentType = null;

  Writable tcontentType = datum.getMetaData().get(
      new Text(Response.CONTENT_TYPE));
  if (tcontentType != null) {
    contentType = tcontentType.toString();
  } else
    contentType = data.getMeta(Response.CONTENT_TYPE);
  if (contentType == null) {
    // Note by Jerome Charron on 20050415:
    // Content Type not solved by a previous plugin
    // Or unable to solve it... Trying to find it
    // Should be better to use the doc content too
    // (using MimeTypes.getMimeType(byte[], String), but I don't know
    // which field it is?
    // if (MAGIC) {
    //   contentType = MIME.getMimeType(url, content);
    // } else {
    //   contentType = MIME.getMimeType(url);
    // }

    mimeType = tika.detect(url);
  } else {
    mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
  }

  // Checks if we solved the content-type.
  if (mimeType == null) {
    return doc;
  }

  // Check if we have to map mime types
  if (mapMimes) {
    // Check if the current mime is mapped
    if (mimeMap.containsKey(mimeType)) {
      // It's mapped, let's replace it
      mimeType = mimeMap.get(mimeType);
    }
  }

  contentType = mimeType;
  doc.add("type", contentType);

  // Check if we need to split the content type in sub parts
  if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
    String[] parts = getParts(contentType);

    for(String part: parts) {
      doc.add("type", part);
    }
  }

  // leave this for future improvement
  //MimeTypeParameterList parameterList = mimeType.getParameters()

  return doc;
}