org.apache.nutch.indexer.NutchDocument Java Examples
The following examples show how to use
org.apache.nutch.indexer.NutchDocument.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
private NutchDocument addTime(NutchDocument doc, ParseData data, String url, CrawlDatum datum) { long time = -1; String lastModified = data.getMeta(Metadata.LAST_MODIFIED); if (lastModified != null) { // try parse last-modified time = getTime(lastModified,url); // use as time // store as string doc.add("lastModified", new Date(time)); } if (time == -1) { // if no last-modified specified in HTTP header time = datum.getModifiedTime(); // use value in CrawlDatum if (time <= 0) { // if also unset time = datum.getFetchTime(); // use time the fetch took place (fetchTime of fetchDatum) } } // un-stored, indexed and un-tokenized doc.add("date", new Date(time)); return doc; }
Example #2
Source File: LanguageIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { // check if LANGUAGE found, possibly put there by HTMLLanguageParser String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE); // check if HTTP-header tels us the language if (lang == null) { lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE); } if (lang == null || lang.length() == 0) { lang = "unknown"; } doc.add("lang", lang); return doc; }
Example #3
Source File: TestAnchorIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
public void testDeduplicateAnchor() throws Exception { Configuration conf = NutchConfiguration.create(); conf.setBoolean("anchorIndexingFilter.deduplicate", true); AnchorIndexingFilter filter = new AnchorIndexingFilter(); filter.setConf(conf); assertNotNull(filter); NutchDocument doc = new NutchDocument(); ParseImpl parse = new ParseImpl("foo bar", new ParseData()); Inlinks inlinks = new Inlinks(); inlinks.add(new Inlink("http://test1.com/", "text1")); inlinks.add(new Inlink("http://test2.com/", "text2")); inlinks.add(new Inlink("http://test3.com/", "text2")); try { filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks); } catch(Exception e){ e.printStackTrace(); fail(e.getMessage()); } assertNotNull(doc); assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor")); assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size()); }
Example #4
Source File: LanguageIndexingFilter.java From anthelion with Apache License 2.0 | 6 votes |
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { // check if LANGUAGE found, possibly put there by HTMLLanguageParser String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE); // check if HTTP-header tels us the language if (lang == null) { lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE); } if (lang == null || lang.length() == 0) { lang = "unknown"; } doc.add("lang", lang); return doc; }
Example #5
Source File: MoreIndexingFilter.java From anthelion with Apache License 2.0 | 6 votes |
private NutchDocument addTime(NutchDocument doc, ParseData data, String url, CrawlDatum datum) { long time = -1; String lastModified = data.getMeta(Metadata.LAST_MODIFIED); if (lastModified != null) { // try parse last-modified time = getTime(lastModified,url); // use as time // store as string doc.add("lastModified", new Date(time)); } if (time == -1) { // if no last-modified time = datum.getFetchTime(); // use fetch time } // un-stored, indexed and un-tokenized doc.add("date", new Date(time)); return doc; }
Example #6
Source File: TestMoreIndexingFilter.java From anthelion with Apache License 2.0 | 6 votes |
/** * @since NUTCH-901 */ public void testNoParts(){ Configuration conf = NutchConfiguration.create(); conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false); MoreIndexingFilter filter = new MoreIndexingFilter(); filter.setConf(conf); assertNotNull(filter); NutchDocument doc = new NutchDocument(); ParseImpl parse = new ParseImpl("foo bar", new ParseData()); try{ filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks()); } catch(Exception e){ e.printStackTrace(); fail(e.getMessage()); } assertNotNull(doc); assertTrue(doc.getFieldNames().contains("type")); assertEquals(1, doc.getField("type").getValues().size()); assertEquals("text/html", doc.getFieldValue("type")); }
Example #7
Source File: URLMetaIndexingFilter.java From anthelion with Apache License 2.0 | 6 votes |
/** * This will take the metatags that you have listed in your "urlmeta.tags" * property, and looks for them inside the CrawlDatum object. If they exist, * this will add it as an attribute inside the NutchDocument. * * @see IndexingFilter#filter */ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { if (conf != null) this.setConf(conf); if (urlMetaTags == null || doc == null) return doc; for (String metatag : urlMetaTags) { Text metadata = (Text) datum.getMetaData().get(new Text(metatag)); if (metadata != null) doc.add(metatag, metadata.toString()); } return doc; }
Example #8
Source File: TestAnchorIndexingFilter.java From anthelion with Apache License 2.0 | 6 votes |
public void testDeduplicateAnchor() throws Exception { Configuration conf = NutchConfiguration.create(); conf.setBoolean("anchorIndexingFilter.deduplicate", true); AnchorIndexingFilter filter = new AnchorIndexingFilter(); filter.setConf(conf); assertNotNull(filter); NutchDocument doc = new NutchDocument(); ParseImpl parse = new ParseImpl("foo bar", new ParseData()); Inlinks inlinks = new Inlinks(); inlinks.add(new Inlink("http://test1.com/", "text1")); inlinks.add(new Inlink("http://test2.com/", "text2")); inlinks.add(new Inlink("http://test3.com/", "text2")); try { filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks); } catch(Exception e){ e.printStackTrace(); fail(e.getMessage()); } assertNotNull(doc); assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor")); assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size()); }
Example #9
Source File: CCIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
/** Add the features represented by a license URL. Urls are of the form * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a * license feature. */ public void addUrlFeatures(NutchDocument doc, String urlString) { try { URL url = new URL(urlString); // tokenize the path of the url, breaking at slashes and dashes StringTokenizer names = new StringTokenizer(url.getPath(), "/-"); if (names.hasMoreTokens()) names.nextToken(); // throw away "licenses" // add a feature per component after "licenses" while (names.hasMoreTokens()) { String feature = names.nextToken(); addFeature(doc, feature); } } catch (MalformedURLException e) { if (LOG.isWarnEnabled()) { LOG.warn("CC: failed to parse url: " + urlString + " : " + e); } } }
Example #10
Source File: TLDScoringFilter.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException { NutchField tlds = doc.getField("tld"); float boost = 1.0f; if(tlds != null) { for(Object tld : tlds.getValues()) { DomainSuffix entry = tldEntries.get(tld.toString()); if(entry != null) boost *= entry.getBoost(); } } return initScore * boost; }
Example #11
Source File: CCIndexingFilter.java From anthelion with Apache License 2.0 | 6 votes |
/** Add the features represented by a license URL. Urls are of the form * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a * license feature. */ public void addUrlFeatures(NutchDocument doc, String urlString) { try { URL url = new URL(urlString); // tokenize the path of the url, breaking at slashes and dashes StringTokenizer names = new StringTokenizer(url.getPath(), "/-"); if (names.hasMoreTokens()) names.nextToken(); // throw away "licenses" // add a feature per component after "licenses" while (names.hasMoreTokens()) { String feature = names.nextToken(); addFeature(doc, feature); } } catch (MalformedURLException e) { if (LOG.isWarnEnabled()) { LOG.warn("CC: failed to parse url: " + urlString + " : " + e); } } }
Example #12
Source File: TLDScoringFilter.java From anthelion with Apache License 2.0 | 6 votes |
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException { NutchField tlds = doc.getField("tld"); float boost = 1.0f; if(tlds != null) { for(Object tld : tlds.getValues()) { DomainSuffix entry = tldEntries.get(tld.toString()); if(entry != null) boost *= entry.getBoost(); } } return initScore * boost; }
Example #13
Source File: TestMoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
/** * @since NUTCH-901 */ public void testNoParts(){ Configuration conf = NutchConfiguration.create(); conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false); MoreIndexingFilter filter = new MoreIndexingFilter(); filter.setConf(conf); assertNotNull(filter); NutchDocument doc = new NutchDocument(); ParseImpl parse = new ParseImpl("foo bar", new ParseData()); try{ filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks()); } catch(Exception e){ e.printStackTrace(); fail(e.getMessage()); } assertNotNull(doc); assertTrue(doc.getFieldNames().contains("type")); assertEquals(1, doc.getField("type").getValues().size()); assertEquals("text/html", doc.getFieldValue("type")); }
Example #14
Source File: URLMetaIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
/** * This will take the metatags that you have listed in your "urlmeta.tags" * property, and looks for them inside the CrawlDatum object. If they exist, * this will add it as an attribute inside the NutchDocument. * * @see IndexingFilter#filter */ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { if (conf != null) this.setConf(conf); if (urlMetaTags == null || doc == null) return doc; for (String metatag : urlMetaTags) { Text metadata = (Text) datum.getMetaData().get(new Text(metatag)); if (metadata != null) doc.add(metatag, metadata.toString()); } return doc; }
Example #15
Source File: TestStaticFieldIndexerTest.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
/** * Test that empty {@code index.static} does not add anything to the document * @throws Exception */ public void testEmptyIndexStatic() throws Exception { assertNotNull(filter); filter.setConf(conf); NutchDocument doc = new NutchDocument(); try { filter.filter(doc, parse, url, crawlDatum, inlinks); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } assertNotNull(doc); assertTrue("tests if no field is set for empty index.static", doc.getFieldNames().isEmpty()); }
Example #16
Source File: StaticFieldIndexer.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
/** * The {@link StaticFieldIndexer} filter object which adds fields as per * configuration setting. See {@code index.static} in nutch-default.xml. * * @param doc The {@link NutchDocument} object * @param parse The relevant {@link Parse} object passing through the filter * @param url URL to be filtered for anchor text * @param datum The {@link CrawlDatum} entry * @param inlinks The {@link Inlinks} containing anchor text * @return filtered NutchDocument */ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { if (this.addStaticFields == true) { for (Entry<String, String[]> entry : this.fields.entrySet()) { for (String val : entry.getValue()) { doc.add(entry.getKey(), val); } } } return doc; }
Example #17
Source File: ElasticIndexWriter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
@Override public void write(NutchDocument doc) throws IOException { String id = (String)doc.getFieldValue("url"); String type = doc.getDocumentMeta().get("type"); if (type == null) type = "doc"; IndexRequestBuilder request = client.prepareIndex(defaultIndex, type, id); Map<String, Object> source = new HashMap<String, Object>(); // Loop through all fields of this doc for (String fieldName : doc.getFieldNames()) { if (doc.getField(fieldName).getValues().size() > 1) { source.put(fieldName, doc.getFieldValue(fieldName)); // Loop through the values to keep track of the size of this document for (Object value : doc.getField(fieldName).getValues()) { bulkLength += value.toString().length(); } } else { source.put(fieldName, doc.getFieldValue(fieldName)); bulkLength += doc.getFieldValue(fieldName).toString().length(); } } request.setSource(source); // Add this indexing request to a bulk request bulk.add(request); indexedDocs++; bulkDocs++; if (bulkDocs >= maxBulkDocs || bulkLength >= maxBulkLength) { LOG.info("Processing bulk request [docs = " + bulkDocs + ", length = " + bulkLength + ", total docs = " + indexedDocs + ", last doc in bulk = '" + id + "']"); // Flush the bulk of indexing requests createNewBulk = true; commit(); } }
Example #18
Source File: RelTagIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { // Check if some Rel-Tags found, possibly put there by RelTagParser String[] tags = parse.getData().getParseMeta().getValues(RelTagParser.REL_TAG); if (tags != null) { for (int i=0; i<tags.length; i++) { doc.add("tag", tags[i]); } } return doc; }
Example #19
Source File: TLDIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks) throws IndexingException { try { URL url = new URL(urlText.toString()); DomainSuffix d = URLUtil.getDomainSuffix(url); doc.add("tld", d.getDomain()); }catch (Exception ex) { LOG.warn(ex.toString()); } return doc; }
Example #20
Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { String url_s = url.toString(); addTime(doc, parse.getData(), url_s, datum); addLength(doc, parse.getData(), url_s); addType(doc, parse.getData(), url_s, datum); resetTitle(doc, parse.getData(), url_s); return doc; }
Example #21
Source File: S2jhIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
@Override public NutchDocument filterInternal(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) { ParseData parseData = parse.getData(); String sku = parseData.getMeta("sku"); if (StringUtils.isBlank(sku)) { return null; } doc.add("sku", sku); doc.add("price", parseData.getMeta("price")); return doc; }
Example #22
Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
private NutchDocument addLength(NutchDocument doc, ParseData data, String url) { String contentLength = data.getMeta(Response.CONTENT_LENGTH); if (contentLength != null) { // NUTCH-1010 ContentLength not trimmed String trimmed = contentLength.toString().trim(); if (!trimmed.isEmpty()) doc.add("contentLength", trimmed); } return doc; }
Example #23
Source File: TestBasicIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public void testBasicIndexingFilter() throws Exception { Configuration conf = NutchConfiguration.create(); conf.setInt("indexer.max.title.length", 10); conf.setBoolean("indexer.add.domain", true); conf.setInt("indexer.max.content.length", 20); BasicIndexingFilter filter = new BasicIndexingFilter(); filter.setConf(conf); assertNotNull(filter); NutchDocument doc = new NutchDocument(); String title = "The Foo Page"; Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") }; Metadata metaData = new Metadata(); metaData.add("Language", "en/us"); ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData); ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData); CrawlDatum crawlDatum = new CrawlDatum(); crawlDatum.setFetchTime(100L); Inlinks inlinks = new Inlinks(); try { filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks); } catch(Exception e){ e.printStackTrace(); fail(e.getMessage()); } assertNotNull(doc); assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc.getField("title").getValues().get(0)); assertEquals("test domain, expect \"apache.org\"", "apache.org", doc.getField("domain").getValues().get(0)); assertEquals("test host, expect \"nutch.apache.org\"", "nutch.apache.org", doc.getField("host").getValues().get(0)); assertEquals("test url, expect \"http://nutch.apache.org/index.html\"", "http://nutch.apache.org/index.html", doc.getField("url").getValues().get(0)); assertEquals("test content", "this is a sample foo", doc.getField("content").getValues().get(0)); assertEquals("test fetch time", new Date(100L), (Date)doc.getField("tstamp").getValues().get(0)); }
Example #24
Source File: TestStaticFieldIndexerTest.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
/** * Test that valid field:value pairs are added to the document * @throws Exception */ public void testNormalScenario() throws Exception { conf.set("index.static", "field1:val1, field2 : val2 val3 , field3, field4 :val4 , "); assertNotNull(filter); filter.setConf(conf); NutchDocument doc = new NutchDocument(); try { filter.filter(doc, parse, url, crawlDatum, inlinks); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } assertNotNull(doc); assertFalse("test if doc is not empty", doc.getFieldNames().isEmpty()); assertEquals("test if doc has 3 fields", 3, doc.getFieldNames().size()); assertTrue("test if doc has field1", doc.getField("field1").getValues() .contains("val1")); assertTrue("test if doc has field2", doc.getField("field2").getValues() .contains("val2")); assertTrue("test if doc has field4", doc.getField("field4").getValues() .contains("val4")); }
Example #25
Source File: AbstractIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
@Override public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { LOG.debug("Invoking indexer {} for url: {}", this.getClass().getName(), url); if (doc == null) { LOG.debug("Skipped as NutchDocument doc is null"); return doc; } return filterInternal(doc, parse, url, datum, inlinks); }
Example #26
Source File: IndexWriters.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public void update(NutchDocument doc) throws IOException { for (int i = 0; i < this.indexWriters.length; i++) { try { this.indexWriters[i].update(doc); } catch (IOException ioe) { throw ioe; } } }
Example #27
Source File: IndexWriters.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public void write(NutchDocument doc) throws IOException { for (int i = 0; i < this.indexWriters.length; i++) { try { this.indexWriters[i].write(doc); } catch (IOException ioe) { throw ioe; } } }
Example #28
Source File: TestMoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public void testContentDispositionTitle() throws IndexingException { Configuration conf = NutchConfiguration.create(); Metadata metadata = new Metadata(); metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext"); MoreIndexingFilter filter = new MoreIndexingFilter(); filter.setConf(conf); NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData( new ParseStatus(), "title", new Outlink[0], metadata)), new Text( "http://www.example.com/"), new CrawlDatum(), new Inlinks()); assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title")); }
Example #29
Source File: TestMoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
private void assertContentType(Configuration conf, String source, String expected) throws IndexingException { Metadata metadata = new Metadata(); metadata.add(Response.CONTENT_TYPE, source); MoreIndexingFilter filter = new MoreIndexingFilter(); filter.setConf(conf); NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData( new ParseStatus(), "title", new Outlink[0], metadata)), new Text( "http://www.example.com/"), new CrawlDatum(), new Inlinks()); assertEquals("mime type not detected", expected, doc.getFieldValue("type")); }
Example #30
Source File: CCIndexingFilter.java From anthelion with Apache License 2.0 | 5 votes |
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { Metadata metadata = parse.getData().getParseMeta(); // index the license String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL); if (licenseUrl != null) { if (LOG.isInfoEnabled()) { LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString()); } // add the entire license as cc:license=xxx addFeature(doc, "license=" + licenseUrl); // index license attributes extracted of the license url addUrlFeatures(doc, licenseUrl); } // index the license location as cc:meta=xxx String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION); if (licenseLocation != null) { addFeature(doc, "meta=" + licenseLocation); } // index the work type cc:type=xxx String workType = metadata.get(CreativeCommons.WORK_TYPE); if (workType != null) { addFeature(doc, workType); } return doc; }