org.apache.nutch.crawl.Inlinks Java Examples
The following examples show how to use
org.apache.nutch.crawl.Inlinks.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: LanguageIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { // check if LANGUAGE found, possibly put there by HTMLLanguageParser String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE); // check if HTTP-header tels us the language if (lang == null) { lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE); } if (lang == null || lang.length() == 0) { lang = "unknown"; } doc.add("lang", lang); return doc; }
Example #2
Source File: URLMetaIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
/** * This will take the metatags that you have listed in your "urlmeta.tags" * property, and looks for them inside the CrawlDatum object. If they exist, * this will add it as an attribute inside the NutchDocument. * * @see IndexingFilter#filter */ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { if (conf != null) this.setConf(conf); if (urlMetaTags == null || doc == null) return doc; for (String metatag : urlMetaTags) { Text metadata = (Text) datum.getMetaData().get(new Text(metatag)); if (metadata != null) doc.add(metatag, metadata.toString()); } return doc; }
Example #3
Source File: TestMoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
/** * @since NUTCH-901 */ public void testNoParts(){ Configuration conf = NutchConfiguration.create(); conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false); MoreIndexingFilter filter = new MoreIndexingFilter(); filter.setConf(conf); assertNotNull(filter); NutchDocument doc = new NutchDocument(); ParseImpl parse = new ParseImpl("foo bar", new ParseData()); try{ filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks()); } catch(Exception e){ e.printStackTrace(); fail(e.getMessage()); } assertNotNull(doc); assertTrue(doc.getFieldNames().contains("type")); assertEquals(1, doc.getField("type").getValues().size()); assertEquals("text/html", doc.getFieldValue("type")); }
Example #4
Source File: TestMoreIndexingFilter.java From anthelion with Apache License 2.0 | 6 votes |
/** * @since NUTCH-901 */ public void testNoParts(){ Configuration conf = NutchConfiguration.create(); conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false); MoreIndexingFilter filter = new MoreIndexingFilter(); filter.setConf(conf); assertNotNull(filter); NutchDocument doc = new NutchDocument(); ParseImpl parse = new ParseImpl("foo bar", new ParseData()); try{ filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks()); } catch(Exception e){ e.printStackTrace(); fail(e.getMessage()); } assertNotNull(doc); assertTrue(doc.getFieldNames().contains("type")); assertEquals(1, doc.getField("type").getValues().size()); assertEquals("text/html", doc.getFieldValue("type")); }
Example #5
Source File: LanguageIndexingFilter.java From anthelion with Apache License 2.0 | 6 votes |
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { // check if LANGUAGE found, possibly put there by HTMLLanguageParser String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE); // check if HTTP-header tels us the language if (lang == null) { lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE); } if (lang == null || lang.length() == 0) { lang = "unknown"; } doc.add("lang", lang); return doc; }
Example #6
Source File: URLMetaIndexingFilter.java From anthelion with Apache License 2.0 | 6 votes |
/** * This will take the metatags that you have listed in your "urlmeta.tags" * property, and looks for them inside the CrawlDatum object. If they exist, * this will add it as an attribute inside the NutchDocument. * * @see IndexingFilter#filter */ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { if (conf != null) this.setConf(conf); if (urlMetaTags == null || doc == null) return doc; for (String metatag : urlMetaTags) { Text metadata = (Text) datum.getMetaData().get(new Text(metatag)); if (metadata != null) doc.add(metatag, metadata.toString()); } return doc; }
Example #7
Source File: TestAnchorIndexingFilter.java From anthelion with Apache License 2.0 | 6 votes |
public void testDeduplicateAnchor() throws Exception { Configuration conf = NutchConfiguration.create(); conf.setBoolean("anchorIndexingFilter.deduplicate", true); AnchorIndexingFilter filter = new AnchorIndexingFilter(); filter.setConf(conf); assertNotNull(filter); NutchDocument doc = new NutchDocument(); ParseImpl parse = new ParseImpl("foo bar", new ParseData()); Inlinks inlinks = new Inlinks(); inlinks.add(new Inlink("http://test1.com/", "text1")); inlinks.add(new Inlink("http://test2.com/", "text2")); inlinks.add(new Inlink("http://test3.com/", "text2")); try { filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks); } catch(Exception e){ e.printStackTrace(); fail(e.getMessage()); } assertNotNull(doc); assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor")); assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size()); }
Example #8
Source File: TestAnchorIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
public void testDeduplicateAnchor() throws Exception { Configuration conf = NutchConfiguration.create(); conf.setBoolean("anchorIndexingFilter.deduplicate", true); AnchorIndexingFilter filter = new AnchorIndexingFilter(); filter.setConf(conf); assertNotNull(filter); NutchDocument doc = new NutchDocument(); ParseImpl parse = new ParseImpl("foo bar", new ParseData()); Inlinks inlinks = new Inlinks(); inlinks.add(new Inlink("http://test1.com/", "text1")); inlinks.add(new Inlink("http://test2.com/", "text2")); inlinks.add(new Inlink("http://test3.com/", "text2")); try { filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks); } catch(Exception e){ e.printStackTrace(); fail(e.getMessage()); } assertNotNull(doc); assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor")); assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size()); }
Example #9
Source File: TLDScoringFilter.java From anthelion with Apache License 2.0 | 6 votes |
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException { NutchField tlds = doc.getField("tld"); float boost = 1.0f; if(tlds != null) { for(Object tld : tlds.getValues()) { DomainSuffix entry = tldEntries.get(tld.toString()); if(entry != null) boost *= entry.getBoost(); } } return initScore * boost; }
Example #10
Source File: TLDScoringFilter.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException { NutchField tlds = doc.getField("tld"); float boost = 1.0f; if(tlds != null) { for(Object tld : tlds.getValues()) { DomainSuffix entry = tldEntries.get(tld.toString()); if(entry != null) boost *= entry.getBoost(); } } return initScore * boost; }
Example #11
Source File: TLDIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks) throws IndexingException { try { URL url = new URL(urlText.toString()); DomainSuffix d = URLUtil.getDomainSuffix(url); doc.add("tld", d.getDomain()); }catch (Exception ex) { LOG.warn(ex.toString()); } return doc; }
Example #12
Source File: RelTagIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { // Check if some Rel-Tags found, possibly put there by RelTagParser String[] tags = parse.getData().getParseMeta().getValues(RelTagParser.REL_TAG); if (tags != null) { for (int i=0; i<tags.length; i++) { doc.add("tag", tags[i]); } } return doc; }
Example #13
Source File: StaticFieldIndexer.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
/** * The {@link StaticFieldIndexer} filter object which adds fields as per * configuration setting. See {@code index.static} in nutch-default.xml. * * @param doc The {@link NutchDocument} object * @param parse The relevant {@link Parse} object passing through the filter * @param url URL to be filtered for anchor text * @param datum The {@link CrawlDatum} entry * @param inlinks The {@link Inlinks} containing anchor text * @return filtered NutchDocument */ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { if (this.addStaticFields == true) { for (Entry<String, String[]> entry : this.fields.entrySet()) { for (String val : entry.getValue()) { doc.add(entry.getKey(), val); } } } return doc; }
Example #14
Source File: AbstractIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
@Override public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { LOG.debug("Invoking indexer {} for url: {}", this.getClass().getName(), url); if (doc == null) { LOG.debug("Skipped as NutchDocument doc is null"); return doc; } return filterInternal(doc, parse, url, datum, inlinks); }
Example #15
Source File: S2jhDiscardIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
@Override public NutchDocument filterInternal(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) { if (!keepIndexPattern.matcher(url.toString()).find()) { LOG.debug("Cancel index for {} as not match regex [{}]", url, keepIndexPattern); doc = null; return null; } return doc; }
Example #16
Source File: S2jhIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
@Override public NutchDocument filterInternal(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) { ParseData parseData = parse.getData(); String sku = parseData.getMeta("sku"); if (StringUtils.isBlank(sku)) { return null; } doc.add("sku", sku); doc.add("price", parseData.getMeta("price")); return doc; }
Example #17
Source File: TestStaticFieldIndexerTest.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
protected void setUp() throws Exception { conf = NutchConfiguration.create(); parse = new ParseImpl(); url = new Text("http://nutch.apache.org/index.html"); crawlDatum = new CrawlDatum(); inlinks = new Inlinks(); filter = new StaticFieldIndexer(); }
Example #18
Source File: IndexingFilters.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
/** Run all defined filters. */ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { for (int i = 0; i < this.indexingFilters.length; i++) { doc = this.indexingFilters[i].filter(doc, parse, url, datum, inlinks); // break the loop if an indexing filter discards the doc if (doc == null) return null; } return doc; }
Example #19
Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { String url_s = url.toString(); addTime(doc, parse.getData(), url_s, datum); addLength(doc, parse.getData(), url_s); addType(doc, parse.getData(), url_s, datum); resetTitle(doc, parse.getData(), url_s); return doc; }
Example #20
Source File: TestMoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public void testContentDispositionTitle() throws IndexingException { Configuration conf = NutchConfiguration.create(); Metadata metadata = new Metadata(); metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext"); MoreIndexingFilter filter = new MoreIndexingFilter(); filter.setConf(conf); NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData( new ParseStatus(), "title", new Outlink[0], metadata)), new Text( "http://www.example.com/"), new CrawlDatum(), new Inlinks()); assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title")); }
Example #21
Source File: TestMoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
private void assertContentType(Configuration conf, String source, String expected) throws IndexingException { Metadata metadata = new Metadata(); metadata.add(Response.CONTENT_TYPE, source); MoreIndexingFilter filter = new MoreIndexingFilter(); filter.setConf(conf); NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData( new ParseStatus(), "title", new Outlink[0], metadata)), new Text( "http://www.example.com/"), new CrawlDatum(), new Inlinks()); assertEquals("mime type not detected", expected, doc.getFieldValue("type")); }
Example #22
Source File: TestBasicIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public void testBasicIndexingFilter() throws Exception { Configuration conf = NutchConfiguration.create(); conf.setInt("indexer.max.title.length", 10); conf.setBoolean("indexer.add.domain", true); conf.setInt("indexer.max.content.length", 20); BasicIndexingFilter filter = new BasicIndexingFilter(); filter.setConf(conf); assertNotNull(filter); NutchDocument doc = new NutchDocument(); String title = "The Foo Page"; Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") }; Metadata metaData = new Metadata(); metaData.add("Language", "en/us"); ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData); ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData); CrawlDatum crawlDatum = new CrawlDatum(); crawlDatum.setFetchTime(100L); Inlinks inlinks = new Inlinks(); try { filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks); } catch(Exception e){ e.printStackTrace(); fail(e.getMessage()); } assertNotNull(doc); assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc.getField("title").getValues().get(0)); assertEquals("test domain, expect \"apache.org\"", "apache.org", doc.getField("domain").getValues().get(0)); assertEquals("test host, expect \"nutch.apache.org\"", "nutch.apache.org", doc.getField("host").getValues().get(0)); assertEquals("test url, expect \"http://nutch.apache.org/index.html\"", "http://nutch.apache.org/index.html", doc.getField("url").getValues().get(0)); assertEquals("test content", "this is a sample foo", doc.getField("content").getValues().get(0)); assertEquals("test fetch time", new Date(100L), (Date)doc.getField("tstamp").getValues().get(0)); }
Example #23
Source File: AnchorIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
/** * The {@link AnchorIndexingFilter} filter object which supports boolean * configuration settings for the deduplication of anchors. * See {@code anchorIndexingFilter.deduplicate} in nutch-default.xml. * * @param doc The {@link NutchDocument} object * @param parse The relevant {@link Parse} object passing through the filter * @param url URL to be filtered for anchor text * @param datum The {@link CrawlDatum} entry * @param inlinks The {@link Inlinks} containing anchor text * @return filtered NutchDocument */ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { String[] anchors = (inlinks != null ? inlinks.getAnchors() : new String[0]); HashSet<String> set = null; for (int i = 0; i < anchors.length; i++) { if (deduplicate) { if (set == null) set = new HashSet<String>(); String lcAnchor = anchors[i].toLowerCase(); // Check if already processed the current anchor if (!set.contains(lcAnchor)) { doc.add("anchor", anchors[i]); // Add to map set.add(lcAnchor); } } else { doc.add("anchor", anchors[i]); } } return doc; }
Example #24
Source File: CCIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { Metadata metadata = parse.getData().getParseMeta(); // index the license String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL); if (licenseUrl != null) { if (LOG.isInfoEnabled()) { LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString()); } // add the entire license as cc:license=xxx addFeature(doc, "license=" + licenseUrl); // index license attributes extracted of the license url addUrlFeatures(doc, licenseUrl); } // index the license location as cc:meta=xxx String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION); if (licenseLocation != null) { addFeature(doc, "meta=" + licenseLocation); } // index the work type cc:type=xxx String workType = metadata.get(CreativeCommons.WORK_TYPE); if (workType != null) { addFeature(doc, workType); } return doc; }
Example #25
Source File: TestIndexingFilters.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
/** * Test behaviour when defined filter does not exist. * @throws IndexingException */ public void testNonExistingIndexingFilter() throws IndexingException { Configuration conf = NutchConfiguration.create(); conf.addResource("nutch-default.xml"); conf.addResource("crawl-tests.xml"); String class1 = "NonExistingFilter"; String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter"; conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2); IndexingFilters filters = new IndexingFilters(conf); filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData( new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text( "http://www.example.com/"), new CrawlDatum(), new Inlinks()); }
Example #26
Source File: TestIndexingFilters.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
/** * Test behaviour when NutchDOcument is null */ public void testNutchDocumentNullIndexingFilter() throws IndexingException{ Configuration conf = NutchConfiguration.create(); conf.addResource("nutch-default.xml"); conf.addResource("crawl-tests.xml"); IndexingFilters filters = new IndexingFilters(conf); NutchDocument doc = filters.filter(null, new ParseImpl("text", new ParseData( new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text( "http://www.example.com/"), new CrawlDatum(), new Inlinks()); assertNull(doc); }
Example #27
Source File: TestIndexingFilters.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
/** * Test behaviour when reset the index filter order will not take effect * * @throws IndexingException */ public void testFilterCacheIndexingFilter() throws IndexingException{ Configuration conf = NutchConfiguration.create(); conf.addResource("nutch-default.xml"); conf.addResource("crawl-tests.xml"); String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter"; conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1); IndexingFilters filters1 = new IndexingFilters(conf); NutchDocument fdoc1 = filters1.filter(new NutchDocument(),new ParseImpl("text",new ParseData( new ParseStatus(),"title",new Outlink[0],new Metadata())),new Text("http://www.example.com/"), new CrawlDatum(),new Inlinks()); // add another index filter String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer"; // set content metadata Metadata md = new Metadata(); md.add("example","data"); // set content metadata property defined in MetadataIndexer conf.set("index.content.md","example"); // add MetadataIndxer filter conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2); IndexingFilters filters2 = new IndexingFilters(conf); NutchDocument fdoc2 = filters2.filter(new NutchDocument(),new ParseImpl("text",new ParseData( new ParseStatus(),"title",new Outlink[0],md)),new Text("http://www.example.com/"), new CrawlDatum(),new Inlinks()); assertEquals(fdoc1.getFieldNames().size(),fdoc2.getFieldNames().size()); }
Example #28
Source File: LanguageDetectionFilter.java From language-detection with Apache License 2.0 | 5 votes |
/** * {@inheritDoc} */ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { if (conf == null) { throw new IndexingException("Not Yet Initialization."); } if (cause != null) { throw new IndexingException("Initialization Failed.", cause); } String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE); if (lang == null) { StringBuilder text = new StringBuilder(); text.append(parse.getData().getTitle()).append(" ") .append(parse.getText()); try { Detector detector = DetectorFactory.create(); detector.setMaxTextLength(textsize_upper_limit); detector.append(text.toString()); lang = detector.detect(); } catch (LangDetectException e) { throw new IndexingException("Detection failed.", e); } } if (lang == null) lang = "unknown"; doc.add("lang", lang); return doc; }
Example #29
Source File: IndexingFilters.java From anthelion with Apache License 2.0 | 5 votes |
/** Run all defined filters. */ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { for (int i = 0; i < this.indexingFilters.length; i++) { doc = this.indexingFilters[i].filter(doc, parse, url, datum, inlinks); // break the loop if an indexing filter discards the doc if (doc == null) return null; } return doc; }
Example #30
Source File: MoreIndexingFilter.java From anthelion with Apache License 2.0 | 5 votes |
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { String url_s = url.toString(); addTime(doc, parse.getData(), url_s, datum); addLength(doc, parse.getData(), url_s); addType(doc, parse.getData(), url_s, datum); resetTitle(doc, parse.getData(), url_s); return doc; }