org.apache.nutch.metadata.Metadata Java Examples
The following examples show how to use
org.apache.nutch.metadata.Metadata.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MoreIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
private NutchDocument addTime(NutchDocument doc, ParseData data, String url, CrawlDatum datum) { long time = -1; String lastModified = data.getMeta(Metadata.LAST_MODIFIED); if (lastModified != null) { // try parse last-modified time = getTime(lastModified,url); // use as time // store as string doc.add("lastModified", new Date(time)); } if (time == -1) { // if no last-modified specified in HTTP header time = datum.getModifiedTime(); // use value in CrawlDatum if (time <= 0) { // if also unset time = datum.getFetchTime(); // use time the fetch took place (fetchTime of fetchDatum) } } // un-stored, indexed and un-tokenized doc.add("date", new Date(time)); return doc; }
Example #2
Source File: RelTagParser.java From anthelion with Apache License 2.0 | 6 votes |
/** * Scan the HTML document looking at possible rel-tags */ public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { // get parse obj Parse parse = parseResult.get(content.getUrl()); // Trying to find the document's rel-tags Parser parser = new Parser(doc); Set tags = parser.getRelTags(); Iterator iter = tags.iterator(); Metadata metadata = parse.getData().getParseMeta(); while (iter.hasNext()) { metadata.add(REL_TAG, (String) iter.next()); } return parseResult; }
Example #3
Source File: HtmlParser.java From anthelion with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws Exception { //LOG.setLevel(Level.FINE); String name = args[0]; String url = "file:"+name; File file = new File(name); byte[] bytes = new byte[(int)file.length()]; DataInputStream in = new DataInputStream(new FileInputStream(file)); in.readFully(bytes); Configuration conf = NutchConfiguration.create(); HtmlParser parser = new HtmlParser(); parser.setConf(conf); Parse parse = parser.getParse( new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url); System.out.println("data: "+parse.getData()); System.out.println("text: "+parse.getText()); }
Example #4
Source File: WdcParser.java From anthelion with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws Exception { // LOG.setLevel(Level.FINE); String name = args[0]; String url = "file:" + name; File file = new File(name); byte[] bytes = new byte[(int) file.length()]; DataInputStream in = new DataInputStream(new FileInputStream(file)); in.readFully(bytes); Configuration conf = NutchConfiguration.create(); WdcParser parser = new WdcParser(); parser.setConf(conf); Parse parse = parser.getParse(new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url); System.out.println("data: " + parse.getData()); System.out.println("text: " + parse.getText()); String contains = parse.getData().getMeta(META_CONTAINS_SEM); System.out.println("contains: " + contains); }
Example #5
Source File: MoreIndexingFilter.java From anthelion with Apache License 2.0 | 6 votes |
private NutchDocument addTime(NutchDocument doc, ParseData data, String url, CrawlDatum datum) { long time = -1; String lastModified = data.getMeta(Metadata.LAST_MODIFIED); if (lastModified != null) { // try parse last-modified time = getTime(lastModified,url); // use as time // store as string doc.add("lastModified", new Date(time)); } if (time == -1) { // if no last-modified time = datum.getFetchTime(); // use fetch time } // un-stored, indexed and un-tokenized doc.add("date", new Date(time)); return doc; }
Example #6
Source File: FeedParser.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
/** * Runs a command line version of this {@link Parser}. * * @param args * A single argument (expected at arg[0]) representing a path on the * local filesystem that points to a feed file. * * @throws Exception * If any error occurs. */ public static void main(String[] args) throws Exception { if (args.length != 1) { System.err.println("Usage: FeedParser <feed>"); System.exit(1); } String name = args[0]; String url = "file:" + name; Configuration conf = NutchConfiguration.create(); FeedParser parser = new FeedParser(); parser.setConf(conf); File file = new File(name); byte[] bytes = new byte[(int) file.length()]; DataInputStream in = new DataInputStream(new FileInputStream(file)); in.readFully(bytes); ParseResult parseResult = parser.getParse(new Content(url, url, bytes, "application/rss+xml", new Metadata(), conf)); for (Entry<Text, Parse> entry : parseResult) { System.out.println("key: " + entry.getKey()); Parse parse = entry.getValue(); System.out.println("data: " + parse.getData()); System.out.println("text: " + parse.getText() + "\n"); } }
Example #7
Source File: HtmlParser.java From anthelion with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws Exception { //LOG.setLevel(Level.FINE); String name = args[0]; String url = "file:"+name; File file = new File(name); byte[] bytes = new byte[(int)file.length()]; DataInputStream in = new DataInputStream(new FileInputStream(file)); in.readFully(bytes); Configuration conf = NutchConfiguration.create(); HtmlParser parser = new HtmlParser(); parser.setConf(conf); Parse parse = parser.getParse( new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url); System.out.println("data: "+parse.getData()); System.out.println("text: "+parse.getText()); }
Example #8
Source File: FeedParser.java From anthelion with Apache License 2.0 | 6 votes |
/** * Runs a command line version of this {@link Parser}. * * @param args * A single argument (expected at arg[0]) representing a path on the * local filesystem that points to a feed file. * * @throws Exception * If any error occurs. */ public static void main(String[] args) throws Exception { if (args.length != 1) { System.err.println("Usage: FeedParser <feed>"); System.exit(1); } String name = args[0]; String url = "file:" + name; Configuration conf = NutchConfiguration.create(); FeedParser parser = new FeedParser(); parser.setConf(conf); File file = new File(name); byte[] bytes = new byte[(int) file.length()]; DataInputStream in = new DataInputStream(new FileInputStream(file)); in.readFully(bytes); ParseResult parseResult = parser.getParse(new Content(url, url, bytes, "application/rss+xml", new Metadata(), conf)); for (Entry<Text, Parse> entry : parseResult) { System.out.println("key: " + entry.getKey()); Parse parse = entry.getValue(); System.out.println("data: " + parse.getData()); System.out.println("text: " + parse.getText() + "\n"); } }
Example #9
Source File: LanguageIndexingFilter.java From anthelion with Apache License 2.0 | 6 votes |
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { // check if LANGUAGE found, possibly put there by HTMLLanguageParser String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE); // check if HTTP-header tels us the language if (lang == null) { lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE); } if (lang == null || lang.length() == 0) { lang = "unknown"; } doc.add("lang", lang); return doc; }
Example #10
Source File: TestHTMLLanguageParser.java From anthelion with Apache License 2.0 | 6 votes |
/** * Test parsing of language identifiers from html **/ public void testMetaHTMLParsing() { try { ParseUtil parser = new ParseUtil(NutchConfiguration.create()); /* loop through the test documents and validate result */ for (int t = 0; t < docs.length; t++) { Content content = getContent(docs[t]); Parse parse = parser.parse(content).get(content.getUrl()); assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE)); } } catch (Exception e) { e.printStackTrace(System.out); fail(e.toString()); } }
Example #11
Source File: SWFParser.java From anthelion with Apache License 2.0 | 6 votes |
/** * Arguments are: 0. Name of input SWF file. */ public static void main(String[] args) throws IOException { FileInputStream in = new FileInputStream(args[0]); byte[] buf = new byte[in.available()]; in.read(buf); SWFParser parser = new SWFParser(); ParseResult parseResult = parser.getParse(new Content("file:" + args[0], "file:" + args[0], buf, "application/x-shockwave-flash", new Metadata(), NutchConfiguration.create())); Parse p = parseResult.get("file:" + args[0]); System.out.println("Parse Text:"); System.out.println(p.getText()); System.out.println("Parse Data:"); System.out.println(p.getData()); }
Example #12
Source File: TestMetatagParser.java From anthelion with Apache License 2.0 | 6 votes |
public void testIt() { Configuration conf = NutchConfiguration.create(); String urlString = "file:" + sampleDir + fileSeparator + sampleFile; try { Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); // check that we get the same values Metadata parseMeta = parse.getData().getParseMeta(); assertEquals(description, parseMeta.get("metatag.description")); assertEquals(keywords, parseMeta.get("metatag.keywords")); } catch (Exception e) { e.printStackTrace(); fail(e.toString()); } }
Example #13
Source File: LanguageIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { // check if LANGUAGE found, possibly put there by HTMLLanguageParser String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE); // check if HTTP-header tels us the language if (lang == null) { lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE); } if (lang == null || lang.length() == 0) { lang = "unknown"; } doc.add("lang", lang); return doc; }
Example #14
Source File: TestParseData.java From anthelion with Apache License 2.0 | 6 votes |
public void testParseData() throws Exception { String title = "The Foo Page"; Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo"), new Outlink("http://bar.com/", "Bar") }; Metadata metaData = new Metadata(); metaData.add("Language", "en/us"); metaData.add("Charset", "UTF-8"); ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData); WritableTestUtils.testWritable(r, null); }
Example #15
Source File: TestHTMLLanguageParser.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
/** * Test parsing of language identifiers from html **/ public void testMetaHTMLParsing() { try { ParseUtil parser = new ParseUtil(NutchConfiguration.create()); /* loop through the test documents and validate result */ for (int t = 0; t < docs.length; t++) { Content content = getContent(docs[t]); Parse parse = parser.parse(content).get(content.getUrl()); assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE)); } } catch (Exception e) { e.printStackTrace(System.out); fail(e.toString()); } }
Example #16
Source File: SWFParser.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
/** * Arguments are: 0. Name of input SWF file. */ public static void main(String[] args) throws IOException { FileInputStream in = new FileInputStream(args[0]); byte[] buf = new byte[in.available()]; in.read(buf); in.close(); SWFParser parser = new SWFParser(); ParseResult parseResult = parser.getParse(new Content("file:" + args[0], "file:" + args[0], buf, "application/x-shockwave-flash", new Metadata(), NutchConfiguration.create())); Parse p = parseResult.get("file:" + args[0]); System.out.println("Parse Text:"); System.out.println(p.getText()); System.out.println("Parse Data:"); System.out.println(p.getData()); }
Example #17
Source File: TestParseData.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
public void testParseData() throws Exception { String title = "The Foo Page"; Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo"), new Outlink("http://bar.com/", "Bar") }; Metadata metaData = new Metadata(); metaData.add("Language", "en/us"); metaData.add("Charset", "UTF-8"); ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData); WritableTestUtils.testWritable(r, null); }
Example #18
Source File: HttpAuthenticationFactory.java From nutch-htmlunit with Apache License 2.0 | 6 votes |
public HttpAuthentication findAuthentication(Metadata header) { if (header == null) return null; try { Collection<String> challenge = new ArrayList<String>(); challenge.add(header.get(WWW_AUTHENTICATE)); for(String challengeString: challenge) { if (challengeString.equals("NTLM")) challengeString="Basic realm=techweb"; if (LOG.isTraceEnabled()) LOG.trace("Checking challengeString=" + challengeString); HttpAuthentication auth = HttpBasicAuthentication.getAuthentication(challengeString, conf); if (auth != null) return auth; //TODO Add additional Authentication lookups here } } catch (Exception e) { LOG.error("Error: ", e); } return null; }
Example #19
Source File: Content.java From anthelion with Apache License 2.0 | 6 votes |
public Content(String url, String base, byte[] content, String contentType, Metadata metadata, Configuration conf) { if (url == null) throw new IllegalArgumentException("null url"); if (base == null) throw new IllegalArgumentException("null base"); if (content == null) throw new IllegalArgumentException("null content"); if (metadata == null) throw new IllegalArgumentException("null metadata"); this.url = url; this.base = base; this.content = content; this.metadata = metadata; this.mimeTypes = new MimeUtil(conf); this.contentType = getContentType(contentType, url, content); }
Example #20
Source File: TestCCParseFilter.java From anthelion with Apache License 2.0 | 6 votes |
public void pageTest(File file, String url, String license, String location, String type) throws Exception { String contentType = "text/html"; InputStream in = new FileInputStream(file); ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length()); byte[] buffer = new byte[1024]; int i; while ((i = in.read(buffer)) != -1) { out.write(buffer, 0, i); } in.close(); byte[] bytes = out.toByteArray(); Configuration conf = NutchConfiguration.create(); Content content = new Content(url, url, bytes, contentType, new Metadata(), conf); Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); Metadata metadata = parse.getData().getParseMeta(); assertEquals(license, metadata.get("License-Url")); assertEquals(location, metadata.get("License-Location")); assertEquals(type, metadata.get("Work-Type")); }
Example #21
Source File: TestMetatagParser.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public Metadata parseMeta(String fileName, Configuration conf) { Metadata metadata = null; try { String urlString = "file:" + sampleDir + fileSeparator + fileName; Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); metadata = parse.getData().getParseMeta(); } catch (Exception e) { e.printStackTrace(); fail(e.toString()); } return metadata; }
Example #22
Source File: TestIndexingFilters.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
/** * Test behaviour when reset the index filter order will not take effect * * @throws IndexingException */ public void testFilterCacheIndexingFilter() throws IndexingException{ Configuration conf = NutchConfiguration.create(); conf.addResource("nutch-default.xml"); conf.addResource("crawl-tests.xml"); String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter"; conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1); IndexingFilters filters1 = new IndexingFilters(conf); NutchDocument fdoc1 = filters1.filter(new NutchDocument(),new ParseImpl("text",new ParseData( new ParseStatus(),"title",new Outlink[0],new Metadata())),new Text("http://www.example.com/"), new CrawlDatum(),new Inlinks()); // add another index filter String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer"; // set content metadata Metadata md = new Metadata(); md.add("example","data"); // set content metadata property defined in MetadataIndexer conf.set("index.content.md","example"); // add MetadataIndxer filter conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2); IndexingFilters filters2 = new IndexingFilters(conf); NutchDocument fdoc2 = filters2.filter(new NutchDocument(),new ParseImpl("text",new ParseData( new ParseStatus(),"title",new Outlink[0],md)),new Text("http://www.example.com/"), new CrawlDatum(),new Inlinks()); assertEquals(fdoc1.getFieldNames().size(),fdoc2.getFieldNames().size()); }
Example #23
Source File: ParseData.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public ParseData(ParseStatus status, String title, Outlink[] outlinks, Metadata contentMeta, Metadata parseMeta) { this.status = status; this.title = title; this.outlinks = outlinks; this.contentMeta = contentMeta; this.parseMeta = parseMeta; }
Example #24
Source File: ParseSegment.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
/** * Checks if the page's content is truncated. * @param content * @return If the page is truncated <code>true</code>. When it is not, * or when it could be determined, <code>false</code>. */ public static boolean isTruncated(Content content) { byte[] contentBytes = content.getContent(); if (contentBytes == null) return false; Metadata metadata = content.getMetadata(); if (metadata == null) return false; String lengthStr = metadata.get(Response.CONTENT_LENGTH); if (lengthStr != null) lengthStr=lengthStr.trim(); if (StringUtil.isEmpty(lengthStr)) { return false; } int inHeaderSize; String url = content.getUrl(); try { inHeaderSize = Integer.parseInt(lengthStr); } catch (NumberFormatException e) { LOG.warn("Wrong contentlength format for " + url, e); return false; } int actualSize = contentBytes.length; if (inHeaderSize > actualSize) { LOG.info(url + " skipped. Content of size " + inHeaderSize + " was truncated to " + actualSize); return true; } if (LOG.isDebugEnabled()) { LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize); } return false; }
Example #25
Source File: TestIndexingFilters.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
/** * Test behaviour when defined filter does not exist. * @throws IndexingException */ public void testNonExistingIndexingFilter() throws IndexingException { Configuration conf = NutchConfiguration.create(); conf.addResource("nutch-default.xml"); conf.addResource("crawl-tests.xml"); String class1 = "NonExistingFilter"; String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter"; conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2); IndexingFilters filters = new IndexingFilters(conf); filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData( new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text( "http://www.example.com/"), new CrawlDatum(), new Inlinks()); }
Example #26
Source File: CCIndexingFilter.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { Metadata metadata = parse.getData().getParseMeta(); // index the license String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL); if (licenseUrl != null) { if (LOG.isInfoEnabled()) { LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString()); } // add the entire license as cc:license=xxx addFeature(doc, "license=" + licenseUrl); // index license attributes extracted of the license url addUrlFeatures(doc, licenseUrl); } // index the license location as cc:meta=xxx String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION); if (licenseLocation != null) { addFeature(doc, "meta=" + licenseLocation); } // index the work type cc:type=xxx String workType = metadata.get(CreativeCommons.WORK_TYPE); if (workType != null) { addFeature(doc, workType); } return doc; }
Example #27
Source File: ParseSegment.java From anthelion with Apache License 2.0 | 5 votes |
/** * Checks if the page's content is truncated. * @param content * @return If the page is truncated <code>true</code>. When it is not, * or when it could be determined, <code>false</code>. */ public static boolean isTruncated(Content content) { byte[] contentBytes = content.getContent(); if (contentBytes == null) return false; Metadata metadata = content.getMetadata(); if (metadata == null) return false; String lengthStr = metadata.get(Response.CONTENT_LENGTH); if (lengthStr != null) lengthStr=lengthStr.trim(); if (StringUtil.isEmpty(lengthStr)) { return false; } int inHeaderSize; String url = content.getUrl(); try { inHeaderSize = Integer.parseInt(lengthStr); } catch (NumberFormatException e) { LOG.warn("Wrong contentlength format for " + url, e); return false; } int actualSize = contentBytes.length; if (inHeaderSize > actualSize) { LOG.info(url + " skipped. Content of size " + inHeaderSize + " was truncated to " + actualSize); return true; } if (LOG.isDebugEnabled()) { LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize); } return false; }
Example #28
Source File: HTMLLanguageParser.java From anthelion with Apache License 2.0 | 5 votes |
private static String getLanguageFromMetadata(Metadata meta) { if (meta == null) return null; // dublin core String lang = meta.get("dc.language"); if (lang != null) return lang; // meta content-language lang = meta.get("content-language"); if (lang != null) return lang; // lang attribute return meta.get("lang"); }
Example #29
Source File: HTMLLanguageParser.java From anthelion with Apache License 2.0 | 5 votes |
/** * Scan the HTML document looking at possible indications of content * language<br> * <li>1. html lang attribute * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1) <li>2. meta * dc.language * (http://dublincore.org/documents/2000/07/16/usageguide/qualified * -html.shtml#language) <li>3. meta http-equiv (content-language) * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) <br> */ public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { String lang = null; Parse parse = parseResult.get(content.getUrl()); if (detect >= 0 && identify < 0) { lang = detectLanguage(parse, doc); } else if (detect < 0 && identify >= 0) { lang = identifyLanguage(parse); } else if (detect < identify) { lang = detectLanguage(parse, doc); if (lang == null) { lang = identifyLanguage(parse); } } else if (identify < detect) { lang = identifyLanguage(parse); if (lang == null) { lang = detectLanguage(parse, doc); } } else { LOG.warn("No configuration for language extraction policy is provided"); return parseResult; } if (lang != null) { parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang); return parseResult; } return parseResult; }
Example #30
Source File: TestIndexingFilters.java From anthelion with Apache License 2.0 | 5 votes |
/** * Test behaviour when defined filter does not exist. * @throws IndexingException */ public void testNonExistingIndexingFilter() throws IndexingException { Configuration conf = NutchConfiguration.create(); String class1 = "NonExistingFilter"; String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter"; conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2); IndexingFilters filters = new IndexingFilters(conf); filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData( new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text( "http://www.example.com/"), new CrawlDatum(), new Inlinks()); }