Java Code Examples for org.apache.nutch.metadata.Metadata#set()
The following examples show how to use
org.apache.nutch.metadata.Metadata#set() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ZipTextExtractor.java From anthelion with Apache License 2.0 | 4 votes |
public String extractText(InputStream input, String url, List outLinksList) throws IOException { String resultText = ""; byte temp; ZipInputStream zin = new ZipInputStream(input); ZipEntry entry; while ((entry = zin.getNextEntry()) != null) { if (!entry.isDirectory()) { int size = (int) entry.getSize(); byte[] b = new byte[size]; for(int x = 0; x < size; x++) { int err = zin.read(); if(err != -1) { b[x] = (byte)err; } } String newurl = url + "/"; String fname = entry.getName(); newurl += fname; URL aURL = new URL(newurl); String base = aURL.toString(); int i = fname.lastIndexOf('.'); if (i != -1) { // Trying to resolve the Mime-Type String contentType = MIME.getMimeType(fname); try { Metadata metadata = new Metadata(); metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize())); metadata.set(Response.CONTENT_TYPE, contentType); Content content = new Content(newurl, base, b, contentType, metadata, this.conf); Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl()); ParseData theParseData = parse.getData(); Outlink[] theOutlinks = theParseData.getOutlinks(); for(int count = 0; count < theOutlinks.length; count++) { outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor())); } resultText += entry.getName() + " " + parse.getText() + " "; } catch (ParseException e) { if (LOG.isInfoEnabled()) { LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage()); } } } } } return resultText; }
Example 2
Source File: TestEncodingDetector.java From anthelion with Apache License 2.0 | 4 votes |
public void testGuessing() { // first disable auto detection conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, -1); Metadata metadata = new Metadata(); EncodingDetector detector; Content content; String encoding; content = new Content("http://www.example.com", "http://www.example.com/", contentInOctets, "text/plain", metadata, conf); detector = new EncodingDetector(conf); detector.autoDetectClues(content, true); encoding = detector.guessEncoding(content, "windows-1252"); // no information is available, so it should return default encoding assertEquals("windows-1252", encoding.toLowerCase()); metadata.clear(); metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16"); content = new Content("http://www.example.com", "http://www.example.com/", contentInOctets, "text/plain", metadata, conf); detector = new EncodingDetector(conf); detector.autoDetectClues(content, true); encoding = detector.guessEncoding(content, "windows-1252"); assertEquals("utf-16", encoding.toLowerCase()); metadata.clear(); content = new Content("http://www.example.com", "http://www.example.com/", contentInOctets, "text/plain", metadata, conf); detector = new EncodingDetector(conf); detector.autoDetectClues(content, true); detector.addClue("windows-1254", "sniffed"); encoding = detector.guessEncoding(content, "windows-1252"); assertEquals("windows-1254", encoding.toLowerCase()); // enable autodetection conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50); metadata.clear(); metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16"); content = new Content("http://www.example.com", "http://www.example.com/", contentInOctets, "text/plain", metadata, conf); detector = new EncodingDetector(conf); detector.autoDetectClues(content, true); detector.addClue("utf-32", "sniffed"); encoding = detector.guessEncoding(content, "windows-1252"); assertEquals("utf-8", encoding.toLowerCase()); }
Example 3
Source File: ZipTextExtractor.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
public String extractText(InputStream input, String url, List<Outlink> outLinksList) throws IOException { String resultText = ""; ZipInputStream zin = new ZipInputStream(input); ZipEntry entry; while ((entry = zin.getNextEntry()) != null) { if (!entry.isDirectory()) { int size = (int) entry.getSize(); byte[] b = new byte[size]; for(int x = 0; x < size; x++) { int err = zin.read(); if(err != -1) { b[x] = (byte)err; } } String newurl = url + "/"; String fname = entry.getName(); newurl += fname; URL aURL = new URL(newurl); String base = aURL.toString(); int i = fname.lastIndexOf('.'); if (i != -1) { // Trying to resolve the Mime-Type Tika tika = new Tika(); String contentType = tika.detect(fname); try { Metadata metadata = new Metadata(); metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize())); metadata.set(Response.CONTENT_TYPE, contentType); Content content = new Content(newurl, base, b, contentType, metadata, this.conf); Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl()); ParseData theParseData = parse.getData(); Outlink[] theOutlinks = theParseData.getOutlinks(); for(int count = 0; count < theOutlinks.length; count++) { outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor())); } resultText += entry.getName() + " " + parse.getText() + " "; } catch (ParseException e) { if (LOG.isInfoEnabled()) { LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage()); } } } } } return resultText; }
Example 4
Source File: TestEncodingDetector.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
public void testGuessing() { // first disable auto detection conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, -1); Metadata metadata = new Metadata(); EncodingDetector detector; Content content; String encoding; content = new Content("http://www.example.com", "http://www.example.com/", contentInOctets, "text/plain", metadata, conf); detector = new EncodingDetector(conf); detector.autoDetectClues(content, true); encoding = detector.guessEncoding(content, "windows-1252"); // no information is available, so it should return default encoding assertEquals("windows-1252", encoding.toLowerCase()); metadata.clear(); metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16"); content = new Content("http://www.example.com", "http://www.example.com/", contentInOctets, "text/plain", metadata, conf); detector = new EncodingDetector(conf); detector.autoDetectClues(content, true); encoding = detector.guessEncoding(content, "windows-1252"); assertEquals("utf-16", encoding.toLowerCase()); metadata.clear(); content = new Content("http://www.example.com", "http://www.example.com/", contentInOctets, "text/plain", metadata, conf); detector = new EncodingDetector(conf); detector.autoDetectClues(content, true); detector.addClue("windows-1254", "sniffed"); encoding = detector.guessEncoding(content, "windows-1252"); assertEquals("windows-1254", encoding.toLowerCase()); // enable autodetection conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50); metadata.clear(); metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16"); content = new Content("http://www.example.com", "http://www.example.com/", contentInOctets, "text/plain", metadata, conf); detector = new EncodingDetector(conf); detector.autoDetectClues(content, true); detector.addClue("utf-32", "sniffed"); encoding = detector.guessEncoding(content, "windows-1252"); assertEquals("utf-8", encoding.toLowerCase()); }