Java Code Examples for org.apache.nutch.metadata.Metadata#set()

The following examples show how to use org.apache.nutch.metadata.Metadata#set() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ZipTextExtractor.java    From anthelion with Apache License 2.0 4 votes vote down vote up
public String extractText(InputStream input, String url, List outLinksList) throws IOException {
  String resultText = "";
  byte temp;
  
  ZipInputStream zin = new ZipInputStream(input);
  
  ZipEntry entry;
  
  while ((entry = zin.getNextEntry()) != null) {
    
    if (!entry.isDirectory()) {
      int size = (int) entry.getSize();
      byte[] b = new byte[size];
      for(int x = 0; x < size; x++) {
        int err = zin.read();
        if(err != -1) {
          b[x] = (byte)err;
        }
      }
      String newurl = url + "/";
      String fname = entry.getName();
      newurl += fname;
      URL aURL = new URL(newurl);
      String base = aURL.toString();
      int i = fname.lastIndexOf('.');
      if (i != -1) {
        // Trying to resolve the Mime-Type
        String contentType = MIME.getMimeType(fname);
        try {
          Metadata metadata = new Metadata();
          metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
          metadata.set(Response.CONTENT_TYPE, contentType);
          Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
          Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
          ParseData theParseData = parse.getData();
          Outlink[] theOutlinks = theParseData.getOutlinks();
          
          for(int count = 0; count < theOutlinks.length; count++) {
            outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
          }
          
          resultText += entry.getName() + " " + parse.getText() + " ";
        } catch (ParseException e) {
          if (LOG.isInfoEnabled()) { 
            LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage());
          }
        }
      }
    }
  }
  
  return resultText;
}
 
Example 2
Source File: TestEncodingDetector.java    From anthelion with Apache License 2.0 4 votes vote down vote up
public void testGuessing() {
  // first disable auto detection
  conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, -1);

  Metadata metadata = new Metadata();
  EncodingDetector detector;
  Content content;
  String encoding;

  content = new Content("http://www.example.com", "http://www.example.com/",
      contentInOctets, "text/plain", metadata, conf);
  detector = new EncodingDetector(conf);
  detector.autoDetectClues(content, true);
  encoding = detector.guessEncoding(content, "windows-1252");
  // no information is available, so it should return default encoding
  assertEquals("windows-1252", encoding.toLowerCase());

  metadata.clear();
  metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
  content = new Content("http://www.example.com", "http://www.example.com/",
      contentInOctets, "text/plain", metadata, conf);
  detector = new EncodingDetector(conf);
  detector.autoDetectClues(content, true);
  encoding = detector.guessEncoding(content, "windows-1252");
  assertEquals("utf-16", encoding.toLowerCase());

  metadata.clear();
  content = new Content("http://www.example.com", "http://www.example.com/",
      contentInOctets, "text/plain", metadata, conf);
  detector = new EncodingDetector(conf);
  detector.autoDetectClues(content, true);
  detector.addClue("windows-1254", "sniffed");
  encoding = detector.guessEncoding(content, "windows-1252");
  assertEquals("windows-1254", encoding.toLowerCase());

  // enable autodetection
  conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50);
  metadata.clear();
  metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
  content = new Content("http://www.example.com", "http://www.example.com/",
      contentInOctets, "text/plain", metadata, conf);
  detector = new EncodingDetector(conf);
  detector.autoDetectClues(content, true);
  detector.addClue("utf-32", "sniffed");
  encoding = detector.guessEncoding(content, "windows-1252");
  assertEquals("utf-8", encoding.toLowerCase());
}
 
Example 3
Source File: ZipTextExtractor.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
public String extractText(InputStream input, String url, List<Outlink> outLinksList) throws IOException {
  String resultText = "";
  ZipInputStream zin = new ZipInputStream(input);
  ZipEntry entry;
  
  while ((entry = zin.getNextEntry()) != null) {
    
    if (!entry.isDirectory()) {
      int size = (int) entry.getSize();
      byte[] b = new byte[size];
      for(int x = 0; x < size; x++) {
        int err = zin.read();
        if(err != -1) {
          b[x] = (byte)err;
        }
      }
      String newurl = url + "/";
      String fname = entry.getName();
      newurl += fname;
      URL aURL = new URL(newurl);
      String base = aURL.toString();
      int i = fname.lastIndexOf('.');
      if (i != -1) {
        // Trying to resolve the Mime-Type
        Tika tika = new Tika();
        String contentType = tika.detect(fname);
        try {
          Metadata metadata = new Metadata();
          metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
          metadata.set(Response.CONTENT_TYPE, contentType);
          Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
          Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
          ParseData theParseData = parse.getData();
          Outlink[] theOutlinks = theParseData.getOutlinks();
          
          for(int count = 0; count < theOutlinks.length; count++) {
            outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
          }
          
          resultText += entry.getName() + " " + parse.getText() + " ";
        } catch (ParseException e) {
          if (LOG.isInfoEnabled()) { 
            LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage());
          }
        }
      }
    }
  }
  
  return resultText;
}
 
Example 4
Source File: TestEncodingDetector.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
public void testGuessing() {
  // first disable auto detection
  conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, -1);

  Metadata metadata = new Metadata();
  EncodingDetector detector;
  Content content;
  String encoding;

  content = new Content("http://www.example.com", "http://www.example.com/",
      contentInOctets, "text/plain", metadata, conf);
  detector = new EncodingDetector(conf);
  detector.autoDetectClues(content, true);
  encoding = detector.guessEncoding(content, "windows-1252");
  // no information is available, so it should return default encoding
  assertEquals("windows-1252", encoding.toLowerCase());

  metadata.clear();
  metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
  content = new Content("http://www.example.com", "http://www.example.com/",
      contentInOctets, "text/plain", metadata, conf);
  detector = new EncodingDetector(conf);
  detector.autoDetectClues(content, true);
  encoding = detector.guessEncoding(content, "windows-1252");
  assertEquals("utf-16", encoding.toLowerCase());

  metadata.clear();
  content = new Content("http://www.example.com", "http://www.example.com/",
      contentInOctets, "text/plain", metadata, conf);
  detector = new EncodingDetector(conf);
  detector.autoDetectClues(content, true);
  detector.addClue("windows-1254", "sniffed");
  encoding = detector.guessEncoding(content, "windows-1252");
  assertEquals("windows-1254", encoding.toLowerCase());

  // enable autodetection
  conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50);
  metadata.clear();
  metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
  content = new Content("http://www.example.com", "http://www.example.com/",
      contentInOctets, "text/plain", metadata, conf);
  detector = new EncodingDetector(conf);
  detector.autoDetectClues(content, true);
  detector.addClue("utf-32", "sniffed");
  encoding = detector.guessEncoding(content, "windows-1252");
  assertEquals("utf-8", encoding.toLowerCase());
}