Java Code Examples for org.apache.nutch.parse.Parse#getText()

The following examples show how to use org.apache.nutch.parse.Parse#getText() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: JSParseFilter.java    From anthelion with Apache License 2.0 6 votes vote down vote up
public ParseResult filter(Content content, ParseResult parseResult,
  HTMLMetaTags metaTags, DocumentFragment doc) {

  Parse parse = parseResult.get(content.getUrl());

  String url = content.getBaseUrl();
  ArrayList outlinks = new ArrayList();
  walk(doc, parse, metaTags, url, outlinks);
  if (outlinks.size() > 0) {
    Outlink[] old = parse.getData().getOutlinks();
    String title = parse.getData().getTitle();
    List list = Arrays.asList(old);
    outlinks.addAll(list);
    ParseStatus status = parse.getData().getStatus();
    String text = parse.getText();
    Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
    ParseData parseData = new ParseData(status, title, newlinks,
                                        parse.getData().getContentMeta(),
                                        parse.getData().getParseMeta());

    // replace original parse obj with new one
    parseResult.put(content.getUrl(), new ParseText(text), parseData);
  }
  return parseResult;
}
 
Example 2
Source File: JSParseFilter.java    From nutch-htmlunit with Apache License 2.0 6 votes vote down vote up
public ParseResult filter(Content content, ParseResult parseResult,
  HTMLMetaTags metaTags, DocumentFragment doc) {

  Parse parse = parseResult.get(content.getUrl());

  String url = content.getBaseUrl();
  ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
  walk(doc, parse, metaTags, url, outlinks);
  if (outlinks.size() > 0) {
    Outlink[] old = parse.getData().getOutlinks();
    String title = parse.getData().getTitle();
    List<Outlink> list = Arrays.asList(old);
    outlinks.addAll(list);
    ParseStatus status = parse.getData().getStatus();
    String text = parse.getText();
    Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
    ParseData parseData = new ParseData(status, title, newlinks,
                                        parse.getData().getContentMeta(),
                                        parse.getData().getParseMeta());

    // replace original parse obj with new one
    parseResult.put(content.getUrl(), new ParseText(text), parseData);
  }
  return parseResult;
}
 
Example 3
Source File: HTMLLanguageParser.java    From anthelion with Apache License 2.0 5 votes vote down vote up
/** Use statistical language identification to extract page language */
private String identifyLanguage(Parse parse) {
    StringBuilder text = new StringBuilder();
    if (parse == null)
        return null;

    String title = parse.getData().getTitle();
    if (title != null) {
        text.append(title.toString());
    }

    String content = parse.getText();
    if (content != null) {
        text.append(" ").append(content.toString());
    }

    // trim content?
    String titleandcontent = text.toString();

    if (this.contentMaxlength != -1
            && titleandcontent.length() > this.contentMaxlength)
        titleandcontent = titleandcontent.substring(0, contentMaxlength);

    LanguageIdentifier identifier = new LanguageIdentifier(titleandcontent);

    if (onlyCertain) {
        if (identifier.isReasonablyCertain())
            return identifier.getLanguage();
        else
            return null;
    }
    return identifier.getLanguage();
}
 
Example 4
Source File: TestMSWordParser.java    From anthelion with Apache License 2.0 5 votes vote down vote up
public String getTextContent(String fileName) throws ProtocolException, ParseException {
  String urlString = "file:" + sampleDir + fileSeparator + fileName;
  Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
  Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
  Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
  return parse.getText();
}
 
Example 5
Source File: HTMLLanguageParser.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
/** Use statistical language identification to extract page language */
private String identifyLanguage(Parse parse) {
    StringBuilder text = new StringBuilder();
    if (parse == null)
        return null;

    String title = parse.getData().getTitle();
    if (title != null) {
        text.append(title.toString());
    }

    String content = parse.getText();
    if (content != null) {
        text.append(" ").append(content.toString());
    }

    // trim content?
    String titleandcontent = text.toString();

    if (this.contentMaxlength != -1
            && titleandcontent.length() > this.contentMaxlength)
        titleandcontent = titleandcontent.substring(0, contentMaxlength);

    LanguageIdentifier identifier = new LanguageIdentifier(titleandcontent);

    if (onlyCertain) {
        if (identifier.isReasonablyCertain())
            return identifier.getLanguage();
        else
            return null;
    }
    return identifier.getLanguage();
}
 
Example 6
Source File: TestMSWordParser.java    From nutch-htmlunit with Apache License 2.0 5 votes vote down vote up
public String getTextContent(String fileName) throws ProtocolException, ParseException {
  String urlString = "file:" + sampleDir + fileSeparator + fileName;
  Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
  Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
  Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(content.getUrl());
  return parse.getText();
}
 
Example 7
Source File: ZipTextExtractor.java    From anthelion with Apache License 2.0 4 votes vote down vote up
public String extractText(InputStream input, String url, List outLinksList) throws IOException {
  String resultText = "";
  byte temp;
  
  ZipInputStream zin = new ZipInputStream(input);
  
  ZipEntry entry;
  
  while ((entry = zin.getNextEntry()) != null) {
    
    if (!entry.isDirectory()) {
      int size = (int) entry.getSize();
      byte[] b = new byte[size];
      for(int x = 0; x < size; x++) {
        int err = zin.read();
        if(err != -1) {
          b[x] = (byte)err;
        }
      }
      String newurl = url + "/";
      String fname = entry.getName();
      newurl += fname;
      URL aURL = new URL(newurl);
      String base = aURL.toString();
      int i = fname.lastIndexOf('.');
      if (i != -1) {
        // Trying to resolve the Mime-Type
        String contentType = MIME.getMimeType(fname);
        try {
          Metadata metadata = new Metadata();
          metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
          metadata.set(Response.CONTENT_TYPE, contentType);
          Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
          Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
          ParseData theParseData = parse.getData();
          Outlink[] theOutlinks = theParseData.getOutlinks();
          
          for(int count = 0; count < theOutlinks.length; count++) {
            outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
          }
          
          resultText += entry.getName() + " " + parse.getText() + " ";
        } catch (ParseException e) {
          if (LOG.isInfoEnabled()) { 
            LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage());
          }
        }
      }
    }
  }
  
  return resultText;
}
 
Example 8
Source File: ZipTextExtractor.java    From nutch-htmlunit with Apache License 2.0 4 votes vote down vote up
public String extractText(InputStream input, String url, List<Outlink> outLinksList) throws IOException {
  String resultText = "";
  ZipInputStream zin = new ZipInputStream(input);
  ZipEntry entry;
  
  while ((entry = zin.getNextEntry()) != null) {
    
    if (!entry.isDirectory()) {
      int size = (int) entry.getSize();
      byte[] b = new byte[size];
      for(int x = 0; x < size; x++) {
        int err = zin.read();
        if(err != -1) {
          b[x] = (byte)err;
        }
      }
      String newurl = url + "/";
      String fname = entry.getName();
      newurl += fname;
      URL aURL = new URL(newurl);
      String base = aURL.toString();
      int i = fname.lastIndexOf('.');
      if (i != -1) {
        // Trying to resolve the Mime-Type
        Tika tika = new Tika();
        String contentType = tika.detect(fname);
        try {
          Metadata metadata = new Metadata();
          metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
          metadata.set(Response.CONTENT_TYPE, contentType);
          Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
          Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
          ParseData theParseData = parse.getData();
          Outlink[] theOutlinks = theParseData.getOutlinks();
          
          for(int count = 0; count < theOutlinks.length; count++) {
            outLinksList.add(new Outlink(theOutlinks[count].getToUrl(), theOutlinks[count].getAnchor()));
          }
          
          resultText += entry.getName() + " " + parse.getText() + " ";
        } catch (ParseException e) {
          if (LOG.isInfoEnabled()) { 
            LOG.info("fetch okay, but can't parse " + fname + ", reason: " + e.getMessage());
          }
        }
      }
    }
  }
  
  return resultText;
}