org.htmlparser.Parser Java Exaples

Source File: HtmlTextParser.java From onboard with Apache License 2.0

6 votes

public static String getPlainText(String htmlStr) {
    Parser parser = new Parser();
    String plainText = "";
    try {
        parser.setInputHTML(htmlStr);

        StringBean stringBean = new StringBean();
        // 设置不需要得到页面所包含的链接信息
        stringBean.setLinks(false);
        // 设置将不间断空格由正规空格所替代
        stringBean.setReplaceNonBreakingSpaces(true);
        // 设置将一序列空格由单一空格替代
        stringBean.setCollapse(true);

        parser.visitAllNodesWith(stringBean);
        plainText = stringBean.getStrings();

    } catch (ParserException e) {
        e.printStackTrace();
    }

    return plainText;
}

Source File: HTMLParser.java From PADListener with GNU General Public License v2.0

6 votes

/**
   * parses the body of the message, and returns a parsed representation
   * See {@link http://htmlparser.sourceforge.net/} for details
   * @param url the url that the message resulted from
   * @param message the Message to parse
   * @return a NodeList containing the various Nodes making up the page
   */
  public Object parseMessage(HttpUrl url, Message message) {
      String contentType = message.getHeader("Content-Type");
      if (contentType == null || !contentType.matches("text/html.*")) {
          return null;
      }
      byte[] content = message.getContent();
      if (content == null || content.length == 0) {
          return null;
      }
      Parser parser = Parser.createParser(new String(content), null);
      try {
          NodeList nodelist = parser.extractAllNodesThatMatch(new NodeFilter() {
public boolean accept(Node node) {
                  return true;
              }
          });
          return nodelist;
      } catch (ParserException pe) {
          _logger.severe(pe.toString());
          return null;
      }
  }

Source File: GangliaHttpParser.java From Hue-Ctrip-DI with MIT License

6 votes

public List<String> getGangliaAttribute(String clusterName)
		throws ParserException, MalformedURLException, IOException {
	String url = gangliaMetricUrl.replaceAll(clusterPattern, clusterName);
	Parser parser = new Parser(new URL(url).openConnection());
	NodeFilter nodeFilter = new AndFilter(new TagNameFilter("select"),
			new HasAttributeFilter("id", "metrics-picker"));
	NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
	SimpleNodeIterator iterator = nodeList.elements();
	List<String> metricList = new ArrayList<String>();
	while (iterator.hasMoreNodes()) {
		Node node = iterator.nextNode();

		SimpleNodeIterator childIterator = node.getChildren().elements();
		while (childIterator.hasMoreNodes()) {
			OptionTag children = (OptionTag) childIterator.nextNode();
			metricList.add(children.getOptionText());
		}
	}

	return metricList;

}

Source File: TestGangliaHttpParser.java From Hue-Ctrip-DI with MIT License

6 votes

public static void main(String[] args) throws Exception {
	Parser parser = new Parser(new URL("http://10.8.75.3/ganglia/?r=hour&cs=&ce=&s=by+name&c=Zookeeper_Cluster&tab=m&vn=&hide-hf=false").openConnection());
	NodeFilter nodeFilter = new AndFilter(new TagNameFilter("select"),
			new HasAttributeFilter("id", "metrics-picker"));
	NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
	SimpleNodeIterator iterator = nodeList.elements();
	while (iterator.hasMoreNodes()) {
		Node node = iterator.nextNode();

		SimpleNodeIterator childIterator = node.getChildren().elements();
		while (childIterator.hasMoreNodes()) {
			OptionTag children = (OptionTag) childIterator.nextNode();
			System.out.println(children.getOptionText());
		}
	}

}

Source File: HTMLConverter.java From OpenEphyra with GNU General Public License v2.0

6 votes

/**
 * Converts an HTML document into plain text.
 * 
 * @param html HTML document
 * @return plain text or <code>null</code> if the conversion failed
 */
public static synchronized String html2text(String html) {
	// convert HTML document
	StringBean sb = new StringBean();
	sb.setLinks(false);  // no links
	sb.setReplaceNonBreakingSpaces (true); // replace non-breaking spaces
    sb.setCollapse(true);  // replace sequences of whitespaces
	Parser parser = new Parser();
	try {
		parser.setInputHTML(html);
		parser.visitAllNodesWith(sb);
	} catch (ParserException e) {
		return null;
	}
	String docText = sb.getStrings();
	
	if (docText == null) docText = "";  // no content
	
	return docText;
}

Source File: HTMLConverter.java From OpenEphyra with GNU General Public License v2.0

6 votes

/**
 * Reads an HTML document from a file and converts it into plain text.
 * 
 * @param filename name of file containing HTML documents
 * @return plain text or <code>null</code> if the reading or conversion failed
 */
public static synchronized String file2text(String filename) {
	// read from file and convert HTML document
	StringBean sb = new StringBean();
	sb.setLinks(false);  // no links
	sb.setReplaceNonBreakingSpaces (true); // replace non-breaking spaces
    sb.setCollapse(true);  // replace sequences of whitespaces
	Parser parser = new Parser();
	try {
		parser.setResource(filename);
		parser.visitAllNodesWith(sb);
	} catch (ParserException e) {
		return null;
	}
	String docText = sb.getStrings();
	
	return docText;
}

Source File: SendMailService.java From cs-actions with Apache License 2.0

5 votes

private void processHTMLBodyWithBASE64Images(MimeMultipart multipart) throws ParserException,
        MessagingException, NoSuchAlgorithmException, SMIMEException, java.security.NoSuchProviderException {
    if (null != input.getBody() && input.getBody().contains(Encodings.BASE64)) {
        Parser parser = new Parser(input.getBody());
        NodeList nodeList = parser.parse(null);
        HtmlImageNodeVisitor htmlImageNodeVisitor = new HtmlImageNodeVisitor();
        nodeList.visitAllNodesWith(htmlImageNodeVisitor);
        input.setBody(nodeList.toHtml());

        addAllBase64ImagesToMimeMultipart(multipart, htmlImageNodeVisitor.getBase64Images());
    }
}

Source File: DouBanParsePage.java From JewelCrawler with GNU General Public License v3.0

4 votes

public static void parseFromString(String content, Connection conn) throws Exception {

        Parser parser = new Parser(content);
        HasAttributeFilter filter = new HasAttributeFilter("href");

        String sql1 = null;
        ResultSet rs1 = null;
        PreparedStatement pstmt1 = null;
        Statement stmt1 = null;

        List<String> nextLinkList = new ArrayList<String>();

        int rowCount = 0;
        sql1 = "select count(*) as rowCount from record";
        stmt1 = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE);
        rs1 = stmt1.executeQuery(sql1);
        if (rs1.next()) {
            rowCount = rs1.getString("rowCount") != null ? Integer.parseInt(rs1.getString("rowCount")) : 0;
        }

        if (rowCount <= Constants.maxCycle) { //once rowCount is bigger than maxCycle, the new crawled link will not insert into record table
            try {
                NodeList list = parser.parse(filter);
                int count = list.size();

                //process every link on this page
                for (int i = 0; i < count; i++) {
                    Node node = list.elementAt(i);

                    if (node instanceof LinkTag) {
                        LinkTag link = (LinkTag) node;
                        String nextLink = link.extractLink();
                        String mainUrl = Constants.MAINURL;

                        if (nextLink.startsWith(mainUrl)) {
                                //check if the link already exists in the database
                                sql1 = "SELECT * FROM record WHERE URL = '" + nextLink + "'";
                                stmt1 = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE);
                                rs1 = stmt1.executeQuery(sql1);
                                if (rs1.next()) {

                                } else {
                                    Pattern moviePattern = Pattern.compile(Constants.MOVIE_REGULAR_EXP);
                                    Matcher movieMatcher = moviePattern.matcher(nextLink);

                                    Pattern commentPattern = Pattern.compile(Constants.COMMENT_REGULAR_EXP);
                                    Matcher commentMatcher = commentPattern.matcher(nextLink);

                                    if (movieMatcher.find() || commentMatcher.find()) {
                                        nextLinkList.add(nextLink);
                                    }
                                }
                        }
                    }
                }
                if (nextLinkList.size() > 0) {
                    conn.setAutoCommit(false);
                    //if the link does not exist in the database, insert it
                    sql1 = "INSERT INTO record (URL, crawled) VALUES (?,0)";
                    pstmt1 = conn.prepareStatement(sql1, Statement.RETURN_GENERATED_KEYS);
                    for (String nextLinkStr : nextLinkList) {
                        pstmt1.setString(1, nextLinkStr);
                        pstmt1.addBatch();
                        System.out.println(nextLinkStr);
                    }
                    pstmt1.executeBatch();
                    conn.commit();
                }
            } catch (Exception e) {
                //handle the exceptions
                e.printStackTrace();
                System.out.println("SQLException: " + e.getMessage());
            } finally {
                //close and release the resources of PreparedStatement, ResultSet and Statement
                if (pstmt1 != null) {
                    try {
                        pstmt1.close();
                    } catch (SQLException e2) {
                    }
                }
                pstmt1 = null;

                if (rs1 != null) {
                    try {
                        rs1.close();
                    } catch (SQLException e1) {
                    }
                }
                rs1 = null;

                if (stmt1 != null) {
                    try {
                        stmt1.close();
                    } catch (SQLException e3) {
                    }
                }
                stmt1 = null;
            }
        }
    }

org.htmlparser.Parser Java Examples