org.htmlparser.Parser Java Examples
The following examples show how to use
org.htmlparser.Parser.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HtmlTextParser.java From onboard with Apache License 2.0 | 6 votes |
public static String getPlainText(String htmlStr) { Parser parser = new Parser(); String plainText = ""; try { parser.setInputHTML(htmlStr); StringBean stringBean = new StringBean(); // 设置不需要得到页面所包含的链接信息 stringBean.setLinks(false); // 设置将不间断空格由正规空格所替代 stringBean.setReplaceNonBreakingSpaces(true); // 设置将一序列空格由单一空格替代 stringBean.setCollapse(true); parser.visitAllNodesWith(stringBean); plainText = stringBean.getStrings(); } catch (ParserException e) { e.printStackTrace(); } return plainText; }
Example #2
Source File: HTMLParser.java From PADListener with GNU General Public License v2.0 | 6 votes |
/** * parses the body of the message, and returns a parsed representation * See {@link http://htmlparser.sourceforge.net/} for details * @param url the url that the message resulted from * @param message the Message to parse * @return a NodeList containing the various Nodes making up the page */ public Object parseMessage(HttpUrl url, Message message) { String contentType = message.getHeader("Content-Type"); if (contentType == null || !contentType.matches("text/html.*")) { return null; } byte[] content = message.getContent(); if (content == null || content.length == 0) { return null; } Parser parser = Parser.createParser(new String(content), null); try { NodeList nodelist = parser.extractAllNodesThatMatch(new NodeFilter() { public boolean accept(Node node) { return true; } }); return nodelist; } catch (ParserException pe) { _logger.severe(pe.toString()); return null; } }
Example #3
Source File: GangliaHttpParser.java From Hue-Ctrip-DI with MIT License | 6 votes |
public List<String> getGangliaAttribute(String clusterName) throws ParserException, MalformedURLException, IOException { String url = gangliaMetricUrl.replaceAll(clusterPattern, clusterName); Parser parser = new Parser(new URL(url).openConnection()); NodeFilter nodeFilter = new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("id", "metrics-picker")); NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter); SimpleNodeIterator iterator = nodeList.elements(); List<String> metricList = new ArrayList<String>(); while (iterator.hasMoreNodes()) { Node node = iterator.nextNode(); SimpleNodeIterator childIterator = node.getChildren().elements(); while (childIterator.hasMoreNodes()) { OptionTag children = (OptionTag) childIterator.nextNode(); metricList.add(children.getOptionText()); } } return metricList; }
Example #4
Source File: TestGangliaHttpParser.java From Hue-Ctrip-DI with MIT License | 6 votes |
public static void main(String[] args) throws Exception { Parser parser = new Parser(new URL("http://10.8.75.3/ganglia/?r=hour&cs=&ce=&s=by+name&c=Zookeeper_Cluster&tab=m&vn=&hide-hf=false").openConnection()); NodeFilter nodeFilter = new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("id", "metrics-picker")); NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter); SimpleNodeIterator iterator = nodeList.elements(); while (iterator.hasMoreNodes()) { Node node = iterator.nextNode(); SimpleNodeIterator childIterator = node.getChildren().elements(); while (childIterator.hasMoreNodes()) { OptionTag children = (OptionTag) childIterator.nextNode(); System.out.println(children.getOptionText()); } } }
Example #5
Source File: HTMLConverter.java From OpenEphyra with GNU General Public License v2.0 | 6 votes |
/** * Converts an HTML document into plain text. * * @param html HTML document * @return plain text or <code>null</code> if the conversion failed */ public static synchronized String html2text(String html) { // convert HTML document StringBean sb = new StringBean(); sb.setLinks(false); // no links sb.setReplaceNonBreakingSpaces (true); // replace non-breaking spaces sb.setCollapse(true); // replace sequences of whitespaces Parser parser = new Parser(); try { parser.setInputHTML(html); parser.visitAllNodesWith(sb); } catch (ParserException e) { return null; } String docText = sb.getStrings(); if (docText == null) docText = ""; // no content return docText; }
Example #6
Source File: HTMLConverter.java From OpenEphyra with GNU General Public License v2.0 | 6 votes |
/** * Reads an HTML document from a file and converts it into plain text. * * @param filename name of file containing HTML documents * @return plain text or <code>null</code> if the reading or conversion failed */ public static synchronized String file2text(String filename) { // read from file and convert HTML document StringBean sb = new StringBean(); sb.setLinks(false); // no links sb.setReplaceNonBreakingSpaces (true); // replace non-breaking spaces sb.setCollapse(true); // replace sequences of whitespaces Parser parser = new Parser(); try { parser.setResource(filename); parser.visitAllNodesWith(sb); } catch (ParserException e) { return null; } String docText = sb.getStrings(); return docText; }
Example #7
Source File: SendMailService.java From cs-actions with Apache License 2.0 | 5 votes |
private void processHTMLBodyWithBASE64Images(MimeMultipart multipart) throws ParserException, MessagingException, NoSuchAlgorithmException, SMIMEException, java.security.NoSuchProviderException { if (null != input.getBody() && input.getBody().contains(Encodings.BASE64)) { Parser parser = new Parser(input.getBody()); NodeList nodeList = parser.parse(null); HtmlImageNodeVisitor htmlImageNodeVisitor = new HtmlImageNodeVisitor(); nodeList.visitAllNodesWith(htmlImageNodeVisitor); input.setBody(nodeList.toHtml()); addAllBase64ImagesToMimeMultipart(multipart, htmlImageNodeVisitor.getBase64Images()); } }
Example #8
Source File: DouBanParsePage.java From JewelCrawler with GNU General Public License v3.0 | 4 votes |
public static void parseFromString(String content, Connection conn) throws Exception { Parser parser = new Parser(content); HasAttributeFilter filter = new HasAttributeFilter("href"); String sql1 = null; ResultSet rs1 = null; PreparedStatement pstmt1 = null; Statement stmt1 = null; List<String> nextLinkList = new ArrayList<String>(); int rowCount = 0; sql1 = "select count(*) as rowCount from record"; stmt1 = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE); rs1 = stmt1.executeQuery(sql1); if (rs1.next()) { rowCount = rs1.getString("rowCount") != null ? Integer.parseInt(rs1.getString("rowCount")) : 0; } if (rowCount <= Constants.maxCycle) { //once rowCount is bigger than maxCycle, the new crawled link will not insert into record table try { NodeList list = parser.parse(filter); int count = list.size(); //process every link on this page for (int i = 0; i < count; i++) { Node node = list.elementAt(i); if (node instanceof LinkTag) { LinkTag link = (LinkTag) node; String nextLink = link.extractLink(); String mainUrl = Constants.MAINURL; if (nextLink.startsWith(mainUrl)) { //check if the link already exists in the database sql1 = "SELECT * FROM record WHERE URL = '" + nextLink + "'"; stmt1 = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE); rs1 = stmt1.executeQuery(sql1); if (rs1.next()) { } else { Pattern moviePattern = Pattern.compile(Constants.MOVIE_REGULAR_EXP); Matcher movieMatcher = moviePattern.matcher(nextLink); Pattern commentPattern = Pattern.compile(Constants.COMMENT_REGULAR_EXP); Matcher commentMatcher = commentPattern.matcher(nextLink); if (movieMatcher.find() || commentMatcher.find()) { nextLinkList.add(nextLink); } } } } } if (nextLinkList.size() > 0) { conn.setAutoCommit(false); //if the link does not exist in the database, insert it sql1 = "INSERT INTO record (URL, crawled) VALUES (?,0)"; pstmt1 = conn.prepareStatement(sql1, Statement.RETURN_GENERATED_KEYS); for (String nextLinkStr : nextLinkList) { pstmt1.setString(1, nextLinkStr); pstmt1.addBatch(); System.out.println(nextLinkStr); } pstmt1.executeBatch(); conn.commit(); } } catch (Exception e) { //handle the exceptions e.printStackTrace(); System.out.println("SQLException: " + e.getMessage()); } finally { //close and release the resources of PreparedStatement, ResultSet and Statement if (pstmt1 != null) { try { pstmt1.close(); } catch (SQLException e2) { } } pstmt1 = null; if (rs1 != null) { try { rs1.close(); } catch (SQLException e1) { } } rs1 = null; if (stmt1 != null) { try { stmt1.close(); } catch (SQLException e3) { } } stmt1 = null; } } }