Java Code Examples for org.htmlcleaner.HtmlCleaner#clean()
The following examples show how to use
org.htmlcleaner.HtmlCleaner#clean() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: UserUtil.java From BigData with GNU General Public License v3.0 | 6 votes |
/** * 解析关注页面,关注与被关注 * * @param followUrl */ public static void processFollow(String followUrl) { String content = PageUtil.getContent(followUrl); HtmlCleaner htmlCleaner = new HtmlCleaner(); TagNode tNode = htmlCleaner.clean(content); extractUserUrl(content); try { Object[] pageNumObj = tNode .evaluateXPath("//*[@id=\"Profile-following\"]//div[@class=\"Pagination\"]/button"); if (pageNumObj != null && pageNumObj.length > 0) { TagNode node = (TagNode) pageNumObj[pageNumObj.length - 2]; int pagenum = Integer.parseInt(node.getText().toString()); for (int i = 2; i <= pagenum; i++) { String url = followUrl + "?page=" + i; content = PageUtil.getContent(url); extractUserUrl(content); } } } catch (XPatherException e) { logger.error(e.getMessage()); } }
Example 2
Source File: JDHtmlParserImpl.java From ispider with Apache License 2.0 | 5 votes |
@Override public void parser(Page page) { HtmlCleaner cleaner = new HtmlCleaner(); /** * cleaner.clean()方法,如果page.getContent为null,那么整个程序就会一直阻塞在这里 * 所以,在前面的代码中ISpider.start()方法,下载网页后,需要对内容进行判断,如果content为空,则跳过解析 */ TagNode rootNode = cleaner.clean(page.getContent()); long start = System.currentTimeMillis(); // 解析开始时间 // 进行判断 根据url的类型进行列表解析还是商品解析 if (page.getUrl().startsWith("https://item.jd.com/")) { // 解析商品 parserProduct(page, rootNode); logger.info("解析商品页面:{}, 消耗时长:{}ms", page.getUrl(), System.currentTimeMillis() - start); } else if (page.getUrl().startsWith("https://list.jd.com/list.html")) { // 解析列表 // 当前页面的商品url列表 List<String> urls = HtmlUtil.getListUrlByXpath(rootNode, "href", "//div[@id='plist']/ul/li/div/div[1]/a"); // 下一页 获取下一页的url String nextUrl = HtmlUtil.getAttrByXpath(rootNode, "href", "//div[@id='J_topPage']/a[2]"); if (!"javascript:;".equals(nextUrl)) { // 说明已经到最后一页了,再不能往下解析了,把当前的url进行排除 nextUrl = "https://list.jd.com" + nextUrl; urls.add(nextUrl); } page.getUrls().addAll(urls); /** * 需要注意的是,当解析的是列表url时,该分支的代码只会解析当前页面的url,而不会爬取数据 * url解析完成以后,添加到当前Page对象中的urls列表中,解析结束后,urls会被添加到Spider对象的url仓库中(高优先级队列) * 这样来让交给循环继续做解析,直到高把优先级队列的url都解析完成了,后面才会去解析低优先级也就是商品url的数据 * 也就是说,当走的是解析列表的分支代码时,这时的Page对象的作用就变成了用来保存url的一个暂时的容器了 */ logger.info("解析列表页面:{}, 消耗时长:{}ms", page.getUrl(), System.currentTimeMillis() - start); if(System.currentTimeMillis() - start == 0) { // 解析京东数据页码数时,偶尔获取不到下一页,时间就为0ms,这时需要重试 logger.info("解析列表页面:{}, 消耗时长:{}ms, 尝试将其重新添加到高优先级url队列中", page.getUrl(), System.currentTimeMillis() - start); HttpUtil.retryUrl(page.getUrl(), SpiderUtil.getTopDomain(page.getUrl()) + SpiderConstants.SPIDER_DOMAIN_HIGHER_SUFFIX); } } }
Example 3
Source File: UtilsStaticAnalyzer.java From apogen with Apache License 2.0 | 5 votes |
private static String digForAMeaningfulName(String xp, String dom) throws UnsupportedEncodingException { xp = xp.toLowerCase(); HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); props.setRecognizeUnicodeChars(true); props.setOmitComments(true); props.setOmitDoctypeDeclaration(true); TagNode node = cleaner.clean(dom); dom = "<html>\n" + cleaner.getInnerHtml(node) + "\n</html>"; // workaround: htmlcleaner works with rel xpaths xp = xp.replace("html[1]/", "/"); try { Object[] result = node.evaluateXPath(xp); if (result.length > 0) { TagNode r = (TagNode) result[0]; return digTheTagTreeForAString(r); } } catch (XPatherException e) { e.printStackTrace(); } // couldn't find a representative string :( return ""; }
Example 4
Source File: ResponseRenderPrintWriter.java From zrlog with Apache License 2.0 | 5 votes |
private String getCompressAndParseHtml(String inputBody) throws IOException { String currentBody = inputBody; //不显示none标签 if (currentBody.endsWith(endFlag)) { currentBody = currentBody.substring(0, currentBody.length() - endFlag.length()); } HtmlCleaner htmlCleaner = new HtmlCleaner(); htmlCleaner.getProperties().setCharset(charset); htmlCleaner.getProperties().setUseCdataForScriptAndStyle(false); TagNode tagNode = htmlCleaner.clean(currentBody); TagNode[] tagNodes = tagNode.getAllElements(true); Map<String, String> plugin = new HashMap<>(); for (TagNode tag : tagNodes) { if (tag != null) { String tagName = tag.getName(); addStaticResourceFlag(tag, tagName); parseCustomHtmlTag(htmlCleaner, plugin, tag, tagName); } } SimpleHtmlSerializer serializer = new SimpleHtmlSerializer(htmlCleaner.getProperties()); StringWriter stringWriter = new StringWriter(); tagNode.serialize(serializer, stringWriter); currentBody = stringWriter.toString(); if (tagNode.getDocType() != null) { currentBody = tagNode.getDocType() + currentBody; } for (Map.Entry<String, String> entry : plugin.entrySet()) { currentBody = currentBody.replace(entry.getKey(), entry.getValue()); } currentBody = currentBody + "<!--" + (System.currentTimeMillis() - startTime) + "ms-->"; return currentBody; }
Example 5
Source File: XmlUtils.java From iaf with Apache License 2.0 | 5 votes |
public static String toXhtml(String htmlString) { String xhtmlString = null; if (StringUtils.isNotEmpty(htmlString)) { xhtmlString = XmlUtils.skipDocTypeDeclaration(htmlString.trim()); if (xhtmlString.startsWith("<html>") || xhtmlString.startsWith("<html ")) { CleanerProperties props = new CleanerProperties(); HtmlCleaner cleaner = new HtmlCleaner(props); TagNode tagNode = cleaner.clean(xhtmlString); xhtmlString = new SimpleXmlSerializer(props).getXmlAsString(tagNode); } } return xhtmlString; }
Example 6
Source File: SNHtmlParserImpl.java From ispider with Apache License 2.0 | 4 votes |
/** * 苏宁的下一页按钮的url似乎也是动态加载的,所以没有办法像京东一样获取 */ @Override public void parser(Page page) { HtmlCleaner cleaner = new HtmlCleaner(); TagNode rootNode = cleaner.clean(page.getContent()); long start = System.currentTimeMillis(); // 解析开始时间 if (page.getUrl().startsWith("https://product.suning.com")) { // 解析商品 parserProduct(page, rootNode); logger.info("解析商品页面:{}, 消耗时长:{}ms", page.getUrl(), System.currentTimeMillis() - start); } else if (page.getUrl().startsWith("https://list.suning.com")) { // 解析列表 // 当前页面的商品url列表 List<String> urls = HtmlUtil.getListUrlByXpath(rootNode, "href", "//div[@id='filter-results']/ul/li/div/div/div/div[1]/div[1]/a"); page.getUrls().addAll(urls); // 获取所有的列表页面url if (!ifGetAll) { Integer totalPage = null; try { // 获取总页码数 Object[] objects = rootNode.evaluateXPath("//div[@id='second-filter']/div[2]/div/span"); TagNode tagNode = (TagNode) objects[0]; String text = tagNode.getText().toString(); // "\n\n1\n/100\n" Pattern pattern = Pattern.compile("[0-9]{2,3}"); Matcher matcher = pattern.matcher(text); if (matcher.find()) { totalPage = Integer.valueOf(matcher.group()); // 获得页码总数 } } catch (XPatherException e) { e.printStackTrace(); } if (totalPage != null) { // 从url中获取当前页码 String currentPageStr = page.getUrl().split("0-20006-")[1].split("\\.")[0]; // url: https://list.suning.com/0-20006-0.html int currentPage = Integer.valueOf(currentPageStr); for (int i = currentPage + 1; i < totalPage; i++) { String url = "https://list.suning.com/0-20006-" + i + ".html"; page.getUrls().add(url); } } ifGetAll = true; // 解析完列表后记得设置为true } logger.info("解析列表页面:{}, 消耗时长:{}ms", page.getUrl(), System.currentTimeMillis() - start); } }
Example 7
Source File: XpathSelectorTest.java From webmagic with Apache License 2.0 | 4 votes |
@Ignore("take long time") @Test public void parserPerformanceTest() throws XPatherException { System.out.println(html.length()); HtmlCleaner htmlCleaner = new HtmlCleaner(); TagNode tagNode = htmlCleaner.clean(html); Document document = Jsoup.parse(html); long time =System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { htmlCleaner.clean(html); } System.out.println(System.currentTimeMillis()-time); time =System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { tagNode.evaluateXPath("//a"); } System.out.println(System.currentTimeMillis()-time); System.out.println("============="); time =System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { Jsoup.parse(html); } System.out.println(System.currentTimeMillis()-time); time =System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { document.select("a"); } System.out.println(System.currentTimeMillis()-time); System.out.println("============="); time =System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { htmlCleaner.clean(html); } System.out.println(System.currentTimeMillis()-time); time =System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { tagNode.evaluateXPath("//a"); } System.out.println(System.currentTimeMillis()-time); System.out.println("============="); XPathEvaluator compile = Xsoup.compile("//a"); time =System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { compile.evaluate(document); } System.out.println(System.currentTimeMillis()-time); }