Java Code Examples for org.htmlcleaner.TagNode#evaluateXPath()

The following examples show how to use org.htmlcleaner.TagNode#evaluateXPath() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: HtmlUtil.java    From ispider with Apache License 2.0 6 votes vote down vote up
/**
 * 得到url列表
 * @param tagNode
 * @param attr
 * @param xpath
 * @return
 */
public static List<String> getListUrlByXpath(TagNode tagNode, String attr, String xpath) {
    List<String> urls = new ArrayList<>();
    try {
        Object[] objs = tagNode.evaluateXPath(xpath);
        if (objs != null && objs.length > 0) {
            for (Object obj : objs) {
                TagNode aTagNode = (TagNode) obj;
                String url = aTagNode.getAttributeByName(attr);
                urls.add("https:" + url);
            }
        }
        return urls;
    } catch (XPatherException e) {
        e.printStackTrace();
    }
    return null;
}
 
Example 2
Source File: UserUtil.java    From BigData with GNU General Public License v3.0 6 votes vote down vote up
/**
 * 解析关注页面,关注与被关注
 * 
 * @param followUrl
 */
public static void processFollow(String followUrl) {
	String content = PageUtil.getContent(followUrl);
	HtmlCleaner htmlCleaner = new HtmlCleaner();
	TagNode tNode = htmlCleaner.clean(content);
	extractUserUrl(content);
	try {
		Object[] pageNumObj = tNode
				.evaluateXPath("//*[@id=\"Profile-following\"]//div[@class=\"Pagination\"]/button");
		if (pageNumObj != null && pageNumObj.length > 0) {
			TagNode node = (TagNode) pageNumObj[pageNumObj.length - 2];
			int pagenum = Integer.parseInt(node.getText().toString());
			for (int i = 2; i <= pagenum; i++) {
				String url = followUrl + "?page=" + i;
				content = PageUtil.getContent(url);
				extractUserUrl(content);
			}
		}
	} catch (XPatherException e) {
		logger.error(e.getMessage());
	}
}
 
Example 3
Source File: HtmlUtil.java    From ispider with Apache License 2.0 5 votes vote down vote up
/**
 * 根据指定的xpath,从tagNode中选择具体的标签Text
 *
 * @param tagNode
 * @param xpath
 * @return
 */
public static String getTextByXpath(TagNode tagNode, String xpath) {
    Object[] objs = null;
    try {
        objs = tagNode.evaluateXPath(xpath);
        if (objs != null && objs.length > 0) {
            TagNode titleNode = (TagNode) objs[0];
            return titleNode.getText().toString().trim();
        }
    } catch (XPatherException e) {
        e.printStackTrace();
    }
    return null;
}
 
Example 4
Source File: HtmlUtil.java    From ispider with Apache License 2.0 5 votes vote down vote up
/**
 * 根据xpath和属性获取对应标签的属性值
 *
 * @param tagNode
 * @param attr
 * @param xpath
 * @return
 */
public static String getAttrByXpath(TagNode tagNode, String attr, String xpath) {
    try {
        Object[] objs = tagNode.evaluateXPath(xpath);
        if (objs != null && objs.length > 0) {
            TagNode node = (TagNode) objs[0];
            return node.getAttributeByName(attr);
        }
    } catch (XPatherException e) {
        e.printStackTrace();
    }
    return null;
}
 
Example 5
Source File: UtilsStaticAnalyzer.java    From apogen with Apache License 2.0 5 votes vote down vote up
private static String digForAMeaningfulName(String xp, String dom) throws UnsupportedEncodingException {

		xp = xp.toLowerCase();

		HtmlCleaner cleaner = new HtmlCleaner();
		CleanerProperties props = cleaner.getProperties();
		props.setAllowHtmlInsideAttributes(true);
		props.setAllowMultiWordAttributes(true);
		props.setRecognizeUnicodeChars(true);
		props.setOmitComments(true);
		props.setOmitDoctypeDeclaration(true);

		TagNode node = cleaner.clean(dom);
		dom = "<html>\n" + cleaner.getInnerHtml(node) + "\n</html>";

		// workaround: htmlcleaner works with rel xpaths
		xp = xp.replace("html[1]/", "/");
		try {
			Object[] result = node.evaluateXPath(xp);

			if (result.length > 0) {
				TagNode r = (TagNode) result[0];
				return digTheTagTreeForAString(r);
			}

		} catch (XPatherException e) {
			e.printStackTrace();
		}

		// couldn't find a representative string :(

		return "";
	}
 
Example 6
Source File: SNHtmlParserImpl.java    From ispider with Apache License 2.0 4 votes vote down vote up
/**
 * 苏宁的下一页按钮的url似乎也是动态加载的,所以没有办法像京东一样获取
 */

@Override
public void parser(Page page) {
    HtmlCleaner cleaner = new HtmlCleaner();
    TagNode rootNode = cleaner.clean(page.getContent());
    long start = System.currentTimeMillis();    // 解析开始时间

    if (page.getUrl().startsWith("https://product.suning.com")) {    // 解析商品
        parserProduct(page, rootNode);
        logger.info("解析商品页面:{}, 消耗时长:{}ms", page.getUrl(), System.currentTimeMillis() - start);
    } else if (page.getUrl().startsWith("https://list.suning.com")) {    // 解析列表
        // 当前页面的商品url列表
        List<String> urls = HtmlUtil.getListUrlByXpath(rootNode, "href", "//div[@id='filter-results']/ul/li/div/div/div/div[1]/div[1]/a");
        page.getUrls().addAll(urls);
        // 获取所有的列表页面url
        if (!ifGetAll) {
            Integer totalPage = null;
            try {
                // 获取总页码数
                Object[] objects = rootNode.evaluateXPath("//div[@id='second-filter']/div[2]/div/span");
                TagNode tagNode = (TagNode) objects[0];
                String text = tagNode.getText().toString(); // "\n\n1\n/100\n"
                Pattern pattern = Pattern.compile("[0-9]{2,3}");
                Matcher matcher = pattern.matcher(text);
                if (matcher.find()) {
                    totalPage = Integer.valueOf(matcher.group()); // 获得页码总数
                }
            } catch (XPatherException e) {
                e.printStackTrace();
            }
            if (totalPage != null) {
                // 从url中获取当前页码
                String currentPageStr = page.getUrl().split("0-20006-")[1].split("\\.")[0];    // url: https://list.suning.com/0-20006-0.html
                int currentPage = Integer.valueOf(currentPageStr);
                for (int i = currentPage + 1; i < totalPage; i++) {
                    String url = "https://list.suning.com/0-20006-" + i + ".html";
                    page.getUrls().add(url);
                }
            }
            ifGetAll = true;    // 解析完列表后记得设置为true
        }
        logger.info("解析列表页面:{}, 消耗时长:{}ms", page.getUrl(), System.currentTimeMillis() - start);
    }
}
 
Example 7
Source File: XpathSelectorTest.java    From webmagic with Apache License 2.0 4 votes vote down vote up
@Ignore("take long time")
@Test
public void parserPerformanceTest() throws XPatherException {
    System.out.println(html.length());

    HtmlCleaner htmlCleaner = new HtmlCleaner();
    TagNode tagNode = htmlCleaner.clean(html);
    Document document = Jsoup.parse(html);

    long time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        htmlCleaner.clean(html);
    }
    System.out.println(System.currentTimeMillis()-time);

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        tagNode.evaluateXPath("//a");
    }
    System.out.println(System.currentTimeMillis()-time);

    System.out.println("=============");

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        Jsoup.parse(html);
    }
    System.out.println(System.currentTimeMillis()-time);

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        document.select("a");
    }
    System.out.println(System.currentTimeMillis()-time);

    System.out.println("=============");

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        htmlCleaner.clean(html);
    }
    System.out.println(System.currentTimeMillis()-time);

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        tagNode.evaluateXPath("//a");
    }
    System.out.println(System.currentTimeMillis()-time);

    System.out.println("=============");

    XPathEvaluator compile = Xsoup.compile("//a");
    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        compile.evaluate(document);
    }
    System.out.println(System.currentTimeMillis()-time);

}