org.htmlcleaner.TagNode#evaluateXPath

Source File: HtmlUtil.java From ispider with Apache License 2.0

6 votes

/**
 * 得到url列表
 * @param tagNode
 * @param attr
 * @param xpath
 * @return
 */
public static List<String> getListUrlByXpath(TagNode tagNode, String attr, String xpath) {
    List<String> urls = new ArrayList<>();
    try {
        Object[] objs = tagNode.evaluateXPath(xpath);
        if (objs != null && objs.length > 0) {
            for (Object obj : objs) {
                TagNode aTagNode = (TagNode) obj;
                String url = aTagNode.getAttributeByName(attr);
                urls.add("https:" + url);
            }
        }
        return urls;
    } catch (XPatherException e) {
        e.printStackTrace();
    }
    return null;
}

Source File: UserUtil.java From BigData with GNU General Public License v3.0

6 votes

/**
 * 解析关注页面，关注与被关注
 * 
 * @param followUrl
 */
public static void processFollow(String followUrl) {
	String content = PageUtil.getContent(followUrl);
	HtmlCleaner htmlCleaner = new HtmlCleaner();
	TagNode tNode = htmlCleaner.clean(content);
	extractUserUrl(content);
	try {
		Object[] pageNumObj = tNode
				.evaluateXPath("//*[@id=\"Profile-following\"]//div[@class=\"Pagination\"]/button");
		if (pageNumObj != null && pageNumObj.length > 0) {
			TagNode node = (TagNode) pageNumObj[pageNumObj.length - 2];
			int pagenum = Integer.parseInt(node.getText().toString());
			for (int i = 2; i <= pagenum; i++) {
				String url = followUrl + "?page=" + i;
				content = PageUtil.getContent(url);
				extractUserUrl(content);
			}
		}
	} catch (XPatherException e) {
		logger.error(e.getMessage());
	}
}

Source File: HtmlUtil.java From ispider with Apache License 2.0

5 votes

/**
 * 根据指定的xpath，从tagNode中选择具体的标签Text
 *
 * @param tagNode
 * @param xpath
 * @return
 */
public static String getTextByXpath(TagNode tagNode, String xpath) {
    Object[] objs = null;
    try {
        objs = tagNode.evaluateXPath(xpath);
        if (objs != null && objs.length > 0) {
            TagNode titleNode = (TagNode) objs[0];
            return titleNode.getText().toString().trim();
        }
    } catch (XPatherException e) {
        e.printStackTrace();
    }
    return null;
}

Source File: HtmlUtil.java From ispider with Apache License 2.0

5 votes

/**
 * 根据xpath和属性获取对应标签的属性值
 *
 * @param tagNode
 * @param attr
 * @param xpath
 * @return
 */
public static String getAttrByXpath(TagNode tagNode, String attr, String xpath) {
    try {
        Object[] objs = tagNode.evaluateXPath(xpath);
        if (objs != null && objs.length > 0) {
            TagNode node = (TagNode) objs[0];
            return node.getAttributeByName(attr);
        }
    } catch (XPatherException e) {
        e.printStackTrace();
    }
    return null;
}

Source File: UtilsStaticAnalyzer.java From apogen with Apache License 2.0

5 votes

private static String digForAMeaningfulName(String xp, String dom) throws UnsupportedEncodingException {

		xp = xp.toLowerCase();

		HtmlCleaner cleaner = new HtmlCleaner();
		CleanerProperties props = cleaner.getProperties();
		props.setAllowHtmlInsideAttributes(true);
		props.setAllowMultiWordAttributes(true);
		props.setRecognizeUnicodeChars(true);
		props.setOmitComments(true);
		props.setOmitDoctypeDeclaration(true);

		TagNode node = cleaner.clean(dom);
		dom = "<html>\n" + cleaner.getInnerHtml(node) + "\n</html>";

		// workaround: htmlcleaner works with rel xpaths
		xp = xp.replace("html[1]/", "/");
		try {
			Object[] result = node.evaluateXPath(xp);

			if (result.length > 0) {
				TagNode r = (TagNode) result[0];
				return digTheTagTreeForAString(r);
			}

		} catch (XPatherException e) {
			e.printStackTrace();
		}

		// couldn't find a representative string :(

		return "";
	}

Source File: SNHtmlParserImpl.java From ispider with Apache License 2.0

4 votes

/**
 * 苏宁的下一页按钮的url似乎也是动态加载的，所以没有办法像京东一样获取
 */

@Override
public void parser(Page page) {
    HtmlCleaner cleaner = new HtmlCleaner();
    TagNode rootNode = cleaner.clean(page.getContent());
    long start = System.currentTimeMillis();    // 解析开始时间

    if (page.getUrl().startsWith("https://product.suning.com")) {    // 解析商品
        parserProduct(page, rootNode);
        logger.info("解析商品页面:{}, 消耗时长:{}ms", page.getUrl(), System.currentTimeMillis() - start);
    } else if (page.getUrl().startsWith("https://list.suning.com")) {    // 解析列表
        // 当前页面的商品url列表
        List<String> urls = HtmlUtil.getListUrlByXpath(rootNode, "href", "//div[@id='filter-results']/ul/li/div/div/div/div[1]/div[1]/a");
        page.getUrls().addAll(urls);
        // 获取所有的列表页面url
        if (!ifGetAll) {
            Integer totalPage = null;
            try {
                // 获取总页码数
                Object[] objects = rootNode.evaluateXPath("//div[@id='second-filter']/div[2]/div/span");
                TagNode tagNode = (TagNode) objects[0];
                String text = tagNode.getText().toString(); // "\n\n1\n/100\n"
                Pattern pattern = Pattern.compile("[0-9]{2,3}");
                Matcher matcher = pattern.matcher(text);
                if (matcher.find()) {
                    totalPage = Integer.valueOf(matcher.group()); // 获得页码总数
                }
            } catch (XPatherException e) {
                e.printStackTrace();
            }
            if (totalPage != null) {
                // 从url中获取当前页码
                String currentPageStr = page.getUrl().split("0-20006-")[1].split("\\.")[0];    // url: https://list.suning.com/0-20006-0.html
                int currentPage = Integer.valueOf(currentPageStr);
                for (int i = currentPage + 1; i < totalPage; i++) {
                    String url = "https://list.suning.com/0-20006-" + i + ".html";
                    page.getUrls().add(url);
                }
            }
            ifGetAll = true;    // 解析完列表后记得设置为true
        }
        logger.info("解析列表页面:{}, 消耗时长:{}ms", page.getUrl(), System.currentTimeMillis() - start);
    }
}

Source File: XpathSelectorTest.java From webmagic with Apache License 2.0

4 votes

@Ignore("take long time")
@Test
public void parserPerformanceTest() throws XPatherException {
    System.out.println(html.length());

    HtmlCleaner htmlCleaner = new HtmlCleaner();
    TagNode tagNode = htmlCleaner.clean(html);
    Document document = Jsoup.parse(html);

    long time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        htmlCleaner.clean(html);
    }
    System.out.println(System.currentTimeMillis()-time);

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        tagNode.evaluateXPath("//a");
    }
    System.out.println(System.currentTimeMillis()-time);

    System.out.println("=============");

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        Jsoup.parse(html);
    }
    System.out.println(System.currentTimeMillis()-time);

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        document.select("a");
    }
    System.out.println(System.currentTimeMillis()-time);

    System.out.println("=============");

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        htmlCleaner.clean(html);
    }
    System.out.println(System.currentTimeMillis()-time);

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        tagNode.evaluateXPath("//a");
    }
    System.out.println(System.currentTimeMillis()-time);

    System.out.println("=============");

    XPathEvaluator compile = Xsoup.compile("//a");
    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        compile.evaluate(document);
    }
    System.out.println(System.currentTimeMillis()-time);

}

Java Code Examples for org.htmlcleaner.TagNode#evaluateXPath()