org.htmlcleaner.HtmlCleaner Java Examples

The following examples show how to use org.htmlcleaner.HtmlCleaner. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ResponseRenderPrintWriter.java    From zrlog with Apache License 2.0 6 votes vote down vote up
private void parseCustomHtmlTag(HtmlCleaner htmlCleaner, Map<String, String> plugin, TagNode tag, String tagName) throws IOException {
    if ("plugin".equals(tagName) && tag.hasAttribute("name")) {
        tag.setForeignMarkup(true);
        Map<String, String> tmp = new LinkedHashMap<>(tag.getAttributes());
        tmp.put("_tmp", System.currentTimeMillis() + "");
        tag.setAttributes(tmp);
        SimpleHtmlSerializer serializer = new SimpleHtmlSerializer(htmlCleaner.getProperties());
        StringWriter stringWriter = new StringWriter();
        tag.serialize(serializer, stringWriter);
        String content = stringWriter.toString();
        try {
            String url = "/" + tag.getAttributeByName("name") + "/" + tag.getAttributeByName("view");
            if (tag.hasAttribute("param")) {
                url += "?" + tag.getAttributeByName("param");
            }
            CloseResponseHandle handle = PluginHelper.getContext(url, "GET", request, false, adminTokenVO);
            byte[] bytes = IOUtil.getByteByInputStream(handle.getT().getEntity().getContent());
            plugin.put(content, new String(bytes, StandardCharsets.UTF_8));
        } catch (Exception e) {
            LOGGER.error("", e);
        }
    }
}
 
Example #2
Source File: UserUtil.java    From BigData with GNU General Public License v3.0 6 votes vote down vote up
/**
 * 解析关注页面,关注与被关注
 * 
 * @param followUrl
 */
public static void processFollow(String followUrl) {
	String content = PageUtil.getContent(followUrl);
	HtmlCleaner htmlCleaner = new HtmlCleaner();
	TagNode tNode = htmlCleaner.clean(content);
	extractUserUrl(content);
	try {
		Object[] pageNumObj = tNode
				.evaluateXPath("//*[@id=\"Profile-following\"]//div[@class=\"Pagination\"]/button");
		if (pageNumObj != null && pageNumObj.length > 0) {
			TagNode node = (TagNode) pageNumObj[pageNumObj.length - 2];
			int pagenum = Integer.parseInt(node.getText().toString());
			for (int i = 2; i <= pagenum; i++) {
				String url = followUrl + "?page=" + i;
				content = PageUtil.getContent(url);
				extractUserUrl(content);
			}
		}
	} catch (XPatherException e) {
		logger.error(e.getMessage());
	}
}
 
Example #3
Source File: HtmlSpanner.java    From SDHtmlTextView with Apache License 2.0 6 votes vote down vote up
private static HtmlCleaner createHtmlCleaner() {
    HtmlCleaner result = new HtmlCleaner();
    CleanerProperties cleanerProperties = result.getProperties();

    cleanerProperties.setAdvancedXmlEscape(true);

    cleanerProperties.setOmitXmlDeclaration(true);
    cleanerProperties.setOmitDoctypeDeclaration(false);

    cleanerProperties.setTranslateSpecialEntities(true);
    cleanerProperties.setTransResCharsToNCR(true);
    cleanerProperties.setRecognizeUnicodeChars(true);

    cleanerProperties.setIgnoreQuestAndExclam(true);
    cleanerProperties.setUseEmptyElementTags(false);

    cleanerProperties.setPruneTags("script,title");

    return result;
}
 
Example #4
Source File: XMLEscape.java    From xframium-java with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Converts a given xml to HTML String
 * @param htmlIn - xml in String
 * @return String - in HTML format
 */
public static String toHTML( String htmlIn )
{
	try
    {
        HtmlCleaner cleaner = new HtmlCleaner();
        cleaner.getProperties().setNamespacesAware( true ); 
        
        XmlSerializer xmlSerializer = new PrettyXmlSerializer( cleaner.getProperties(), "  " );

        String htmlData = xmlSerializer.getAsString( htmlIn );
        
        htmlData = escapeXML( htmlData.replaceAll("(?m)^[ \t]*\r?\n", "") );
        
        return htmlData;

    }
    catch( Exception e )
    {
    	e.printStackTrace();
        return null;
    }
}
 
Example #5
Source File: JDHtmlParserImpl.java    From ispider with Apache License 2.0 5 votes vote down vote up
@Override
public void parser(Page page) {
    HtmlCleaner cleaner = new HtmlCleaner();
    /**
     * cleaner.clean()方法,如果page.getContent为null,那么整个程序就会一直阻塞在这里
     * 所以,在前面的代码中ISpider.start()方法,下载网页后,需要对内容进行判断,如果content为空,则跳过解析
     */
    TagNode rootNode = cleaner.clean(page.getContent());

    long start = System.currentTimeMillis();    // 解析开始时间
    // 进行判断 根据url的类型进行列表解析还是商品解析
    if (page.getUrl().startsWith("https://item.jd.com/")) {  // 解析商品
        parserProduct(page, rootNode);
        logger.info("解析商品页面:{}, 消耗时长:{}ms", page.getUrl(), System.currentTimeMillis() - start);
    } else if (page.getUrl().startsWith("https://list.jd.com/list.html")) {  // 解析列表
        // 当前页面的商品url列表
        List<String> urls = HtmlUtil.getListUrlByXpath(rootNode, "href", "//div[@id='plist']/ul/li/div/div[1]/a");
        // 下一页 获取下一页的url
        String nextUrl = HtmlUtil.getAttrByXpath(rootNode, "href", "//div[@id='J_topPage']/a[2]");
        if (!"javascript:;".equals(nextUrl)) {    // 说明已经到最后一页了,再不能往下解析了,把当前的url进行排除
            nextUrl = "https://list.jd.com" + nextUrl;
            urls.add(nextUrl);
        }
        page.getUrls().addAll(urls);
        /**
         * 需要注意的是,当解析的是列表url时,该分支的代码只会解析当前页面的url,而不会爬取数据
         * url解析完成以后,添加到当前Page对象中的urls列表中,解析结束后,urls会被添加到Spider对象的url仓库中(高优先级队列)
         * 这样来让交给循环继续做解析,直到高把优先级队列的url都解析完成了,后面才会去解析低优先级也就是商品url的数据
         * 也就是说,当走的是解析列表的分支代码时,这时的Page对象的作用就变成了用来保存url的一个暂时的容器了
         */
        logger.info("解析列表页面:{}, 消耗时长:{}ms", page.getUrl(), System.currentTimeMillis() - start);
        if(System.currentTimeMillis() - start == 0) {   // 解析京东数据页码数时,偶尔获取不到下一页,时间就为0ms,这时需要重试
            logger.info("解析列表页面:{}, 消耗时长:{}ms, 尝试将其重新添加到高优先级url队列中", page.getUrl(), System.currentTimeMillis() - start);
            HttpUtil.retryUrl(page.getUrl(), SpiderUtil.getTopDomain(page.getUrl()) + SpiderConstants.SPIDER_DOMAIN_HIGHER_SUFFIX);
        }
    }

}
 
Example #6
Source File: XmlUtils.java    From iaf with Apache License 2.0 5 votes vote down vote up
public static String toXhtml(String htmlString) {
	String xhtmlString = null;
	if (StringUtils.isNotEmpty(htmlString)) {
		xhtmlString = XmlUtils.skipDocTypeDeclaration(htmlString.trim());
		if (xhtmlString.startsWith("<html>") || xhtmlString.startsWith("<html ")) {
			CleanerProperties props = new CleanerProperties();
			HtmlCleaner cleaner = new HtmlCleaner(props);
			TagNode tagNode = cleaner.clean(xhtmlString);
			xhtmlString = new SimpleXmlSerializer(props).getXmlAsString(tagNode);
		}
	}
	return xhtmlString;
}
 
Example #7
Source File: ResponseRenderPrintWriter.java    From zrlog with Apache License 2.0 5 votes vote down vote up
private String getCompressAndParseHtml(String inputBody) throws IOException {
    String currentBody = inputBody;

    //不显示none标签
    if (currentBody.endsWith(endFlag)) {
        currentBody = currentBody.substring(0, currentBody.length() - endFlag.length());
    }
    HtmlCleaner htmlCleaner = new HtmlCleaner();
    htmlCleaner.getProperties().setCharset(charset);
    htmlCleaner.getProperties().setUseCdataForScriptAndStyle(false);
    TagNode tagNode = htmlCleaner.clean(currentBody);
    TagNode[] tagNodes = tagNode.getAllElements(true);
    Map<String, String> plugin = new HashMap<>();
    for (TagNode tag : tagNodes) {
        if (tag != null) {
            String tagName = tag.getName();
            addStaticResourceFlag(tag, tagName);
            parseCustomHtmlTag(htmlCleaner, plugin, tag, tagName);
        }
    }

    SimpleHtmlSerializer serializer = new SimpleHtmlSerializer(htmlCleaner.getProperties());
    StringWriter stringWriter = new StringWriter();
    tagNode.serialize(serializer, stringWriter);
    currentBody = stringWriter.toString();
    if (tagNode.getDocType() != null) {
        currentBody = tagNode.getDocType() + currentBody;
    }
    for (Map.Entry<String, String> entry : plugin.entrySet()) {
        currentBody = currentBody.replace(entry.getKey(), entry.getValue());
    }
    currentBody = currentBody + "<!--" + (System.currentTimeMillis() - startTime) + "ms-->";
    return currentBody;

}
 
Example #8
Source File: HTTPLinkCheck.java    From xframium-java with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Converts a given xml to HTML String
 * @param htmlIn - xml in String
 * @return String - in HTML format
 */
public InputStream toHTML( InputStream htmlIn )
{
    try
    {
        

        
        byte[] buffer = new byte[ 512 ];
        int bytesRead = 0;
        
        StringBuilder sB = new StringBuilder();
        while ( (bytesRead = htmlIn.read( buffer ) ) != -1 )
        {
            sB.append( new String( buffer, 0, bytesRead ) );
        }
        
        if ( sB.indexOf( "html" ) != -1 )
        {
        
            HtmlCleaner cleaner = new HtmlCleaner();
            cleaner.getProperties().setNamespacesAware( true ); 
            
            XmlSerializer xmlSerializer = new PrettyXmlSerializer( cleaner.getProperties(), "  " );
            String htmlData = xmlSerializer.getAsString( sB.toString() );
            
            htmlData = escapeXML( htmlData.replaceAll("(?m)^[ \t]*\r?\n", "") );
            
            htmlData = htmlData.replace( "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">", "" );
            return new ByteArrayInputStream( htmlData.getBytes() );
        }
        else
            return null;

    }
    catch( Exception e )
    {
        return null;
    }
}
 
Example #9
Source File: UtilsStaticAnalyzer.java    From apogen with Apache License 2.0 5 votes vote down vote up
private static String digForAMeaningfulName(String xp, String dom) throws UnsupportedEncodingException {

		xp = xp.toLowerCase();

		HtmlCleaner cleaner = new HtmlCleaner();
		CleanerProperties props = cleaner.getProperties();
		props.setAllowHtmlInsideAttributes(true);
		props.setAllowMultiWordAttributes(true);
		props.setRecognizeUnicodeChars(true);
		props.setOmitComments(true);
		props.setOmitDoctypeDeclaration(true);

		TagNode node = cleaner.clean(dom);
		dom = "<html>\n" + cleaner.getInnerHtml(node) + "\n</html>";

		// workaround: htmlcleaner works with rel xpaths
		xp = xp.replace("html[1]/", "/");
		try {
			Object[] result = node.evaluateXPath(xp);

			if (result.length > 0) {
				TagNode r = (TagNode) result[0];
				return digTheTagTreeForAString(r);
			}

		} catch (XPatherException e) {
			e.printStackTrace();
		}

		// couldn't find a representative string :(

		return "";
	}
 
Example #10
Source File: HtmlSpanner.java    From SDHtmlTextView with Apache License 2.0 5 votes vote down vote up
/**
 * Creates a new HtmlSpanner using the given HtmlCleaner instance.
 *
 * This allows for a custom-configured HtmlCleaner.
 *
 * @param cleaner
 */
public HtmlSpanner(HtmlCleaner cleaner, FontResolver fontResolver,int textColor, float textSize) {
    initBaseComponents(cleaner, fontResolver);
    setTextColor(textColor);
    setTextSize(textSize);
    calculateBaseDimensions(textSize);
    registerBuiltInHandlers();
}
 
Example #11
Source File: HtmlSpanner.java    From SDHtmlTextView with Apache License 2.0 4 votes vote down vote up
private void initBaseComponents(HtmlCleaner cleaner, FontResolver fontResolver) {
    this.handlers = new HashMap<>();
    this.htmlCleaner = cleaner;
    this.fontResolver = fontResolver;
}
 
Example #12
Source File: Http.java    From BotLibre with Eclipse Public License 1.0 4 votes vote down vote up
public HtmlCleaner getHtmlCleaner() {
	if (this.htmlCleaner.get() == null) {
		this.htmlCleaner.set(new HtmlCleaner());
	}
	return this.htmlCleaner.get();
}
 
Example #13
Source File: Http.java    From BotLibre with Eclipse Public License 1.0 4 votes vote down vote up
public HtmlCleaner getHtmlCleaner() {
	if (this.htmlCleaner.get() == null) {
		this.htmlCleaner.set(new HtmlCleaner());
	}
	return this.htmlCleaner.get();
}
 
Example #14
Source File: MagnetWServiceModelImp.java    From AndroidDownload with Apache License 2.0 4 votes vote down vote up
public List<MagnetInfo> parser(String rootUrl, String url, String keyword,String sort, int page, String group, String magnet, String name, String size, String count,String hot) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException {
    String newUrl = transformUrl(url, keyword,sort, page);
    String html = Jsoup.connect(newUrl).get().body().html();


    XPath xPath = XPathFactory.newInstance().newXPath();
    TagNode tagNode = new HtmlCleaner().clean(html);
    Document dom = new DomSerializer(new CleanerProperties()).createDOM(tagNode);

    NodeList result = (NodeList) xPath.evaluate(group, dom, XPathConstants.NODESET);
    List<MagnetInfo> infos = new ArrayList<MagnetInfo>();
    for (int i = 0; i < result.getLength(); i++) {
        Node node = result.item(i);
        if (node != null) {
            if (StringUtil.isEmpty(node.getTextContent().trim())) {
                continue;
            }
            MagnetInfo info = new MagnetInfo();
            Node magnetNote = (Node) xPath.evaluate(magnet, node, XPathConstants.NODE);
            //磁力链
            String magnetValue = magnetNote.getTextContent();
            info.setMagnet(transformMagnet(magnetValue));
            //名称
            Node nameNote = ((Node) xPath.evaluate(name, node, XPathConstants.NODE));
            String nameValue = nameNote.getTextContent();
            info.setName(nameValue);
            String nameHref = nameNote.getAttributes().getNamedItem("href").getTextContent();
            info.setDetailUrl(transformDetailUrl(rootUrl, nameHref));
            //大小
            Node sizeNote = ((Node) xPath.evaluate(size, node, XPathConstants.NODE));
            if (sizeNote != null) {
                String sizeValue = sizeNote.getTextContent();
                info.setFormatSize(sizeValue);

                info.setSize(transformSize(sizeValue));
            }
            //时间
            Node dateNote=((Node) xPath.evaluate(count, node, XPathConstants.NODE));
            if(dateNote!=null){
                String countValue = dateNote.getTextContent();
                info.setCount(countValue);
            }
            Node hotNote=((Node) xPath.evaluate(hot, node, XPathConstants.NODE));
            if(hotNote!=null){
                String hotValue = hotNote.getTextContent();
                info.setHot(hotValue);
            }
            //一些加工的额外信息
            String resolution = transformResolution(nameValue);
            info.setResolution(resolution);

            infos.add(info);
        }
    }
    return infos;
}
 
Example #15
Source File: MagnetWServiceModelImp.java    From AndroidMagnetSearch with Apache License 2.0 4 votes vote down vote up
public List<MagnetInfo> parser(String rootUrl, String url, String keyword,String sort, int page, String group, String magnet, String name, String size, String count,String hot) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException {
    String newUrl = transformUrl(url, keyword,sort, page);
    String html = Jsoup.connect(newUrl).get().body().html();


    XPath xPath = XPathFactory.newInstance().newXPath();
    TagNode tagNode = new HtmlCleaner().clean(html);
    Document dom = new DomSerializer(new CleanerProperties()).createDOM(tagNode);

    NodeList result = (NodeList) xPath.evaluate(group, dom, XPathConstants.NODESET);
    List<MagnetInfo> infos = new ArrayList<MagnetInfo>();
    for (int i = 0; i < result.getLength(); i++) {
        Node node = result.item(i);
        if (node != null) {
            if (StringUtil.isEmpty(node.getTextContent().trim())) {
                continue;
            }
            MagnetInfo info = new MagnetInfo();
            Node magnetNote = (Node) xPath.evaluate(magnet, node, XPathConstants.NODE);
            //磁力链
            String magnetValue = magnetNote.getTextContent();
            info.setMagnet(transformMagnet(magnetValue));
            //名称
            Node nameNote = ((Node) xPath.evaluate(name, node, XPathConstants.NODE));
            String nameValue = nameNote.getTextContent();
            info.setName(nameValue);
            String nameHref = nameNote.getAttributes().getNamedItem("href").getTextContent();
            info.setDetailUrl(transformDetailUrl(rootUrl, nameHref));
            //大小
            Node sizeNote = ((Node) xPath.evaluate(size, node, XPathConstants.NODE));
            if (sizeNote != null) {
                String sizeValue = sizeNote.getTextContent();
                info.setFormatSize(sizeValue);

                info.setSize(transformSize(sizeValue));
            }
            //时间
            Node dateNote=((Node) xPath.evaluate(count, node, XPathConstants.NODE));
            if(dateNote!=null){
                String countValue = dateNote.getTextContent();
                info.setCount(countValue);
            }
            Node hotNote=((Node) xPath.evaluate(hot, node, XPathConstants.NODE));
            if(hotNote!=null){
                String hotValue = hotNote.getTextContent();
                info.setHot(hotValue);
            }
            //一些加工的额外信息
            String resolution = transformResolution(nameValue);
            info.setResolution(resolution);

            infos.add(info);
        }
    }
    return infos;
}
 
Example #16
Source File: HTMLCleanerHandle.java    From java-client-api with Apache License 2.0 4 votes vote down vote up
public HtmlCleaner getParser() {
  if (parser == null)
    parser = makeParser();
  return parser;
}
 
Example #17
Source File: HTMLCleanerHandle.java    From java-client-api with Apache License 2.0 4 votes vote down vote up
public void setParser(HtmlCleaner parser) {
  this.parser = parser;
}
 
Example #18
Source File: HTMLCleanerHandle.java    From java-client-api with Apache License 2.0 4 votes vote down vote up
protected HtmlCleaner makeParser() {
  return new HtmlCleaner(getRulesProvider(), getConfiguration());
}
 
Example #19
Source File: SNHtmlParserImpl.java    From ispider with Apache License 2.0 4 votes vote down vote up
/**
 * 苏宁的下一页按钮的url似乎也是动态加载的,所以没有办法像京东一样获取
 */

@Override
public void parser(Page page) {
    HtmlCleaner cleaner = new HtmlCleaner();
    TagNode rootNode = cleaner.clean(page.getContent());
    long start = System.currentTimeMillis();    // 解析开始时间

    if (page.getUrl().startsWith("https://product.suning.com")) {    // 解析商品
        parserProduct(page, rootNode);
        logger.info("解析商品页面:{}, 消耗时长:{}ms", page.getUrl(), System.currentTimeMillis() - start);
    } else if (page.getUrl().startsWith("https://list.suning.com")) {    // 解析列表
        // 当前页面的商品url列表
        List<String> urls = HtmlUtil.getListUrlByXpath(rootNode, "href", "//div[@id='filter-results']/ul/li/div/div/div/div[1]/div[1]/a");
        page.getUrls().addAll(urls);
        // 获取所有的列表页面url
        if (!ifGetAll) {
            Integer totalPage = null;
            try {
                // 获取总页码数
                Object[] objects = rootNode.evaluateXPath("//div[@id='second-filter']/div[2]/div/span");
                TagNode tagNode = (TagNode) objects[0];
                String text = tagNode.getText().toString(); // "\n\n1\n/100\n"
                Pattern pattern = Pattern.compile("[0-9]{2,3}");
                Matcher matcher = pattern.matcher(text);
                if (matcher.find()) {
                    totalPage = Integer.valueOf(matcher.group()); // 获得页码总数
                }
            } catch (XPatherException e) {
                e.printStackTrace();
            }
            if (totalPage != null) {
                // 从url中获取当前页码
                String currentPageStr = page.getUrl().split("0-20006-")[1].split("\\.")[0];    // url: https://list.suning.com/0-20006-0.html
                int currentPage = Integer.valueOf(currentPageStr);
                for (int i = currentPage + 1; i < totalPage; i++) {
                    String url = "https://list.suning.com/0-20006-" + i + ".html";
                    page.getUrls().add(url);
                }
            }
            ifGetAll = true;    // 解析完列表后记得设置为true
        }
        logger.info("解析列表页面:{}, 消耗时长:{}ms", page.getUrl(), System.currentTimeMillis() - start);
    }
}
 
Example #20
Source File: XpathSelectorTest.java    From webmagic with Apache License 2.0 4 votes vote down vote up
@Ignore("take long time")
@Test
public void parserPerformanceTest() throws XPatherException {
    System.out.println(html.length());

    HtmlCleaner htmlCleaner = new HtmlCleaner();
    TagNode tagNode = htmlCleaner.clean(html);
    Document document = Jsoup.parse(html);

    long time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        htmlCleaner.clean(html);
    }
    System.out.println(System.currentTimeMillis()-time);

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        tagNode.evaluateXPath("//a");
    }
    System.out.println(System.currentTimeMillis()-time);

    System.out.println("=============");

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        Jsoup.parse(html);
    }
    System.out.println(System.currentTimeMillis()-time);

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        document.select("a");
    }
    System.out.println(System.currentTimeMillis()-time);

    System.out.println("=============");

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        htmlCleaner.clean(html);
    }
    System.out.println(System.currentTimeMillis()-time);

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        tagNode.evaluateXPath("//a");
    }
    System.out.println(System.currentTimeMillis()-time);

    System.out.println("=============");

    XPathEvaluator compile = Xsoup.compile("//a");
    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        compile.evaluate(document);
    }
    System.out.println(System.currentTimeMillis()-time);

}