org.htmlcleaner.CleanerProperties Java Examples
The following examples show how to use
org.htmlcleaner.CleanerProperties.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HtmlSpanner.java From SDHtmlTextView with Apache License 2.0 | 6 votes |
private static HtmlCleaner createHtmlCleaner() { HtmlCleaner result = new HtmlCleaner(); CleanerProperties cleanerProperties = result.getProperties(); cleanerProperties.setAdvancedXmlEscape(true); cleanerProperties.setOmitXmlDeclaration(true); cleanerProperties.setOmitDoctypeDeclaration(false); cleanerProperties.setTranslateSpecialEntities(true); cleanerProperties.setTransResCharsToNCR(true); cleanerProperties.setRecognizeUnicodeChars(true); cleanerProperties.setIgnoreQuestAndExclam(true); cleanerProperties.setUseEmptyElementTags(false); cleanerProperties.setPruneTags("script,title"); return result; }
Example #2
Source File: UtilsStaticAnalyzer.java From apogen with Apache License 2.0 | 5 votes |
private static String digForAMeaningfulName(String xp, String dom) throws UnsupportedEncodingException { xp = xp.toLowerCase(); HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); props.setRecognizeUnicodeChars(true); props.setOmitComments(true); props.setOmitDoctypeDeclaration(true); TagNode node = cleaner.clean(dom); dom = "<html>\n" + cleaner.getInnerHtml(node) + "\n</html>"; // workaround: htmlcleaner works with rel xpaths xp = xp.replace("html[1]/", "/"); try { Object[] result = node.evaluateXPath(xp); if (result.length > 0) { TagNode r = (TagNode) result[0]; return digTheTagTreeForAString(r); } } catch (XPatherException e) { e.printStackTrace(); } // couldn't find a representative string :( return ""; }
Example #3
Source File: HTMLCleanerHandle.java From java-client-api with Apache License 2.0 | 5 votes |
protected XmlSerializer makeSerializer() { CleanerProperties configuration = getConfiguration(); return new CompactXmlSerializer( (configuration != null) ? configuration : getParser().getProperties() ); }
Example #4
Source File: XmlUtils.java From iaf with Apache License 2.0 | 5 votes |
public static String toXhtml(String htmlString) { String xhtmlString = null; if (StringUtils.isNotEmpty(htmlString)) { xhtmlString = XmlUtils.skipDocTypeDeclaration(htmlString.trim()); if (xhtmlString.startsWith("<html>") || xhtmlString.startsWith("<html ")) { CleanerProperties props = new CleanerProperties(); HtmlCleaner cleaner = new HtmlCleaner(props); TagNode tagNode = cleaner.clean(xhtmlString); xhtmlString = new SimpleXmlSerializer(props).getXmlAsString(tagNode); } } return xhtmlString; }
Example #5
Source File: MagnetWServiceModelImp.java From AndroidMagnetSearch with Apache License 2.0 | 4 votes |
public List<MagnetInfo> parser(String rootUrl, String url, String keyword,String sort, int page, String group, String magnet, String name, String size, String count,String hot) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException { String newUrl = transformUrl(url, keyword,sort, page); String html = Jsoup.connect(newUrl).get().body().html(); XPath xPath = XPathFactory.newInstance().newXPath(); TagNode tagNode = new HtmlCleaner().clean(html); Document dom = new DomSerializer(new CleanerProperties()).createDOM(tagNode); NodeList result = (NodeList) xPath.evaluate(group, dom, XPathConstants.NODESET); List<MagnetInfo> infos = new ArrayList<MagnetInfo>(); for (int i = 0; i < result.getLength(); i++) { Node node = result.item(i); if (node != null) { if (StringUtil.isEmpty(node.getTextContent().trim())) { continue; } MagnetInfo info = new MagnetInfo(); Node magnetNote = (Node) xPath.evaluate(magnet, node, XPathConstants.NODE); //磁力链 String magnetValue = magnetNote.getTextContent(); info.setMagnet(transformMagnet(magnetValue)); //名称 Node nameNote = ((Node) xPath.evaluate(name, node, XPathConstants.NODE)); String nameValue = nameNote.getTextContent(); info.setName(nameValue); String nameHref = nameNote.getAttributes().getNamedItem("href").getTextContent(); info.setDetailUrl(transformDetailUrl(rootUrl, nameHref)); //大小 Node sizeNote = ((Node) xPath.evaluate(size, node, XPathConstants.NODE)); if (sizeNote != null) { String sizeValue = sizeNote.getTextContent(); info.setFormatSize(sizeValue); info.setSize(transformSize(sizeValue)); } //时间 Node dateNote=((Node) xPath.evaluate(count, node, XPathConstants.NODE)); if(dateNote!=null){ String countValue = dateNote.getTextContent(); info.setCount(countValue); } Node hotNote=((Node) xPath.evaluate(hot, node, XPathConstants.NODE)); if(hotNote!=null){ String hotValue = hotNote.getTextContent(); info.setHot(hotValue); } //一些加工的额外信息 String resolution = transformResolution(nameValue); info.setResolution(resolution); infos.add(info); } } return infos; }
Example #6
Source File: MagnetWServiceModelImp.java From AndroidDownload with Apache License 2.0 | 4 votes |
public List<MagnetInfo> parser(String rootUrl, String url, String keyword,String sort, int page, String group, String magnet, String name, String size, String count,String hot) throws IOException, XPathExpressionException, ParserConfigurationException, XPatherException { String newUrl = transformUrl(url, keyword,sort, page); String html = Jsoup.connect(newUrl).get().body().html(); XPath xPath = XPathFactory.newInstance().newXPath(); TagNode tagNode = new HtmlCleaner().clean(html); Document dom = new DomSerializer(new CleanerProperties()).createDOM(tagNode); NodeList result = (NodeList) xPath.evaluate(group, dom, XPathConstants.NODESET); List<MagnetInfo> infos = new ArrayList<MagnetInfo>(); for (int i = 0; i < result.getLength(); i++) { Node node = result.item(i); if (node != null) { if (StringUtil.isEmpty(node.getTextContent().trim())) { continue; } MagnetInfo info = new MagnetInfo(); Node magnetNote = (Node) xPath.evaluate(magnet, node, XPathConstants.NODE); //磁力链 String magnetValue = magnetNote.getTextContent(); info.setMagnet(transformMagnet(magnetValue)); //名称 Node nameNote = ((Node) xPath.evaluate(name, node, XPathConstants.NODE)); String nameValue = nameNote.getTextContent(); info.setName(nameValue); String nameHref = nameNote.getAttributes().getNamedItem("href").getTextContent(); info.setDetailUrl(transformDetailUrl(rootUrl, nameHref)); //大小 Node sizeNote = ((Node) xPath.evaluate(size, node, XPathConstants.NODE)); if (sizeNote != null) { String sizeValue = sizeNote.getTextContent(); info.setFormatSize(sizeValue); info.setSize(transformSize(sizeValue)); } //时间 Node dateNote=((Node) xPath.evaluate(count, node, XPathConstants.NODE)); if(dateNote!=null){ String countValue = dateNote.getTextContent(); info.setCount(countValue); } Node hotNote=((Node) xPath.evaluate(hot, node, XPathConstants.NODE)); if(hotNote!=null){ String hotValue = hotNote.getTextContent(); info.setHot(hotValue); } //一些加工的额外信息 String resolution = transformResolution(nameValue); info.setResolution(resolution); infos.add(info); } } return infos; }
Example #7
Source File: HTMLCleanerHandle.java From java-client-api with Apache License 2.0 | 4 votes |
public CleanerProperties getConfiguration() { if (configuration == null) configuration = makeConfiguration(); return configuration; }
Example #8
Source File: HTMLCleanerHandle.java From java-client-api with Apache License 2.0 | 4 votes |
public void setConfiguration(CleanerProperties configuration) { this.configuration = configuration; }
Example #9
Source File: HTMLCleanerHandle.java From java-client-api with Apache License 2.0 | 4 votes |
protected CleanerProperties makeConfiguration() { return new CleanerProperties(); }