Java Code Examples for org.jsoup.nodes.Element#outerHtml()
The following examples show how to use
org.jsoup.nodes.Element#outerHtml() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: JsoupTesting.java From Java-Data-Science-Cookbook with MIT License | 6 votes |
public void extractDataWithJsoup(String href){ Document doc = null; try { doc = Jsoup.connect(href).timeout(10*1000).userAgent("Mozilla").ignoreHttpErrors(true).get(); } catch (IOException e) { //Your exception handling here } if(doc != null){ String title = doc.title(); String text = doc.body().text(); Elements links = doc.select("a[href]"); for (Element link : links) { String linkHref = link.attr("href"); String linkText = link.text(); String linkOuterHtml = link.outerHtml(); String linkInnerHtml = link.html(); } } }
Example 2
Source File: CssQueryMethodInterceptor.java From mica with GNU Lesser General Public License v3.0 | 5 votes |
@Nullable private String getValue(@Nullable Element element, CssQuery cssQuery) { if (element == null) { return null; } // 读取的属性名 String attrName = cssQuery.attr(); // 读取的值 String attrValue; if (StringUtil.isBlank(attrName)) { attrValue = element.outerHtml(); } else if ("html".equalsIgnoreCase(attrName)) { attrValue = element.html(); } else if ("text".equalsIgnoreCase(attrName)) { attrValue = getText(element); } else if ("allText".equalsIgnoreCase(attrName)) { attrValue = element.text(); } else { attrValue = element.attr(attrName); } // 判断是否需要正则处理 String regex = cssQuery.regex(); if (StringUtil.isBlank(attrValue) || StringUtil.isBlank(regex)) { return attrValue; } // 处理正则表达式 return getRegexValue(regex, cssQuery.regexGroup(), attrValue); }
Example 3
Source File: ScriptFinder.java From burp-javascript-security-extension with GNU General Public License v3.0 | 5 votes |
/** * Take the HTML this object has and find all of the scripts within it */ private void getScriptsFromHtml(){ Document doc = Jsoup.parse(html); for (Element jsElement : doc.getElementsByTag("script")){ if (jsElement.hasAttr("src")){ String scriptSrc = conditionReceivedUrl(jsElement.attr("src"), url); String scriptTag = jsElement.outerHtml(); JavascriptResource scriptObject = new JavascriptResource(myCallbacks, scriptSrc, scriptTag); htmlScriptData.put(scriptSrc, scriptObject); htmlScripts.add(scriptSrc); } } }
Example 4
Source File: CssSelector.java From NetDiscovery with Apache License 2.0 | 5 votes |
private String getValue(Element element) { if (attrName == null) { return element.outerHtml(); } else if ("innerHtml".equalsIgnoreCase(attrName)) { return element.html(); } else if ("text".equalsIgnoreCase(attrName)) { return getText(element); } else if ("allText".equalsIgnoreCase(attrName)) { return element.text(); } else { return element.attr(attrName); } }
Example 5
Source File: HtmlField.java From jspoon with MIT License | 5 votes |
private <U> String getValue(Element node, Class<U> fieldType) { if (node == null) { return spec.getDefaultValue(); } String value; switch (spec.getAttribute()) { case "": case "text": value = node.text(); break; case "html": case "innerHtml": value = node.html(); break; case "outerHtml": value = node.outerHtml(); break; default: value = node.attr(spec.getAttribute()); break; } if (spec.getRegex() != null) { Pattern pattern = Pattern.compile(spec.getRegex()); Matcher matcher = pattern.matcher(value); if (matcher.find()) { value = (matcher.groupCount() > 0) ? matcher.group(1) : spec.getDefaultValue(); if (value == null || value.isEmpty()) { value = spec.getDefaultValue(); } } } return value; }
Example 6
Source File: ElementOperator.java From zongtui-webcrawler with GNU General Public License v2.0 | 5 votes |
protected String getSource(Element element) { if (attribute == null) { return element.outerHtml(); } else { String attr = element.attr(attribute); Validate.notNull(attr, "Attribute " + attribute + " of " + element + " is not exist!"); return attr; } }
Example 7
Source File: CssSelector.java From zongtui-webcrawler with GNU General Public License v2.0 | 5 votes |
private String getValue(Element element) { if (attrName == null) { return element.outerHtml(); } else if ("innerHtml".equalsIgnoreCase(attrName)) { return element.html(); } else if ("text".equalsIgnoreCase(attrName)) { return getText(element); } else if ("allText".equalsIgnoreCase(attrName)) { return element.text(); } else { return element.attr(attrName); } }
Example 8
Source File: ZeppelinRDisplay.java From zeppelin with Apache License 2.0 | 5 votes |
private static RDisplay htmlDisplay(Element body, String imageWidth) { String div = ""; for (Element element : body.children()) { String eHtml = element.html(); String eOuterHtml = element.outerHtml(); eOuterHtml = eOuterHtml.replace("“%html " , "").replace("”", ""); Matcher matcher = pattern.matcher(eHtml); if (matcher.matches()) { eOuterHtml = eOuterHtml.replace(matcher.group(), ""); } div = div + eOuterHtml; } String content = div .replaceAll("src=\"//", "src=\"http://") .replaceAll("href=\"//", "href=\"http://"); body.html(content); for (Element image : body.getElementsByTag("img")) { image.attr("width", imageWidth); } return new RDisplay(body.html(), Type.HTML, Code.SUCCESS); }
Example 9
Source File: ContentExtractor.java From WebCollector with GNU General Public License v3.0 | 5 votes |
protected String getTime(Element contentElement) throws Exception { String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})"; Pattern pattern = Pattern.compile(regex); Element current = contentElement; for (int i = 0; i < 2; i++) { if (current != null && current != doc.body()) { Element parent = current.parent(); if (parent != null) { current = parent; } } } for (int i = 0; i < 6; i++) { if (current == null) { break; } String currentHtml = current.outerHtml(); Matcher matcher = pattern.matcher(currentHtml); if (matcher.find()) { return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3) + " " + matcher.group(4) + ":" + matcher.group(5) + ":" + matcher.group(6); } if (current != doc.body()) { current = current.parent(); } } try { return getDate(contentElement); } catch (Exception ex) { throw new Exception("time not found"); } }
Example 10
Source File: ContentExtractor.java From WebCollector with GNU General Public License v3.0 | 5 votes |
protected String getDate(Element contentElement) throws Exception { String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})"; Pattern pattern = Pattern.compile(regex); Element current = contentElement; for (int i = 0; i < 2; i++) { if (current != null && current != doc.body()) { Element parent = current.parent(); if (parent != null) { current = parent; } } } for (int i = 0; i < 6; i++) { if (current == null) { break; } String currentHtml = current.outerHtml(); Matcher matcher = pattern.matcher(currentHtml); if (matcher.find()) { return matcher.group(1) + "-" + matcher.group(2) + "-" + matcher.group(3); } if (current != doc.body()) { current = current.parent(); } } throw new Exception("date not found"); }
Example 11
Source File: JsoupParserIntegrationTest.java From tutorials with MIT License | 5 votes |
@Test public void examplesExtracting() { Element firstArticle = doc.select("article") .first(); Element timeElement = firstArticle.select("time") .first(); String dateTimeOfFirstArticle = timeElement.attr("datetime"); Element sectionDiv = firstArticle.select("section div") .first(); String sectionDivText = sectionDiv.text(); String articleHtml = firstArticle.html(); String outerHtml = firstArticle.outerHtml(); }
Example 12
Source File: ElementOperator.java From xsoup with MIT License | 5 votes |
protected String getSource(Element element) { if (attribute == null) { return element.outerHtml(); } else { String attr = element.attr(attribute); Validate.notNull(attr, "Attribute " + attribute + " of " + element + " is not exist!"); return attr; } }
Example 13
Source File: CssSelector.java From webmagic with Apache License 2.0 | 5 votes |
private String getValue(Element element) { if (attrName == null) { return element.outerHtml(); } else if ("innerHtml".equalsIgnoreCase(attrName)) { return element.html(); } else if ("text".equalsIgnoreCase(attrName)) { return getText(element); } else if ("allText".equalsIgnoreCase(attrName)) { return element.text(); } else { return element.attr(attrName); } }
Example 14
Source File: ElementOperator.java From zongtui-webcrawler with GNU General Public License v2.0 | 4 votes |
@Override public String operate(Element element) { return element.outerHtml(); }
Example 15
Source File: ElementOperator.java From xsoup with MIT License | 4 votes |
@Override public String operate(Element element) { return element.outerHtml(); }
Example 16
Source File: MlMessageParser.java From symphony-java-client with Apache License 2.0 | 3 votes |
public void parseMessage(String message) throws SymException { Document doc = Jsoup.parse(message); originalDoc = doc.clone(); Element elementErrors = doc.body().getElementsByTag("errors").first(); if (elementErrors != null) { if (elementErrors.outerHtml() != null) logger.debug("Errors found in message: {}", elementErrors.outerHtml()); } //Lets remove the errors elements doc.select("errors").remove(); elementMessageML = doc.select("messageML").first(); if(elementMessageML==null) elementMessageML = doc.select("div").first(); if (elementMessageML != null) { if (elementMessageML.outerHtml() != null) logger.debug("Doc parsed: {}", elementMessageML.outerHtml()); } else { logger.error("Could not parse document for message {}", message); throw new SymException("Malformed message"); } textDoc = new StringBuilder(); stripTags(textDoc, elementMessageML.childNodes()); textChunks = textDoc.toString().split("\\s+"); }