Java Code Examples for org.jsoup.nodes.Element#html()
The following examples show how to use
org.jsoup.nodes.Element#html() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Rgaa3Extractor.java From Asqatasun with GNU Affero General Public License v3.0 | 6 votes |
private static void extractLevelFromCriterionAndWrite(Document doc) throws IOException { StringBuilder crit = new StringBuilder(); for (Element el : doc.select(CRITERION_SELECTOR)) { if (StringUtils.isNotBlank(el.id())) { crit.append(el.id().replace("crit", "Rgaa30")); crit.append("="); String content = el.html(); content = content.substring(content.indexOf("] ") + 1); content = extractRuleContent(content); crit.append(content); crit.append("\n"); String level = el.text().substring(el.text().indexOf("[")+1, el.text().indexOf("]")); levelFromCrit.put(el.id().replaceAll("crit-", ""), level); } } if (writeCritInFile) { FileUtils.write(new File(CRITERION_I18N_FILE_PATH), crit.toString()); } }
Example 2
Source File: RosiMM.java From PicKing with Apache License 2.0 | 6 votes |
@Override public String getContentNext(String baseUrl, String currentUrl, byte[] result) throws UnsupportedEncodingException { Document document = Jsoup.parse(new String(result, "gbk")); Elements elements = document.select("script"); for (Element element : elements) { String code = element.html(); if (!element.html().equals("")) { Pattern pattern = Pattern.compile("index_\\d*.htm\">下一页"); Matcher matcher = pattern.matcher(code); if (matcher.find()) { String temp = matcher.group(); return baseUrl + "rosimm/" + temp.substring(0, temp.length() - 5); } } } return ""; }
Example 3
Source File: GithubHotProcessor.java From hot-crawler with MIT License | 6 votes |
@Override protected Info getInfoByElement(Element element) { Element urlElement = element.getElementsByTag("h1").get(0).getElementsByTag("a").get(0); Element descElement = null; if (! element.getElementsByTag("p").isEmpty()) { descElement = element.getElementsByTag("p").get(0); } String repositoryName = urlElement.attr("href"); // Title StringBuilder infoTitle = new StringBuilder(); infoTitle.append(repositoryName.substring(repositoryName.indexOf('/', 1) + 1)); infoTitle.append(". "); String desc = descElement == null ? "" : descElement.html(); infoTitle.append(desc); // Url StringBuilder infoUrl = new StringBuilder(); infoUrl.append(this.prefix); infoUrl.append(repositoryName); return new Info(infoTitle.toString(), infoUrl.toString()); }
Example 4
Source File: WeiboHotProcessor.java From hot-crawler with MIT License | 6 votes |
@Override protected List<Info> getInfoDataByElements(Elements elements) { List<Info> list = new ArrayList<>(); if (elements != null) { // remove two tr elements elements.remove(0); elements.remove(0); int i = 0; for (Element element : elements) { Element itemElement = element.getElementsByClass("td-02").get(0).getElementsByTag("a").get(0); String id = String.valueOf(++i); String infoUrl = itemElement.attr("href"); String infoTitle = itemElement.html(); infoUrl = this.prefix + infoUrl; list.add(new Info(id, infoTitle, infoUrl)); } } return list; }
Example 5
Source File: CloudmusicHotProcessor.java From hot-crawler with MIT License | 6 votes |
@Override protected List<Info> getInfoDataByElements(Elements elements) { List<Info> list = new ArrayList<>(); if (elements != null) { int i = 0; for (Element element : elements) { Element itemElement = null; try { itemElement = element.getElementsByTag("a").get(0); String id = String.valueOf(++i); StringBuilder infoUrl = new StringBuilder(); infoUrl.append(this.prefix); infoUrl.append("#"); infoUrl.append(itemElement.attr("href")); String infoTitle = itemElement.html(); list.add(new Info(id, infoTitle, infoUrl.toString())); } catch (NullPointerException | IndexOutOfBoundsException e) { log.error("Can't found item element by attribute!", e); } } } return list; }
Example 6
Source File: TestSession.java From actframework with Apache License 2.0 | 6 votes |
private static boolean matches(Object a, Object b) { if ($.eq(a, b)) { return true; } if (!((b instanceof String) && (a instanceof Element))) { return false; } String test = S.string(b); Element element = (Element) a; // try html String html = element.html(); if (S.eq(html, test, S.IGNORECASE)) { return true; } // try text String text = element.text(); if (S.eq(text, test, S.IGNORECASE)) { return true; } // try val String val = element.val(); if (S.eq(val, test, S.IGNORECASE)) { return true; } return false; }
Example 7
Source File: RedgifsRipper.java From ripme with MIT License | 6 votes |
@Override public List<String> getURLsFromPage(Document doc) { List<String> result = new ArrayList<>(); if (isProfile().matches() || isSearch().matches()) { result = hasURLs(doc); } else { Elements videos = doc.select("script"); for (Element el : videos) { String json = el.html(); if (json.startsWith("{")) { JSONObject page = new JSONObject(json); result.add(page.getJSONObject("video").getString("contentUrl")); } } } return result; }
Example 8
Source File: JSVarFieldRender.java From gecco with MIT License | 5 votes |
@Override @SuppressWarnings({ "unchecked" }) public void render(HttpRequest request, HttpResponse response, BeanMap beanMap, SpiderBean bean) { Context cx = Context.enter(); ScriptableObject scope = cx.initSafeStandardObjects(); String windowScript = "var window = {};var document = {};"; cx.evaluateString(scope, windowScript, "window", 1, null); HtmlParser parser = new HtmlParser(request.getUrl(), response.getContent()); for (Element ele : parser.$("script")) { String sc = ele.html(); if (StringUtils.isNotEmpty(sc)) { try { cx.evaluateString(scope, sc, "", 1, null); } catch (Exception ex) { // ex.printStackTrace(); } } } Map<String, Object> fieldMap = new HashMap<String, Object>(); Set<Field> jsVarFields = ReflectionUtils.getAllFields(bean.getClass(), ReflectionUtils.withAnnotation(JSVar.class)); for (Field jsVarField : jsVarFields) { Object value = injectJsVarField(request, beanMap, jsVarField, cx, scope); if(value != null) { fieldMap.put(jsVarField.getName(), value); } } beanMap.putAll(fieldMap); Context.exit(); }
Example 9
Source File: TianLaiReadUtil.java From MissZzzReader with Apache License 2.0 | 5 votes |
/** * 从html中获取章节列表 * * @param html * @return */ public static ArrayList<Chapter> getChaptersFromHtml(String html,Book book) { ArrayList<Chapter> chapters = new ArrayList<>(); Document doc = Jsoup.parse(html); Element divList = doc.getElementById("list"); Element dl = divList.getElementsByTag("dl").get(0); String lastTile = null; int i = 0; for(Element dd : dl.getElementsByTag("dd")){ Elements as = dd.getElementsByTag("a"); if (as.size() > 0) { Element a = as.get(0); String title = a.html(); if (!StringHelper.isEmpty(lastTile) && title.equals(lastTile)) { continue; } Chapter chapter = new Chapter(); chapter.setNumber(i++); chapter.setTitle(title); String url = a.attr("href"); if (StringHelper.isEmpty(book.getSource()) || BookSource.tianlai.toString().equals(book.getSource())) { url = URLCONST.nameSpace_tianlai + url; } else if (BookSource.biquge.toString().equals(book.getSource())) { url = book.getChapterUrl() + url; } chapter.setUrl(url); chapters.add(chapter); lastTile = title; } } return chapters; }
Example 10
Source File: IfanrHotProcessor.java From hot-crawler with MIT License | 5 votes |
@Override protected Info getInfoByElement(Element element) { element = element.getElementsByClass("js-title-transform").get(0); String infoUrl = element.attr("href"); String infoTitle = element.html(); return new Info(infoTitle, infoUrl); }
Example 11
Source File: HuxiuHotProcessor.java From hot-crawler with MIT License | 5 votes |
@Override protected Info getInfoByElement(Element element) { Element titleItem = element.getElementsByClass("article-item__content__title").get(0); String infoTitle = titleItem.html(); Element urlItem = element.getElementsByClass("article-item__img").get(0).parent(); StringBuilder infoUrl = new StringBuilder(this.prefix); infoUrl.append(urlItem.attr("href")); Info info = new Info(); info.setTitle(infoTitle); info.setUrl(infoUrl.toString()); return info; }
Example 12
Source File: ImageExtensions.java From Android-WYSIWYG-Editor with Apache License 2.0 | 5 votes |
public void loadImage(String _path, Element node) { String desc = null; if(node != null) { desc = node.html(); } final View childLayout = loadImageRemote(_path, desc); CustomEditText text = childLayout.findViewById(R.id.desc); if(node != null) { componentsWrapper.getInputExtensions().applyStyles(text, node); } }
Example 13
Source File: HtmlField.java From jspoon with MIT License | 5 votes |
private <U> String getValue(Element node, Class<U> fieldType) { if (node == null) { return spec.getDefaultValue(); } String value; switch (spec.getAttribute()) { case "": case "text": value = node.text(); break; case "html": case "innerHtml": value = node.html(); break; case "outerHtml": value = node.outerHtml(); break; default: value = node.attr(spec.getAttribute()); break; } if (spec.getRegex() != null) { Pattern pattern = Pattern.compile(spec.getRegex()); Matcher matcher = pattern.matcher(value); if (matcher.find()) { value = (matcher.groupCount() > 0) ? matcher.group(1) : spec.getDefaultValue(); if (value == null || value.isEmpty()) { value = spec.getDefaultValue(); } } } return value; }
Example 14
Source File: InputExtensions.java From Android-WYSIWYG-Editor with Apache License 2.0 | 5 votes |
@Override public Node buildNodeFromHTML(Element element) { String text; int count; TextView tv; HtmlTag tag = HtmlTag.valueOf(element.tagName().toLowerCase()); switch (tag){ case h1: case h2: case h3: RenderHeader(tag, element); break; case p: case div: text = element.html(); count = editorCore.getParentView().getChildCount(); tv = insertEditText(count, null, text); applyStyles(tv, element); break; case blockquote: text = element.html(); count = editorCore.getParentView().getChildCount(); tv = insertEditText(count, null, text); UpdateTextStyle(EditorTextStyle.BLOCKQUOTE,tv); applyStyles(tv, element); } return null; }
Example 15
Source File: ModifyHTMLElement.java From localization_nifi with Apache License 2.0 | 4 votes |
@Override public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException { final FlowFile flowFile = session.get(); if (flowFile == null) { return; } final Document doc; final Elements eles; try { doc = parseHTMLDocumentFromFlowfile(flowFile, context, session); eles = doc.select(context.getProperty(CSS_SELECTOR).evaluateAttributeExpressions(flowFile).getValue()); } catch (Exception ex) { getLogger().error("Failed to extract HTML from {} due to {}; routing to {}", new Object[] {flowFile, ex.toString(), REL_INVALID_HTML.getName()}, ex); session.transfer(flowFile, REL_INVALID_HTML); return; } final String modifiedValue = context.getProperty(MODIFIED_VALUE).evaluateAttributeExpressions(flowFile).getValue(); if (eles == null || eles.size() == 0) { // No element found session.transfer(flowFile, REL_NOT_FOUND); } else { for (Element ele : eles) { switch (context.getProperty(OUTPUT_TYPE).getValue()) { case ELEMENT_HTML: ele.html(modifiedValue); break; case ELEMENT_ATTRIBUTE: ele.attr(context.getProperty(ATTRIBUTE_KEY).evaluateAttributeExpressions(flowFile).getValue(), modifiedValue); break; case ELEMENT_TEXT: ele.text(modifiedValue); break; } } FlowFile ff = session.write(session.create(flowFile), new StreamCallback() { @Override public void process(InputStream in, OutputStream out) throws IOException { out.write(doc.html().getBytes(StandardCharsets.UTF_8)); } }); ff = session.putAttribute(ff, NUM_ELEMENTS_MODIFIED_ATTR, new Integer(eles.size()).toString()); session.transfer(ff, REL_SUCCESS); // Transfer the original HTML session.transfer(flowFile, REL_ORIGINAL); } }
Example 16
Source File: WebComponentBootstrapHandler.java From flow with Apache License 2.0 | 4 votes |
/** * Copies the {@link org.jsoup.nodes.Element Elements} found in the given * {@code head} elements into the head of the embedding website using * JavaScript. Drops {@code <base>} element. * * @param contentType * Content type of the response. * @param response * {@link com.vaadin.flow.server.VaadinResponse} into which the * script is written * @param head * head element of Vaadin Bootstrap page. The child elements are * copied into the embedding page's head using JavaScript. * @param serviceUrl * base path to use for the head elements' URLs * @throws IOException * if writing fails */ protected void writeBootstrapPage(String contentType, VaadinResponse response, Element head, String serviceUrl) throws IOException { /* * The elements found in the head are reconstructed using JavaScript and * document.createElement(...). Since innerHTML and related methods do * not execute <script> blocks, the contents cannot be copied as pure * string into the head. The each element is created separately and then * attributes are copied and innerHTML set, if the element has * innerHTML. The innerHTMLs are in-lined for easier copying. */ response.setContentType(contentType); /* * Collection of Elements that should be transferred to the web * component shadow DOMs rather than the page head */ ArrayList<com.vaadin.flow.dom.Element> elementsForShadows = new ArrayList<>(); try (BufferedWriter writer = new BufferedWriter( new OutputStreamWriter(response.getOutputStream(), UTF_8))) { String varName = "headElem"; // generated head element writer.append("var ").append(varName).append("=null;"); for (Element element : head.children()) { if (elementShouldNotBeTransferred(element)) { getElementForShadowDom(element) .ifPresent(elementsForShadows::add); continue; } writer.append(varName).append("="); writer.append("document.createElement('") .append(element.tagName()).append("');"); transferAttribute(writer, varName, element, serviceUrl); // set cleaned html as innerHTML for the element String elementHtml = element.html(); if (elementHtml != null && elementHtml.length() > 0) { writer.append(varName).append(".innerHTML=\"") .append(inlineHTML(elementHtml)).append("\";"); } writer.append("document.head.appendChild(").append(varName) .append(");"); } } WebComponentConfigurationRegistry .getInstance(response.getService().getContext()) .setShadowDomElements(elementsForShadows); }
Example 17
Source File: JsoupUtil.java From materialup with Apache License 2.0 | 4 votes |
private static String html(Element e) { if (e == null) { return null; } return e.html(); }
Example 18
Source File: SubHDCommon.java From SubTitleSearcher with Apache License 2.0 | 4 votes |
/** * 获取下载网址列表 * @return */ public static JSONArray getDetailList(String url) { String result = HtHttpUtil.http.get(baseUrl+url, HtHttpUtil.http.default_charset, HtHttpUtil.http._ua, baseUrl+url); Document doc = Jsoup.parse(result); Elements matchList = doc.select(".d_table tr"); //System.out.println(matchList.html()); JSONArray detailList = new JSONArray(); for (Element matchRow : matchList) { if(matchRow.select(".dt_edition").size() == 0)continue; String html = matchRow.html(); String htmlLower = html.toLowerCase(); String downUrl = matchRow.select(".dt_down a").attr("href"); String title = matchRow.select(".dt_edition a").text().trim(); int downCount = Integer.valueOf(RegexUtil.getMatchStr(matchRow.select(".dt_count").text(), "([\\d]+)")); String ext = ""; for(String extName : AppConfig.subExtNames) { //if(StrUtil.isNotEmpty(RegexUtil.getMatchStr(html, "(>"+extName+"<)", Pattern.CASE_INSENSITIVE))) { if(htmlLower.contains(">"+extName+"<")) { ext += extName; ext += ","; } } if(ext.endsWith(",")) { ext=ext.substring(0, ext.length()-1); }else { ext="其它"; } String lang = ""; String[] langList = new String[] {"双语", "简体", "繁体", "英文"}; for(String langName : langList) { if(htmlLower.contains(">"+langName+"<")) { lang += langName; lang += ","; } } if(lang.endsWith(",")) { lang=lang.substring(0, lang.length()-1); }else { lang="其它"; } Elements labels = matchRow.select(".label"); StringBuffer labelInfo = new StringBuffer(); labels.forEach(element ->{ labelInfo.append(element.text() + ","); }); if(labelInfo.length() > 0) { labelInfo.delete(labelInfo.length()-1, labelInfo.length()); } String zimuzu = matchRow.select("a.gray").text(); JSONObject dataRow = new JSONObject(); dataRow.put("url", downUrl); dataRow.put("title", title); dataRow.put("ext", ext); dataRow.put("lang",lang); dataRow.put("rate", "-"); dataRow.put("downCount", downCount); dataRow.put("labelInfo", labelInfo); dataRow.put("zimuzu", zimuzu); detailList.add(dataRow); } return detailList; }
Example 19
Source File: ParsePxgav.java From v9porn with MIT License | 4 votes |
/** * @param html 原网页 * @return json=== */ public static BaseResult<PxgavVideoParserJsonResult> parserVideoUrl(String html) { BaseResult<PxgavVideoParserJsonResult> baseResult = new BaseResult<>(); Document document = Jsoup.parse(html); Element videoWrapper = document.getElementsByClass("penci-entry-content entry-content").first(); String videoHtml = videoWrapper.html(); Logger.t(TAG).d(videoHtml); int index = videoHtml.indexOf("setup") + 6; int endIndexV = videoHtml.indexOf(");"); String videoUrl = videoHtml.substring(index, endIndexV); Logger.t(TAG).d(videoUrl); PxgavVideoParserJsonResult pxgavVideoParserJsonResult = new Gson().fromJson(videoUrl, PxgavVideoParserJsonResult.class); Elements items = document.getElementsByClass("penci-block_content").first().select("article"); List<PxgavModel> pxgavModelList = new ArrayList<>(); for (Element element : items) { PxgavModel pxgavModel = new PxgavModel(); Element a = element.selectFirst("a"); String title = a.attr("title"); pxgavModel.setTitle(title); String contentUrl = a.attr("href"); pxgavModel.setContentUrl(contentUrl); String imgUrl = a.attr("style"); String bigImg = StringUtils.subString(imgUrl, imgUrl.indexOf("url(") + 4, imgUrl.lastIndexOf("-")); Logger.t(TAG).d(bigImg); if (TextUtils.isEmpty(bigImg)) { pxgavModel.setImgUrl(imgUrl); } else { pxgavModel.setImgUrl(bigImg + ".jpg"); } int beginIndex = bigImg.lastIndexOf("/"); int endIndex = bigImg.lastIndexOf("-"); String pId = StringUtils.subString(imgUrl, beginIndex + 1, endIndex); //Logger.t(TAG).d(pId); pxgavModel.setpId(pId); pxgavModelList.add(pxgavModel); } pxgavVideoParserJsonResult.setPxgavModelList(pxgavModelList); baseResult.setData(pxgavVideoParserJsonResult); return baseResult; }
Example 20
Source File: ParsePxgav.java From v9porn with MIT License | 4 votes |
/** * @param html 原网页 * @return json=== */ public static BaseResult<PxgavVideoParserJsonResult> parserVideoUrl(String html) { BaseResult<PxgavVideoParserJsonResult> baseResult = new BaseResult<>(); Document document = Jsoup.parse(html); Element videoWrapper = document.getElementsByClass("penci-entry-content entry-content").first(); String videoHtml = videoWrapper.html(); Logger.t(TAG).d(videoHtml); int index = videoHtml.indexOf("setup") + 6; int endIndexV = videoHtml.indexOf(");"); String videoUrl = videoHtml.substring(index, endIndexV); Logger.t(TAG).d(videoUrl); PxgavVideoParserJsonResult pxgavVideoParserJsonResult = new Gson().fromJson(videoUrl, PxgavVideoParserJsonResult.class); Elements items = document.getElementsByClass("penci-block_content").first().select("article"); List<PxgavModel> pxgavModelList = new ArrayList<>(); for (Element element : items) { PxgavModel pxgavModel = new PxgavModel(); Element a = element.selectFirst("a"); String title = a.attr("title"); pxgavModel.setTitle(title); String contentUrl = a.attr("href"); pxgavModel.setContentUrl(contentUrl); String imgUrl = a.attr("style"); String bigImg = StringUtils.subString(imgUrl, imgUrl.indexOf("url(") + 4, imgUrl.lastIndexOf("-")); Logger.t(TAG).d(bigImg); if (TextUtils.isEmpty(bigImg)) { pxgavModel.setImgUrl(imgUrl); } else { pxgavModel.setImgUrl(bigImg + ".jpg"); } int beginIndex = bigImg.lastIndexOf("/"); int endIndex = bigImg.lastIndexOf("-"); String pId = StringUtils.subString(imgUrl, beginIndex + 1, endIndex); //Logger.t(TAG).d(pId); pxgavModel.setpId(pId); pxgavModelList.add(pxgavModel); } pxgavVideoParserJsonResult.setPxgavModelList(pxgavModelList); baseResult.setData(pxgavVideoParserJsonResult); return baseResult; }