org.jsoup.Jsoup Java Examples
The following examples show how to use
org.jsoup.Jsoup.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParseTest.java From astor with GNU General Public License v2.0 | 6 votes |
@Test public void testNewsHomepage() throws IOException { File in = getFile("/htmltests/news-com-au-home.html"); Document doc = Jsoup.parse(in, "UTF-8", "http://www.news.com.au/"); assertEquals("News.com.au | News from Australia and around the world online | NewsComAu", doc.title()); assertEquals("Brace yourself for Metro meltdown", doc.select(".id1225817868581 h4").text().trim()); Element a = doc.select("a[href=/entertainment/horoscopes]").first(); assertEquals("/entertainment/horoscopes", a.attr("href")); assertEquals("http://www.news.com.au/entertainment/horoscopes", a.attr("abs:href")); Element hs = doc.select("a[href*=naughty-corners-are-a-bad-idea]").first(); assertEquals( "http://www.heraldsun.com.au/news/naughty-corners-are-a-bad-idea-for-kids/story-e6frf7jo-1225817899003", hs.attr("href")); assertEquals(hs.attr("href"), hs.attr("abs:href")); }
Example #2
Source File: ElementTest.java From astor with GNU General Public License v2.0 | 6 votes |
@Test public void testRemoveBeforeIndex() { Document doc = Jsoup.parse( "<html><body><div><p>before1</p><p>before2</p><p>XXX</p><p>after1</p><p>after2</p></div></body></html>", ""); Element body = doc.select("body").first(); Elements elems = body.select("p:matchesOwn(XXX)"); Element xElem = elems.first(); Elements beforeX = xElem.parent().getElementsByIndexLessThan(xElem.elementSiblingIndex()); for(Element p : beforeX) { p.remove(); } assertEquals("<body><div><p>XXX</p><p>after1</p><p>after2</p></div></body>", TextUtil.stripNewlines(body.outerHtml())); }
Example #3
Source File: ElementTest.java From astor with GNU General Public License v2.0 | 6 votes |
@Test public void testClone() { Document doc = Jsoup.parse("<div><p>One<p><span>Two</div>"); Element p = doc.select("p").get(1); Element clone = p.clone(); assertNull(clone.parent()); // should be orphaned assertEquals(0, clone.siblingIndex); assertEquals(1, p.siblingIndex); assertNotNull(p.parent()); clone.append("<span>Three"); assertEquals("<p><span>Two</span><span>Three</span></p>", TextUtil.stripNewlines(clone.outerHtml())); assertEquals("<div><p>One</p><p><span>Two</span></p></div>", TextUtil.stripNewlines(doc.body().html())); // not modified doc.body().appendChild(clone); // adopt assertNotNull(clone.parent()); assertEquals("<div><p>One</p><p><span>Two</span></p></div><p><span>Two</span><span>Three</span></p>", TextUtil.stripNewlines(doc.body().html())); }
Example #4
Source File: CleanerTest.java From astor with GNU General Public License v2.0 | 6 votes |
@Test public void testIsValidBodyHtml() { String ok = "<p>Test <b><a href='http://example.com/' rel='nofollow'>OK</a></b></p>"; String ok1 = "<p>Test <b><a href='http://example.com/'>OK</a></b></p>"; // missing enforced is OK because still needs run thru cleaner String nok1 = "<p><script></script>Not <b>OK</b></p>"; String nok2 = "<p align=right>Test Not <b>OK</b></p>"; String nok3 = "<!-- comment --><p>Not OK</p>"; // comments and the like will be cleaned String nok4 = "<html><head>Foo</head><body><b>OK</b></body></html>"; // not body html String nok5 = "<p>Test <b><a href='http://example.com/' rel='nofollowme'>OK</a></b></p>"; String nok6 = "<p>Test <b><a href='http://example.com/'>OK</b></p>"; // missing close tag String nok7 = "</div>What"; assertTrue(Jsoup.isValid(ok, Whitelist.basic())); assertTrue(Jsoup.isValid(ok1, Whitelist.basic())); assertFalse(Jsoup.isValid(nok1, Whitelist.basic())); assertFalse(Jsoup.isValid(nok2, Whitelist.basic())); assertFalse(Jsoup.isValid(nok3, Whitelist.basic())); assertFalse(Jsoup.isValid(nok4, Whitelist.basic())); assertFalse(Jsoup.isValid(nok5, Whitelist.basic())); assertFalse(Jsoup.isValid(nok6, Whitelist.basic())); assertFalse(Jsoup.isValid(ok, Whitelist.none())); assertFalse(Jsoup.isValid(nok7, Whitelist.basic())); }
Example #5
Source File: AlbumRequest.java From meizhi with Apache License 2.0 | 6 votes |
@Override protected Response<List<Image>> parseNetworkResponse(NetworkResponse response) { try { List<Image> images = new ArrayList<>(); Document document = Jsoup.parse(new String(response.data, HttpHeaderParser.parseCharset(response.headers))); for (Element img : document.select(".container.main .box.show-box img")) { String url = img.attr("src"); if (TextUtils.isEmpty(url)) { continue; } Image image = new Image(); image.url = url; images.add(image); } return Response.success(images, HttpHeaderParser.parseCacheHeaders(response)); } catch (UnsupportedEncodingException e) { return Response.error(new ParseError(e)); } }
Example #6
Source File: SteamWebHandler.java From UpdogFarmer with GNU General Public License v3.0 | 6 votes |
/** * Unlock Steam parental controls with a pin */ private String unlockParental(String pin) { final String url = STEAM_STORE + "parental/ajaxunlock"; try { final Map<String,String> responseCookies = Jsoup.connect(url) .referrer(STEAM_STORE) .followRedirects(true) .ignoreContentType(true) .cookies(generateWebCookies()) .data("pin", pin) .method(Connection.Method.POST) .execute() .cookies(); return responseCookies.get("steamparental"); } catch (Exception e) { e.printStackTrace(); } return null; }
Example #7
Source File: SourcePrinterTest.java From warnings-ng-plugin with MIT License | 6 votes |
@Test void shouldCreateSourceWithoutLineNumber() { SourcePrinter printer = new SourcePrinter(); IssueBuilder builder = new IssueBuilder(); Issue issue = builder.build(); Document document = Jsoup.parse(printer.render(asStream("format-java.txt"), issue, NO_DESCRIPTION, ICON_URL)); String expectedFile = toString("format-java.txt"); assertThat(document.text()).isEqualToIgnoringWhitespace(expectedFile); Elements pre = document.getElementsByTag("pre"); assertThat(pre.text()).isEqualToIgnoringWhitespace(expectedFile); }
Example #8
Source File: Book.java From nju-lib-downloader with GNU General Public License v3.0 | 6 votes |
public List<Node> getOutline() throws IOException { for (int i = 0; i < 20; i++) { try { String url = CoreService.baseUrl + "/book/getDirectoryTree.jsps?bookId=" + idInt + "&type=PDF"; //http://sxnju.chineseall.cn/book/getDirectoryTree.jsps?bookId=10060602592&type=PDF&_=1504844448871 String result = MyHttpRequest.get(url, null, "UTF-8", 3000); result = new ObjectMapper().readValue(result, ObjectNode.class).get("data").textValue(); Document doc = Jsoup.parse(result); Elements elements = doc.select("ul[id=directoryTree]"); return parseUL(elements.get(0)); } catch (Exception e) { if (i == 19) { throw e; } } } return null; }
Example #9
Source File: IPUtils.java From superword with Apache License 2.0 | 6 votes |
public static List<String> getIPLocation(String ip){ List<String> locations = new ArrayList<>(); try { Elements elements = Jsoup .parse(new URL("http://ip138.com/ips138.asp?ip=" + ip), 60000) .select("ul li"); for(Element element : elements){ String text = element.text(); if(StringUtils.isNotBlank(text)){ String[] attrs = text.split(":"); if(attrs != null && attrs.length == 2){ locations.add(attrs[1]); } } } }catch (Exception e){ LOG.error("获取IP地址的地理位置", e); } return locations; }
Example #10
Source File: TextFilterManage.java From bbs with GNU Affero General Public License v3.0 | 6 votes |
/** * 读取上传图片路径名称 * @param html * @param item 项目 * @return */ public List<String> readImageName(String html,String item) { //上传图片文件名称 List<String> imageNameList = new ArrayList<String>(); if(!StringUtils.isBlank(html)){ Document doc = Jsoup.parseBodyFragment(html); //图片 Elements image_elements = doc.select("img[src]"); for (Element element : image_elements) { String imageUrl = element.attr("src"); if(StringUtils.startsWithIgnoreCase(imageUrl, "file/"+item+"/")){ imageNameList.add(imageUrl); } } } return imageNameList; }
Example #11
Source File: ElementTest.java From astor with GNU General Public License v2.0 | 6 votes |
@Test public void testAppendTo() { String parentHtml = "<div class='a'></div>"; String childHtml = "<div class='b'></div><p>Two</p>"; Document parentDoc = Jsoup.parse(parentHtml); Element parent = parentDoc.body(); Document childDoc = Jsoup.parse(childHtml); Element div = childDoc.select("div").first(); Element p = childDoc.select("p").first(); Element appendTo1 = div.appendTo(parent); assertEquals(div, appendTo1); Element appendTo2 = p.appendTo(div); assertEquals(p, appendTo2); assertEquals("<div class=\"a\"></div>\n<div class=\"b\">\n <p>Two</p>\n</div>", parentDoc.body().html()); assertEquals("", childDoc.body().html()); // got moved out }
Example #12
Source File: WordClassifierForOxford.java From superword with Apache License 2.0 | 6 votes |
public static String getContent(String word) { String url = OXFORD + word + "?renovate=" + (new Random(System.currentTimeMillis()).nextInt(899999)+100000); LOGGER.debug("url:"+url); Connection conn = Jsoup.connect(url) .header("Accept", ACCEPT) .header("Accept-Encoding", ENCODING) .header("Accept-Language", LANGUAGE) .header("Connection", CONNECTION) .header("Referer", REFERER) .header("Host", HOST) .header("User-Agent", USER_AGENT) .timeout(60000) .ignoreContentType(true); String html = ""; try { html = conn.post().html(); html = html.replaceAll("[\n\r]", ""); }catch (Exception e){ //LOGGER.error("获取URL:"+url+"页面出错", e); LOGGER.error("获取URL:"+url+"页面出错"); } return html; }
Example #13
Source File: NewService.java From Pixiv-Illustration-Collection-Backend with Apache License 2.0 | 6 votes |
private void pullACGMHNews() throws IOException, InterruptedException { HttpRequest request = HttpRequest.newBuilder() .uri(URI.create("https://www.acgmh.com/category/news")).POST(HttpRequest.BodyPublishers.ofString("type=catL3&paged=1")).build(); String body = httpClient.send(request, HttpResponse.BodyHandlers.ofString()).body(); //ACGMHNewsDTO acgmhNewsDTO = objectMapper.readValue(body, ACGMHNewsDTO.class); Document doc = Jsoup.parse(body); Elements elements = doc.getElementsByClass("pos-r pd10 post-list box mar10-b content"); List<ACGNew> acgNewList = elements.stream().map(e -> { String style = e.getElementsByClass("preview thumb-in").get(0).attr("style"); String cover = style.substring(style.indexOf("('") + 2, style.length() - 2); String author = e.getElementsByClass("users").text(); String createDate = e.getElementsByClass("timeago").text(); Elements es = e.getElementsByClass("entry-title"); String title = es.text(); String refererUrl = es.get(0).getElementsByTag("a").get(0).attr("href"); String intro = e.getElementsByClass("mar10-b post-ex mar10-t mobile-hide").text(); return new ACGNew(title, intro, author, cover, refererUrl, LocalDate.parse(createDate.substring(0, 10)), NewsCrawlerConstant.ACGMH); }).collect(Collectors.toList()); process(acgNewList, "id", "content-innerText"); }
Example #14
Source File: CDTClassifierEvaluation.java From NLIWOD with GNU Affero General Public License v3.0 | 6 votes |
public static ArrayList<String> loadSystemR(String system){ Path datapath = Paths.get("./src/main/resources/QALD6MultilingualLogs/multilingual_" + system + ".html"); ArrayList<String> result = Lists.newArrayList(); try{ String loadedData = Files.lines(datapath).collect(Collectors.joining()); Document doc = Jsoup.parse(loadedData); Element table = doc.select("table").get(5); Elements tableRows = table.select("tr"); for(Element row: tableRows){ Elements tableEntry = row.select("td"); result.add(tableEntry.get(1).ownText()); } result.remove(0); //remove the head of the table return result; }catch(IOException e){ e.printStackTrace(); log.debug("loading failed."); return result; } }
Example #15
Source File: FuckBroDomain.java From TrackRay with GNU General Public License v3.0 | 6 votes |
public Map<String,String> aizhanIcp(String domain){ HashMap<String, String> map = new HashMap<>(); HttpClient httpClient = new HttpClient(); String url = "https://icp.aizhan.com/%s/"; try { ResponseStatus responseStatus = httpClient.get(String.format(url, domain)); String html = responseStatus.getContent(); if (!html.contains("未找到") && html.contains("该单位备案网站") && html.contains("缓存于")) { Document doc = Jsoup.parse(html); Elements trs = doc.select("div#company .table-s1 tbody tr"); for (Element tr : trs) { String title = tr.select("td").get(1).text(); String dom = tr.select("td").get(2).text(); map.put(dom,title); } } } catch (Exception e) { task.getExceptions().add(e); } SysLog.info("ICP反查结束"); return map; }
Example #16
Source File: Class.java From nju-lib-downloader with GNU General Public License v3.0 | 5 votes |
public static int getBookSizeFromHtml(String html){ Document doc= Jsoup.parse(html); Elements sizeNode=doc.select("input[id=totalSize]"); if(sizeNode!=null&&sizeNode.size()>0){ String sizeString=sizeNode.attr("value"); if(sizeString!=null){ int sizeInt= Integer.parseInt(sizeString); return sizeInt; } } return -1; }
Example #17
Source File: ElementTest.java From astor with GNU General Public License v2.0 | 5 votes |
@Test public void testChildrenElements() { String html = "<div><p><a>One</a></p><p><a>Two</a></p>Three</div><span>Four</span><foo></foo><img>"; Document doc = Jsoup.parse(html); Element div = doc.select("div").first(); Element p = doc.select("p").first(); Element span = doc.select("span").first(); Element foo = doc.select("foo").first(); Element img = doc.select("img").first(); Elements docChildren = div.children(); assertEquals(2, docChildren.size()); assertEquals("<p><a>One</a></p>", docChildren.get(0).outerHtml()); assertEquals("<p><a>Two</a></p>", docChildren.get(1).outerHtml()); assertEquals(3, div.childNodes().size()); assertEquals("Three", div.childNodes().get(2).outerHtml()); assertEquals(1, p.children().size()); assertEquals("One", p.children().text()); assertEquals(0, span.children().size()); assertEquals(1, span.childNodes().size()); assertEquals("Four", span.childNodes().get(0).outerHtml()); assertEquals(0, foo.children().size()); assertEquals(0, foo.childNodes().size()); assertEquals(0, img.children().size()); assertEquals(0, img.childNodes().size()); }
Example #18
Source File: StatusReportGenerationErrorTest.java From kubernetes-elastic-agents with Apache License 2.0 | 5 votes |
@Test public void shouldGenerateErrorViewForException() { final StatusReportGenerationException exception = StatusReportGenerationException.noRunningPod("foo"); final GoPluginApiResponse response = StatusReportGenerationErrorHandler.handle(PluginStatusReportViewBuilder.instance(), exception); assertThat(response.responseCode(), is(200)); final String view = new JsonParser().parse(response.responseBody()).getAsJsonObject().get("view").getAsString(); final Document document = Jsoup.parse(view); assertThat(document.select(".outer-container .container .error-container blockquote header").text(), is("Pod is not running.")); assertThat(document.select(".outer-container .container .error-container blockquote p").text(), is("Can not find a running pod for the provided elastic agent id 'foo'.")); }
Example #19
Source File: BaseElementSelector.java From zongtui-webcrawler with GNU General Public License v2.0 | 5 votes |
@Override public String select(String text) { if (text != null) { return select(Jsoup.parse(text)); } return null; }
Example #20
Source File: ElementsTest.java From astor with GNU General Public License v2.0 | 5 votes |
@Test public void empty() { Document doc = Jsoup.parse("<div><p>Hello <b>there</b></p> <p>now!</p></div>"); doc.outputSettings().prettyPrint(false); doc.select("p").empty(); assertEquals("<div><p></p> <p></p></div>", doc.body().html()); }
Example #21
Source File: ParagraphMarkedClassificationTest.java From baleen with Apache License 2.0 | 5 votes |
@Test public void testMarking() { Document doc = Jsoup.parseBodyFragment("<p>(UK OFFICIAL)This is some text</p>"); m.manipulate(doc); assertEquals( MarkupUtils.getAttribute(doc.body().select("p").first(), "classification"), "UK OFFICIAL"); assertEquals(doc.body().text(), "This is some text"); }
Example #22
Source File: PreviewTextUtils.java From mblog with GNU General Public License v3.0 | 5 votes |
/** * 获取文章中的img url * @param html 代码 * @return string */ public static List<String> extractImage(String html) { List<String> urls = new ArrayList<>(); if (html == null) return urls; Document doc = Jsoup.parseBodyFragment(html); Elements images = doc.select("img"); if (null != images) { for(Element el : images) { urls.add(el.attr("src")); } } return urls; }
Example #23
Source File: OnnmyoujiSpider.java From SpringBootUnity with MIT License | 5 votes |
/** * 获取御魂信息详情页连接 */ private static List<String> getMitamaDetailInfoUrl() { List<String> list = new ArrayList<>(); String html = HttpUtil.get(URL); Document doc = Jsoup.parse(html); Element select = doc.select(".heroList-1").get(0); Elements liElement = select.select("a"); for (Element element : liElement) { String href = element.attr("href"); list.add(href); } return list; }
Example #24
Source File: ElementTest.java From astor with GNU General Public License v2.0 | 5 votes |
@Test public void testChildrenElements() { String html = "<div><p><a>One</a></p><p><a>Two</a></p>Three</div><span>Four</span><foo></foo><img>"; Document doc = Jsoup.parse(html); Element div = doc.select("div").first(); Element p = doc.select("p").first(); Element span = doc.select("span").first(); Element foo = doc.select("foo").first(); Element img = doc.select("img").first(); Elements docChildren = div.children(); assertEquals(2, docChildren.size()); assertEquals("<p><a>One</a></p>", docChildren.get(0).outerHtml()); assertEquals("<p><a>Two</a></p>", docChildren.get(1).outerHtml()); assertEquals(3, div.childNodes().size()); assertEquals("Three", div.childNodes().get(2).outerHtml()); assertEquals(1, p.children().size()); assertEquals("One", p.children().text()); assertEquals(0, span.children().size()); assertEquals(1, span.childNodes().size()); assertEquals("Four", span.childNodes().get(0).outerHtml()); assertEquals(0, foo.children().size()); assertEquals(0, foo.childNodes().size()); assertEquals(0, img.children().size()); assertEquals(0, img.childNodes().size()); }
Example #25
Source File: HtmlParserTest.java From astor with GNU General Public License v2.0 | 5 votes |
@Test public void parsesUnterminatedTextarea() { // don't parse right to end, but break on <p> Document doc = Jsoup.parse("<body><p><textarea>one<p>two"); Element t = doc.select("textarea").first(); assertEquals("one", t.text()); assertEquals("two", doc.select("p").get(1).text()); }
Example #26
Source File: UrlConnectTest.java From astor with GNU General Public License v2.0 | 5 votes |
@Test public void throwsIfRequestBodyForGet() throws IOException { boolean caught = false; String url = "https://jsoup.org"; try { Document doc = Jsoup.connect(url).requestBody("fail").get(); } catch (IllegalArgumentException e) { caught = true; } assertTrue(caught); }
Example #27
Source File: ParseMeiZiTu.java From v9porn with MIT License | 5 votes |
public static BaseResult<List<String>> parsePicturePage(String html) { BaseResult<List<String>> baseResult = new BaseResult<>(); Document doc = Jsoup.parse(html); Element pageElement = doc.getElementsByClass("pagenavi").first(); Elements aElements = pageElement.select("a"); int totalPage = 1; if (aElements != null && aElements.size() > 3) { String pageStr = aElements.get(aElements.size() - 2).text(); if (!TextUtils.isEmpty(pageStr) && TextUtils.isDigitsOnly(pageStr)) { totalPage = Integer.parseInt(pageStr); } } List<String> imageUrlList = new ArrayList<>(); String imageUrl = doc.getElementsByClass("main-image").first().selectFirst("img").attr("src"); if (totalPage == 1) { imageUrlList.add(imageUrl); } for (int i = 1; i < totalPage + 1; i++) { String tmp; if (i < 10) { tmp = imageUrl.replace("01.", "0" + i + "."); } else { tmp = imageUrl.replace("01.", "" + i + "."); } imageUrlList.add(tmp); } baseResult.setData(imageUrlList); return baseResult; }
Example #28
Source File: Header.java From viritin with Apache License 2.0 | 5 votes |
private void render() { if (text != null) { setContentMode(ContentMode.HTML); StringBuilder sb = new StringBuilder("<h"); sb.append(headerLevel); sb.append(">"); sb.append(Jsoup.clean(text, getWhitelist())); sb.append("</h"); sb.append(headerLevel); sb.append(">"); super.setValue(sb.toString()); text = null; } }
Example #29
Source File: XmlTreeBuilderTest.java From astor with GNU General Public License v2.0 | 5 votes |
@Test public void testSupplyParserToJsoupClass() { String xml = "<doc><val>One<val>Two</val></bar>Three</doc>"; Document doc = Jsoup.parse(xml, "http://foo.com/", Parser.xmlParser()); assertEquals("<doc><val>One<val>Two</val>Three</val></doc>", TextUtil.stripNewlines(doc.html())); }
Example #30
Source File: SelectorTest.java From astor with GNU General Public License v2.0 | 5 votes |
@Test public void testById() { Elements els = Jsoup.parse("<div><p id=foo>Hello</p><p id=foo>Foo two!</p></div>").select("#foo"); assertEquals(2, els.size()); assertEquals("Hello", els.get(0).text()); assertEquals("Foo two!", els.get(1).text()); Elements none = Jsoup.parse("<div id=1></div>").select("#foo"); assertEquals(0, none.size()); }