org.jsoup.Jsoup Java Exaples

Source File: ParseTest.java From astor with GNU General Public License v2.0

6 votes

@Test
public void testNewsHomepage() throws IOException {
    File in = getFile("/htmltests/news-com-au-home.html");
    Document doc = Jsoup.parse(in, "UTF-8", "http://www.news.com.au/");
    assertEquals("News.com.au | News from Australia and around the world online | NewsComAu", doc.title());
    assertEquals("Brace yourself for Metro meltdown", doc.select(".id1225817868581 h4").text().trim());

    Element a = doc.select("a[href=/entertainment/horoscopes]").first();
    assertEquals("/entertainment/horoscopes", a.attr("href"));
    assertEquals("http://www.news.com.au/entertainment/horoscopes", a.attr("abs:href"));

    Element hs = doc.select("a[href*=naughty-corners-are-a-bad-idea]").first();
    assertEquals(
            "http://www.heraldsun.com.au/news/naughty-corners-are-a-bad-idea-for-kids/story-e6frf7jo-1225817899003",
            hs.attr("href"));
    assertEquals(hs.attr("href"), hs.attr("abs:href"));
}

Source File: ElementTest.java From astor with GNU General Public License v2.0

6 votes

@Test
public void testRemoveBeforeIndex() {
	Document doc = Jsoup.parse(
            "<html><body><div><p>before1</p><p>before2</p><p>XXX</p><p>after1</p><p>after2</p></div></body></html>",
            "");
    Element body = doc.select("body").first();
    Elements elems = body.select("p:matchesOwn(XXX)");
    Element xElem = elems.first();
    Elements beforeX = xElem.parent().getElementsByIndexLessThan(xElem.elementSiblingIndex());

    for(Element p : beforeX) {
        p.remove();
    }

    assertEquals("<body><div><p>XXX</p><p>after1</p><p>after2</p></div></body>", TextUtil.stripNewlines(body.outerHtml()));
}

Source File: ElementTest.java From astor with GNU General Public License v2.0

6 votes

@Test public void testClone() {
    Document doc = Jsoup.parse("<div><p>One<p><span>Two</div>");

    Element p = doc.select("p").get(1);
    Element clone = p.clone();

    assertNull(clone.parent()); // should be orphaned
    assertEquals(0, clone.siblingIndex);
    assertEquals(1, p.siblingIndex);
    assertNotNull(p.parent());

    clone.append("<span>Three");
    assertEquals("<p><span>Two</span><span>Three</span></p>", TextUtil.stripNewlines(clone.outerHtml()));
    assertEquals("<div><p>One</p><p><span>Two</span></p></div>", TextUtil.stripNewlines(doc.body().html())); // not modified

    doc.body().appendChild(clone); // adopt
    assertNotNull(clone.parent());
    assertEquals("<div><p>One</p><p><span>Two</span></p></div><p><span>Two</span><span>Three</span></p>", TextUtil.stripNewlines(doc.body().html()));
}

Source File: CleanerTest.java From astor with GNU General Public License v2.0

6 votes

@Test public void testIsValidBodyHtml() {
    String ok = "<p>Test <b><a href='http://example.com/' rel='nofollow'>OK</a></b></p>";
    String ok1 = "<p>Test <b><a href='http://example.com/'>OK</a></b></p>"; // missing enforced is OK because still needs run thru cleaner
    String nok1 = "<p><script></script>Not <b>OK</b></p>";
    String nok2 = "<p align=right>Test Not <b>OK</b></p>";
    String nok3 = "<!-- comment --><p>Not OK</p>"; // comments and the like will be cleaned
    String nok4 = "<html><head>Foo</head><body><b>OK</b></body></html>"; // not body html
    String nok5 = "<p>Test <b><a href='http://example.com/' rel='nofollowme'>OK</a></b></p>";
    String nok6 = "<p>Test <b><a href='http://example.com/'>OK</b></p>"; // missing close tag
    String nok7 = "</div>What";
    assertTrue(Jsoup.isValid(ok, Whitelist.basic()));
    assertTrue(Jsoup.isValid(ok1, Whitelist.basic()));
    assertFalse(Jsoup.isValid(nok1, Whitelist.basic()));
    assertFalse(Jsoup.isValid(nok2, Whitelist.basic()));
    assertFalse(Jsoup.isValid(nok3, Whitelist.basic()));
    assertFalse(Jsoup.isValid(nok4, Whitelist.basic()));
    assertFalse(Jsoup.isValid(nok5, Whitelist.basic()));
    assertFalse(Jsoup.isValid(nok6, Whitelist.basic()));
    assertFalse(Jsoup.isValid(ok, Whitelist.none()));
    assertFalse(Jsoup.isValid(nok7, Whitelist.basic()));
}

Source File: AlbumRequest.java From meizhi with Apache License 2.0

6 votes

@Override
protected Response<List<Image>> parseNetworkResponse(NetworkResponse response) {
    try {
        List<Image> images = new ArrayList<>();

        Document document = Jsoup.parse(new String(response.data,
                HttpHeaderParser.parseCharset(response.headers)));

        for (Element img : document.select(".container.main .box.show-box img")) {
            String url = img.attr("src");
            if (TextUtils.isEmpty(url)) {
                continue;
            }

            Image image = new Image();
            image.url = url;

            images.add(image);
        }

        return Response.success(images, HttpHeaderParser.parseCacheHeaders(response));
    } catch (UnsupportedEncodingException e) {
        return Response.error(new ParseError(e));
    }
}

Source File: SteamWebHandler.java From UpdogFarmer with GNU General Public License v3.0

6 votes

/**
 * Unlock Steam parental controls with a pin
 */
private String unlockParental(String pin) {
    final String url = STEAM_STORE + "parental/ajaxunlock";
    try {
        final Map<String,String> responseCookies = Jsoup.connect(url)
                .referrer(STEAM_STORE)
                .followRedirects(true)
                .ignoreContentType(true)
                .cookies(generateWebCookies())
                .data("pin", pin)
                .method(Connection.Method.POST)
                .execute()
                .cookies();
        return responseCookies.get("steamparental");
    } catch (Exception e) {
        e.printStackTrace();
    }
    return null;
}

Source File: SourcePrinterTest.java From warnings-ng-plugin with MIT License

6 votes

@Test
void shouldCreateSourceWithoutLineNumber() {
    SourcePrinter printer = new SourcePrinter();

    IssueBuilder builder = new IssueBuilder();
    Issue issue = builder.build();

    Document document = Jsoup.parse(printer.render(asStream("format-java.txt"), issue,
            NO_DESCRIPTION, ICON_URL));
    String expectedFile = toString("format-java.txt");

    assertThat(document.text()).isEqualToIgnoringWhitespace(expectedFile);

    Elements pre = document.getElementsByTag("pre");
    assertThat(pre.text()).isEqualToIgnoringWhitespace(expectedFile);
}

Source File: Book.java From nju-lib-downloader with GNU General Public License v3.0

6 votes

public List<Node> getOutline() throws IOException {
    for (int i = 0; i < 20; i++) {
        try {
            String url = CoreService.baseUrl + "/book/getDirectoryTree.jsps?bookId=" + idInt + "&type=PDF";
            //http://sxnju.chineseall.cn/book/getDirectoryTree.jsps?bookId=10060602592&type=PDF&_=1504844448871
            String result = MyHttpRequest.get(url, null, "UTF-8", 3000);

            result = new ObjectMapper().readValue(result, ObjectNode.class).get("data").textValue();

            Document doc = Jsoup.parse(result);
            Elements elements = doc.select("ul[id=directoryTree]");
            return parseUL(elements.get(0));
        } catch (Exception e) {
            if (i == 19) {
                throw e;
            }
        }

    }
    return null;
}

Source File: IPUtils.java From superword with Apache License 2.0

6 votes

public static List<String> getIPLocation(String ip){
    List<String> locations = new ArrayList<>();
    try {
        Elements elements = Jsoup
                .parse(new URL("http://ip138.com/ips138.asp?ip=" + ip), 60000)
                .select("ul li");
        for(Element element : elements){
            String text = element.text();
            if(StringUtils.isNotBlank(text)){
                String[] attrs = text.split("：");
                if(attrs != null && attrs.length == 2){
                    locations.add(attrs[1]);
                }
            }
        }
    }catch (Exception e){
        LOG.error("获取IP地址的地理位置", e);
    }
    return locations;
}

Source File: TextFilterManage.java From bbs with GNU Affero General Public License v3.0

6 votes

/**
 * 读取上传图片路径名称
 * @param html
 * @param item 项目
 * @return
 */
public List<String> readImageName(String html,String item) {
	//上传图片文件名称
	List<String> imageNameList = new ArrayList<String>();
	if(!StringUtils.isBlank(html)){
		Document doc = Jsoup.parseBodyFragment(html);

		//图片
		Elements image_elements = doc.select("img[src]");  
		for (Element element : image_elements) {
			 String imageUrl = element.attr("src"); 
			 if(StringUtils.startsWithIgnoreCase(imageUrl, "file/"+item+"/")){
				 
				 imageNameList.add(imageUrl);
             }
		}
	}
	return imageNameList;
}

Source File: ElementTest.java From astor with GNU General Public License v2.0

6 votes

@Test
public void testAppendTo() {
	String parentHtml = "<div class='a'></div>";
	String childHtml = "<div class='b'></div><p>Two</p>";

	Document parentDoc = Jsoup.parse(parentHtml);
	Element parent = parentDoc.body();
       Document childDoc = Jsoup.parse(childHtml);

       Element div = childDoc.select("div").first();
       Element p = childDoc.select("p").first();
       Element appendTo1 = div.appendTo(parent);
       assertEquals(div, appendTo1);

       Element appendTo2 = p.appendTo(div);
       assertEquals(p, appendTo2);

       assertEquals("<div class=\"a\"></div>\n<div class=\"b\">\n <p>Two</p>\n</div>", parentDoc.body().html());
       assertEquals("", childDoc.body().html()); // got moved out
}

Source File: WordClassifierForOxford.java From superword with Apache License 2.0

6 votes

public static String getContent(String word) {
    String url = OXFORD + word + "?renovate=" + (new Random(System.currentTimeMillis()).nextInt(899999)+100000);
    LOGGER.debug("url:"+url);
    Connection conn = Jsoup.connect(url)
            .header("Accept", ACCEPT)
            .header("Accept-Encoding", ENCODING)
            .header("Accept-Language", LANGUAGE)
            .header("Connection", CONNECTION)
            .header("Referer", REFERER)
            .header("Host", HOST)
            .header("User-Agent", USER_AGENT)
            .timeout(60000)
            .ignoreContentType(true);
    String html = "";
    try {
        html = conn.post().html();
        html = html.replaceAll("[\n\r]", "");
    }catch (Exception e){
        //LOGGER.error("获取URL："+url+"页面出错", e);
        LOGGER.error("获取URL："+url+"页面出错");
    }
    return html;
}

Source File: NewService.java From Pixiv-Illustration-Collection-Backend with Apache License 2.0

6 votes

private void pullACGMHNews() throws IOException, InterruptedException {
    HttpRequest request = HttpRequest.newBuilder()
            .uri(URI.create("https://www.acgmh.com/category/news")).POST(HttpRequest.BodyPublishers.ofString("type=catL3&paged=1")).build();
    String body = httpClient.send(request, HttpResponse.BodyHandlers.ofString()).body();
    //ACGMHNewsDTO acgmhNewsDTO = objectMapper.readValue(body, ACGMHNewsDTO.class);
    Document doc = Jsoup.parse(body);
    Elements elements = doc.getElementsByClass("pos-r pd10 post-list box mar10-b content");
    List<ACGNew> acgNewList = elements.stream().map(e -> {
        String style = e.getElementsByClass("preview thumb-in").get(0).attr("style");
        String cover = style.substring(style.indexOf("('") + 2, style.length() - 2);
        String author = e.getElementsByClass("users").text();
        String createDate = e.getElementsByClass("timeago").text();
        Elements es = e.getElementsByClass("entry-title");
        String title = es.text();
        String refererUrl = es.get(0).getElementsByTag("a").get(0).attr("href");
        String intro = e.getElementsByClass("mar10-b post-ex mar10-t mobile-hide").text();
        return new ACGNew(title, intro, author, cover, refererUrl, LocalDate.parse(createDate.substring(0, 10)), NewsCrawlerConstant.ACGMH);
    }).collect(Collectors.toList());
    process(acgNewList, "id", "content-innerText");
}

Source File: CDTClassifierEvaluation.java From NLIWOD with GNU Affero General Public License v3.0

6 votes

public static ArrayList<String> loadSystemR(String system){
	Path datapath = Paths.get("./src/main/resources/QALD6MultilingualLogs/multilingual_" + system + ".html");
	ArrayList<String> result = Lists.newArrayList();

	try{
		String loadedData = Files.lines(datapath).collect(Collectors.joining()); 
		Document doc = Jsoup.parse(loadedData);
		Element table = doc.select("table").get(5);
		Elements tableRows = table.select("tr");
		for(Element row: tableRows){
			Elements tableEntry = row.select("td");
			result.add(tableEntry.get(1).ownText());
		}
		result.remove(0); //remove the head of the table
		return result;
	}catch(IOException e){
		e.printStackTrace();
		log.debug("loading failed.");
		return result;
	}
}

Source File: FuckBroDomain.java From TrackRay with GNU General Public License v3.0

6 votes

public Map<String,String> aizhanIcp(String domain){
    HashMap<String, String> map = new HashMap<>();
    HttpClient httpClient = new HttpClient();
    String url = "https://icp.aizhan.com/%s/";
    try {
        ResponseStatus responseStatus = httpClient.get(String.format(url, domain));
        String html = responseStatus.getContent();
        if (!html.contains("未找到") && html.contains("该单位备案网站") && html.contains("缓存于"))
        {
            Document doc = Jsoup.parse(html);

            Elements trs = doc.select("div#company .table-s1 tbody tr");
            for (Element tr : trs) {
                String title = tr.select("td").get(1).text();
                String dom = tr.select("td").get(2).text();
                map.put(dom,title);
            }
        }
    } catch (Exception e) {
        task.getExceptions().add(e);
    }
    SysLog.info("ICP反查结束");
    return map;
}

Source File: Class.java From nju-lib-downloader with GNU General Public License v3.0

5 votes

public static int getBookSizeFromHtml(String html){
    Document doc= Jsoup.parse(html);
    Elements sizeNode=doc.select("input[id=totalSize]");
    if(sizeNode!=null&&sizeNode.size()>0){
        String sizeString=sizeNode.attr("value");
        if(sizeString!=null){
            int sizeInt= Integer.parseInt(sizeString);
            return sizeInt;
        }
    }
    return -1;
}

Source File: ElementTest.java From astor with GNU General Public License v2.0

5 votes

@Test public void testChildrenElements() {
    String html = "<div><p><a>One</a></p><p><a>Two</a></p>Three</div><span>Four</span><foo></foo><img>";
    Document doc = Jsoup.parse(html);
    Element div = doc.select("div").first();
    Element p = doc.select("p").first();
    Element span = doc.select("span").first();
    Element foo = doc.select("foo").first();
    Element img = doc.select("img").first();

    Elements docChildren = div.children();
    assertEquals(2, docChildren.size());
    assertEquals("<p><a>One</a></p>", docChildren.get(0).outerHtml());
    assertEquals("<p><a>Two</a></p>", docChildren.get(1).outerHtml());
    assertEquals(3, div.childNodes().size());
    assertEquals("Three", div.childNodes().get(2).outerHtml());

    assertEquals(1, p.children().size());
    assertEquals("One", p.children().text());

    assertEquals(0, span.children().size());
    assertEquals(1, span.childNodes().size());
    assertEquals("Four", span.childNodes().get(0).outerHtml());

    assertEquals(0, foo.children().size());
    assertEquals(0, foo.childNodes().size());
    assertEquals(0, img.children().size());
    assertEquals(0, img.childNodes().size());
}

Source File: StatusReportGenerationErrorTest.java From kubernetes-elastic-agents with Apache License 2.0

5 votes

@Test
public void shouldGenerateErrorViewForException() {
    final StatusReportGenerationException exception = StatusReportGenerationException.noRunningPod("foo");

    final GoPluginApiResponse response = StatusReportGenerationErrorHandler.handle(PluginStatusReportViewBuilder.instance(), exception);

    assertThat(response.responseCode(), is(200));

    final String view = new JsonParser().parse(response.responseBody()).getAsJsonObject().get("view").getAsString();
    final Document document = Jsoup.parse(view);

    assertThat(document.select(".outer-container .container .error-container blockquote header").text(), is("Pod is not running."));
    assertThat(document.select(".outer-container .container .error-container blockquote p").text(), is("Can not find a running pod for the provided elastic agent id 'foo'."));
}

Source File: BaseElementSelector.java From zongtui-webcrawler with GNU General Public License v2.0

5 votes

@Override
public String select(String text) {
    if (text != null) {
        return select(Jsoup.parse(text));
    }
    return null;
}

Source File: ElementsTest.java From astor with GNU General Public License v2.0

5 votes

@Test public void empty() {
    Document doc = Jsoup.parse("<div><p>Hello <b>there</b></p> <p>now!</p></div>");
    doc.outputSettings().prettyPrint(false);

    doc.select("p").empty();
    assertEquals("<div><p></p> <p></p></div>", doc.body().html());
}

Source File: ParagraphMarkedClassificationTest.java From baleen with Apache License 2.0

5 votes

@Test
public void testMarking() {
  Document doc = Jsoup.parseBodyFragment("<p>(UK OFFICIAL)This is some text</p>");
  m.manipulate(doc);

  assertEquals(
      MarkupUtils.getAttribute(doc.body().select("p").first(), "classification"), "UK OFFICIAL");
  assertEquals(doc.body().text(), "This is some text");
}

Source File: PreviewTextUtils.java From mblog with GNU General Public License v3.0

5 votes

/**
 * 获取文章中的img url
 * @param html 代码
 * @return string
 */
public static List<String> extractImage(String html) {
    List<String> urls = new ArrayList<>();
    if (html == null)
        return urls;
    Document doc = Jsoup.parseBodyFragment(html);
    Elements images = doc.select("img");
    if (null != images) {
        for(Element el : images) {
            urls.add(el.attr("src"));
        }
    }
    return urls;
}

Source File: OnnmyoujiSpider.java From SpringBootUnity with MIT License

5 votes

/**
 * 获取御魂信息详情页连接
 */
private static List<String> getMitamaDetailInfoUrl() {
    List<String> list = new ArrayList<>();
    String html = HttpUtil.get(URL);
    Document doc = Jsoup.parse(html);
    Element select = doc.select(".heroList-1").get(0);
    Elements liElement = select.select("a");
    for (Element element : liElement) {
        String href = element.attr("href");
        list.add(href);
    }
    return list;
}

Source File: ElementTest.java From astor with GNU General Public License v2.0

5 votes

@Test public void testChildrenElements() {
    String html = "<div><p><a>One</a></p><p><a>Two</a></p>Three</div><span>Four</span><foo></foo><img>";
    Document doc = Jsoup.parse(html);
    Element div = doc.select("div").first();
    Element p = doc.select("p").first();
    Element span = doc.select("span").first();
    Element foo = doc.select("foo").first();
    Element img = doc.select("img").first();

    Elements docChildren = div.children();
    assertEquals(2, docChildren.size());
    assertEquals("<p><a>One</a></p>", docChildren.get(0).outerHtml());
    assertEquals("<p><a>Two</a></p>", docChildren.get(1).outerHtml());
    assertEquals(3, div.childNodes().size());
    assertEquals("Three", div.childNodes().get(2).outerHtml());

    assertEquals(1, p.children().size());
    assertEquals("One", p.children().text());

    assertEquals(0, span.children().size());
    assertEquals(1, span.childNodes().size());
    assertEquals("Four", span.childNodes().get(0).outerHtml());

    assertEquals(0, foo.children().size());
    assertEquals(0, foo.childNodes().size());
    assertEquals(0, img.children().size());
    assertEquals(0, img.childNodes().size());
}

Source File: HtmlParserTest.java From astor with GNU General Public License v2.0

5 votes

@Test public void parsesUnterminatedTextarea() {
    // don't parse right to end, but break on <p>
    Document doc = Jsoup.parse("<body><p><textarea>one<p>two");
    Element t = doc.select("textarea").first();
    assertEquals("one", t.text());
    assertEquals("two", doc.select("p").get(1).text());
}

Source File: UrlConnectTest.java From astor with GNU General Public License v2.0

5 votes

@Test
public void throwsIfRequestBodyForGet() throws IOException {
    boolean caught = false;
    String url = "https://jsoup.org";
    try {
        Document doc = Jsoup.connect(url).requestBody("fail").get();
    } catch (IllegalArgumentException e) {
        caught = true;
    }
    assertTrue(caught);
}

Source File: ParseMeiZiTu.java From v9porn with MIT License

5 votes

public static BaseResult<List<String>> parsePicturePage(String html) {
    BaseResult<List<String>> baseResult = new BaseResult<>();

    Document doc = Jsoup.parse(html);

    Element pageElement = doc.getElementsByClass("pagenavi").first();

    Elements aElements = pageElement.select("a");
    int totalPage = 1;
    if (aElements != null && aElements.size() > 3) {
        String pageStr = aElements.get(aElements.size() - 2).text();
        if (!TextUtils.isEmpty(pageStr) && TextUtils.isDigitsOnly(pageStr)) {
            totalPage = Integer.parseInt(pageStr);
        }
    }

    List<String> imageUrlList = new ArrayList<>();

    String imageUrl = doc.getElementsByClass("main-image").first().selectFirst("img").attr("src");
    if (totalPage == 1) {
        imageUrlList.add(imageUrl);
    }
    for (int i = 1; i < totalPage + 1; i++) {
        String tmp;
        if (i < 10) {
            tmp = imageUrl.replace("01.", "0" + i + ".");
        } else {
            tmp = imageUrl.replace("01.", "" + i + ".");
        }
        imageUrlList.add(tmp);
    }
    baseResult.setData(imageUrlList);
    return baseResult;
}

Source File: Header.java From viritin with Apache License 2.0

5 votes

private void render() {
    if (text != null) {
        setContentMode(ContentMode.HTML);
        StringBuilder sb = new StringBuilder("<h");
        sb.append(headerLevel);
        sb.append(">");
        sb.append(Jsoup.clean(text, getWhitelist()));
        sb.append("</h");
        sb.append(headerLevel);
        sb.append(">");
        super.setValue(sb.toString());
        text = null;
    }
}

Source File: XmlTreeBuilderTest.java From astor with GNU General Public License v2.0

5 votes

@Test
public void testSupplyParserToJsoupClass() {
    String xml = "<doc><val>One<val>Two</val></bar>Three</doc>";
    Document doc = Jsoup.parse(xml, "http://foo.com/", Parser.xmlParser());
    assertEquals("<doc><val>One<val>Two</val>Three</val></doc>",
            TextUtil.stripNewlines(doc.html()));
}

Source File: SelectorTest.java From astor with GNU General Public License v2.0

5 votes

@Test public void testById() {
    Elements els = Jsoup.parse("<div><p id=foo>Hello</p><p id=foo>Foo two!</p></div>").select("#foo");
    assertEquals(2, els.size());
    assertEquals("Hello", els.get(0).text());
    assertEquals("Foo two!", els.get(1).text());

    Elements none = Jsoup.parse("<div id=1></div>").select("#foo");
    assertEquals(0, none.size());
}

org.jsoup.Jsoup Java Examples