org.jsoup.nodes.Document#getElementsByTag

Source File: SourcePrinterTest.java From warnings-ng-plugin with MIT License

6 votes

@Test
@org.jvnet.hudson.test.Issue("JENKINS-55679")
void shouldRenderXmlFiles() {
    SourcePrinter printer = new SourcePrinter();

    IssueBuilder builder = new IssueBuilder();
    Issue issue = builder.build();

    Document document = Jsoup.parse(printer.render(asStream("format.xml"), issue,
            NO_DESCRIPTION, ICON_URL));
    String expectedFile = toString("format.xml");

    assertThat(document.text()).isEqualToIgnoringWhitespace(expectedFile);

    Elements pre = document.getElementsByTag("pre");
    assertThat(pre.text()).isEqualToIgnoringWhitespace(expectedFile);
}

Source File: MeiziUtil.java From MoeQuest with Apache License 2.0

6 votes

/**
 * 解析自拍妹子Html
 */
public List<MeiziTu> parserMeiziTuByAutodyne(String html, String type) {

  List<MeiziTu> list = new ArrayList<>();
  Document doc = Jsoup.parse(html);

  Elements p = doc.getElementsByTag("p");
  MeiziTu meiziTu;
  Element img;
  for (int i = 0; i < 15; i++) {
    meiziTu = new MeiziTu();
    img = p.get(i).select("img").first();
    String src = img.attr("src");
    String title = img.attr("alt");
    meiziTu.setOrder(i);
    meiziTu.setType(type);
    meiziTu.setWidth(0);
    meiziTu.setHeight(0);
    meiziTu.setImageurl(src);
    meiziTu.setTitle(title);
    list.add(meiziTu);
  }
  return list;
}

Source File: WhenRubyExtensionIsRegistered.java From asciidoctorj with Apache License 2.0

6 votes

@Test
public void ruby_block_macro_processor_should_be_registered_with_block_name() {

    RubyExtensionRegistry rubyExtensionRegistry = asciidoctor.rubyExtensionRegistry();
    rubyExtensionRegistry.loadClass(getClass().getResourceAsStream("/ruby-extensions/gist-block-macro.rb")).blockMacro("mygist", "GistBlockMacro");

    String content = asciidoctor.convert(
            ".My Gist\n" +
                "mygist::123456[]",
            options().toFile(false).get());

    Document doc = Jsoup.parse(content, "UTF-8");
    Elements elements = doc.getElementsByTag("script");
    assertThat(elements.size(), is(1));
    assertThat(elements.get(0).attr("src"), is("https://gist.github.com/123456.js"));

}

Source File: BookClass.java From nju-lib-downloader with GNU General Public License v3.0

6 votes

/**
 * 加载子分类。仅加载一层子分类，即子分类的子分类不会被加载。
 * 当该方法被调用时，会向服务器查询该分类的子分类并更新该对象的{@link #children}
 * <p>
 * 如需递归加载子分类，调用{@link #loadAllChild()}
 *
 * @throws IOException 从服务器查询子节点出错
 */
public void loadChild() throws IOException {
    if (!isTerminal()) {
        checkCookie();
        String Url = NJULib.baseUrl + "/classifyview";
        String data = "fenlei=" + this.getId() + "&lib=markbook";
        String result = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "UTF-8", 1000);
        // System.out.println(result);
        Document doc = Jsoup.parse(result);
        Elements li = doc.getElementsByTag("li");
        for (Element bookClassId : li) {
            String id = bookClassId.attr("id");
            String name = bookClassId.getElementsByTag("a").text();
            boolean hasSubTree = bookClassId.getElementsByTag("img").attr("onClick").contains("getSubTree");
            //System.out.println(id+" "+NJULib.decodeUrlUnicode(name));
            BookClass child = hasSubTree ? new BookClass(id, MyDecoder.decodeUrlUnicode(name), this) :
                    new TerminalBookClass(id, MyDecoder.decodeUrlUnicode(name), this);
            child.setCookie(cookie);
            this.addChild(child);
        }
        this.isLoaded = true;
    }
}

Source File: HtmlUtil.java From jbake with MIT License

6 votes

/**
 * Image paths are specified as w.r.t. assets folder. This function prefix site host to all img src except
 * the ones that starts with http://, https://.
 * <p>
 * If image path starts with "./", i.e. relative to the source file, then it first replace that with output file directory and the add site host.
 *
 * @param fileContents  Map representing file contents
 * @param configuration Configuration object
 */
public static void fixImageSourceUrls(Map<String, Object> fileContents, JBakeConfiguration configuration) {
    String htmlContent = fileContents.get(Attributes.BODY).toString();
    boolean prependSiteHost = configuration.getImgPathPrependHost();
    String siteHost = configuration.getSiteHost();
    String uri = getDocumentUri(fileContents);

    Document document = Jsoup.parseBodyFragment(htmlContent);
    Elements allImgs = document.getElementsByTag("img");

    for (Element img : allImgs) {
        transformImageSource(img, uri, siteHost, prependSiteHost);
    }

    //Use body().html() to prevent adding <body></body> from parsed fragment.
    fileContents.put(Attributes.BODY, document.body().html());
}

Source File: EmailServiceImpl.java From gravitee-management-rest-api with Apache License 2.0

6 votes

private String addResourcesInMessage(final MimeMessageHelper mailMessage, final String htmlText) throws Exception {
    final Document document = Jsoup.parse(htmlText);

    final List<String> resources = new ArrayList<>();

    final Elements imageElements = document.getElementsByTag("img");
    resources.addAll(imageElements.stream()
            .filter(imageElement -> imageElement.hasAttr("src"))
            .filter(imageElement -> !imageElement.attr("src").startsWith("http"))
            .map(imageElement -> {
                final String src = imageElement.attr("src");
                imageElement.attr("src", "cid:" + src);
                return src;
            })
            .collect(Collectors.toList()));

    final String html = document.html();
    mailMessage.setText(html, true);

    for (final String res : resources) {
        final FileSystemResource templateResource = new FileSystemResource(new File(templatesPath, res));
        mailMessage.addInline(res, templateResource, getContentTypeByFileName(res));
    }

    return html;
}

Source File: AgentManagementServiceImpl.java From Insights with Apache License 2.0

6 votes

@Override
public Map<String, ArrayList<String>> getSystemAvailableAgentList() throws InsightsCustomException {
	Map<String, ArrayList<String>> agentDetails = new TreeMap<>();
	if (!ApplicationConfigProvider.getInstance().getAgentDetails().isOnlineRegistration()) {
		agentDetails = getOfflineSystemAvailableAgentList();
	} else {
		String url = ApplicationConfigProvider.getInstance().getAgentDetails().getDocrootUrl();
		Document doc;
		try {
			doc = Jsoup.connect(url).get();
			Elements rows = doc.getElementsByTag("a");
			for (Element element : rows) {
				if (null != element.text() && element.text().startsWith("v")) {
					String version = StringUtils.stripEnd(element.text(), "/");
					ArrayList<String> toolJson = getAgents(version);
					agentDetails.put(version, toolJson);
				}
			}
		} catch (IOException e) {
			log.error("Error while getting system agent list ", e);
			throw new InsightsCustomException(e.toString());
		}
	}
	return agentDetails;
}

Source File: LyricsChart.java From QuickLyric with GNU General Public License v3.0

6 votes

public static ArrayList<Lyrics> search(String query) {
    ArrayList<Lyrics> results = new ArrayList<>();
    try {
        String url = "http://api.chartlyrics.com/apiv1.asmx/SearchLyricText?lyricText=";
        url += URLEncoder.encode(query, "UTF-8");
        Document doc = Jsoup.parse(url, null);
        Elements elements = doc.getElementsByTag("SearchLyricResult");
        for (Element element : elements) {
            String id = element.getElementsByTag("TrackId").get(0).text();
            String checksum = element.getElementsByTag("TrackChecksum").get(0).text();
            Lyrics lyrics = new Lyrics(Lyrics.SEARCH_ITEM);
            lyrics.setArtist(element.getElementsByTag("artist").get(0).text());
            lyrics.setTitle(element.getElementsByTag("song").get(0).text());
            lyrics.setURL("http://api.chartlyrics.com/apiv1.asmx/GetLyric?lyricId=" + id + "&lyricCheckSum=" + checksum);
            results.add(lyrics);
        }
        return results;
    } catch (Exception e) {
        if (!BuildConfig.DEBUG && !(e instanceof IOException)) {
            e.printStackTrace();
        }
    }

    return new ArrayList<>();
}

Source File: BookClass.java From nju-lib-downloader with GNU General Public License v3.0

6 votes

/**
 * 加载子分类。仅加载一层子分类，即子分类的子分类不会被加载。
 * 当该方法被调用时，会向服务器查询该分类的子分类并更新该对象的{@link #children}
 * <p>
 * 如需递归加载子分类，调用{@link #loadAllChild()}
 *
 * @throws IOException 从服务器查询子节点出错
 */
public void loadChild() throws IOException {
    if (!isTerminal()) {
        checkCookie();
        String Url = NJULib.baseUrl + "/classifyview";
        String data = "fenlei=" + this.getId() + "&lib=markbook";
        String result = MyHttpRequest.postWithCookie(data, Url, null, cookie, "UTF-8", "UTF-8", 1000);
        // System.out.println(result);
        Document doc = Jsoup.parse(result);
        Elements li = doc.getElementsByTag("li");
        for (Element bookClassId : li) {
            String id = bookClassId.attr("id");
            String name = bookClassId.getElementsByTag("a").text();
            boolean hasSubTree = bookClassId.getElementsByTag("img").attr("onClick").contains("getSubTree");
            //System.out.println(id+" "+NJULib.decodeUrlUnicode(name));
            BookClass child = hasSubTree ? new BookClass(id, MyDecoder.decodeUrlUnicode(name), this) :
                    new TerminalBookClass(id, MyDecoder.decodeUrlUnicode(name), this);
            child.setCookie(cookie);
            this.addChild(child);
        }
        this.isLoaded = true;
    }
}

Source File: WebDavFile.java From a with GNU General Public License v3.0

6 votes

private List<WebDavFile> parseDir(String s) {
    List<WebDavFile> list = new ArrayList<>();
    Document document = Jsoup.parse(s);
    Elements elements = document.getElementsByTag("d:response");
    String baseUrl = getUrl().endsWith("/") ? getUrl() : getUrl() + "/";
    for (Element element : elements) {
        String href = element.getElementsByTag("d:href").get(0).text();
        if (!href.endsWith("/")) {
            String fileName = href.substring(href.lastIndexOf("/") + 1);
            WebDavFile webDavFile;
            try {
                webDavFile = new WebDavFile(baseUrl + fileName);
                webDavFile.setDisplayName(fileName);
                webDavFile.setUrlName(href);
                list.add(webDavFile);
            } catch (MalformedURLException e) {
                e.printStackTrace();
            }
        }
    }
    return list;
}

Source File: WhenAttributesAreUsedInAsciidoctor.java From asciidoctorj with Apache License 2.0

5 votes

@Test
public void should_skip_front_matter_if_specified_by_skip_front_matter_attribute()
        throws IOException {

    Attributes attributes = attributes().skipFrontMatter(true).get();
    Options options = options().toFile(false).inPlace(false).attributes(attributes).get();

    String content = asciidoctor.convertFile(classpath.getResource("renderwithfrontmatter.adoc"), options);
    Document doc = Jsoup.parse(content, "UTF-8");
    Elements hrElements = doc.getElementsByTag("hr");

    assertThat(hrElements.size(), is(0));

}

Source File: SessionUtil.java From snowflake-jdbc with Apache License 2.0

5 votes

/**
 * Extracts post back url from the HTML returned by the IDP
 *
 * @param html The HTML that we are parsing to find the post back url
 * @return The post back url
 */
static private String getPostBackUrlFromHTML(String html)
{
  Document doc = Jsoup.parse(html);
  Elements e1 = doc.getElementsByTag("body");
  Elements e2 = e1.get(0).getElementsByTag("form");
  return e2.first().attr("action");
}

Source File: VulnChecker.java From zap-extensions with Apache License 2.0

5 votes

/**
 * Give a list of securiteam.com search links for the key word introduced
 *
 * @param appName
 * @param version
 * @return
 * @throws Exception
 */
public static ArrayList<String> fromSecuritiTeam(String appName, String version)
        throws Exception {
    ArrayList<String> results = new ArrayList<String>();
    URL url =
            new URL(
                    "http://www.securiteam.com/cgi-bin/htsearch?words="
                            + appName
                            + "+"
                            + version);

    WebPage wp = new WebPage(url);
    Document doc = wp.getDocument();
    if (doc.outerHtml().contains("No matches were found for")) {
        System.out.println("No Results Found");

    } else {
        // System.out.println(doc.getElementsByTag("dl"));
        for (Element elt : doc.getElementsByTag("dl")) {
            String link = elt.getElementsByTag("a").get(0).attr("href");
            // for the moment i return just links
            System.out.println(link);
            results.add(link + "\n");
            /*wp = new WebPage(new URL(link));
            doc = wp.getDocument();
            for(Element e:doc.getAllElements()){

            }
            String fields = doc.getElementsMatchingOwnText("Vulnerable Systems:").get(0).parent().text();
            System.out.println(fields.replaceAll("Protect your website!.*vulnerability-scanner", ""));
            */
        }
    }
    return results;
}

Source File: EncodingDetect.java From a with GNU General Public License v3.0

5 votes

public static String getEncodeInHtml(@NonNull byte[] bytes) {
    try {
        String charsetStr = "UTF-8";
        Document doc = Jsoup.parse(new String(bytes, charsetStr));
        int a = doc.childNode(0).toString().indexOf("encoding");
        if (a > 0) {
            String e = doc.childNode(0).toString().substring(a);
            int b = e.indexOf('"');
            int c = e.indexOf('"', b + 1);
            return e.substring(b + 1, c);
        }
        Elements metaTags = doc.getElementsByTag("meta");
        for (Element metaTag : metaTags) {
            String content = metaTag.attr("content");
            String http_equiv = metaTag.attr("http-equiv");
            charsetStr = metaTag.attr("charset");
            if (!charsetStr.isEmpty()) {
                if (!isEmpty(charsetStr)) {
                    return charsetStr;
                }
            }
            if (http_equiv.toLowerCase().equals("content-type")) {
                if (content.toLowerCase().contains("charset")) {
                    charsetStr = content.substring(content.toLowerCase().indexOf("charset") + "charset=".length());
                } else {
                    charsetStr = content.substring(content.toLowerCase().indexOf(";") + 1);
                }
                if (!isEmpty(charsetStr)) {
                    return charsetStr;
                }
            }
        }
    } catch (Exception ignored) {
    }
    return getJavaEncode(bytes);
}

Source File: VscoRipper.java From ripme with MIT License

5 votes

private String vscoImageToURL(String url) throws IOException{
    Document page = Jsoup.connect(url).userAgent(USER_AGENT)
                                      .get();
    //create Elements filled only with Elements with the "meta" tag.
    Elements metaTags = page.getElementsByTag("meta");
    String result = "";

    for(Element metaTag : metaTags){
        //find URL inside meta-tag with property of "og:image"
        if (metaTag.attr("property").equals("og:image")){
            String givenURL = metaTag.attr("content");
            givenURL = givenURL.replaceAll("\\?h=[0-9]+", "");//replace the "?h=xxx" tag at the end of the URL (where each x is a number)
            
            result = givenURL;
            LOGGER.debug("Found image URL: " + givenURL);
            break;//immediately stop after getting URL (there should only be 1 image to be downloaded)
        }
    }
    
    //Means website changed, things need to be fixed.
    if (result.isEmpty()){
        LOGGER.error("Could not find image URL at: " + url);
    }
    
    return result;
    
}

Source File: ResourceQuote.java From templatespider with Apache License 2.0

5 votes

/**
 * 替换 img 标签
 * @param doc
 * @return
 */
public Document imgTag(Document doc){
	Elements imgElements = doc.getElementsByTag("img");
	for (int i = 0; i < imgElements.size(); i++) {
		Element e = imgElements.get(i);
		String url = e.attr("src");
		String absUrl = hierarchyReplace(this.baseUri, url);
		if(!url.equals(absUrl)){
			e.attr("src", absUrl);
		}
	}
	return doc;
}

Source File: DefaultEmailNotifier.java From jetlinks-community with Apache License 2.0

5 votes

private Map<String, String> extractSendTextImage(String sendText) {
    Map<String, String> images = new HashMap<>();
    Document doc = Jsoup.parse(sendText);
    for (Element src : doc.getElementsByTag("img")) {
        String s = src.attr("src");
        if (s.startsWith("http")) {
            continue;
        }
        String tempKey = IDGenerator.MD5.generate();
        src.attr("src", "cid:".concat(tempKey));
        images.put(tempKey, s);
    }
    return images;
}

Source File: CustomVRaptorIntegration.java From mamute with Apache License 2.0

4 votes

protected Elements getElementsByTag(String html, String tagName) {
	Document document = Jsoup.parse(html);
	return document.getElementsByTag(tagName);
}

Source File: ThechiveRipper.java From ripme with MIT License

4 votes

private List<String> getUrlsFromThechive(Document doc) {
    /*
     * The image urls are stored in a <script> tag of the document. This script
     * contains a single array var by name CHIVE_GALLERY_ITEMS.
     * 
     * We grab all the <img> tags from the particular script, combine them in a
     * string, parse it, and grab all the img/gif urls.
     * 
     */
    List<String> result = new ArrayList<>();
    Elements scripts = doc.getElementsByTag("script");

    for (Element script : scripts) {
        String data = script.data();

        if (!data.contains("CHIVE_GALLERY_ITEMS")) {
            continue;
        }

        /*
         * We add all the <img/> tags in a single StringBuilder and parse as HTML for
         * easy sorting of img/ gifs.
         */
        StringBuilder allImgTags = new StringBuilder();
        Matcher matcher = imagePattern.matcher(data);
        while (matcher.find()) {
            // Unescape '\' from the img tags, which also unescape's img url as well.
            allImgTags.append(matcher.group(0).replaceAll("\\\\", ""));
        }

        // Now we parse and sort links.
        Document imgDoc = Jsoup.parse(allImgTags.toString());
        Elements imgs = imgDoc.getElementsByTag("img");
        for (Element img : imgs) {
            if (img.hasAttr("data-gifsrc")) {
                // For gifs.
                result.add(img.attr("data-gifsrc"));
            } else {
                // For jpeg images.
                result.add(img.attr("src"));
            }
        }
    }

    // strip all GET parameters from the links( such as quality, width, height as to
    // get the original image.).
    result.replaceAll(s -> s.substring(0, s.indexOf("?")));

    return result;
}

Source File: FeedParser.java From WordPressHelper with MIT License

4 votes

@Override
protected Object doInBackground(Object[] params) {
    try {
        Document document = Jsoup.connect(FEED_URL)
                .userAgent("Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 Safari/537.22")
                .timeout(60000).ignoreContentType(true).get();
        Elements elements = document.getElementsByTag("item");
        for (Element element : elements) {
            FeedItem feedItem = new FeedItem();

            //get all simple information
            feedItem.setTitle(element.getElementsByTag("title").first().text());
            feedItem.setPubDate(element.getElementsByTag("pubDate").first().text());
            feedItem.setCreator(element.getElementsByTag("dc:creator").first().text());
            feedItem.setDescription(element.getElementsByTag("description").first().text());
            feedItem.setContent(element.getElementsByTag("content:encoded").first().text());
            feedItem.setCommentRss(element.getElementsByTag("wfw:commentRss").first().text());
            feedItem.setComments(element.getElementsByTag("slash:comments").first().text());
            feedItem.setLink(element.select("link").first().nextSibling().toString().trim());
            feedItem.setGuid(element.getElementsByTag("guid").first().text());

            //get first image
            Document document1 = Jsoup.parse(element.getElementsByTag("content:encoded").first().text());
            Elements elements1 = document1.select("img");
            feedItem.setImage(elements1.attr("src"));

            //get all category
            Elements elements2 = element.getElementsByTag("category");
            ArrayList<String> category = new ArrayList<>();
            for (int i = 0; i < elements2.size(); i++) {
                category.add(element.getElementsByTag("category").get(i).text());
            }
            feedItem.setCategory(category);
            //get id
            String idPost[] = element.getElementsByTag("guid").first().text().split("p=");
            if (idPost.length > 1) {
                feedItem.setId(idPost[1]);
                //add feeditem to arraylist
                feedItems.add(feedItem);
            }

        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    return null;
}

Java Code Examples for org.jsoup.nodes.Document#getElementsByTag()