Java Code Examples for org.jsoup.nodes.Document#body()

The following examples show how to use org.jsoup.nodes.Document#body() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SelectorTest.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
@Test @MultiLocaleTest public void containsData() {
    String html = "<p>function</p><script>FUNCTION</script><style>item</style><span><!-- comments --></span>";
    Document doc = Jsoup.parse(html);
    Element body = doc.body();

    Elements dataEls1 = body.select(":containsData(function)");
    Elements dataEls2 = body.select("script:containsData(function)");
    Elements dataEls3 = body.select("span:containsData(comments)");
    Elements dataEls4 = body.select(":containsData(o)");
    Elements dataEls5 = body.select("style:containsData(ITEM)");

    assertEquals(2, dataEls1.size()); // body and script
    assertEquals(1, dataEls2.size());
    assertEquals(dataEls1.last(), dataEls2.first());
    assertEquals("<script>FUNCTION</script>", dataEls2.outerHtml());
    assertEquals(1, dataEls3.size());
    assertEquals("span", dataEls3.first().tagName());
    assertEquals(3, dataEls4.size());
    assertEquals("body", dataEls4.first().tagName());
    assertEquals("script", dataEls4.get(1).tagName());
    assertEquals("span", dataEls4.get(2).tagName());
    assertEquals(1, dataEls5.size());
}
 
Example 2
Source File: HtmlParserTest.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
@Test public void createsDocumentStructure() {
    String html = "<meta name=keywords /><link rel=stylesheet /><title>jsoup</title><p>Hello world</p>";
    Document doc = Jsoup.parse(html);
    Element head = doc.head();
    Element body = doc.body();

    assertEquals(1, doc.children().size()); // root node: contains html node
    assertEquals(2, doc.child(0).children().size()); // html node: head and body
    assertEquals(3, head.children().size());
    assertEquals(1, body.children().size());

    assertEquals("keywords", head.getElementsByTag("meta").get(0).attr("name"));
    assertEquals(0, body.getElementsByTag("meta").size());
    assertEquals("jsoup", doc.title());
    assertEquals("Hello world", body.text());
    assertEquals("Hello world", body.children().get(0).text());
}
 
Example 3
Source File: SelectorTest.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
@Test @MultiLocaleTest public void containsData() {
    String html = "<p>function</p><script>FUNCTION</script><style>item</style><span><!-- comments --></span>";
    Document doc = Jsoup.parse(html);
    Element body = doc.body();

    Elements dataEls1 = body.select(":containsData(function)");
    Elements dataEls2 = body.select("script:containsData(function)");
    Elements dataEls3 = body.select("span:containsData(comments)");
    Elements dataEls4 = body.select(":containsData(o)");
    Elements dataEls5 = body.select("style:containsData(ITEM)");

    assertEquals(2, dataEls1.size()); // body and script
    assertEquals(1, dataEls2.size());
    assertEquals(dataEls1.last(), dataEls2.first());
    assertEquals("<script>FUNCTION</script>", dataEls2.outerHtml());
    assertEquals(1, dataEls3.size());
    assertEquals("span", dataEls3.first().tagName());
    assertEquals(3, dataEls4.size());
    assertEquals("body", dataEls4.first().tagName());
    assertEquals("script", dataEls4.get(1).tagName());
    assertEquals("span", dataEls4.get(2).tagName());
    assertEquals(1, dataEls5.size());
}
 
Example 4
Source File: HtmlParserTest.java    From astor with GNU General Public License v2.0 6 votes vote down vote up
@Test public void createsDocumentStructure() {
    String html = "<meta name=keywords /><link rel=stylesheet /><title>jsoup</title><p>Hello world</p>";
    Document doc = Jsoup.parse(html);
    Element head = doc.head();
    Element body = doc.body();

    assertEquals(1, doc.children().size()); // root node: contains html node
    assertEquals(2, doc.child(0).children().size()); // html node: head and body
    assertEquals(3, head.children().size());
    assertEquals(1, body.children().size());

    assertEquals("keywords", head.getElementsByTag("meta").get(0).attr("name"));
    assertEquals(0, body.getElementsByTag("meta").size());
    assertEquals("jsoup", doc.title());
    assertEquals("Hello world", body.text());
    assertEquals("Hello world", body.children().get(0).text());
}
 
Example 5
Source File: BootstrapHandlerTest.java    From flow with Apache License 2.0 6 votes vote down vote up
@Test
public void renderUI() throws IOException {
    TestUI anotherUI = new TestUI();
    initUI(testUI);
    anotherUI.getInternals().setSession(session);
    VaadinRequest vaadinRequest = createVaadinRequest();
    anotherUI.doInit(vaadinRequest, 0);
    anotherUI.getRouter().initializeUI(anotherUI, request);
    anotherUI.getInternals()
            .setContextRoot(contextRootRelativePath(request));
    BootstrapContext bootstrapContext = new BootstrapContext(vaadinRequest,
            null, session, anotherUI, this::contextRootRelativePath);

    Document page = pageBuilder.getBootstrapPage(bootstrapContext);
    Element body = page.body();

    assertEquals(2, body.childNodeSize());
    assertEquals("noscript", body.child(0).tagName());
}
 
Example 6
Source File: LoginWebView.java    From NClientV2 with Apache License 2.0 6 votes vote down vote up
@Override
public void fetchUrl(String url, String html) {
    Document jsoup=Jsoup.parse(html);
    Element body=jsoup.body();
    Element form=body.getElementsByTag("form").first();
    body.getElementsByClass("lead").first().text("Tested");
    form.tagName("div");
    form.before("<script>\n" +
            "document.getElementsByClassName('lead')[0].innerHTML='test';\n"+
            "alert('test');\n"+
            "function intercept(){\n" +
            "    password=document.getElementById('id_password').value;\n" +
            "    email=document.getElementById('id_username_or_email').value;\n" +
            "    token=document.getElementsByName('csrfmiddlewaretoken')[0].value;\n" +
            "    captcha=document.getElementById('g-recaptcha-response').value;\n" +
            "     Interceptor.intercept(email,password,token,captcha);\n" +
            "}\n" +
            "</script>");
    form.getElementsByAttributeValue("type","submit").first().attr("onclick","intercept()");
    removeFetcher(fetcher);
    String encodedHtml = Base64.encodeToString(jsoup.outerHtml().getBytes(), Base64.NO_PADDING);
    loadDataWithBaseURL(Utility.getBaseUrl(), encodedHtml,"text/html","base64",null);
}
 
Example 7
Source File: Parser.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Parse a fragment of HTML into the {@code body} of a Document.
 *
 * @param bodyHtml fragment of HTML
 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
 *
 * @return Document, with empty head, and HTML parsed into body
 */
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
    Document doc = Document.createShell(baseUri);
    Element body = doc.body();
    List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
    Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node list gets modified when re-parented
    for (int i = nodes.length - 1; i > 0; i--) {
        nodes[i].remove();
    }
    for (Node node : nodes) {
        body.appendChild(node);
    }
    return doc;
}
 
Example 8
Source File: TagServlet.java    From firing-range with Apache License 2.0 5 votes vote down vote up
@Override
public void doGet(HttpServletRequest request, HttpServletResponse response) throws IOException {
  if (request.getParameter("q") == null) {
    Responses.sendError(response, "Missing q parameter", 400);
    return;
  }

  String  q = request.getParameter("q");
  Document doc = Jsoup.parseBodyFragment(q);
  Element body = doc.body();
  Elements elements = body.getAllElements();
  if (!(q.contains("body"))){
    elements.remove(body);
  }

  if (elements.isEmpty()) {
    Responses.sendError(response, "Invalid input, no tags", 400);
    return;
  }

  String allowedTag = "";
  String allowedAttribute = "";
  if (request.getPathInfo() != null) {
    String pathInfo = request.getPathInfo().substring(1);
    if (pathInfo.contains("/")) {
      allowedTag = pathInfo.split("/", 2)[0];
      allowedAttribute = pathInfo.split("/")[1];
    } else {
      allowedTag = pathInfo;
    }      
  }
  handleRequest(elements, response, allowedTag, allowedAttribute);
}
 
Example 9
Source File: Parser.java    From jsoup-learning with MIT License 5 votes vote down vote up
/**
 * Parse a fragment of HTML into the {@code body} of a Document.
 *
 * @param bodyHtml fragment of HTML
 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
 *
 * @return Document, with empty head, and HTML parsed into body
 */
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
    Document doc = Document.createShell(baseUri);
    Element body = doc.body();
    List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
    Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node list gets modified when re-parented
    for (Node node : nodes) {
        body.appendChild(node);
    }
    return doc;
}
 
Example 10
Source File: HtmlParserTest.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
@Test public void parsesComments() {
    String html = "<html><head></head><body><img src=foo><!-- <table><tr><td></table> --><p>Hello</p></body></html>";
    Document doc = Jsoup.parse(html);

    Element body = doc.body();
    Comment comment = (Comment) body.childNode(1); // comment should not be sub of img, as it's an empty tag
    assertEquals(" <table><tr><td></table> ", comment.getData());
    Element p = body.child(1);
    TextNode text = (TextNode) p.childNode(0);
    assertEquals("Hello", text.getWholeText());
}
 
Example 11
Source File: Parser.java    From astor with GNU General Public License v2.0 5 votes vote down vote up
/**
 * Parse a fragment of HTML into the {@code body} of a Document.
 *
 * @param bodyHtml fragment of HTML
 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
 *
 * @return Document, with empty head, and HTML parsed into body
 */
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
    Document doc = Document.createShell(baseUri);
    Element body = doc.body();
    List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
    Node[] nodes = nodeList.toArray(new Node[nodeList.size()]); // the node list gets modified when re-parented
    for (int i = nodes.length - 1; i > 0; i--) {
        nodes[i].remove();
    }
    for (Node node : nodes) {
        body.appendChild(node);
    }
    return doc;
}
 
Example 12
Source File: ZeppelinRDisplay.java    From zeppelin with Apache License 2.0 5 votes vote down vote up
public static RDisplay render( String html, String imageWidth) {

    Document document = Jsoup.parse(html);
    document.outputSettings().prettyPrint(false);

    Element body = document.body();

    if (body.getElementsByTag("p").isEmpty()) {
      return new RDisplay(body.html(), Type.HTML, Code.SUCCESS);
    }

    String bodyHtml = body.html();

    if (! bodyHtml.contains("<img")
      &&  ! bodyHtml.contains("<script")
      && ! bodyHtml.contains("%html ")
      && ! bodyHtml.contains("%table ")
      && ! bodyHtml.contains("%img ")
    ) {
      return textDisplay(body);
    }

    if (bodyHtml.contains("%table")) {
      return tableDisplay(body);
    }

    if (bodyHtml.contains("%img")) {
      return imgDisplay(body);
    }

    return htmlDisplay(body, imageWidth);
  }
 
Example 13
Source File: RemoveEmptyText.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
public void manipulate(Document document) {
  Element body = document.body();

  while (!removeEmpty(body)) {
    // Repeat as needed.... work done in the while
  }
}
 
Example 14
Source File: JsoupUtils.java    From springboot-admin with Apache License 2.0 5 votes vote down vote up
public static String getBodyHtml(String html) {
	if (StringUtils.isNotBlank(html)) {
		Document document = Jsoup.parse(html);
		if (null != document && document.body() != null) {
			return document.body().html().toString();
		}
	}
	return html;
}
 
Example 15
Source File: BaseSoup.java    From ShareBox with Apache License 2.0 5 votes vote down vote up
public Map<String, Object> doParse(Object... arg) {
    mArguments = arg;
    if (mValues == null) {
        mValues = new HashMap<>();
    }
    Document doc = Jsoup.parse(mHtml);
    mHeader = doc.head();
    mBody = doc.body();
    parse(doc, mHeader, mBody, mValues);
    return mValues;
}
 
Example 16
Source File: IpProxy.java    From emotional_analysis with Apache License 2.0 5 votes vote down vote up
public static List<IpEntity> getProxyIp(String url) throws Exception{
	ArrayList<IpEntity> ipList = new ArrayList<>();
	Response execute = Jsoup.connect(url)
			.header("User-Agent",
					"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36")
			.header("Cache-Control", "max-age=60").header("Accept", "*/*")
			.header("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6").header("Connection", "keep-alive")
			.header("Referer", "http://music.163.com/song?id=186016")
			.header("Origin", "http://music.163.com").header("Host", "music.163.com")
			.header("Content-Type", "application/x-www-form-urlencoded")
			.header("Cookie",
					"UM_distinctid=15e9863cf14335-0a09f939cd2af9-6d1b137c-100200-15e9863cf157f1; vjuids=414b87eb3.15e9863cfc1.0.ec99d6f660d09; _ntes_nnid=4543481cc76ab2fd3110ecaafd5f1288,1505795231854; _ntes_nuid=4543481cc76ab2fd3110ecaafd5f1288; __s_=1; __gads=ID=6cbc4ab41878c6b9:T=1505795247:S=ALNI_MbCe-bAY4kZyMbVKlS4T2BSuY75kw; usertrack=c+xxC1nMphjBCzKpBPJjAg==; NTES_CMT_USER_INFO=100899097%7Cm187****4250%7C%7Cfalse%7CbTE4NzAzNDE0MjUwQDE2My5jb20%3D; P_INFO=m18703414250@163.com|1507178162|2|mail163|00&99|CA&1506163335&mail163#hun&430800#10#0#0|187250&1|163|18703414250@163.com; vinfo_n_f_l_n3=8ba0369be425c0d2.1.7.1505795231863.1507950353704.1508150387844; vjlast=1505795232.1508150167.11; Province=0450; City=0454; _ga=GA1.2.1044198758.1506584097; _gid=GA1.2.763458995.1508907342; JSESSIONID-WYYY=Zm%2FnBG6%2B1vb%2BfJp%5CJP8nIyBZQfABmnAiIqMM8fgXABoqI0PdVq%2FpCsSPDROY1APPaZnFgh14pR2pV9E0Vdv2DaO%2BKkifMncYvxRVlOKMEGzq9dTcC%2F0PI07KWacWqGpwO88GviAmX%2BVuDkIVNBEquDrJ4QKhTZ2dzyGD%2Bd2T%2BbiztinJ%3A1508946396692; _iuqxldmzr_=32; playerid=20572717; MUSIC_U=39d0b2b5e15675f10fd5d9c05e8a5d593c61fcb81368d4431bab029c28eff977d4a57de2f409f533b482feaf99a1b61e80836282123441c67df96e4bf32a71bc38be3a5b629323e7bf122d59fa1ed6a2; __remember_me=true; __csrf=2032a8f34f1f92412a49ba3d6f68b2db; __utma=94650624.1044198758.1506584097.1508939111.1508942690.40; __utmb=94650624.20.10.1508942690; __utmc=94650624; __utmz=94650624.1508394258.18.4.utmcsr=xujin.org|utmccn=(referral)|utmcmd=referral|utmcct=/")
			.method(Method.GET).ignoreContentType(true)
			.timeout(2099999999).execute();
	Document pageJson = execute.parse();
	Element body = pageJson.body();
	List<Node> childNodes = body.childNode(11).childNode(3).childNode(5).childNode(1).childNodes();
	//把前10位的代理IP放到List中
	for(int i = 2;i <= 30;i += 2){
		IpEntity ipEntity = new IpEntity();
		Node node = childNodes.get(i);
		List<Node> nodes = node.childNodes();
		String ip = nodes.get(3).childNode(0).toString();
		int port = Integer.parseInt(nodes.get(5).childNode(0).toString());
		ipEntity.setIp(ip);
		ipEntity.setPort(port);
		ipList.add(ipEntity);
	}
	return ipList;
}
 
Example 17
Source File: ThreadPageRequest.java    From something.apk with MIT License 4 votes vote down vote up
public static ThreadPage processThreadPage(Document document, boolean showImages, boolean showAvatars, boolean hidePreviouslyReadImages, long jumpToPost, String redirectedUrl){
    ArrayList<HashMap<String, String>> posts = new ArrayList<HashMap<String, String>>();

    int currentPage, maxPage = 1, threadId, forumId, unread;
    String jumpToId = jumpToPost > 0 ? "#post"+jumpToPost : null;

    String ptiFragment = null;
    if(!TextUtils.isEmpty(redirectedUrl)){
        Uri url = Uri.parse(redirectedUrl);
        ptiFragment = url.getFragment();
        if("lastpost".matches(ptiFragment)){
            ptiFragment = null;
            jumpToId = "#lastpost";
        }
    }


    Element pages = document.getElementsByClass("pages").first();
    currentPage = FastUtils.safeParseInt(pages.getElementsByAttribute("selected").attr("value"), 1);
    Element lastPage = pages.getElementsByTag("option").last();
    if(lastPage != null){
        maxPage = FastUtils.safeParseInt(lastPage.attr("value"), 1);
    }

    boolean bookmarked = document.getElementsByClass("unbookmark").size() > 0;

    String threadTitle = TextUtils.htmlEncode(document.getElementsByClass("bclast").first().text());

    Element body = document.body();
    forumId = Integer.parseInt(body.attr("data-forum"));
    threadId = Integer.parseInt(body.attr("data-thread"));

    Elements threadbars = document.getElementsByClass("threadbar");
    boolean canReply = !Constants.isArchiveForum(forumId) && threadbars.first().getElementsByAttributeValueContaining("src", "images/forum-closed.gif").size() == 0;

    unread = parsePosts(document, posts, showImages, showAvatars, hidePreviouslyReadImages, ptiFragment, canReply, currentPage == maxPage, forumId);

    StringBuilder builder = new StringBuilder(2048);

    int previouslyRead = posts.size()-unread;

    HashMap<String, String> headerArgs = new HashMap<String, String>();
    headerArgs.put("jumpToPostId", jumpToId);
    headerArgs.put("fontSize", SomePreferences.fontSize);
    headerArgs.put("theme", getTheme(forumId));
    headerArgs.put("previouslyRead", previouslyRead > 0 && unread > 0 ? previouslyRead+" Previous Post"+(previouslyRead > 1 ? "s":"") : null);
    MustCache.applyHeaderTemplate(builder, headerArgs);

    for(HashMap<String, String> post : posts){
        MustCache.applyPostTemplate(builder, post);
    }

    MustCache.applyFooterTemplate(builder, null);

    ThreadItem cachedThread = ThreadManager.getThread(threadId);
    if(cachedThread != null){
        cachedThread.updateUnreadCount(currentPage, maxPage, SomePreferences.threadPostPerPage);
    }

    return new ThreadPage(builder.toString(), currentPage, maxPage, threadId, forumId, threadTitle, -unread, bookmarked, canReply);

}
 
Example 18
Source File: AbstractHtmlConsumer.java    From baleen with Apache License 2.0 4 votes vote down vote up
@Override
protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException {
  final File f = getFileName(jCas);
  final DocumentAnnotation da = getDocumentAnnotation(jCas);

  final Document doc =
      Jsoup.parse("<!DOCTYPE html>\n<html lang=\"" + da.getLanguage() + "\"></html>");
  doc.outputSettings(new Document.OutputSettings().prettyPrint(false));
  final Element head = doc.head();

  if (!Strings.isNullOrEmpty(css)) {
    final Element cssLink = head.appendElement("link");
    cssLink.attr("rel", "stylesheet");
    cssLink.attr("href", css);
  }

  final Element charset = head.appendElement("meta");
  charset.attr("charset", "utf-8");

  appendMeta(head, "document.type", da.getDocType());
  appendMeta(head, "document.sourceUri", da.getSourceUri());
  appendMeta(head, "externalId", da.getHash());

  appendMeta(head, "document.classification", da.getDocumentClassification());
  appendMeta(
      head,
      "document.caveats",
      String.join(",", UimaTypesUtils.toArray(da.getDocumentCaveats())));
  appendMeta(
      head,
      "document.releasability",
      String.join(",", UimaTypesUtils.toArray(da.getDocumentReleasability())));

  String title = null;
  for (final Metadata md : JCasUtil.select(jCas, Metadata.class)) {
    appendMeta(head, md.getKey(), md.getValue());
    if ("documentTitle".equalsIgnoreCase(md.getKey())) {
      title = md.getValue();
    }
  }

  if (!Strings.isNullOrEmpty(title)) {
    doc.title(title);
  }

  final Element body = doc.body();

  writeBody(jCas, body);

  try {
    FileUtils.writeStringToFile(f, doc.html(), Charset.defaultCharset());
  } catch (final IOException e) {
    throw new AnalysisEngineProcessException(e);
  }
}
 
Example 19
Source File: Expression.java    From firing-range with Apache License 2.0 4 votes vote down vote up
@Override
public void doGet(HttpServletRequest request, HttpServletResponse response) throws IOException {
  if (request.getParameter("q") == null) {
    Responses.sendError(response, "Missing q parameter", 400);
    return;
  }

  String  q = request.getParameter("q");
  Document doc = Jsoup.parseBodyFragment(q);
  Element body = doc.body();
  Elements elements = body.getAllElements();
  elements.remove(body);
  if (elements.isEmpty()) {
    Responses.sendError(response, "Invalid input, no tags", 400);
    return;
  }

  StringBuilder res = new StringBuilder();
  for (Element element : elements) {
    boolean validElement = true;

    Attributes attributes = element.attributes();
    for (Attribute attribute : attributes) {
      if (attribute.getKey().toLowerCase().startsWith("on")
          || attribute.getKey().toLowerCase().equals("href")
          || attribute.getKey().toLowerCase().equals("src")) {
        validElement = false;
      }

      if (attribute.getKey().toLowerCase().equals("style")
          && attribute.getValue().toLowerCase().contains("expression")) {
        validElement = false;
      }
    }

    if (validElement) {
      res.append(element.toString());
    }
  }
  Responses.sendXssed(response, res.toString());
}
 
Example 20
Source File: SMSender.java    From rebuild with GNU General Public License v3.0 4 votes vote down vote up
/**
 * @return
 * @throws IOException
 */
protected static Element getMailTemplate() throws IOException {
	File tmp = SysConfiguration.getFileOfRes("locales/email_zh-CN.html");
	Document html = Jsoup.parse(tmp, "utf-8");
	return html.body();
}