org.jsoup.parser.Parser Java Exaples

Source File: UrlConnectTest.java From astor with GNU General Public License v2.0

6 votes

@Test
public void handles200WithNoContent() throws IOException {
    Connection con = Jsoup
        .connect("http://direct.infohound.net/tools/200-no-content.pl")
        .userAgent(browserUa);
    Connection.Response res = con.execute();
    Document doc = res.parse();
    assertEquals(200, res.statusCode());

    con = Jsoup
        .connect("http://direct.infohound.net/tools/200-no-content.pl")
        .parser(Parser.xmlParser())
        .userAgent(browserUa);
    res = con.execute();
    doc = res.parse();
    assertEquals(200, res.statusCode());
}

Source File: JsoupBasedFormatter.java From formatter-maven-plugin with Apache License 2.0

6 votes

@Override
public String doFormat(String code, LineEnding ending) {
    Document document;
    switch (formatter.syntax()) {
    case html:
        document = Jsoup.parse(code, "", Parser.htmlParser());
        break;
    case xml:
        document = Jsoup.parse(code, "", Parser.xmlParser());
        break;
    default:
        throw new IllegalArgumentException(formatter.syntax() + " is not allowed as syntax");
    }
    document.outputSettings(formatter);

    String formattedCode = document.outerHtml();
    if (code.equals(formattedCode)) {
        return null;
    }
    return formattedCode;
}

Source File: DataUtilTest.java From astor with GNU General Public License v2.0

6 votes

@Test
public void wrongMetaCharsetFallback() {
    try {
        final byte[] input = "<html><head><meta charset=iso-8></head><body></body></html>".getBytes("UTF-8");
        final ByteBuffer inBuffer = ByteBuffer.wrap(input);
        
        Document doc = DataUtil.parseByteData(inBuffer, null, "http://example.com", Parser.htmlParser());
        
        final String expected = "<html>\n" +
                                " <head>\n" +
                                "  <meta charset=\"iso-8\">\n" +
                                " </head>\n" +
                                " <body></body>\n" +
                                "</html>";
        
        assertEquals(expected, doc.toString());
    } catch( UnsupportedEncodingException ex ) {
        fail(ex.getMessage());
    }
}

Source File: Ch5Coz4.java From CrawlerPack with Apache License 2.0

6 votes

public static void normalXmlParse(){
    String json = CrawlerPack.getFromRemote(url);
    String xml = CrawlerPack.jsonToXml(json);

    // 原始 json 轉為 xml 的結果
    System.out.println( "原始XML" ) ;
    System.out.println( xml );

    Document jsoupDoc = Jsoup.parse(xml, "", Parser.xmlParser());
    jsoupDoc.charset(StandardCharsets.UTF_8);

    // 發生了什麼事？
    System.out.println( "轉換後XML" ) ;
    System.out.println(jsoupDoc.toString());


}

Source File: AppShellSettings.java From flow with Apache License 2.0

6 votes

private Element element(VaadinRequest request) {
    if (content == null) {
        content = BootstrapUtils.getDependencyContents(request, file);
    }

    if (type == Wrapping.AUTOMATIC && file != null) {
        if (file.toLowerCase().endsWith(".css")) {
            type = Wrapping.STYLESHEET;
        } else if (file.toLowerCase().endsWith(".js")) {
            type = Wrapping.JAVASCRIPT;
        }
    }
    if (type == Wrapping.STYLESHEET) {
        return createElement("style", content, "type", "text/css");
    }
    if (type == Wrapping.JAVASCRIPT) {
        return createElement("script", content, "type",
                "text/javascript");
    }
    return Jsoup.parse(content, "", Parser.xmlParser());
}

Source File: JerryExtractor.java From web-data-extractor with Apache License 2.0

6 votes

private String parse(String str) {
    Document document = Jsoup.parse(str, "", Parser.xmlParser());
    String result = "";
    switch (outType) {
        case TYPE_TEXT:
            result = document.text();
            break;
        case TYPE_HTML:
            result = document.html();
            break;
        default:
            result = document.text();
            break;
    }
    return result;
}

Source File: SelectorTest.java From jsoup-learning with MIT License

6 votes

public static void main(String[] args) {
    String html = "<body>\n" +
            " <textarea>\n" +
            "        &lt;!-- Text --&gt;\n" +
            "        xxx\n" +
            "    </textarea> \n" +
            " <div> \n" +
            "  <table> \n" +
            "   <!-- InTable --> \n" +
            "   <!-- InTableText --> xxx \n" +
            "   <tbody> \n" +
            "    <tr> \n" +
            "     <!-- InRow --> \n" +
            "     <td> \n" +
            "      <!-- InCell --> </td> \n" +
            "    </tr> \n" +
            "   </tbody> \n" +
            "  </table> \n" +
            " </div> \n" +
            "</body>";
    Parser parser = Parser.htmlParser();
    Document document = parser.parseInput(html, "");
    Elements select = document.select("body div");
    System.out.println(select);
}

Source File: NicoAudioSourceManager.java From lavaplayer with Apache License 2.0

6 votes

private AudioTrack loadTrack(String videoId) {
  checkLoggedIn();

  try (HttpInterface httpInterface = getHttpInterface()) {
    try (CloseableHttpResponse response = httpInterface.execute(new HttpGet("http://ext.nicovideo.jp/api/getthumbinfo/" + videoId))) {
      int statusCode = response.getStatusLine().getStatusCode();
      if (!HttpClientTools.isSuccessWithContent(statusCode)) {
        throw new IOException("Unexpected response code from video info: " + statusCode);
      }

      Document document = Jsoup.parse(response.getEntity().getContent(), StandardCharsets.UTF_8.name(), "", Parser.xmlParser());
      return extractTrackFromXml(videoId, document);
    }
  } catch (IOException e) {
    throw new FriendlyException("Error occurred when extracting video info.", SUSPICIOUS, e);
  }
}

Source File: BoxDotComAccount.java From neembuu-uploader with GNU General Public License v3.0

6 votes

/**
 * Read information about user. Here you can read other important info.
 * @throws Exception 
 */
private void getUserInfo() throws Exception {
    //https://www.box.net/api/1.0/rest?action=get_auth_token&api_key=vkf3k5dh0tg1ibvcikjcp8sx0f89d14u&ticket=

    //https://www.box.net/api/1.0/rest?action=get_auth_token&api_key=vkf3k5dh0tg1ibvcikjcp8sx0f89d14u&ticket=xybt9orxzo1xrr5vk4r0axne804y1tpk

    NULogger.getLogger().log(Level.INFO, "{0} Getting auth token value............", getClass());
    httpGet = new NUHttpGet("https://www.box.net/api/1.0/rest?action=get_auth_token&api_key=vkf3k5dh0tg1ibvcikjcp8sx0f89d14u&ticket=" + ticket);
    httpResponse = httpclient.execute(httpGet, httpContext);
    responseString = EntityUtils.toString(httpResponse.getEntity());
    //NULogger.getLogger().log(Level.INFO, "{0}Response : {1}", new Object[]{getClass(), stringResponse});
    
    doc = Jsoup.parse(responseString, "", Parser.xmlParser());
    String auth_token = doc.select("response auth_token").text();

    NULogger.getLogger().log(Level.INFO, "{0} Auth_token : {1}", new Object[]{getClass(), auth_token});
    properties().setEncryptedProperty(KEY_AUTH_TOKEN, auth_token);
}

Source File: Node.java From astor with GNU General Public License v2.0

5 votes

private void addSiblingHtml(int index, String html) {
    Validate.notNull(html);
    Validate.notNull(parentNode);

    Element context = parent() instanceof Element ? (Element) parent() : null;
    List<Node> nodes = Parser.parseFragment(html, context, baseUri());
    parentNode.addChildren(index, nodes.toArray(new Node[nodes.size()]));
}

Source File: DataUtil.java From jsoup-learning with MIT License

5 votes

/**
 * Loads a file to a Document.
 * @param in file to load
 * @param charsetName character set of input
 * @param baseUri base URI of document, to resolve relative links against
 * @return Document
 * @throws IOException on IO error
 */
public static Document load(File in, String charsetName, String baseUri) throws IOException {
    FileInputStream inStream = null;
    try {
        inStream = new FileInputStream(in);
        ByteBuffer byteData = readToByteBuffer(inStream);
        return parseByteData(byteData, charsetName, baseUri, Parser.htmlParser());
    } finally {
        if (inStream != null)
            inStream.close();
    }
}

Source File: Element.java From astor with GNU General Public License v2.0

5 votes

/**
 * Add inner HTML into this element. The supplied HTML will be parsed, and each node prepended to the start of the element's children.
 * @param html HTML to add inside this element, before the existing HTML
 * @return this element
 * @see #html(String)
 */
public Element prepend(String html) {
    Validate.notNull(html);
    
    List<Node> nodes = Parser.parseFragment(html, this, baseUri());
    addChildren(0, nodes.toArray(new Node[nodes.size()]));
    return this;
}

Source File: Element.java From astor with GNU General Public License v2.0

5 votes

/**
 * Add inner HTML to this element. The supplied HTML will be parsed, and each node appended to the end of the children.
 * @param html HTML to add inside this element, after the existing HTML
 * @return this element
 * @see #html(String)
 */
public Element append(String html) {
    Validate.notNull(html);

    List<Node> nodes = Parser.parseFragment(html, this, baseUri());
    addChildren(nodes.toArray(new Node[nodes.size()]));
    return this;
}

Source File: RssLoader.java From android-opensource-library-56 with Apache License 2.0

5 votes

@Override
public RssList loadInBackground() {
    try {

        Document document = Jsoup.connect(this.mFeed.url)
                .parser(Parser.xmlParser()).get();
        parseCssSelector(document);
        // parseDomTraverse(document);

    } catch (Exception e) {
        e.printStackTrace();
    }
    return mList;
}

Source File: UrlConnectTest.java From astor with GNU General Public License v2.0

5 votes

@Test
public void fetchHandlesXmlAsHtmlWhenParserSet() throws IOException {
    // should auto-detect xml and use XML parser, unless explicitly requested the html parser
    String xmlUrl = "http://direct.infohound.net/tools/parse-xml.xml";
    Connection con = Jsoup.connect(xmlUrl).parser(Parser.htmlParser());
    Document doc = con.get();
    Connection.Request req = con.request();
    assertTrue(req.parser().getTreeBuilder() instanceof HtmlTreeBuilder);
    assertEquals("<html> <head></head> <body> <xml> <link>one <table> Two </table> </xml> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml()));
}

Source File: CharsetIdentification.java From storm-crawler with Apache License 2.0

5 votes

/**
 * Attempt to find a META tag in the HTML that hints at the character set
 * used to write the document.
 */
private static String getCharsetFromMeta(byte buffer[], int maxlength) {
    // convert to UTF-8 String -- which hopefully will not mess up the
    // characters we're interested in...
    int len = buffer.length;
    if (maxlength > 0 && maxlength < len) {
        len = maxlength;
    }
    String html = new String(buffer, 0, len, DEFAULT_CHARSET);

    String foundCharset = null;

    try {
        Document doc = Parser.htmlParser().parseInput(html, "dummy");

        // look for <meta http-equiv="Content-Type"
        // content="text/html;charset=gb2312"> or HTML5 <meta
        // charset="gb2312">
        Elements metaElements = doc
                .select("meta[http-equiv=content-type], meta[charset]");
        for (Element meta : metaElements) {
            if (meta.hasAttr("http-equiv"))
                foundCharset = getCharsetFromContentType(meta
                        .attr("content"));
            if (foundCharset == null && meta.hasAttr("charset"))
                foundCharset = meta.attr("charset");
            if (foundCharset != null)
                return foundCharset;
        }
    } catch (Exception e) {
        foundCharset = null;
    }

    return foundCharset;
}

Source File: UrlConnectTest.java From astor with GNU General Public License v2.0

5 votes

@Test
public void fetchHandlesXmlAsHtmlWhenParserSet() throws IOException {
    // should auto-detect xml and use XML parser, unless explicitly requested the html parser
    String xmlUrl = "http://direct.infohound.net/tools/parse-xml.xml";
    Connection con = Jsoup.connect(xmlUrl).parser(Parser.htmlParser());
    Document doc = con.get();
    Connection.Request req = con.request();
    assertTrue(req.parser().getTreeBuilder() instanceof HtmlTreeBuilder);
    assertEquals("<html> <head></head> <body> <xml> <link>one <table> Two </table> </xml> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml()));
}

Source File: DataUtilTest.java From astor with GNU General Public License v2.0

5 votes

@Test public void discardsSpuriousByteOrderMarkWhenNoCharsetSet() {
    String html = "\uFEFF<html><head><title>One</title></head><body>Two</body></html>";
    ByteBuffer buffer = Charset.forName("UTF-8").encode(html);
    Document doc = DataUtil.parseByteData(buffer, null, "http://foo.com/", Parser.htmlParser());
    assertEquals("One", doc.head().text());
    assertEquals("UTF-8", doc.outputSettings().charset().displayName());
}

Source File: Cleaner.java From astor with GNU General Public License v2.0

5 votes

public boolean isValidBodyHtml(String bodyHtml) {
    Document clean = Document.createShell("");
    Document dirty = Document.createShell("");
    ParseErrorList errorList = ParseErrorList.tracking(1);
    List<Node> nodes = Parser.parseFragment(bodyHtml, dirty.body(), "", errorList);
    dirty.body().insertChildren(0, nodes);
    int numDiscarded = copySafeNodes(dirty.body(), clean.body());
    return numDiscarded == 0 && errorList.size() == 0;
}

Source File: HttpConnection.java From astor with GNU General Public License v2.0

5 votes

Request() {
    timeoutMilliseconds = 30000; // 30 seconds
    maxBodySizeBytes = 1024 * 1024; // 1MB
    followRedirects = true;
    data = new ArrayList<>();
    method = Method.GET;
    addHeader("Accept-Encoding", "gzip");
    addHeader(USER_AGENT, DEFAULT_UA);
    parser = Parser.htmlParser();
}

Source File: TextExtractorTest.java From storm-crawler with Apache License 2.0

5 votes

@Test
public void testExclusionCase() throws IOException {
    Config conf = new Config();
    conf.put(TextExtractor.EXCLUDE_PARAM_NAME, "style");

    TextExtractor extractor = new TextExtractor(conf);

    String content = "<html>the<STYLE>main</STYLE>content of the page</html>";

    Document jsoupDoc = Parser.htmlParser().parseInput(content,
            "http://stormcrawler.net");
    String text = extractor.text(jsoupDoc.body());

    assertEquals("the content of the page", text);
}

Source File: Element.java From astor with GNU General Public License v2.0

5 votes

/**
 * Add inner HTML into this element. The supplied HTML will be parsed, and each node prepended to the start of the element's children.
 * @param html HTML to add inside this element, before the existing HTML
 * @return this element
 * @see #html(String)
 */
public Element prepend(String html) {
    Validate.notNull(html);
    
    List<Node> nodes = Parser.parseFragment(html, this, baseUri());
    addChildren(0, nodes.toArray(new Node[nodes.size()]));
    return this;
}

Source File: Element.java From astor with GNU General Public License v2.0

5 votes

/**
 * Add inner HTML into this element. The supplied HTML will be parsed, and each node prepended to the start of the element's children.
 * @param html HTML to add inside this element, before the existing HTML
 * @return this element
 * @see #html(String)
 */
public Element prepend(String html) {
    Validate.notNull(html);
    
    List<Node> nodes = Parser.parseFragment(html, this, baseUri());
    addChildren(0, nodes.toArray(new Node[nodes.size()]));
    return this;
}

Source File: Node.java From astor with GNU General Public License v2.0

5 votes

private void addSiblingHtml(int index, String html) {
    Validate.notNull(html);
    Validate.notNull(parentNode);

    Element context = parent() instanceof Element ? (Element) parent() : null;
    List<Node> nodes = Parser.parseFragment(html, context, baseUri());
    parentNode.addChildren(index, nodes.toArray(new Node[nodes.size()]));
}

Source File: DataUtilTest.java From astor with GNU General Public License v2.0

5 votes

@Test
public void wrongMetaCharsetFallback() throws IOException {
    String html = "<html><head><meta charset=iso-8></head><body></body></html>";

    Document doc = DataUtil.parseInputStream(stream(html), null, "http://example.com", Parser.htmlParser());

    final String expected = "<html>\n" +
        " <head>\n" +
        "  <meta charset=\"iso-8\">\n" +
        " </head>\n" +
        " <body></body>\n" +
        "</html>";

    assertEquals(expected, doc.toString());
}

Source File: DataUtilTest.java From astor with GNU General Public License v2.0

5 votes

@Test
public void secondMetaElementWithContentTypeContainsCharsetParameter() throws Exception {
    String html = "<html><head>" +
            "<meta http-equiv=\"Content-Type\" content=\"text/html\">" +
            "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=euc-kr\">" +
            "</head><body>한국어</body></html>";

    Document doc = DataUtil.parseInputStream(stream(html, "euc-kr"), null, "http://example.com", Parser.htmlParser());

    assertEquals("한국어", doc.body().text());
}

Source File: DataUtilTest.java From astor with GNU General Public License v2.0

5 votes

@Test
public void firstMetaElementWithCharsetShouldBeUsedForDecoding() throws Exception {
    String html = "<html><head>" +
            "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">" +
            "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=koi8-u\">" +
            "</head><body>Übergrößenträger</body></html>";

    Document doc = DataUtil.parseInputStream(stream(html, "iso-8859-1"), null, "http://example.com", Parser.htmlParser());

    assertEquals("Übergrößenträger", doc.body().text());
}

Source File: Cleaner.java From astor with GNU General Public License v2.0

5 votes

public boolean isValidBodyHtml(String bodyHtml) {
    Document clean = Document.createShell("");
    Document dirty = Document.createShell("");
    ParseErrorList errorList = ParseErrorList.tracking(1);
    List<Node> nodes = Parser.parseFragment(bodyHtml, dirty.body(), "", errorList);
    dirty.body().insertChildren(0, nodes);
    int numDiscarded = copySafeNodes(dirty.body(), clean.body());
    return numDiscarded == 0 && errorList.size() == 0;
}

Source File: HttpConnection.java From astor with GNU General Public License v2.0

5 votes

Request() {
    timeoutMilliseconds = 30000; // 30 seconds
    maxBodySizeBytes = 1024 * 1024; // 1MB
    followRedirects = true;
    data = new ArrayList<>();
    method = Method.GET;
    addHeader("Accept-Encoding", "gzip");
    addHeader(USER_AGENT, DEFAULT_UA);
    parser = Parser.htmlParser();
}

Source File: Node.java From jsoup-learning with MIT License

5 votes

private void addSiblingHtml(int index, String html) {
    Validate.notNull(html);
    Validate.notNull(parentNode);

    Element context = parent() instanceof Element ? (Element) parent() : null;        
    List<Node> nodes = Parser.parseFragment(html, context, baseUri());
    parentNode.addChildren(index, nodes.toArray(new Node[nodes.size()]));
}

org.jsoup.parser.Parser Java Examples