org.jsoup.parser.Parser#parseInput

Source File: SelectorTest.java From jsoup-learning with MIT License

6 votes

public static void main(String[] args) {
    String html = "<body>\n" +
            " <textarea>\n" +
            "        &lt;!-- Text --&gt;\n" +
            "        xxx\n" +
            "    </textarea> \n" +
            " <div> \n" +
            "  <table> \n" +
            "   <!-- InTable --> \n" +
            "   <!-- InTableText --> xxx \n" +
            "   <tbody> \n" +
            "    <tr> \n" +
            "     <!-- InRow --> \n" +
            "     <td> \n" +
            "      <!-- InCell --> </td> \n" +
            "    </tr> \n" +
            "   </tbody> \n" +
            "  </table> \n" +
            " </div> \n" +
            "</body>";
    Parser parser = Parser.htmlParser();
    Document document = parser.parseInput(html, "");
    Elements select = document.select("body div");
    System.out.println(select);
}

Source File: DataUtil.java From jsoup-learning with MIT License

5 votes

static Document parseByteData(ByteBuffer byteData, String charsetName, String baseUri, Parser parser) {
    String docData;
    Document doc = null;
    if (charsetName == null) { // determine from meta. safe parse as UTF-8
        // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
        docData = Charset.forName(defaultCharset).decode(byteData).toString();
        doc = parser.parseInput(docData, baseUri);
        Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first();
        if (meta != null) { // if not found, will keep utf-8 as best attempt
            String foundCharset = meta.hasAttr("http-equiv") ? getCharsetFromContentType(meta.attr("content")) : meta.attr("charset");
            if (foundCharset != null && foundCharset.length() != 0 && !foundCharset.equals(defaultCharset)) { // need to re-decode
                charsetName = foundCharset;
                byteData.rewind();
                docData = Charset.forName(foundCharset).decode(byteData).toString();
                doc = null;
            }
        }
    } else { // specified by content type header (or by user on file load)
        Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
        docData = Charset.forName(charsetName).decode(byteData).toString();
    }
    if (doc == null) {
        // there are times where there is a spurious byte-order-mark at the start of the text. Shouldn't be present
        // in utf-8. If after decoding, there is a BOM, strip it; otherwise will cause the parser to go straight
        // into head mode
        if (docData.length() > 0 && docData.charAt(0) == 65279)
            docData = docData.substring(1);

        doc = parser.parseInput(docData, baseUri);
        doc.outputSettings().charset(charsetName);
    }
    return doc;
}

Source File: ParserCorrectorTest.java From jsoup-learning with MIT License

5 votes

public static void main(String[] args) {
    String htmlWithDivUnclosed = "<body>\n" +
            " <textarea>\n" +
            "        &lt;!-- Text --&gt;\n" +
            "        xxx\n" +
            "    </textarea> \n" +
            " <div> \n" +
            " <div>\n" +
            "  <table> \n" +
            "   <!-- InTable --> \n" +
            "   <!-- InTableText --> xxx \n" +
            "   <tbody> \n" +
            "    <tr> \n" +
            "     <!-- InRow --> \n" +
            "     <td> \n" +
            "      <!-- InCell --> </td> \n" +
            "    </tr> \n" +
            "   </tbody> \n" +
            "  </table> \n" +
            " </div> \n" +
            "</body>";
    Parser parser = Parser.htmlParser();
    parser.setTrackErrors(100);
    Document document = parser.parseInput(htmlWithDivUnclosed, "");
    List<ParseError> errors = parser.getErrors();
    System.out.println(errors);

}

Source File: PageErrorChecker.java From jsoup-learning with MIT License

5 votes

public static List<ParseError> check(String url) throws IOException {
    Parser parser = Parser.htmlParser();
    parser.setTrackErrors(100);
    String body = Jsoup.connect(url).userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36")
            .execute().body();
    parser.parseInput(body, url);
    List<ParseError> errors = parser.getErrors();
    return errors;
}

Source File: DataUtil.java From astor with GNU General Public License v2.0

4 votes

static Document parseByteData(ByteBuffer byteData, String charsetName, String baseUri, Parser parser) {
    String docData;
    Document doc = null;

    // look for BOM - overrides any other header or input
    byteData.mark();
    byte[] bom = new byte[4];
    if (byteData.remaining() >= bom.length) {
        byteData.get(bom);
        byteData.rewind();
    }
    if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE
            bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE
        charsetName = "UTF-32"; // and I hope it's on your system
    } else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE
            bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) {
        charsetName = "UTF-16"; // in all Javas
    } else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) {
        charsetName = "UTF-8"; // in all Javas
        byteData.position(3); // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed
    }

    if (charsetName == null) { // determine from meta. safe parse as UTF-8
        // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
        docData = Charset.forName(defaultCharset).decode(byteData).toString();
        doc = parser.parseInput(docData, baseUri);
        Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first();
        if (meta != null) { // if not found, will keep utf-8 as best attempt
            String foundCharset = null;
            if (meta.hasAttr("http-equiv")) {
                foundCharset = getCharsetFromContentType(meta.attr("content"));
            }
            if (foundCharset == null && meta.hasAttr("charset")) {
                try {
                    if (Charset.isSupported(meta.attr("charset"))) {
                        foundCharset = meta.attr("charset");
                    }
                } catch (IllegalCharsetNameException e) {
                    foundCharset = null;
                }
            }

            if (foundCharset != null && foundCharset.length() != 0 && !foundCharset.equals(defaultCharset)) { // need to re-decode
                foundCharset = foundCharset.trim().replaceAll("[\"']", "");
                charsetName = foundCharset;
                byteData.rewind();
                docData = Charset.forName(foundCharset).decode(byteData).toString();
                doc = null;
            }
        }
    } else { // specified by content type header (or by user on file load)
        Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
        docData = Charset.forName(charsetName).decode(byteData).toString();
    }
    if (doc == null) {
        doc = parser.parseInput(docData, baseUri);
        doc.outputSettings().charset(charsetName);
    }
    return doc;
}

Source File: DataUtil.java From astor with GNU General Public License v2.0

4 votes

static Document parseInputStream(InputStream input, String charsetName, String baseUri, Parser parser) throws IOException  {
    if (input == null) // empty body
        return new Document(baseUri);

    if (!(input instanceof ConstrainableInputStream))
        input = new ConstrainableInputStream(input, bufferSize, 0);

    Document doc = null;
    boolean fullyRead = false;

    // read the start of the stream and look for a BOM or meta charset
    input.mark(firstReadBufferSize);
    ByteBuffer firstBytes = readToByteBuffer(input, firstReadBufferSize - 1); // -1 because we read one more to see if completed
    fullyRead = input.read() == -1;
    input.reset();

    // look for BOM - overrides any other header or input
    BomCharset bomCharset = detectCharsetFromBom(firstBytes, charsetName);
    if (bomCharset != null) {
        charsetName = bomCharset.charset;
        input.skip(bomCharset.offset);
    }

    if (charsetName == null) { // determine from meta. safe first parse as UTF-8
        String docData = Charset.forName(defaultCharset).decode(firstBytes).toString();
        doc = parser.parseInput(docData, baseUri);

        // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
        Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]");
        String foundCharset = null; // if not found, will keep utf-8 as best attempt
        for (Element meta : metaElements) {
            if (meta.hasAttr("http-equiv"))
                foundCharset = getCharsetFromContentType(meta.attr("content"));
            if (foundCharset == null && meta.hasAttr("charset"))
                foundCharset = meta.attr("charset");
            if (foundCharset != null)
                break;
        }

        // look for <?xml encoding='ISO-8859-1'?>
        if (foundCharset == null && doc.childNodeSize() > 0 && doc.childNode(0) instanceof XmlDeclaration) {
            XmlDeclaration prolog = (XmlDeclaration) doc.childNode(0);
            if (prolog.name().equals("xml"))
                foundCharset = prolog.attr("encoding");
        }
        foundCharset = validateCharset(foundCharset);
        if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharset)) { // need to re-decode. (case insensitive check here to match how validate works)
            foundCharset = foundCharset.trim().replaceAll("[\"']", "");
            charsetName = foundCharset;
            doc = null;
        } else if (!fullyRead) {
            doc = null;
        }
    } else { // specified by content type header (or by user on file load)
        Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
    }
    if (doc == null) {
        if (charsetName == null)
            charsetName = defaultCharset;
        BufferedReader reader = new BufferedReader(new InputStreamReader(input, charsetName), bufferSize);
        doc = parser.parseInput(reader, baseUri);
        doc.outputSettings().charset(charsetName);
    }
    input.close();
    return doc;
}

Source File: DataUtil.java From astor with GNU General Public License v2.0

4 votes

static Document parseInputStream(InputStream input, String charsetName, String baseUri, Parser parser) throws IOException  {
    if (input == null) // empty body
        return new Document(baseUri);

    if (!(input instanceof ConstrainableInputStream))
        input = new ConstrainableInputStream(input, bufferSize, 0);

    Document doc = null;
    boolean fullyRead = false;

    // read the start of the stream and look for a BOM or meta charset
    input.mark(firstReadBufferSize);
    ByteBuffer firstBytes = readToByteBuffer(input, firstReadBufferSize - 1); // -1 because we read one more to see if completed
    fullyRead = input.read() == -1;
    input.reset();

    // look for BOM - overrides any other header or input
    BomCharset bomCharset = detectCharsetFromBom(firstBytes, charsetName);
    if (bomCharset != null) {
        charsetName = bomCharset.charset;
        input.skip(bomCharset.offset);
    }

    if (charsetName == null) { // determine from meta. safe first parse as UTF-8
        String docData = Charset.forName(defaultCharset).decode(firstBytes).toString();
        doc = parser.parseInput(docData, baseUri);

        // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
        Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]");
        String foundCharset = null; // if not found, will keep utf-8 as best attempt
        for (Element meta : metaElements) {
            if (meta.hasAttr("http-equiv"))
                foundCharset = getCharsetFromContentType(meta.attr("content"));
            if (foundCharset == null && meta.hasAttr("charset"))
                foundCharset = meta.attr("charset");
            if (foundCharset != null)
                break;
        }

        // look for <?xml encoding='ISO-8859-1'?>
        if (foundCharset == null && doc.childNodeSize() > 0 && doc.childNode(0) instanceof XmlDeclaration) {
            XmlDeclaration prolog = (XmlDeclaration) doc.childNode(0);
            if (prolog.name().equals("xml"))
                foundCharset = prolog.attr("encoding");
        }
        foundCharset = validateCharset(foundCharset);
        if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharset)) { // need to re-decode. (case insensitive check here to match how validate works)
            foundCharset = foundCharset.trim().replaceAll("[\"']", "");
            charsetName = foundCharset;
            doc = null;
        } else if (!fullyRead) {
            doc = null;
        }
    } else { // specified by content type header (or by user on file load)
        Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
    }
    if (doc == null) {
        if (charsetName == null)
            charsetName = defaultCharset;
        BufferedReader reader = new BufferedReader(new InputStreamReader(input, charsetName), bufferSize);
        doc = parser.parseInput(reader, baseUri);
        doc.outputSettings().charset(charsetName);
    }
    input.close();
    return doc;
}

Source File: Jsoup.java From astor with GNU General Public License v2.0

votes

/**
 Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML
 (non-HTML) parser.

 @param html    HTML to parse
 @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
 before the HTML declares a {@code <base href>} tag.
 @param parser alternate {@link Parser#xmlParser() parser} to use.
 @return sane HTML
 */
public static Document parse(String html, String baseUri, Parser parser) {
    return parser.parseInput(html, baseUri);
}

Source File: Jsoup.java From astor with GNU General Public License v2.0

votes

/**
 Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML
 (non-HTML) parser.

 @param html    HTML to parse
 @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
 before the HTML declares a {@code <base href>} tag.
 @param parser alternate {@link Parser#xmlParser() parser} to use.
 @return sane HTML
 */
public static Document parse(String html, String baseUri, Parser parser) {
    return parser.parseInput(html, baseUri);
}

Source File: Jsoup.java From astor with GNU General Public License v2.0

votes

/**
 Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML
 (non-HTML) parser.

 @param html    HTML to parse
 @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
 before the HTML declares a {@code <base href>} tag.
 @param parser alternate {@link Parser#xmlParser() parser} to use.
 @return sane HTML
 */
public static Document parse(String html, String baseUri, Parser parser) {
    return parser.parseInput(html, baseUri);
}

Source File: Jsoup.java From jsoup-learning with MIT License

votes

/**
 Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML
 (non-HTML) parser.

 @param html    HTML to parse
 @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
 before the HTML declares a {@code <base href>} tag.
 @param parser alternate {@link Parser#xmlParser() parser} to use.
 @return sane HTML
 */
public static Document parse(String html, String baseUri, Parser parser) {
    return parser.parseInput(html, baseUri);
}

Java Code Examples for org.jsoup.parser.Parser#parseInput()