org.jsoup.nodes.Document#outputSettings

Source File: URLDownloadTests.java From java_in_examples with Apache License 2.0

8 votes

private static void testHtmlParser(String url) throws Exception {
    Document doc = Jsoup.connect(url).userAgent(USER_AGENT).cookie("auth", "token")
            .timeout(30000).get();
    Charset charset = doc.charset();
    System.out.println("charset = " + charset);
    System.out.println("location = " + doc.location());
    System.out.println("nodeName = " + doc.nodeName());
    Document.OutputSettings outputSettings = doc.outputSettings();
    System.out.println("charset = " + outputSettings.charset());
    System.out.println("indentAmount = " + outputSettings.indentAmount());
    System.out.println("syntax = " + outputSettings.syntax());
    System.out.println("escapeMode = " + outputSettings.escapeMode());
    System.out.println("prettyPrint = " + outputSettings.prettyPrint());
    System.out.println("outline = " + outputSettings.outline());

    System.out.println("title = " + doc.title());
    System.out.println("baseUri = " + doc.baseUri());

    Element head = doc.head();
    Elements children = head.children();
    for(Element child: children) {
        System.out.print(child.tag().getName() + " : ");
        System.out.println(child);
    }
    printElements(doc.body().children());
}

Source File: XHTMLDocumentHandler.java From docx4j-template with Apache License 2.0

6 votes

/**
 * Jsoup.parse(in, charsetName, baseUri)
 */
@Override
public Document handle( InputStream input) throws IOException{
	//获取Jsoup参数
	String charsetName = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_CHARSETNAME, Docx4jConstants.DEFAULT_CHARSETNAME );
	String baseUri = Docx4jProperties.getProperty(Docx4jConstants.DOCX4J_JSOUP_PARSE_BASEURI,"");
	//使用Jsoup将html转换成Document对象
	Document doc = Jsoup.parse(input, charsetName, baseUri);
	
	OutputSettings outputSettings = new OutputSettings();
	
	outputSettings.prettyPrint(false);
	
	/*
	outputSettings.syntax(syntax)
	outputSettings.charset(charset)
	outputSettings*/
	doc.outputSettings(outputSettings);
	
	//返回Document对象
	return doc;
}

Source File: WinterShParser.java From substitution-schedule-parser with Mozilla Public License 2.0

6 votes

@NotNull static AdditionalInfo handleXML(String xml) {
	AdditionalInfo info = new AdditionalInfo();
	info.setTitle(TITLE);
	Document doc = Jsoup.parse(xml, "", Parser.xmlParser());
	doc.outputSettings(new Document.OutputSettings().prettyPrint(false));
	String text = doc.select("item description").first().html().replace("\r\n", "<br>").trim();
	if (text.startsWith("Zurzeit gibt es keine Hinweise auf witterungsbedingten Unterrichtsausfall.")
			|| text.startsWith("Aktuell gibt es keine Hinweise auf witterungsbedingten Unterrichtsausfall.")) {
		info.setHasInformation(false);
		info.setText("Aktuell gibt es keine Hinweise auf witterungsbedingten Unterrichtsausfall.");
	}
	if (text.endsWith("<br>")) {
		text = text.substring(0, text.length() - 4);
	}
	info.setTitle(TITLE + " (Stand: " + doc.select("pubDate").first().text() + ")");
	info.setText(text);

	return info;
}

Source File: JsoupBasedFormatter.java From formatter-maven-plugin with Apache License 2.0

6 votes

@Override
public String doFormat(String code, LineEnding ending) {
    Document document;
    switch (formatter.syntax()) {
    case html:
        document = Jsoup.parse(code, "", Parser.htmlParser());
        break;
    case xml:
        document = Jsoup.parse(code, "", Parser.xmlParser());
        break;
    default:
        throw new IllegalArgumentException(formatter.syntax() + " is not allowed as syntax");
    }
    document.outputSettings(formatter);

    String formattedCode = document.outerHtml();
    if (code.equals(formattedCode)) {
        return null;
    }
    return formattedCode;
}

Source File: FuraffinityRipper.java From ripme with MIT License

5 votes

public String getDescription(String page) {
    try {
        // Fetch the image page
        Response resp = Http.url(page)
                .referrer(this.url)
                .response();
        cookies.putAll(resp.cookies());

        // Try to find the description
        Elements els = resp.parse().select("td[class=alt1][width=\"70%\"]");
        if (els.isEmpty()) {
            LOGGER.debug("No description at " + page);
            throw new IOException("No description found");
        }
        LOGGER.debug("Description found!");
        Document documentz = resp.parse();
        Element ele = documentz.select("td[class=alt1][width=\"70%\"]").get(0); // This is where the description is.
        // Would break completely if FurAffinity changed site layout.
        documentz.outputSettings(new Document.OutputSettings().prettyPrint(false));
        ele.select("br").append("\\n");
        ele.select("p").prepend("\\n\\n");
        LOGGER.debug("Returning description at " + page);
        String tempPage = Jsoup.clean(ele.html().replaceAll("\\\\n", System.getProperty("line.separator")), "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
        return documentz.select("meta[property=og:title]").attr("content") + "\n" + tempPage; // Overridden saveText takes first line and makes it the file name.
    } catch (IOException ioe) {
        LOGGER.info("Failed to get description " + page + " : '" + ioe.getMessage() + "'");
        return null;
    }
}

Source File: NetUtils.java From Shadbot with GNU General Public License v3.0

5 votes

/**
 * @param html The HTML to convert to text with new lines preserved, may be {@code null}.
 * @return The provided HTML converted to text with new lines preserved or {@code null} if null string input.
 */
@Nullable
public static String cleanWithLinebreaks(@Nullable String html) {
    if (html == null || html.isBlank()) {
        return html;
    }
    final Document document = Jsoup.parse(html);
    // Makes html() preserve linebreak and spacing
    document.outputSettings(new Document.OutputSettings().prettyPrint(false));
    document.select("br").append("\\n");
    document.select("p").prepend("\\n\\n");
    final String str = document.html().replace("\\\\n", "\n");
    return Jsoup.clean(str, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
}

Source File: HtmlParser.java From scava with Eclipse Public License 2.0

5 votes

private static List<String> parse(String input, Whitelist wl)
{
	String cleanInput=Jsoup.clean(input, "", wl, outputSettings);
	//System.out.println(input);
	Document document = Jsoup.parse(cleanInput);
	
	document.outputSettings(outputSettings);
	
	List<String> textList = new ArrayList<String>();

	readNodes(document.body().childNodes(), textList);
	return textList;
}

Source File: HtmlParser.java From scava with Eclipse Public License 2.0

5 votes

private static List<Map.Entry<String,String>> parseWithTags(String input, Whitelist wl)
{
	String cleanInput=Jsoup.clean(input, "", wl, outputSettings);
	Document document = Jsoup.parse(cleanInput);
	
	document.outputSettings(outputSettings);
	
	List<Map.Entry<String,String>> textListMap = new ArrayList<Map.Entry<String,String>>();

	readNodesWithTags(document.body().childNodes(), textListMap,"body");
	return textListMap;
}

Source File: ElasticsearchDocumentWriter.java From newsleak with GNU Affero General Public License v3.0

5 votes

/**
 * Replace html line breaks and &gt; &lt; entities.
 *
 * @param html
 *            the html
 * @return the string
 */
public static String replaceHtmlLineBreaks(String html) {
	if (html == null)
		return html;
	Document document = Jsoup.parse(html);
	// makes html() preserve linebreaks and spacing
	document.outputSettings(new Document.OutputSettings().prettyPrint(false));
	document.select("br").append("\\n");
	document.select("p").prepend("\\n\\n");
	String s = document.html().replaceAll("\\\\n", "\n");
	String cleanedString = Jsoup.clean(s, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
	cleanedString = cleanedString.replaceAll("&gt;", ">");
	cleanedString = cleanedString.replaceAll("&lt;", "<");
	return cleanedString;
}

Source File: HtmlTextFilter.java From voj with GNU General Public License v3.0

5 votes

/**
 * 过滤包含HTML字符串.
 * @param text - 待过滤的字符串
 * @return 过滤后的字符串.
 */
public static String filter(String text) {
	if ( text == null ) {
		return text;
	}
	
	Document document = Jsoup.parse(text);
	document.outputSettings(new Document.OutputSettings().prettyPrint(false));
	document.select("br").append("\\n");
	document.select("p").prepend("\\n\\n");
	String s = document.html().replaceAll("\\\\n", "\n");
	return Jsoup.clean(s, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
}

Source File: AbstractHtmlConsumer.java From baleen with Apache License 2.0

4 votes

@Override
protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException {
  final File f = getFileName(jCas);
  final DocumentAnnotation da = getDocumentAnnotation(jCas);

  final Document doc =
      Jsoup.parse("<!DOCTYPE html>\n<html lang=\"" + da.getLanguage() + "\"></html>");
  doc.outputSettings(new Document.OutputSettings().prettyPrint(false));
  final Element head = doc.head();

  if (!Strings.isNullOrEmpty(css)) {
    final Element cssLink = head.appendElement("link");
    cssLink.attr("rel", "stylesheet");
    cssLink.attr("href", css);
  }

  final Element charset = head.appendElement("meta");
  charset.attr("charset", "utf-8");

  appendMeta(head, "document.type", da.getDocType());
  appendMeta(head, "document.sourceUri", da.getSourceUri());
  appendMeta(head, "externalId", da.getHash());

  appendMeta(head, "document.classification", da.getDocumentClassification());
  appendMeta(
      head,
      "document.caveats",
      String.join(",", UimaTypesUtils.toArray(da.getDocumentCaveats())));
  appendMeta(
      head,
      "document.releasability",
      String.join(",", UimaTypesUtils.toArray(da.getDocumentReleasability())));

  String title = null;
  for (final Metadata md : JCasUtil.select(jCas, Metadata.class)) {
    appendMeta(head, md.getKey(), md.getValue());
    if ("documentTitle".equalsIgnoreCase(md.getKey())) {
      title = md.getValue();
    }
  }

  if (!Strings.isNullOrEmpty(title)) {
    doc.title(title);
  }

  final Element body = doc.body();

  writeBody(jCas, body);

  try {
    FileUtils.writeStringToFile(f, doc.html(), Charset.defaultCharset());
  } catch (final IOException e) {
    throw new AnalysisEngineProcessException(e);
  }
}

Java Code Examples for org.jsoup.nodes.Document#outputSettings()