org.jsoup.nodes.TextNode Java Exaples

Source File: HtmlToPlainText.java From intellij-quarkus with Eclipse Public License 2.0

7 votes

@Override
public void head(Node node, int depth) {
    String name = node.nodeName();
    if (node instanceof TextNode) {
        append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
    } else if (name.equals("ul")) {
        listNesting++;
    } else if (name.equals("li")) {
        append("\n ");
        for (int i = 1; i < listNesting; i++) {
            append("  ");
        }
        if (listNesting == 1) {
            append("* ");
        } else {
            append("- ");
        }
    } else if (name.equals("dt")) {
        append("  ");
    } else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) {
        append("\n");
    }
}

Source File: ContentExtractor.java From ContentExtractor with GNU General Public License v2.0

6 votes

private void addTextNode(TextNode tNode) {

        String text = tNode.text().trim();
        if (text.isEmpty()) {
            return;
        }
        String xpath = JsoupHelper.getXpath(tNode);
        tNodeList.add(tNode);
        xpathMap.put(tNode, xpath);

        CountInfo countInfo = new CountInfo(tNode);
        ArrayList<CountInfo> countInfoList = countMap.get(xpath);
        if (countInfoList == null) {
            countInfoList = new ArrayList<CountInfo>();
            countMap.put(xpath, countInfoList);
        }
        countInfoList.add(countInfo);
    }

Source File: CollectionsPresenter.java From OpenHub with GNU General Public License v3.0

6 votes

private ArrayList<Collection> getBellowCollections(Document doc){
    ArrayList<Collection> collections = new ArrayList<>();
    Elements elements = doc.getElementsByClass(
            "d-flex border-bottom border-gray-light pb-4 mb-5");
    for (Element element : elements) {
        Element titleElement = element.select("div > h2 > a").first();
        Element descElement = element.select("div").last();
        String id = titleElement.attr("href");
        id = id.substring(id.lastIndexOf("/") + 1);
        String title = titleElement.textNodes().get(0).toString();

        List<TextNode> descTextNodes = descElement.textNodes();
        int descIndex = descTextNodes.size() == 0 ? 0 : descTextNodes.size() - 1;
        String desc = descTextNodes.get(descIndex).toString().trim();
        Collection collection = new Collection(id, title, desc);
        collections.add(collection);
    }
    return collections;
}

Source File: PageLoaderEpub.java From a with GNU General Public License v3.0

6 votes

@Override
protected String getChapterContent(BookChapterBean chapter) throws Exception {
    Resource resource = epubBook.getResources().getByHref(chapter.getDurChapterUrl());
    StringBuilder content = new StringBuilder();
    Document doc = Jsoup.parse(new String(resource.getData(), mCharset));
    Elements elements = doc.getAllElements();
    for (Element element : elements) {
        List<TextNode> contentEs = element.textNodes();
        for (int i = 0; i < contentEs.size(); i++) {
            String text = contentEs.get(i).text().trim();
            text = StringUtils.formatHtml(text);
            if (elements.size() > 1) {
                if (text.length() > 0) {
                    if (content.length() > 0) {
                        content.append("\r\n");
                    }
                    content.append("\u3000\u3000").append(text);
                }
            } else {
                content.append(text);
            }
        }
    }
    return content.toString();
}

Source File: ContentExtractor.java From WordCount with GNU General Public License v2.0

6 votes

private void addTextNode(TextNode tNode) {

        String text = tNode.text().trim();
        if (text.isEmpty()) {
            return;
        }
        String xpath = JsoupHelper.getXpath(tNode);
        tNodeList.add(tNode);
        xpathMap.put(tNode, xpath);

        CountInfo countInfo = new CountInfo(tNode);
        ArrayList<CountInfo> countInfoList = countMap.get(xpath);
        if (countInfoList == null) {
            countInfoList = new ArrayList<CountInfo>();
            countMap.put(xpath, countInfoList);
        }
        countInfoList.add(countInfo);
    }

Source File: HtmlToPlainText.java From lemminx with Eclipse Public License 2.0

6 votes

@Override
public void head(Node node, int depth) {
	String name = node.nodeName();
	if (node instanceof TextNode) {
		append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
	} else if (name.equals("ul")) {
		listNesting++;
	} else if (name.equals("li")) {
		append("\n ");
		for (int i = 1; i < listNesting; i++) {
			append("  ");
		}
		if (listNesting == 1) {
			append("* ");
		} else {
			append("- ");
		}
	} else if (name.equals("dt")) {
		append("  ");
	} else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) {
		append("\n");
	}
}

Source File: TruncateHtmlFilter.java From jinjava with Apache License 2.0

6 votes

@Override
public void head(Node node, int depth) {
  if (node instanceof TextNode) {
    TextNode text = (TextNode) node;
    String textContent = text.text();

    if (textLen >= maxTextLen) {
      text.text("");
    } else if (textLen + textContent.length() > maxTextLen) {
      int ptr = maxTextLen - textLen;
      if (!killwords) {
        ptr = Functions.movePointerToJustBeforeLastWord(ptr, textContent) - 1;
      }

      text.text(textContent.substring(0, ptr) + ending);
      textLen = maxTextLen;
    } else {
      textLen += textContent.length();
    }
  }
}

Source File: WhenRubyExtensionGroupIsRegistered.java From asciidoctorj with Apache License 2.0

6 votes

@Test
public void ruby_treeprocessor_should_be_registered() {

    this.asciidoctor.createGroup()
        .loadRubyClass(getClass().getResourceAsStream("/ruby-extensions/shell-session-tree-processor.rb"))
        .rubyTreeprocessor("ShellSessionTreeProcessor")
        .register();

    String content = this.asciidoctor.convert(
        " $ echo \"Hello, World!\"\n" +
            " > Hello, World!\n" +
            "\n" +
            " $ gem install asciidoctor",
        options().toFile(false).get());

    final Document document = Jsoup.parse(content);
    final TextNode commandElement = document.getElementsByClass("command").get(0).textNodes().get(0);
    assertThat(commandElement.getWholeText(), is("echo \"Hello, World!\""));
    final TextNode commandElement2 = document.getElementsByClass("command").get(1).textNodes().get(0);
    assertThat(commandElement2.getWholeText(), is("gem install asciidoctor"));
}

Source File: HtmlHelper.java From FairEmail with GNU General Public License v3.0

6 votes

private static String _getText(Document d, boolean full) {
    truncate(d, !full);

    for (Element bq : d.select("blockquote")) {
        bq.prependChild(new TextNode("["));
        bq.appendChild(new TextNode("]"));
    }

    String text = d.text();
    if (full)
        return text;

    String preview = text.substring(0, Math.min(text.length(), PREVIEW_SIZE));
    if (preview.length() < text.length())
        preview += "…";

    return preview;
}

Source File: ElementOperator.java From xsoup with MIT License

6 votes

@Override
public String operate(Element element) {
    int index = 0;
    StringBuilder accum = new StringBuilder();
    for (Node node : element.childNodes()) {
        if (node instanceof TextNode) {
            TextNode textNode = (TextNode) node;
            if (group == 0) {
                accum.append(textNode.text());
            } else if (++index == group) {
                return textNode.text();
            }
        }
    }
    return accum.toString();
}

Source File: WhenRubyExtensionIsRegistered.java From asciidoctorj with Apache License 2.0

6 votes

@Test
public void ruby_treeprocessor_should_be_registered() {

    final String rubyExtPath = classpath.getResource("ruby-extensions").getAbsolutePath();
    final AsciidoctorJRuby asciidoctor = AsciidoctorJRuby.Factory.create(singletonList(rubyExtPath));
    asciidoctor.rubyExtensionRegistry()
        .requireLibrary("shell-session-tree-processor.rb")
        .treeprocessor("ShellSessionTreeProcessor");

    String content = asciidoctor.convert(
        " $ echo \"Hello, World!\"\n" +
            " > Hello, World!\n" +
            "\n" +
            " $ gem install asciidoctor",
            options().toFile(false).get());

    final Document document = Jsoup.parse(content);
    final TextNode commandElement = document.getElementsByClass("command").get(0).textNodes().get(0);
    assertThat(commandElement.getWholeText(), is("echo \"Hello, World!\""));
    final TextNode commandElement2 = document.getElementsByClass("command").get(1).textNodes().get(0);
    assertThat(commandElement2.getWholeText(), is("gem install asciidoctor"));
}

Source File: PageLoaderEpub.java From MyBookshelf with GNU General Public License v3.0

6 votes

@Override
protected String getChapterContent(BookChapterBean chapter) throws Exception {
    Resource resource = epubBook.getResources().getByHref(chapter.getDurChapterUrl());
    StringBuilder content = new StringBuilder();
    Document doc = Jsoup.parse(new String(resource.getData(), mCharset));
    Elements elements = doc.getAllElements();
    for (Element element : elements) {
        List<TextNode> contentEs = element.textNodes();
        for (int i = 0; i < contentEs.size(); i++) {
            String text = contentEs.get(i).text().trim();
            text = StringUtils.formatHtml(text);
            if (elements.size() > 1) {
                if (text.length() > 0) {
                    if (content.length() > 0) {
                        content.append("\r\n");
                    }
                    content.append("\u3000\u3000").append(text);
                }
            } else {
                content.append(text);
            }
        }
    }
    return content.toString();
}

Source File: HtmlParser.java From scava with Eclipse Public License 2.0

6 votes

private static void readNodes(List<Node> nodeList, List<String> textList)
{
	String tempText;
	for(Node node : nodeList)
	{
		if(node.childNodeSize()>0)
		{
			readNodes(node.childNodes(), textList);
		}
		else
		{
			if(node.nodeName().equals("#text"))
			{
				tempText=((TextNode) node).getWholeText();
				tempText=newline.matcher(tempText).replaceAll("");
				if(!tempText.isEmpty())
					textList.add(tempText);
			}
		}
	}
}

Source File: HtmlParser.java From scava with Eclipse Public License 2.0

6 votes

private static void readNodesWithTags(List<Node> nodeList, List<Map.Entry<String,String>> textListMap, String tag)
{
	for(Node node : nodeList)
	{
		if(node.childNodeSize()>0)
		{
			readNodesWithTags(node.childNodes(), textListMap, node.nodeName());
		}
		else
		{
			if(node.nodeName().equals("#text"))
			{
				if(tag.equalsIgnoreCase("body"))
					tag="p";
				textListMap.add(new AbstractMap.SimpleEntry<String,String>(tag, ((TextNode) node).getWholeText() ));
			}
		}
	}
}

Source File: OutputFormatter.java From Xndroid with GNU General Public License v3.0

6 votes

private void appendTextSkipHidden(Element e, StringBuilder accum, int indent) {
    for (Node child : e.childNodes()) {
        if (unlikely(child)) {
            continue;
        }
        if (child instanceof TextNode) {
            TextNode textNode = (TextNode) child;
            String txt = textNode.text();
            accum.append(txt);
        } else if (child instanceof Element) {
            Element element = (Element) child;
            if (accum.length() > 0 && element.isBlock()
                    && !lastCharIsWhitespace(accum))
                accum.append(' ');
            else if (element.tagName().equals("br"))
                accum.append(' ');
            appendTextSkipHidden(element, accum, indent + 1);
        }
    }
}

Source File: Cleaner.java From astor with GNU General Public License v2.0

6 votes

public void head(Node source, int depth) {
    if (source instanceof Element) {
        Element sourceEl = (Element) source;

        if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs
            ElementMeta meta = createSafeElement(sourceEl);
            Element destChild = meta.el;
            destination.appendChild(destChild);

            numDiscarded += meta.numAttribsDiscarded;
            destination = destChild;
        } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.
            numDiscarded++;
        }
    } else if (source instanceof TextNode) {
        TextNode sourceText = (TextNode) source;
        TextNode destText = new TextNode(sourceText.getWholeText());
        destination.appendChild(destText);
    } else if (source instanceof DataNode && whitelist.isSafeTag(source.parent().nodeName())) {
      DataNode sourceData = (DataNode) source;
      DataNode destData = new DataNode(sourceData.getWholeData());
      destination.appendChild(destData);
    } else { // else, we don't care about comments, xml proc instructions, etc
        numDiscarded++;
    }
}

Source File: Paragraph.java From dkpro-c4corpus with Apache License 2.0

6 votes

public void initRawInfo()
{
    StringBuilder sb = new StringBuilder();
    for (Node n : this) {
        //            NodeHelper.cleanEmptyElements(n);
        if (n instanceof TextNode) {
            this.setTagName(getPath(n));
            String nodeRawText = ((TextNode) n).text();
            sb.append(Utils.normalizeBreaks(nodeRawText).trim());

            if (NodeHelper.isLink(n)) {
                charsCountInLinks += nodeRawText.length();
            }
        }
    }

    rawText = sb.toString();
}

Source File: Paragraph.java From dkpro-c4corpus with Apache License 2.0

6 votes

public String getPath(Node n)
{
    String nodePath = "";
    while (n != null) {
        if (n instanceof TextNode) {
            n = n.parent();
        }
        if (NodeHelper.isInnerText(n)) {
            n = n.parent();
        }
        String parentNodeName = n.nodeName();
        nodePath = parentNodeName + "." + nodePath;

        if (!parentNodeName.equalsIgnoreCase("html")) {
            n = n.parent();
        }
        else {
            break;
        }
    }

    return nodePath;
}

Source File: HtmlToPlainText.java From eclipse.jdt.ls with Eclipse Public License 2.0

6 votes

@Override
public void head(Node node, int depth) {
	String name = node.nodeName();
	if (node instanceof TextNode) {
		append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
	} else if (name.equals("ul")) {
		listNesting++;
	} else if (name.equals("li")) {
		append("\n ");
		for (int i = 1; i < listNesting; i++) {
			append("  ");
		}
		if (listNesting == 1) {
			append("* ");
		} else {
			append("- ");
		}
	} else if (name.equals("dt")) {
		append("  ");
	} else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) {
		append("\n");
	}
}

Source File: HtmlRenderer.java From kafka-connect-couchbase with Apache License 2.0

6 votes

private static void renderAsPlaintext(Node node, StringBuilder out) {
  if (node instanceof TextNode) {
    String text = ((TextNode) node).text();
    if (out.length() == 0 || endsWithWhitespace(out)) {
      text = trimLeft(text);
    }
    out.append(text);
    return;
  }

  if (node instanceof Element) {
    Element e = (Element) node;

    if (e.tagName().equals("p") || e.tagName().equals("br")) {
      trimRight(out);
      if (out.length() > 0) {
        out.append(PARAGRAPH_SEPARATOR);
      }
    }

    for (Node child : e.childNodes()) {
      renderAsPlaintext(child, out);
    }
  }
}

Source File: Cleaner.java From astor with GNU General Public License v2.0

6 votes

public void head(Node source, int depth) {
    if (source instanceof Element) {
        Element sourceEl = (Element) source;

        if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs
            ElementMeta meta = createSafeElement(sourceEl);
            Element destChild = meta.el;
            destination.appendChild(destChild);

            numDiscarded += meta.numAttribsDiscarded;
            destination = destChild;
        } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.
            numDiscarded++;
        }
    } else if (source instanceof TextNode) {
        TextNode sourceText = (TextNode) source;
        TextNode destText = new TextNode(sourceText.getWholeText());
        destination.appendChild(destText);
    } else if (source instanceof DataNode && whitelist.isSafeTag(source.parent().nodeName())) {
      DataNode sourceData = (DataNode) source;
      DataNode destData = new DataNode(sourceData.getWholeData());
      destination.appendChild(destData);
    } else { // else, we don't care about comments, xml proc instructions, etc
        numDiscarded++;
    }
}

Source File: ElementOperator.java From zongtui-webcrawler with GNU General Public License v2.0

6 votes

@Override
public String operate(Element element) {
    int index = 0;
    StringBuilder accum = new StringBuilder();
    for (Node node : element.childNodes()) {
        if (node instanceof TextNode) {
            TextNode textNode = (TextNode) node;
            if (group == 0) {
                accum.append(textNode.text());
            } else if (++index == group) {
                return textNode.text();
            }
        }
    }
    return accum.toString();
}

Source File: DeepTextElementBuilder.java From Asqatasun with GNU Affero General Public License v3.0

6 votes

@Override
public String buildTextFromElement(Element element) {
    StringBuilder elementText = new StringBuilder();
    if (element.hasAttr(ALT_ATTR)) {
        elementText.append(SPACER);
        elementText.append(altAttrTextBuilder.buildTextFromElement(element));
    }
    for (Node child : element.childNodes()) {
        if (child instanceof TextNode && !((TextNode)child).isBlank()) {
           elementText.append(SPACER);
           elementText.append(StringUtils.trim(((TextNode)child).text()));
        } else if (child instanceof Element){
            elementText.append(SPACER);
            elementText.append(buildTextFromElement((Element)child));
        }
    }
    return StringUtils.trim(elementText.toString());
}

Source File: ElementUtil.java From flow with Apache License 2.0

6 votes

/**
 * Converts the given element and its children to a JSoup node with
 * children.
 *
 * @param document
 *            A JSoup document
 * @param element
 *            The element to convert
 * @return A JSoup node containing the converted element
 */
public static Node toJsoup(Document document, Element element) {
    if (element.isTextNode()) {
        return new TextNode(element.getText(), document.baseUri());
    }

    org.jsoup.nodes.Element target = document
            .createElement(element.getTag());
    if (element.hasProperty("innerHTML")) {
        target.html((String) element.getPropertyRaw("innerHTML"));
    }

    element.getAttributeNames().forEach(name -> {
        String attributeValue = element.getAttribute(name);
        if ("".equals(attributeValue)) {
            target.attr(name, true);
        } else {
            target.attr(name, attributeValue);
        }
    });

    element.getChildren()
            .forEach(child -> target.appendChild(toJsoup(document, child)));

    return target;
}

Source File: ElementUtil.java From flow with Apache License 2.0

6 votes

/**
 * Converts a given JSoup {@link org.jsoup.nodes.Node} and its children into
 * a matching {@link com.vaadin.flow.dom.Element} hierarchy.
 * <p>
 * Only nodes of type {@link org.jsoup.nodes.TextNode} and
 * {@link org.jsoup.nodes.Element} are converted - other node types return
 * an empty optional.
 *
 * @param node
 *            JSoup node to convert
 * @return element with the matching hierarchy as the given node, or empty
 */
public static Optional<Element> fromJsoup(Node node) {
    Element ret;
    if (node instanceof TextNode) {
        return Optional.of(Element.createText(((TextNode) node).text()));
    } else if (node instanceof org.jsoup.nodes.Element) {
        ret = new Element(((org.jsoup.nodes.Element)node).tagName());
    } else {
        LoggerFactory.getLogger(ElementUtil.class).error(
                "Could not convert a {}, '{}' into {}!",
                Node.class.getName(), node, Element.class.getName());
        return Optional.empty();
    }

    node.attributes().asList().forEach(attribute -> ret
            .setAttribute(attribute.getKey(), attribute.getValue()));

    List<Node> childNodes = node.childNodes();
    if (!childNodes.isEmpty()) {
        childNodes.forEach(
                child -> fromJsoup(child).ifPresent(ret::appendChild));
    }

    return Optional.of(ret);
}

Source File: OutputFormatter.java From JumpGo with Mozilla Public License 2.0

6 votes

private void appendTextSkipHidden(Element e, StringBuilder accum, int indent) {
    for (Node child : e.childNodes()) {
        if (unlikely(child)) {
            continue;
        }
        if (child instanceof TextNode) {
            TextNode textNode = (TextNode) child;
            String txt = textNode.text();
            accum.append(txt);
        } else if (child instanceof Element) {
            Element element = (Element) child;
            if (accum.length() > 0 && element.isBlock()
                    && !lastCharIsWhitespace(accum))
                accum.append(' ');
            else if (element.tagName().equals("br"))
                accum.append(' ');
            appendTextSkipHidden(element, accum, indent + 1);
        }
    }
}

Source File: ParseUtil.java From zrlog with Apache License 2.0

6 votes

public static String autoDigest(String str, int size) {
    StringBuilder sb = new StringBuilder();
    Document document = Jsoup.parseBodyFragment(str);
    List<Node> allTextNode = new ArrayList<>();
    getAllTextNode(document.childNodes(), allTextNode);
    int tLength = 0;
    for (Node node : allTextNode) {
        if (node instanceof TextNode) {
            sb.append(node.parent().outerHtml());
            tLength += ((TextNode) node).text().length();
            if (tLength > size) {
                sb.append(" ...");
                break;
            }
        }
    }
    String digest = sb.toString();
    Elements elements = Jsoup.parse(str).body().select("video");
    if (elements != null && !elements.isEmpty()) {
        digest = elements.get(0).toString() + "<br/>" + digest;
    }
    return digest.trim();
}

Source File: ComMailingContentServiceImpl.java From openemm with GNU Affero General Public License v3.0

6 votes

private void generateTextContent(StringBuilder sb, List<Node> nodes) {
    for (Node node : nodes) {
        if (node instanceof Element) {
            Element element = (Element) node;

            switch (element.nodeName()) {
                case "a":
                    sb.append(getTextLink(element));
                    break;

                case "br":
                    sb.append('\n');
                    break;

                default:
                    generateTextContent(sb, element.childNodes());
                    break;
            }
        } else if (node instanceof TextNode) {
            sb.append(((TextNode) node).getWholeText());
        }
    }
}

Source File: TextExtractor.java From storm-crawler with Apache License 2.0

5 votes

private static void appendNormalisedText(StringBuilder accum,
        TextNode textNode) {
    String text = textNode.getWholeText();

    if (preserveWhitespace(textNode.parent())
            || textNode instanceof CDataNode)
        accum.append(text);
    else
        StringUtil.appendNormalisedWhitespace(accum, text,
                lastCharIsWhitespace(accum));
}

Source File: HtmlToPlainText.java From jsoup-learning with MIT License

5 votes

public void head(Node node, int depth) {
    String name = node.nodeName();
    if (node instanceof TextNode)
        append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
    else if (name.equals("li"))
        append("\n * ");
}

org.jsoup.nodes.TextNode Java Examples