org.jsoup.nodes.Node Java Exaples

Source File: HtmlToPlainText.java From intellij-quarkus with Eclipse Public License 2.0

7 votes

@Override
public void head(Node node, int depth) {
    String name = node.nodeName();
    if (node instanceof TextNode) {
        append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
    } else if (name.equals("ul")) {
        listNesting++;
    } else if (name.equals("li")) {
        append("\n ");
        for (int i = 1; i < listNesting; i++) {
            append("  ");
        }
        if (listNesting == 1) {
            append("* ");
        } else {
            append("- ");
        }
    } else if (name.equals("dt")) {
        append("  ");
    } else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) {
        append("\n");
    }
}

Source File: ComMailingContentServiceImpl.java From openemm with GNU Affero General Public License v3.0

6 votes

private void generateTextContent(StringBuilder sb, List<Node> nodes) {
    for (Node node : nodes) {
        if (node instanceof Element) {
            Element element = (Element) node;

            switch (element.nodeName()) {
                case "a":
                    sb.append(getTextLink(element));
                    break;

                case "br":
                    sb.append('\n');
                    break;

                default:
                    generateTextContent(sb, element.childNodes());
                    break;
            }
        } else if (node instanceof TextNode) {
            sb.append(((TextNode) node).getWholeText());
        }
    }
}

Source File: SearchUtils.java From emotional_analysis with Apache License 2.0

6 votes

/**
 * 获取歌曲名称
 * <p>Title: getSongNameById</p>
 * <p>Description: </p>
 * @param songId
 * @return
 * @throws Exception
 */
public static String getSongNameById(long songId) throws Exception{
	String songName = null;
	Response execute = Jsoup.connect("http://music.163.com/m/song?id=" + songId)
			.header("User-Agent",
					"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36")
			.header("Cache-Control", "no-cache").timeout(2000000000)
			.execute();
	Document parse = execute.parse();
	Elements elementsByClass = parse.getElementsByClass("f-ff2");
	if(elementsByClass.size() > 0){
		Element element = elementsByClass.get(0);
		Node childNode = element.childNode(0);
		songName = childNode.toString();
	}else{
		songName = "ES中歌曲在网易云音乐中找不到";
	}
	return songName;
}

Source File: SongTest.java From emotional_analysis with Apache License 2.0

6 votes

/**
 * 解析出歌手 专辑
 * <p>Title: test4</p>
 * <p>Description: </p>
 * @throws Exception
 */
@Test
public void test4() throws Exception{
	 Response execute = Jsoup.connect("http://music.163.com/song?id=63650")
				.ignoreContentType(true).execute();
	 Document parse = execute.parse();
	 Elements elements = parse.getElementsByClass("s-fc7");
	 Element singerElement = elements.get(1);
	 Node singerChildNode = singerElement.childNode(0);
	 String singer = singerChildNode.toString();
	 //Album
	 Element albumElement = elements.get(2);
	 Node albumChildNode = albumElement.childNode(0);
	 String album = albumChildNode.toString();
	 System.out.println(singer+"--------"+album);
}

Source File: DeepTextElementBuilder.java From Asqatasun with GNU Affero General Public License v3.0

6 votes

@Override
public String buildTextFromElement(Element element) {
    StringBuilder elementText = new StringBuilder();
    if (element.hasAttr(ALT_ATTR)) {
        elementText.append(SPACER);
        elementText.append(altAttrTextBuilder.buildTextFromElement(element));
    }
    for (Node child : element.childNodes()) {
        if (child instanceof TextNode && !((TextNode)child).isBlank()) {
           elementText.append(SPACER);
           elementText.append(StringUtils.trim(((TextNode)child).text()));
        } else if (child instanceof Element){
            elementText.append(SPACER);
            elementText.append(buildTextFromElement((Element)child));
        }
    }
    return StringUtils.trim(elementText.toString());
}

Source File: ParseUtil.java From zrlog with Apache License 2.0

6 votes

public static String autoDigest(String str, int size) {
    StringBuilder sb = new StringBuilder();
    Document document = Jsoup.parseBodyFragment(str);
    List<Node> allTextNode = new ArrayList<>();
    getAllTextNode(document.childNodes(), allTextNode);
    int tLength = 0;
    for (Node node : allTextNode) {
        if (node instanceof TextNode) {
            sb.append(node.parent().outerHtml());
            tLength += ((TextNode) node).text().length();
            if (tLength > size) {
                sb.append(" ...");
                break;
            }
        }
    }
    String digest = sb.toString();
    Elements elements = Jsoup.parse(str).body().select("video");
    if (elements != null && !elements.isEmpty()) {
        digest = elements.get(0).toString() + "<br/>" + digest;
    }
    return digest.trim();
}

Source File: Paragraph.java From dkpro-c4corpus with Apache License 2.0

6 votes

public void initRawInfo()
{
    StringBuilder sb = new StringBuilder();
    for (Node n : this) {
        //            NodeHelper.cleanEmptyElements(n);
        if (n instanceof TextNode) {
            this.setTagName(getPath(n));
            String nodeRawText = ((TextNode) n).text();
            sb.append(Utils.normalizeBreaks(nodeRawText).trim());

            if (NodeHelper.isLink(n)) {
                charsCountInLinks += nodeRawText.length();
            }
        }
    }

    rawText = sb.toString();
}

Source File: NodeTraversor.java From jsoup-learning with MIT License

6 votes

/**
 * Start a depth-first traverse of the root and all of its descendants.
 * @param root the root node point to traverse.
 */
public void traverse(Node root) {
    Node node = root;
    int depth = 0;
    
    while (node != null) {
        visitor.head(node, depth);
        if (node.childNodeSize() > 0) {
            node = node.childNode(0);
            depth++;
        } else {
            while (node.nextSibling() == null && depth > 0) {
                visitor.tail(node, depth);
                node = node.parent();
                depth--;
            }
            visitor.tail(node, depth);
            if (node == root)
                break;
            node = node.nextSibling();
        }
    }
}

Source File: HtmlParser.java From scava with Eclipse Public License 2.0

6 votes

private static void readNodes(List<Node> nodeList, List<String> textList)
{
	String tempText;
	for(Node node : nodeList)
	{
		if(node.childNodeSize()>0)
		{
			readNodes(node.childNodes(), textList);
		}
		else
		{
			if(node.nodeName().equals("#text"))
			{
				tempText=((TextNode) node).getWholeText();
				tempText=newline.matcher(tempText).replaceAll("");
				if(!tempText.isEmpty())
					textList.add(tempText);
			}
		}
	}
}

Source File: ElementOperator.java From xsoup with MIT License

6 votes

@Override
public String operate(Element element) {
    int index = 0;
    StringBuilder accum = new StringBuilder();
    for (Node node : element.childNodes()) {
        if (node instanceof TextNode) {
            TextNode textNode = (TextNode) node;
            if (group == 0) {
                accum.append(textNode.text());
            } else if (++index == group) {
                return textNode.text();
            }
        }
    }
    return accum.toString();
}

Source File: HtmlToPlainText.java From lemminx with Eclipse Public License 2.0

6 votes

@Override
public void head(Node node, int depth) {
	String name = node.nodeName();
	if (node instanceof TextNode) {
		append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
	} else if (name.equals("ul")) {
		listNesting++;
	} else if (name.equals("li")) {
		append("\n ");
		for (int i = 1; i < listNesting; i++) {
			append("  ");
		}
		if (listNesting == 1) {
			append("* ");
		} else {
			append("- ");
		}
	} else if (name.equals("dt")) {
		append("  ");
	} else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) {
		append("\n");
	}
}

Source File: HtmlParser.java From scava with Eclipse Public License 2.0

6 votes

private static void readNodesWithTags(List<Node> nodeList, List<Map.Entry<String,String>> textListMap, String tag)
{
	for(Node node : nodeList)
	{
		if(node.childNodeSize()>0)
		{
			readNodesWithTags(node.childNodes(), textListMap, node.nodeName());
		}
		else
		{
			if(node.nodeName().equals("#text"))
			{
				if(tag.equalsIgnoreCase("body"))
					tag="p";
				textListMap.add(new AbstractMap.SimpleEntry<String,String>(tag, ((TextNode) node).getWholeText() ));
			}
		}
	}
}

Source File: ElementOperator.java From zongtui-webcrawler with GNU General Public License v2.0

6 votes

@Override
public String operate(Element element) {
    int index = 0;
    StringBuilder accum = new StringBuilder();
    for (Node node : element.childNodes()) {
        if (node instanceof TextNode) {
            TextNode textNode = (TextNode) node;
            if (group == 0) {
                accum.append(textNode.text());
            } else if (++index == group) {
                return textNode.text();
            }
        }
    }
    return accum.toString();
}

Source File: OutputFormatter.java From Xndroid with GNU General Public License v3.0

6 votes

private void appendTextSkipHidden(Element e, StringBuilder accum, int indent) {
    for (Node child : e.childNodes()) {
        if (unlikely(child)) {
            continue;
        }
        if (child instanceof TextNode) {
            TextNode textNode = (TextNode) child;
            String txt = textNode.text();
            accum.append(txt);
        } else if (child instanceof Element) {
            Element element = (Element) child;
            if (accum.length() > 0 && element.isBlock()
                    && !lastCharIsWhitespace(accum))
                accum.append(' ');
            else if (element.tagName().equals("br"))
                accum.append(' ');
            appendTextSkipHidden(element, accum, indent + 1);
        }
    }
}

Source File: ElementUtil.java From flow with Apache License 2.0

6 votes

/**
 * Converts the given element and its children to a JSoup node with
 * children.
 *
 * @param document
 *            A JSoup document
 * @param element
 *            The element to convert
 * @return A JSoup node containing the converted element
 */
public static Node toJsoup(Document document, Element element) {
    if (element.isTextNode()) {
        return new TextNode(element.getText(), document.baseUri());
    }

    org.jsoup.nodes.Element target = document
            .createElement(element.getTag());
    if (element.hasProperty("innerHTML")) {
        target.html((String) element.getPropertyRaw("innerHTML"));
    }

    element.getAttributeNames().forEach(name -> {
        String attributeValue = element.getAttribute(name);
        if ("".equals(attributeValue)) {
            target.attr(name, true);
        } else {
            target.attr(name, attributeValue);
        }
    });

    element.getChildren()
            .forEach(child -> target.appendChild(toJsoup(document, child)));

    return target;
}

Source File: ParagraphsExplorer.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override
public void head(Node node, int depth)
{
    if (node.childNodeSize() == 0) {
        if (node instanceof TextNode && StringUtil.isBlank(node.outerHtml())) {
            return;
        }
        mergeToResult(node);
        nodes.add(node);
    }
}

Source File: CssSelector.java From zongtui-webcrawler with GNU General Public License v2.0

5 votes

protected String getText(Element element) {
    StringBuilder accum = new StringBuilder();
    for (Node node : element.childNodes()) {
        if (node instanceof TextNode) {
            TextNode textNode = (TextNode) node;
            accum.append(textNode.text());
        }
    }
    return accum.toString();
}

Source File: CommonParser.java From ZfsoftCampusAssit with Apache License 2.0

5 votes

public void parseCollegeTerms(String rawHtml, Setting setting) {
    Element doc = Jsoup.parse(rawHtml).getElementById("xqd");
    for (Node yearNode : doc.childNodes()) {
        if (yearNode.hasAttr("value")) {
            setting.ownTerms.add(yearNode.attr("value"));
            if (yearNode.hasAttr("selected")) {
                setting.currentTerm = yearNode.attr("selected");
            }
        }
    }
}

Source File: TruncateHtmlFilter.java From jinjava with Apache License 2.0

5 votes

@Override
public void tail(Node node, int depth) {
  if (node instanceof Element) {
    Element el = (Element) node;
    if (StringUtils.isBlank(el.text())) {
      el.addClass("__deleteme");
    }
  }
}

Source File: HtmlTreeBuilder.java From astor with GNU General Public License v2.0

5 votes

private void insertNode(Node node) {
    // if the stack hasn't been set up yet, elements (doctype, comments) go into the doc
    if (stack.size() == 0)
        doc.appendChild(node);
    else if (isFosterInserts())
        insertInFosterParent(node);
    else
        currentElement().appendChild(node);

    // connect form controls to their form element
    if (node instanceof Element && ((Element) node).tag().isFormListed()) {
        if (formElement != null)
            formElement.addElement((Element) node);
    }
}

Source File: HTMLJsoupCleanerImpl.java From Asqatasun with GNU Affero General Public License v3.0

5 votes

/**
 * Remove the comments of the page 
 * 
 * @param node 
 */
private void removeComments(Node node) {
    // as we are removing child nodes while iterating, we cannot use a normal foreach over children,
    // or will get a concurrent list modification error.
    int i = 0;
    while (i < node.childNodes().size()) {
        Node child = node.childNode(i);
        if (child.nodeName().equals("#comment"))
            child.remove();
        else {
            removeComments(child);
            i++;
        }
    }
}

Source File: ParagraphsExplorer.java From dkpro-c4corpus with Apache License 2.0

5 votes

private void insertAsNewParagraph(Node node)
{
    Paragraph p = new Paragraph(node);
    p.initRawInfo();
    // if (!p.getRawText().isEmpty()) {
    paragraphs.add(p);
    // }
}

Source File: ParagraphsExplorer.java From dkpro-c4corpus with Apache License 2.0

5 votes

private void appendToLastParagraph(Node node)
{
    //        if(!node.nodeName().equalsIgnoreCase("br")){
    if (node instanceof TextNode) {
        Paragraph p = paragraphs.getLast();
        p.setRawText(p.getRawText() + " " + node);
        if (NodeHelper.isLink(node)) {
            p.charsCountInLinks += ((TextNode) node).text().length();
        }
        paragraphs.getLast().add(node);
    }
}

Source File: ParagraphsExplorer.java From dkpro-c4corpus with Apache License 2.0

5 votes

private Node getLastAddedNode()
{
    //        if (paragraphs.isEmpty()) {
    //            return null;
    //        }
    //        return paragraphs.getLast().getLast();
    if (nodes.isEmpty()) {
        return null;
    }
    return nodes.getLast();
}

Source File: JusTextBoilerplateRemoval.java From dkpro-c4corpus with Apache License 2.0

5 votes

/**
 * Initialize the Paragraph explorer class in order to convert a document to
 * a list of blocks (paragraphs)
 */
private LinkedList<Paragraph> makeParagraphs(Node node)
{
    ParagraphsExplorer pe = new ParagraphsExplorer();
    node.traverse(pe); //begin the traversal of the doc
    return pe.getParagraphs();
}

Source File: NodeHelper.java From dkpro-c4corpus with Apache License 2.0

5 votes

/**
 * Returns true if node has a link ancestor
 *
 * @param node node
 * @return boolean value
 */
public static boolean isLink(Node node)
{
    Node ancestor = node;

    while (ancestor != null) {
        if (isLinkTag(ancestor)) {
            return true;
        }
        ancestor = ancestor.parent();
    }

    return false;
}

Source File: JsoupHelper.java From WordCount with GNU General Public License v2.0

5 votes

public static String getXpath(Node node) {
    String result = "";
    Node temp = node;
    while (temp != null) {
        String name = getNodeName(temp);
        result = "," + name + result;
        temp = temp.parent();
    }
    return result;
    
}

Source File: DocumentToJCasConverter.java From baleen with Apache License 2.0

5 votes

/**
 * Walk the HTML document node by node, creating annotations and text.
 *
 * @param builder the builder
 * @param root the root
 * @param depth the depth
 */
private void walk(
    final JCasBuilder builder, final Node root, final int depth, final boolean captureText) {
  if (root == null) {
    return;
  }

  final int begin = builder.getCurrentOffset();
  if (captureText) {
    // Generate the text and the annotations
    final String text = mapToText(root);
    if (!Strings.isNullOrEmpty(text)) {
      builder.addText(text);
    }
  }

  List<Annotation> annotations = null;
  if (root instanceof Element) {
    annotations = mapElementToAnnotations(builder.getJCas(), (Element) root);
  }

  // BUG: With multiple mappers depth here is wrong! It puts all mappers at the same depth...
  // (though in fairness they are all the same begin-end and same element too)

  // Walk the children
  if (root.childNodeSize() > 0) {
    for (final Node node : root.childNodes()) {
      walk(builder, node, depth + 1, captureText);
    }
  }

  // Add annotations to the JCas
  final int end = builder.getCurrentOffset();
  if (annotations != null && !annotations.isEmpty()) {
    builder.addAnnotations(annotations, begin, end, depth);
  }
}

Source File: Evaluator.java From astor with GNU General Public License v2.0

5 votes

@Override
public boolean matches(Element root, Element element) {
      	List<Node> family = element.childNodes();
          for (Node n : family) {
              if (!(n instanceof Comment || n instanceof XmlDeclaration || n instanceof DocumentType)) return false;
          }
      	return true;
}

Source File: HtmlToPlainText.java From astor with GNU General Public License v2.0

5 votes

public void head(Node node, int depth) {
    String name = node.nodeName();
    if (node instanceof TextNode)
        append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
    else if (name.equals("li"))
        append("\n * ");
    else if (name.equals("dt"))
        append("  ");
    else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr"))
        append("\n");
}

org.jsoup.nodes.Node Java Examples