org.htmlcleaner.TagNode Java Examples

The following examples show how to use org.htmlcleaner.TagNode. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Http.java    From BotLibre with Eclipse Public License 1.0 6 votes vote down vote up
/**
 * Convert the HTML input stream into DOM parsable XHTML.
 */
public StringReader convertToXHTML(InputStream input) throws IOException {
	StringWriter output = new StringWriter();
	
	/*int next = input.read();
	while (next != -1) {
		output.write(next);
		next = input.read();
	}
	String result = output.toString();
	System.out.println(result);*/
	
	TagNode node = getHtmlCleaner().clean(input, "UTF-8");
	//TagNode node = getHtmlCleaner().clean(result);
	node.serialize(new SimpleXmlSerializer(getHtmlCleaner().getProperties()), output);
	output.flush();
	String xhtml = output.toString();
	return new StringReader(xhtml);
}
 
Example #2
Source File: PreHandler.java    From SDHtmlTextView with Apache License 2.0 6 votes vote down vote up
private void getPlainText(StringBuffer buffer, Object node) {
	if (node instanceof ContentNode) {

		ContentNode contentNode = (ContentNode) node;
		String text = TextUtil.replaceHtmlEntities(contentNode.getContent()
				.toString(), true);

		buffer.append(text);

	} else if (node instanceof TagNode) {
		TagNode tagNode = (TagNode) node;
		for (Object child : tagNode.getAllChildren()) {
			getPlainText(buffer, child);
		}
	}
}
 
Example #3
Source File: ListsHandler.java    From mvvm-template with GNU General Public License v3.0 6 votes vote down vote up
private int getMyIndex(TagNode node) {
    if (node.getParent() == null) {
        return -1;
    } else {
        int i = 1;
        for (Object child : node.getParent().getChildren()) {
            if (child == node) {
                return i;
            }
            if (child instanceof TagNode) {
                TagNode childNode = (TagNode) child;
                if ("li".equals(childNode.getName())) {
                    ++i;
                }
            }
        }
        return -1;
    }
}
 
Example #4
Source File: PreTagHandler.java    From mvvm-template with GNU General Public License v3.0 6 votes vote down vote up
@Override public void handleTagNode(TagNode node, SpannableStringBuilder builder, int start, int end) {
    if (isPre) {
        StringBuffer buffer = new StringBuffer();
        buffer.append("\n");//fake padding top + make sure, pre is always by itself
        getPlainText(buffer, node);
        buffer.append("\n");//fake padding bottom + make sure, pre is always by itself
        builder.append(replace(buffer.toString()));
        builder.append("\n");
        builder.setSpan(new CodeBackgroundRoundedSpan(color), start, builder.length(), SPAN_EXCLUSIVE_EXCLUSIVE);
        builder.append("\n");
        this.appendNewLine(builder);
        this.appendNewLine(builder);
    } else {
        StringBuffer text = node.getText();
        builder.append(" ");
        builder.append(replace(text.toString()));
        builder.append(" ");
        final int stringStart = start + 1;
        final int stringEnd = builder.length() - 1;
        builder.setSpan(new BackgroundColorSpan(color), stringStart, stringEnd, SPAN_EXCLUSIVE_EXCLUSIVE);
        if (theme == PrefGetter.LIGHT) {
            builder.setSpan(new ForegroundColorSpan(Color.RED), stringStart, stringEnd, SPAN_EXCLUSIVE_EXCLUSIVE);
        }
        builder.setSpan(new TypefaceSpan("monospace"), stringStart, stringEnd, SPAN_EXCLUSIVE_EXCLUSIVE);
    }
}
 
Example #5
Source File: ListItemHandler.java    From SDHtmlTextView with Apache License 2.0 6 votes vote down vote up
private int getMyIndex(TagNode node) {
	if (node.getParent() == null) {
		return -1;
	}

	int i = 1;

	for (Object child : node.getParent().getAllChildren()) {
		if (child == node) {
			return i;
		}

		if (child instanceof TagNode) {
			TagNode childNode = (TagNode) child;
			if ("li".equals(childNode.getName())) {
				i++;
			}
		}
	}

	return -1;
}
 
Example #6
Source File: StyleNodeHandler.java    From SDHtmlTextView with Apache License 2.0 6 votes vote down vote up
@Override
public void handleTagNode(TagNode node, SpannableStringBuilder builder, int start, int end, SpanStack spanStack) {

    if ( getSpanner().isAllowStyling() ) {

        if ( node.getAllChildren().size() == 1 ) {
            Object childNode = node.getAllChildren().get(0);

            if ( childNode instanceof ContentNode ) {
                parseCSSFromText( ( (ContentNode) childNode ).getContent(),
                        spanStack );
            }
        }
    }

}
 
Example #7
Source File: HtmlUtil.java    From ispider with Apache License 2.0 6 votes vote down vote up
/**
 * 得到url列表
 * @param tagNode
 * @param attr
 * @param xpath
 * @return
 */
public static List<String> getListUrlByXpath(TagNode tagNode, String attr, String xpath) {
    List<String> urls = new ArrayList<>();
    try {
        Object[] objs = tagNode.evaluateXPath(xpath);
        if (objs != null && objs.length > 0) {
            for (Object obj : objs) {
                TagNode aTagNode = (TagNode) obj;
                String url = aTagNode.getAttributeByName(attr);
                urls.add("https:" + url);
            }
        }
        return urls;
    } catch (XPatherException e) {
        e.printStackTrace();
    }
    return null;
}
 
Example #8
Source File: AlignmentAttributeHandler.java    From SDHtmlTextView with Apache License 2.0 6 votes vote down vote up
@Override
public void handleTagNode(TagNode node, SpannableStringBuilder builder,
		int start, int end, Style style, SpanStack spanStack) {
	
	String align = node.getAttributeByName("align");

	if ( "right".equalsIgnoreCase(align) ) {
	    style = style.setTextAlignment(Style.TextAlignment.RIGHT);
	} else if ( "center".equalsIgnoreCase(align) ) {
           style =  style.setTextAlignment(Style.TextAlignment.CENTER);
	} else if ( "left".equalsIgnoreCase(align) ) {
           style =  style.setTextAlignment(Style.TextAlignment.LEFT);
	}
	
	super.handleTagNode(node, builder, start, end, style, spanStack);
}
 
Example #9
Source File: HrHandler.java    From mvvm-template with GNU General Public License v3.0 5 votes vote down vote up
@Override public void handleTagNode(TagNode tagNode, SpannableStringBuilder spannableStringBuilder, int i, int i1) {
    spannableStringBuilder.append("\n");
    SpannableStringBuilder builder = new SpannableStringBuilder("$");
    HrSpan hrSpan = new HrSpan(color, width);
    builder.setSpan(hrSpan, 0, builder.length(), Spanned.SPAN_EXCLUSIVE_EXCLUSIVE);
    builder.setSpan(new CenterSpan(), 0, builder.length(), Spanned.SPAN_EXCLUSIVE_EXCLUSIVE);
    builder.append("\n");
    spannableStringBuilder.append(builder);
}
 
Example #10
Source File: TagNodeEmptyContentCondition.java    From web-data-extractor with Apache License 2.0 5 votes vote down vote up
private boolean satisfy(TagNode tagNode, boolean override) {
    String name = tagNode.getName();
    TagInfo tagInfo = tagInfoProvider.getTagInfo(name);
    //Only _block_ elements can match.
    if (tagInfo != null && !hasIdAttributeSet(tagNode) && none != tagInfo.getDisplay() && !tagInfo.isEmptyTag() && (override || !unsafeBlockElements.contains(name))) {
        CharSequence contentString = tagNode.getText();
        if (isEmptyString(contentString)) {
            // even though there may be no text need to make sure all children are empty or can be pruned
            if (tagNode.isEmpty()) {
                return true;
            } else {
                for (Object child : tagNode.getAllChildren()) {
                    // TODO : similar check as in tagNode.isEmpty() argues for a visitor pattern
                    // but allow empty td, ths to be pruned.
                    if (child instanceof TagNode) {
                        if (!satisfy((TagNode) child, true)) {
                            return false;
                        }
                    } else if (child instanceof ContentNode) {
                        if (!((ContentNode) child).isBlank()) {
                            return false;
                        }
                    } else {
                        return false;
                    }
                }
                return true;
            }
        }
    }
    return false;
}
 
Example #11
Source File: HtmlUtil.java    From ispider with Apache License 2.0 5 votes vote down vote up
/**
 * 根据xpath和属性获取对应标签的属性值
 *
 * @param tagNode
 * @param attr
 * @param xpath
 * @return
 */
public static String getAttrByXpath(TagNode tagNode, String attr, String xpath) {
    try {
        Object[] objs = tagNode.evaluateXPath(xpath);
        if (objs != null && objs.length > 0) {
            TagNode node = (TagNode) objs[0];
            return node.getAttributeByName(attr);
        }
    } catch (XPatherException e) {
        e.printStackTrace();
    }
    return null;
}
 
Example #12
Source File: TagNodeInsignificantBrCondition.java    From web-data-extractor with Apache License 2.0 5 votes vote down vote up
public boolean satisfy(TagNode tagNode) {
    if (!isBrNode(tagNode)) {
        return false;
    }
    TagNode parent = tagNode.getParent();
    List children = parent.getAllChildren();
    int brIndex = children.indexOf(tagNode);
    return checkSublist(0, brIndex, children) || checkSublist(brIndex, children.size(), children);
}
 
Example #13
Source File: TagNodeAttNameValueRegexCondition.java    From web-data-extractor with Apache License 2.0 5 votes vote down vote up
public boolean satisfy(TagNode tagNode) {
    if (tagNode != null) {
        for (Map.Entry<String, String> entry : tagNode.getAttributes().entrySet()) {
            if ((attNameRegex == null || attNameRegex.matcher(entry.getKey()).find()) && (attValueRegex == null || attValueRegex.matcher(entry.getValue()).find())) {
                return true;
            }
        }
    }
    return false;
}
 
Example #14
Source File: ListItemHandler.java    From SDHtmlTextView with Apache License 2.0 5 votes vote down vote up
private String getParentName(TagNode node) {
	if (node.getParent() == null) {
		return null;
	}

	return node.getParent().getName();
}
 
Example #15
Source File: HtmlUtil.java    From ispider with Apache License 2.0 5 votes vote down vote up
/**
 * 根据指定的xpath,从tagNode中选择具体的标签Text
 *
 * @param tagNode
 * @param xpath
 * @return
 */
public static String getTextByXpath(TagNode tagNode, String xpath) {
    Object[] objs = null;
    try {
        objs = tagNode.evaluateXPath(xpath);
        if (objs != null && objs.length > 0) {
            TagNode titleNode = (TagNode) objs[0];
            return titleNode.getText().toString().trim();
        }
    } catch (XPatherException e) {
        e.printStackTrace();
    }
    return null;
}
 
Example #16
Source File: LinkHandler.java    From SDHtmlTextView with Apache License 2.0 5 votes vote down vote up
@Override
public void handleTagNode(TagNode node, SpannableStringBuilder builder,
		int start, int end, SpanStack spanStack) {

	final String href = node.getAttributeByName("href");
	spanStack.pushSpan(new URLSpan(href), start, end);
}
 
Example #17
Source File: LinkHandler.java    From mvvm-template with GNU General Public License v3.0 5 votes vote down vote up
@Override public void handleTagNode(TagNode node, SpannableStringBuilder spannableStringBuilder, int start, int end) {
    String href = node.getAttributeByName("href");
    if (href != null) {
        spannableStringBuilder.setSpan(new LinkSpan(href, linkColor), start, end, 33);
    } else if (node.getText() != null) {
        spannableStringBuilder.setSpan(new LinkSpan("https://github.com/" + node.getText().toString(), linkColor), start, end, 33);
    }
}
 
Example #18
Source File: WrappingStyleHandler.java    From SDHtmlTextView with Apache License 2.0 5 votes vote down vote up
@Override
public void handleTagNode(TagNode node, SpannableStringBuilder builder, int start, int end, Style useStyle,
    SpanStack spanStack ) {
    if ( wrappedHandler != null ) {
        wrappedHandler.handleTagNode(node, builder, start, end, useStyle, spanStack);
    }
}
 
Example #19
Source File: CSSCompiler.java    From SDHtmlTextView with Apache License 2.0 5 votes vote down vote up
@Override
public boolean matches(TagNode tagNode) {

    if ( tagNode == null ) {
        return false;
    }

    //If a tag name is given it should match
    if (tagName != null && tagName.length() > 0 && ! tagName.equals(tagNode.getName() ) ) {
        return  false;
    }

    String classAttribute = tagNode.getAttributeByName("class");
    return classAttribute != null && classAttribute.equals(className);
}
 
Example #20
Source File: TagNodeInsignificantBrCondition.java    From web-data-extractor with Apache License 2.0 5 votes vote down vote up
private boolean checkSublist(int start, int end, List list) {
    List sublist = list.subList(start, end);
    for (Object object : sublist) {
        if (!(object instanceof TagNode)) {
            return false;
        }
        TagNode node = (TagNode) object;
        if (!isBrNode(node) && !node.isPruned()) {
            return false;
        }
    }
    return true;
}
 
Example #21
Source File: CSSCompiler.java    From SDHtmlTextView with Apache License 2.0 5 votes vote down vote up
@Override
public boolean matches(TagNode tagNode) {

    if ( tagNode == null ) {
        return false;
    }

    String idAttribute = tagNode.getAttributeByName("id");
    return idAttribute != null && idAttribute.equals( id );
}
 
Example #22
Source File: CompiledRule.java    From SDHtmlTextView with Apache License 2.0 5 votes vote down vote up
public boolean matches( TagNode tagNode ) {

        for ( List<CSSCompiler.TagNodeMatcher> matcherList: matchers ) {
            if ( matchesChain(matcherList, tagNode)) {
                return true;
            }
        }

        return false;
    }
 
Example #23
Source File: Http.java    From BotLibre with Eclipse Public License 1.0 5 votes vote down vote up
/**
 * Convert the HTML input stream into DOM parsable XHTML.
 */
public String convertToXHTML(String html) throws IOException {
	StringWriter output = new StringWriter();
	TagNode node = getHtmlCleaner().clean(html);
	node.serialize(new SimpleXmlSerializer(getHtmlCleaner().getProperties()), output);
	output.flush();
	return output.toString();
}
 
Example #24
Source File: HtmlSpanner.java    From SDHtmlTextView with Apache License 2.0 5 votes vote down vote up
/**
 * Creates spanned text from a TagNode.
 *
 * @param node
 * @return
 */
public Spannable fromTagNode(TagNode node, CancellationCallback cancellationCallback) {
    SpannableStringBuilder result = new SpannableStringBuilder();
    SpanStack stack = new SpanStack();

    applySpan( result, node, stack, cancellationCallback );

    stack.applySpans(this, result);

    return result;
}
 
Example #25
Source File: HtmlSpanner.java    From SDHtmlTextView with Apache License 2.0 5 votes vote down vote up
private void applySpan(SpannableStringBuilder builder, TagNode node, SpanStack stack,
                       CancellationCallback cancellationCallback) {

    checkForCancellation(cancellationCallback);

    TagNodeHandler handler = this.handlers.get(node.getName());

    if ( handler == null ) {
        handler = new StyledTextHandler();
        handler.setSpanner(this);
    }

    int lengthBefore = builder.length();

    handler.beforeChildren(node, builder, stack);

    if ( !handler.rendersContent() ) {

        for (Object childNode : node.getAllChildren()) {

            if ( childNode instanceof ContentNode ) {
                handleContent( builder, childNode, stack, cancellationCallback );
            } else if ( childNode instanceof TagNode ) {
                applySpan( builder, (TagNode) childNode, stack, cancellationCallback );
            }
        }
    }

    int lengthAfter = builder.length();
    handler.handleTagNode(node, builder, lengthBefore, lengthAfter, stack);
}
 
Example #26
Source File: TagNodeAttValueCondition.java    From web-data-extractor with Apache License 2.0 5 votes vote down vote up
public boolean satisfy(TagNode tagNode) {
    if (tagNode == null || attName == null || attValue == null) {
        return false;
    } else {
        return isCaseSensitive ?
                attValue.equals(tagNode.getAttributeByName(attName)) :
                attValue.equalsIgnoreCase(tagNode.getAttributeByName(attName));
    }
}
 
Example #27
Source File: HorizontalLineHandler.java    From SDHtmlTextView with Apache License 2.0 5 votes vote down vote up
@Override
public void handleTagNode(TagNode node, SpannableStringBuilder builder, int start, int end,
                          Style useStyle, SpanStack spanStack) {

    end+=1;
    Log.d("HorizontalLineHandler", "Draw hr from " + start + " to " + end);
    spanStack.pushSpan(new HorizontalLineSpan(useStyle, start, end), start, end);
    appendNewLine(builder);

    super.handleTagNode(node, builder, start, end, useStyle, spanStack);

}
 
Example #28
Source File: StyleAttributeHandler.java    From SDHtmlTextView with Apache License 2.0 5 votes vote down vote up
@Override
public void handleTagNode(TagNode node, SpannableStringBuilder builder, int start, int end, Style useStyle,
                          SpanStack spanStack) {

    String styleAttr = node.getAttributeByName("style");

    if ( getSpanner().isAllowStyling() && styleAttr != null ) {
        super.handleTagNode(node, builder, start, end,
                parseStyleFromAttribute(useStyle, styleAttr),
                spanStack);
    } else {
        super.handleTagNode(node, builder, start, end, useStyle, spanStack);
    }

}
 
Example #29
Source File: BorderAttributeHandler.java    From SDHtmlTextView with Apache License 2.0 5 votes vote down vote up
@Override
public void handleTagNode(TagNode node, SpannableStringBuilder builder, int start, int end,
                          Style useStyle, SpanStack spanStack) {

    if ( node.getAttributeByName("border") != null ) {
        Log.d("BorderAttributeHandler", "Adding BorderSpan from " + start + " to " + end);
        spanStack.pushSpan(new BorderSpan(useStyle, start, end, getSpanner().isUseColoursFromStyle() ), start, end);
    }

    super.handleTagNode(node, builder, start, end, useStyle, spanStack);

}
 
Example #30
Source File: NewLineHandler.java    From SDHtmlTextView with Apache License 2.0 5 votes vote down vote up
public void handleTagNode(TagNode node, SpannableStringBuilder builder,
		int start, int end, SpanStack spanStack) {

       super.handleTagNode(node, builder, start, end, spanStack);

	for (int i = 0; i < numberOfNewLines; i++) {
		appendNewLine(builder);
	}
}