org.jsoup.nodes.Element#remove

Source File: TextFilterManage.java From bbs with GNU Affero General Public License v3.0

6 votes

/**
 * 删除隐藏标签(包括隐藏标签的内容和子标签)
 * @param html 富文本内容
 * @return
 */
public String deleteHiddenTag(String html){
	if(!StringUtils.isBlank(html)){
		Document doc = Jsoup.parseBodyFragment(html);
		Elements elements = doc.select("hide");  
		for (Element element : elements) {
			element.remove();
		}
		//prettyPrint(是否重新格式化)、outline(是否强制所有标签换行)、indentAmount(缩进长度)    doc.outputSettings().indentAmount(0).prettyPrint(false);
		doc.outputSettings().prettyPrint(false);
		html = doc.body().html();
	}
	return html;
	
	
}

Source File: NewLineToNewParagraph.java From baleen with Apache License 2.0

6 votes

/**
 * Adds each new line (a run) to the documnet as a paragraph.
 *
 * @param e the element at which to add the runs.
 * @param runs the runs
 */
private void addRunsToDom(Element e, List<Element> runs) {
  // Add these new spans into the DOM
  if ("p".equalsIgnoreCase(e.tagName())) {
    // If this is a p, then just add below it
    // reverse order so the first element of runs ends up closest to p as it should be
    Collections.reverse(runs);
    runs.forEach(e::after);
    // Delete the old paragraph
    e.remove();
  } else {
    // If we aren't in a p (eg in a li) then lets add paragraphs to this element
    // But first clear it out
    e.children().remove();
    runs.forEach(e::appendChild);
  }
}

Source File: ParsedResponse.java From mosmetro-android with GNU General Public License v3.0

6 votes

public ParsedResponse(@Nullable String url, @Nullable String html, int code,
                      @Nullable Map<String,List<String>> headers) {
    this.url = url;
    this.html = html;

    if (html != null && !html.isEmpty()) {
        document = Jsoup.parse(html, url);

        // Clean-up useless tags: <script> without src, <style>
        for (Element element : document.getElementsByTag("script")) {
            if (!element.hasAttr("src")) {
                element.remove();
            }
        }
        document.getElementsByTag("style").remove();
    }

    this.code = code;

    if (headers != null){
        this.headers.putAll(headers);
    }
}

Source File: HtmlHelper.java From FairEmail with GNU General Public License v3.0

5 votes

static boolean truncate(Document d, boolean reformat) {
    int max = (reformat ? MAX_FORMAT_TEXT_SIZE : MAX_FULL_TEXT_SIZE);

    int length = 0;
    int images = 0;
    for (Element elm : d.select("*")) {
        if ("img".equals(elm.tagName()))
            images++;

        boolean skip = false;
        for (Node child : elm.childNodes()) {
            if (child instanceof TextNode) {
                TextNode tnode = ((TextNode) child);
                String text = tnode.getWholeText();

                if (length < max) {
                    if (length + text.length() >= max) {
                        text = text.substring(0, max - length) + " ...";
                        tnode.text(text);
                        skip = true;
                    }
                } else {
                    if (skip)
                        tnode.text("");
                }

                length += text.length();
            }
        }

        if (length >= max && !skip)
            elm.remove();
    }

    Log.i("Message size=" + length + " images=" + images);

    return (length >= max);
}

Source File: ArticleTextExtractor.java From JumpGo with Mozilla Public License 2.0

5 votes

/**
 * Removes unlikely candidates from HTML. Currently takes id and class name
 * and matches them against list of patterns
 *
 * @param doc document to strip unlikely candidates from
 */
protected void stripUnlikelyCandidates(Document doc) {
    for (Element child : doc.select("body").select("*")) {
        String className = child.className().toLowerCase();
        String id = child.id().toLowerCase();

        if (NEGATIVE.matcher(className).find()
                || NEGATIVE.matcher(id).find()) {
            child.remove();
        }
    }
}

Source File: OutputFormatter.java From JumpGo with Mozilla Public License 2.0

5 votes

/**
 * If there are elements inside our top node that have a negative gravity
 * score remove them
 */
private void removeNodesWithNegativeScores(Element topNode) {
    Elements gravityItems = topNode.select("*[gravityScore]");
    for (Element item : gravityItems) {
        int score = getScore(item);
        int paragraphIndex = getParagraphIndex(item);
        if (score < 0 || item.text().length() < getMinParagraph(paragraphIndex)) {
            item.remove();
        }
    }
}

Source File: JsoupCssInliner.java From ogham with Apache License 2.0

5 votes

/**
 * Generates a stylesheet from an html document
 *
 * @param doc
 *            the html document
 * @return a string representing the stylesheet.
 */
private static String fetchStyles(Document doc) {
	Elements els = doc.select(STYLE_TAG);
	StringBuilder styles = new StringBuilder();
	for (Element e : els) {
		if (isInlineModeAllowed(e, InlineModes.STYLE_ATTR)) {
			styles.append(e.data());
			e.remove();
		}
	}
	return styles.toString();
}

Source File: Rgaa3Extractor.java From Asqatasun with GNU Affero General Public License v3.0

5 votes

private static void createTestcaseFiles() throws IOException {
    File srcDir = new File(RGAA3_TESTCASE_PATH);
    for (File file : srcDir.listFiles()) {
        String fileName = file.getName().replace("Rgaa30Rule", "").replace(".java", "");
        String theme = fileName.substring(0, 2);
        String crit = fileName.substring(2, 4);
        String test = fileName.substring(4, 6);
        String testKey = Integer.valueOf(theme).toString()+"-"+Integer.valueOf(crit).toString()+"-"+Integer.valueOf(test).toString();
        String wrongKey = theme+"."+crit+"."+test;
        for (File testcase : file.listFiles()) {
            if (testcase.isFile() && testcase.getName().contains("html")) {
                Document doc = Jsoup.parse(FileUtils.readFileToString(testcase));
                Element detail = doc.select(".test-detail").first();
                if (detail == null) {
                    System.out.println(doc.outerHtml());
                } else {
                    detail.tagName("div");
                    detail.text("");
                    for (Element el : detail.children()) {
                        el.remove();
                    }
                    if (!detail.hasAttr("lang")) {
                        detail.attr("lang", "fr");
                    }
                    detail.append("\n"+RGAA3.get(testKey).ruleRawHtml+"\n");
                    doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
                    doc.outputSettings().outline(false);
                    doc.outputSettings().indentAmount(4);
                    String outputHtml = doc.outerHtml();
                    if (outputHtml.contains(wrongKey)) {
                        outputHtml = outputHtml.replaceAll(wrongKey, RGAA3.get(testKey).getRuleDot());
                    }
                    FileUtils.writeStringToFile(testcase, outputHtml);
                }
            }
        }
    }
}

Source File: JusTextBoilerplateRemoval.java From dkpro-c4corpus with Apache License 2.0

5 votes

/**
 * remove unwanted parts from a jsoup doc
 */
private Document cleanDom(Document jsoupDoc)
{
    String[] tagsToRemove = { "head", "script", ".hidden", "embedded" };

    for (String tag : tagsToRemove) {
        Elements selectedTags = jsoupDoc.select(tag);
        for (Element element : selectedTags) {
            element.remove();
        }
    }

    return jsoupDoc;
}

Source File: ArticleTextExtractor.java From Xndroid with GNU General Public License v3.0

5 votes

/**
 * Removes unlikely candidates from HTML. Currently takes id and class name
 * and matches them against list of patterns
 *
 * @param doc document to strip unlikely candidates from
 */
protected void stripUnlikelyCandidates(Document doc) {
    for (Element child : doc.select("body").select("*")) {
        String className = child.className().toLowerCase();
        String id = child.id().toLowerCase();

        if (NEGATIVE.matcher(className).find()
                || NEGATIVE.matcher(id).find()) {
            child.remove();
        }
    }
}

Source File: OutputFormatter.java From Xndroid with GNU General Public License v3.0

5 votes

/**
 * If there are elements inside our top node that have a negative gravity
 * score remove them
 */
private void removeNodesWithNegativeScores(Element topNode) {
    Elements gravityItems = topNode.select("*[gravityScore]");
    for (Element item : gravityItems) {
        int score = getScore(item);
        int paragraphIndex = getParagraphIndex(item);
        if (score < 0 || item.text().length() < getMinParagraph(paragraphIndex)) {
            item.remove();
        }
    }
}

Source File: HtmlConverter.java From docx4j-template with Apache License 2.0

5 votes

/**
 * 将页面转为{@link org.jsoup.nodes.Document}对象，xhtml 格式
 *
 * @param url
 * @return
 * @throws Exception
 */
protected Document url2xhtml(String url) throws Exception {
    Document doc = Jsoup.connect(url).get(); //获得

    if (logger.isDebugEnabled()) {
        logger.debug("baseUri: {}", doc.baseUri());
    }

    for (Element script : doc.getElementsByTag("script")) { //除去所有 script
        script.remove();
    }

    for (Element a : doc.getElementsByTag("a")) { //除去 a 的 onclick，href 属性
        a.removeAttr("onclick");
        a.removeAttr("href");
    }

    Elements links = doc.getElementsByTag("link"); //将link中的地址替换为绝对地址
    for (Element element : links) {
        String href = element.absUrl("href");

        if (logger.isDebugEnabled()) {
            logger.debug("href: {} -> {}", element.attr("href"), href);
        }

        element.attr("href", href);
    }

    doc.outputSettings()
            .syntax(Document.OutputSettings.Syntax.xml)
            .escapeMode(Entities.EscapeMode.xhtml);  //转为 xhtml 格式

    if (logger.isDebugEnabled()) {
        String[] split = doc.html().split("\n");
        for (int c = 0; c < split.length; c++) {
            logger.debug("line {}:\t{}", c + 1, split[c]);
        }
    }
    return doc;
}

Source File: HtmlHelper.java From FairEmail with GNU General Public License v3.0

5 votes

static void cleanup(Document d) {
    // https://www.chromestatus.com/feature/5756335865987072
    // Some messages contain 100 thousands of Apple spaces
    for (Element aspace : d.select(".Apple-converted-space")) {
        Node next = aspace.nextSibling();
        if (next instanceof TextNode) {
            TextNode tnode = (TextNode) next;
            tnode.text(" " + tnode.text());
            aspace.remove();
        } else
            aspace.replaceWith(new TextNode(" "));
    }
}

Source File: QAKIS.java From NLIWOD with GNU Affero General Public License v3.0

4 votes

public void search(IQuestion question, String language) throws Exception {
	String questionString;
	if (!question.getLanguageToQuestion().containsKey(language)) {
		return;
	}
	questionString = question.getLanguageToQuestion().get(language);
	log.debug(this.getClass().getSimpleName() + ": " + questionString);
	final HashSet<String> resultSet = new HashSet<String>();
	String url = "http://qakis.org/qakis/index.xhtml";

	RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(this.timeout).build();
	HttpClient client = HttpClientBuilder.create().setDefaultRequestConfig(requestConfig).build();
	HttpPost httppost = new HttpPost(url);
	HttpResponse ping = client.execute(httppost);
	//Test if error occured
	if(ping.getStatusLine().getStatusCode()>=400){
		throw new Exception("QAKIS Server could not answer due to: "+ping.getStatusLine());
	}
	
	Document vsdoc = Jsoup.parse(responseparser.responseToString(ping));
	Elements el = vsdoc.select("input");
	String viewstate = (el.get(el.size() - 1).attr("value"));

	List<NameValuePair> formparams = new ArrayList<NameValuePair>();
	formparams.add(new BasicNameValuePair("index_form", "index_form"));
	formparams.add(new BasicNameValuePair("index_form:question",
			questionString));
	formparams.add(new BasicNameValuePair("index_form:eps", ""));
	formparams.add(new BasicNameValuePair("index_form:submitQuestion", ""));
	formparams.add(new BasicNameValuePair("javax.faces.ViewState",
			viewstate));
	if(this.setLangPar){
		formparams.add(new BasicNameValuePair("index_form:language", language));
	}
	
	UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formparams,
			Consts.UTF_8);
	httppost.setEntity(entity);
	HttpResponse response = client.execute(httppost);

	Document doc = Jsoup.parse(responseparser.responseToString(response));
	Elements answer = doc.select("div.global-presentation-details>h3>a");
	NodeVisitor nv = new NodeVisitor() {
		public void tail(Node node, int depth) {
			if (depth == 0)
				resultSet.add(node.attr("href"));
		}

		public void head(Node arg0, int arg1) {
			// do nothing here
		}
	};
	answer.traverse(nv);
	question.setGoldenAnswers(resultSet);

	Elements codeElements = doc.select("div#sparqlQuery pre");
	if (codeElements.size() > 0) {
		Element sparqlElement = codeElements.get(0);
		Elements codeChildren = sparqlElement.children();
		for (Element c : codeChildren) {
			c.remove();
		}
		question.setSparqlQuery(sparqlElement.text());
	}
}

Source File: Action.java From templatespider with Apache License 2.0

4 votes

/**
	 * 替换模版页面中的动态标签
	 * 1.替换title标签
	 * 2.删除keywords 、 description
	 */
	public static void replaceDongtaiTag(){
		/*
		 * 遍历出模版页面
		 */
		List<Map<String, String>> templatePageList = new ArrayList<Map<String,String>>();
		
		DefaultTableModel pageModel = Global.mainUI.getTemplatePageTableModel();
		int pageRowCount = pageModel.getRowCount();
		for (int i = 0; i < pageRowCount; i++) {
			Map<String, String> map = new HashMap<String, String>();
			//模版页面名字
			String name = (String) pageModel.getValueAt(i, 0);
			if(name != null && name.length() > 0){
				
				Template temp = Global.templateMap.get(name);
				if(temp != null){
					//有这个模版页面
					Document doc = temp.getDoc();
					
					//删除 keywords 、 description
					Elements metaEles = doc.getElementsByTag("meta");
					Iterator<Element> it = metaEles.iterator();
					while(it.hasNext()){
						Element metaEle = it.next();
						String metaName = metaEle.attr("name");
						if(metaEle != null && metaName != null){
							if(metaName.equalsIgnoreCase("keywords") || metaName.equalsIgnoreCase("description")){
								try {
									metaEle.remove();
									it.remove();
								} catch (Exception e) {
									e.printStackTrace();
									System.out.println(metaEle);
								}
							}
						}
					}
					
					//替换title标签
					Elements titleEles = doc.getElementsByTag("title");
					Element titleEle = null;
					if(titleEles != null && titleEles.size() > 0){
						titleEle = titleEles.first();
					}else{
						//若没有这个title，那么需要新增加一个
						Elements headElements = doc.getElementsByTag("head");
						if(headElements == null || headElements.size() == 0){
							UI.showMessageDialog("模版页面"+temp.getFile().getName()+"中无head标签！模版页估计不完整！请手动补上head标签");
							return;
						}else{
//							titleEle = new Element(tag, baseUri)
//							headElements.first().appendElement(tagName)
							/*
							 * 待加入
							 */
						}
					}
					if(titleEle != null){
						//替换title标签为动态标签
						String type = (String) pageModel.getValueAt(i, 1);
						switch (type) {
						case "首页模版":
							titleEle.text(site_name);
							break;
						case "列表页模版":
							titleEle.text(siteColumn_name+"_"+site_name);
							break;
						case "详情页模版":
							titleEle.text(news_title+"_"+site_name);
							break;
						default:
							titleEle.text(site_name);
							break;
						}
					}
					
					Global.templateMap.put(temp.getFile().getName(), temp);
				}
			}
		}
	}

Source File: Elements.java From astor with GNU General Public License v2.0

3 votes

/**
 * Remove each matched element from the DOM. This is similar to setting the outer HTML of each element to nothing.
 * <p>
 * E.g. HTML: {@code <div><p>Hello</p> <p>there</p> <img /></div>}<br>
 * <code>doc.select("p").remove();</code><br>
 * HTML = {@code <div> <img /></div>}
 * <p>
 * Note that this method should not be used to clean user-submitted HTML; rather, use {@link org.jsoup.safety.Cleaner} to clean HTML.
 * @return this, for chaining
 * @see Element#empty()
 * @see #empty()
 */
public Elements remove() {
    for (Element element : this) {
        element.remove();
    }
    return this;
}

Source File: Elements.java From astor with GNU General Public License v2.0

3 votes

/**
 * Remove each matched element from the DOM. This is similar to setting the outer HTML of each element to nothing.
 * <p>
 * E.g. HTML: {@code <div><p>Hello</p> <p>there</p> <img /></div>}<br>
 * <code>doc.select("p").remove();</code><br>
 * HTML = {@code <div> <img /></div>}
 * <p>
 * Note that this method should not be used to clean user-submitted HTML; rather, use {@link org.jsoup.safety.Cleaner} to clean HTML.
 * @return this, for chaining
 * @see Element#empty()
 * @see #empty()
 */
public Elements remove() {
    for (Element element : this) {
        element.remove();
    }
    return this;
}

Source File: Elements.java From astor with GNU General Public License v2.0

3 votes

/**
 * Remove each matched element from the DOM. This is similar to setting the outer HTML of each element to nothing.
 * <p>
 * E.g. HTML: {@code <div><p>Hello</p> <p>there</p> <img /></div>}<br>
 * <code>doc.select("p").remove();</code><br>
 * HTML = {@code <div> <img /></div>}
 * <p>
 * Note that this method should not be used to clean user-submitted HTML; rather, use {@link org.jsoup.safety.Cleaner} to clean HTML.
 * @return this, for chaining
 * @see Element#empty()
 * @see #empty()
 */
public Elements remove() {
    for (Element element : this) {
        element.remove();
    }
    return this;
}

Source File: Elements.java From jsoup-learning with MIT License

3 votes

/**
 * Remove each matched element from the DOM. This is similar to setting the outer HTML of each element to nothing.
 * <p>
 * E.g. HTML: {@code <div><p>Hello</p> <p>there</p> <img /></div>}<br>
 * <code>doc.select("p").remove();</code><br>
 * HTML = {@code <div> <img /></div>}
 * <p>
 * Note that this method should not be used to clean user-submitted HTML; rather, use {@link org.jsoup.safety.Cleaner} to clean HTML.
 * @return this, for chaining
 * @see Element#empty()
 * @see #empty()
 */
public Elements remove() {
    for (Element element : contents) {
        element.remove();
    }
    return this;
}

Java Code Examples for org.jsoup.nodes.Element#remove()