org.jsoup.select.Elements#remove

Source File: CSSReverter.java From BlogManagePlatform with Apache License 2.0

6 votes

/**
 * 将html中外联的css变成内联,并去掉外联样式
 * @author Frodez
 * @date 2019-03-21
 */
@Override
public String revert(String html) {
	Assert.notNull(html, "html must not be null");
	try {
		Document document = Jsoup.parse(html);
		Elements links = document.select("link[href]");
		Elements htmlElement = document.select("html");
		for (Element iter : links) {
			String path = iter.attr("href");
			if (!path.endsWith(".css")) {
				continue;
			}
			htmlElement.prepend(StrUtil.concat("<style type=\"text/css\">", FileUtil.readString(ResourceUtils
				.getFile(StrUtil.concat(FreemarkerRender.getLoaderPath(), path))), "</style>"));
		}
		links.remove();
		return document.html();
	} catch (Exception e) {
		log.error("[frodez.util.renderer.reverter.CSSReverter.revert]", e);
		return html;
	}
}

Source File: WeiboHotProcessor.java From hot-crawler with MIT License

6 votes

@Override
protected List<Info> getInfoDataByElements(Elements elements) {
    List<Info> list = new ArrayList<>();
    if (elements != null) {
        // remove two tr elements
        elements.remove(0);
        elements.remove(0);
        int i = 0;
        for (Element element : elements) {
            Element itemElement = element.getElementsByClass("td-02").get(0).getElementsByTag("a").get(0);
            String id = String.valueOf(++i);
            String infoUrl = itemElement.attr("href");
            String infoTitle = itemElement.html();
            infoUrl = this.prefix + infoUrl;
            list.add(new Info(id, infoTitle, infoUrl));
        }
    }
    return list;
}

Source File: MTGoldFishDashBoard.java From MtgDesktopCompanion with GNU General Public License v3.0

5 votes

@Override
public List<CardDominance> getBestCards(MagicFormat.FORMATS f, String filter) throws IOException {

	// spells, creatures, all, lands
	
	String u = getString(WEBSITE) + "/format-staples/" + f.name().toLowerCase() + "/full/" + filter;
	
	if(f == MagicFormat.FORMATS.COMMANDER)
		u=getString(WEBSITE) + "/format-staples/commander_1v1/full/" + filter;
	
	Document doc = URLTools.extractHtml(u);

	logger.debug("get best cards : " + u);
	Elements trs = doc.select("table tr");
	trs.remove(0);
	trs.remove(0);
	List<CardDominance> ret = new ArrayList<>();
	for (Element e : trs) {
		Elements tds = e.select(MTGConstants.HTML_TAG_TD);
		try {
			int correct = filter.equalsIgnoreCase("lands") ? 1 : 0;

			CardDominance d = new CardDominance();
			d.setPosition(Integer.parseInt(tds.get(0).text()));
			d.setCardName(tds.get(1).text());
			d.setDecksPercent(Double.parseDouble(tds.get(3 - correct).text().replaceAll("\\%", "")));
			d.setPlayers(Double.parseDouble(tds.get(4 - correct).text().replaceAll("\\%", "")));
			
			ret.add(d);
		} catch (Exception ex) {
			logger.error("Error parsing " + tds, ex);
		}

	}
	return ret;
}

Source File: TagServlet.java From firing-range with Apache License 2.0

5 votes

@Override
public void doGet(HttpServletRequest request, HttpServletResponse response) throws IOException {
  if (request.getParameter("q") == null) {
    Responses.sendError(response, "Missing q parameter", 400);
    return;
  }

  String  q = request.getParameter("q");
  Document doc = Jsoup.parseBodyFragment(q);
  Element body = doc.body();
  Elements elements = body.getAllElements();
  if (!(q.contains("body"))){
    elements.remove(body);
  }

  if (elements.isEmpty()) {
    Responses.sendError(response, "Invalid input, no tags", 400);
    return;
  }

  String allowedTag = "";
  String allowedAttribute = "";
  if (request.getPathInfo() != null) {
    String pathInfo = request.getPathInfo().substring(1);
    if (pathInfo.contains("/")) {
      allowedTag = pathInfo.split("/", 2)[0];
      allowedAttribute = pathInfo.split("/")[1];
    } else {
      allowedTag = pathInfo;
    }      
  }
  handleRequest(elements, response, allowedTag, allowedAttribute);
}

Source File: DescendantSelector.java From JsoupXpath with Apache License 2.0

5 votes

@Override
public XValue apply(Elements context) {
    Set<Element> total = new HashSet<>();
    Elements descendant = new Elements();
    for (Element el:context){
        Elements tmp = el.getAllElements();
        //exclude self
        tmp.remove(el);
        total.addAll(tmp);
    }
    descendant.addAll(total);
    return XValue.create(descendant);
}

Source File: StructuralHtml.java From baleen with Apache License 2.0

5 votes

@Override
protected void writeBody(final JCas jCas, final Element body) {

  final Node<Structure> root = StructureHierarchy.build(jCas, structuralClasses).getRoot();

  walk(body, root);

  // We need to create the proper li tags under ol and ul
  body.select("ul > p").wrap("<li></li>");
  body.select("ol > p").wrap("<li></li>");

  // Correct table cells from td to th in header
  body.select("thead td").tagName("th");

  // Add &nbsp; to any empty td or th's
  body.select("td:empty,th:empty").html("&nbsp");

  if (!outputEmptyTags) {
    Elements e = emptyElements(body);
    while (!e.isEmpty()) {
      e.remove();
      e = emptyElements(body);
    }
  }

  // TODO: In accordance with HTML spec
  // - Captions for Table should be moved inside the table
  // - Captions for Figure should be moved inside the figure

}

Source File: RemoveEmptyText.java From baleen with Apache License 2.0

5 votes

private boolean removeEmpty(Element document) {
  Elements emptyNodes = document.select(":empty").not("body");
  if (emptyNodes.isEmpty()) {
    return true;
  }
  emptyNodes.remove();
  return false;
}

Source File: CourseParse.java From CourseScheduleDemo with MIT License

5 votes

public static List<Course> parsePersonal(String data){
    List<Course> courses = new ArrayList<>();
    Document doc = Jsoup.parse(data);
    //首先获取Table
    Element table = doc.getElementById("Table1");
    //然后获取table中的td节点
    Elements trs = table.select("tr");
    //移除不需要的参数，这里表示移除前两个数值。
    trs.remove(0);
    trs.remove(0);
    //遍历td节点
    for (int i=0; i<trs.size(); ++i){
        Element tr = trs.get(i);
        //获取tr下的td节点，要求
        Elements tds = tr.select("td[align]");
        //遍历td节点
        for(int j=0; j<tds.size(); ++j){
            Element td = tds.get(j);
            String str = td.text();
            //如果数值为空则不计算。
            if (str.length() != 1){
                //解析文本数据
                str = parsePersonalCourse(str);
                Course course = new Course();
                course.setClsName(str);
                course.setDay(j+1);
                course.setClsCount(Integer.valueOf(td.attr("rowspan")));
                course.setClsNum(i+1);
                Random random = new Random();
                int num = random.nextInt(COLOR.length);
                course.setColor(COLOR[num]);
                courses.add(course);
            }
        }
    }
    return courses;
}

Source File: Comic.java From HHComicViewer with Apache License 2.0

4 votes

public boolean checkUpdate(String content) {
        //查看是否有更新;
        //获取到网页内容时自动完善内容
        Document doc = Jsoup.parse(content);
        Element comicInfoDiv = doc.select("div[id=permalink]").first();

        this.title = comicInfoDiv.getElementsByTag("h1").first().text();

        Element about_kit = comicInfoDiv.select("div[id=about_kit]").first();
        Elements comicInfoList = about_kit.select("li");
        comicInfoList.remove(0);
        for (Element comicInfo : comicInfoList) {
            switch (comicInfo.getElementsByTag("b").first().text()) {
                case "作者:":
                    this.author = comicInfo.text().split(":")[1];
                    break;
                case "状态:":
                    this.comicStatus = comicInfo.text();
                    break;
                case "集数:":
                    this.comicStatus += (" " + comicInfo.text().split("\\)")[0] + ")");
                    break;
                case "更新:":
                    this.comicUpdateTime = comicInfo.text();
                    break;
                case "收藏:":
                    this.comicFavorite = comicInfo.text();
                    break;
                case "评价:":
                    this.ratingNumber = Float.valueOf(comicInfo.getElementsByTag("span").first().text());
                    this.ratingPeopleNum = Integer.valueOf(comicInfo.text().split("\\(")[1].split("人")[0]);
                    break;
                case "简介":
                    this.description = comicInfo.text();
                    break;
            }
        }

        //章节目录解析
        Element volListSrc = doc.select("div[class=cVolList]").first();
        Elements tagsSrc = volListSrc.select("div[class=cVolTag]");
        Elements tagchapterSrc = volListSrc.select("ul[class=cVolUl]");

        this.chapterName = new ArrayList<>();
        this.chapterId = new ArrayList<>();
        for (int i = 0; i < tagsSrc.size(); i++) {
//            this.tags.add(tagsSrc.get(i).text());
            Elements chaptersSrc = tagchapterSrc.get(i).select("a[class=l_s]");
//            tagCounts.add(chaptersSrc.size());
            for (int j = chaptersSrc.size() - 1; j > -1; j--) {
                //这个倒数循环把原本的倒序的章节顺序变为正序
                chapterName.add(chaptersSrc.get(j).attr("title"));
                //地址需要做一个变换，因为需要另外一个网站的网址，更好解析
                String urlSrc = chaptersSrc.get(j).attr("href");
                //图片服务器编号
                String domainNum = urlSrc.split("=")[1];
                //章节编号
                String chapterNum = urlSrc.split("/")[1].substring(4);
                chapterId.add(Long.parseLong(chapterNum));
                if (i == 0) {
                    serverId = Integer.parseInt(domainNum);
                }
            }
        }
        if (this.chapterCount != this.chapterName.size()) {
            this.isUpdate = true;
        }
        this.chapterCount = this.chapterName.size();
        return isUpdate;
    }

Source File: EchoMTGDashBoard.java From MtgDesktopCompanion with GNU General Public License v3.0

4 votes

@Override
protected EditionsShakers getOnlineShakesForEdition(MagicEdition ed) throws IOException {
	
	EditionsShakers variations = new EditionsShakers();
	variations.setDate(new Date());
	variations.setEdition(ed);
	variations.setProviderName(getName());
	
	Document d = RequestBuilder.build().method(METHOD.GET).setClient(client)
	 .url(EchoMTGExport.BASE_URL+"/set/"+ed.getId().toUpperCase()+"/"+ed.getSet().replace(" ", "-").toLowerCase()+"/")
	 .addHeader(URLTools.HOST, WEBSITE)
	 .addHeader(URLTools.REFERER, EchoMTGExport.BASE_URL)
	 .toHtml();
	
	
	Elements trs = d.select("table#set-table tr");
	trs.remove(trs.first());
	trs.remove(trs.last());
	
	trs.forEach(tr->{
		
		Elements tds = tr.getElementsByTag("td");
		CardShake cs = new CardShake();
				  cs.setEd(ed.getId());
				  cs.setName(tds.get(2).getElementsByTag("a").first().text());
			  
				  double price =Double.parseDouble(tds.get(4).getElementsByTag("a").first().attr("data-price"));
				  double lastWeekPrice = price;
				  
				  if(!tds.get(3).text().isEmpty())
				  {
					  double pc = Double.parseDouble(tds.get(3).text().replace("%",""))/100;
					  lastWeekPrice = price - (lastWeekPrice*pc);
				  }
				  cs.init(price, price, lastWeekPrice);
				  
				  
				  
				  
				  cs.setCurrency(getCurrency());
		variations.addShake(cs);
	});
	return variations;
}

Source File: Comic.java From HHComicViewer with Apache License 2.0

4 votes

public Comic(int cid, String content) {
        this.cid = cid;
        //获取到网页内容时自动完善内容
        Document doc = Jsoup.parse(content);
        Element comicInfoDiv = doc.select("div[class=product]").first();

        this.title = comicInfoDiv.getElementsByTag("h1").first().text();
        this.thumbnailUrl = comicInfoDiv.select("div[id=about_style]").first()
                .getElementsByTag("img").first().attr("src");

        Element about_kit = comicInfoDiv.select("div[id=about_kit]").first();
        Elements comicInfoList = about_kit.select("li");
        comicInfoList.remove(0);
        for (Element comicInfo : comicInfoList) {
            switch (comicInfo.getElementsByTag("b").first().text()) {
                case "作者:":
                    this.author = comicInfo.text().split(":")[1];
                    break;
                case "状态:":
                    this.comicStatus = comicInfo.text();
                    break;
                case "集数:":
                    this.comicStatus += (" " + comicInfo.text().split("\\)")[0] + ")");
                    break;
                case "更新:":
                    this.comicUpdateTime = comicInfo.text();
                    break;
                case "收藏:":
                    this.comicFavorite = comicInfo.text();
                    break;
                case "评价:":
                    this.ratingNumber = Float.valueOf(comicInfo.getElementsByTag("span").first().text());
                    this.ratingPeopleNum = Integer.valueOf(comicInfo.text().split("\\(")[1].split("人")[0]);
                    break;
                case "简介":
                    this.description = comicInfo.text();
                    break;
            }
        }

        //章节目录解析
        Element volListSrc = doc.select("div[class=cVolList]").first();
        Elements tagsSrc = volListSrc.select("div[class=cVolTag]");
        Elements tagchapterSrc = volListSrc.select("ul[class=cVolUl]");

        this.chapterName = new ArrayList<>();
        this.chapterId = new ArrayList<>();
        for (int i = 0; i < tagsSrc.size(); i++) {
//            this.tags.add(tagsSrc.get(i).text());
            Elements chaptersSrc = tagchapterSrc.get(i).select("a[class=l_s]");
//            tagCounts.add(chaptersSrc.size());
            for (int j = chaptersSrc.size() - 1; j > -1; j--) {
                //这个倒数循环把原本的倒序的章节顺序变为正序
                chapterName.add(chaptersSrc.get(j).attr("title"));
                //地址需要做一个变换，因为需要另外一个网站的网址，更好解析
                String urlSrc = chaptersSrc.get(j).attr("href");
                //图片服务器编号
                String domainNum = urlSrc.split("=")[1];
                //章节编号
                String chapterNum = urlSrc.split("/")[1].substring(4);
                chapterId.add(Long.parseLong(chapterNum));
                if (i == 0) {
                    serverId = Integer.parseInt(domainNum);
                }
            }
        }
        this.chapterCount = this.chapterName.size();
    }

Source File: MagicBazarShopper.java From MtgDesktopCompanion with GNU General Public License v3.0

4 votes

private List<OrderEntry> parse(Document doc, String id, Date date) {
	List<OrderEntry> entries = new ArrayList<>();
	Elements table = doc.select("div.table div.tr");
	table.remove(0);
	
	
	for(int i=0;i<table.size();i++)
	{
		Element e = table.get(i);
		boolean iscard=e.hasClass("filterElement");
		String name = e.select("div.td.name").text();
		
		
		if(!name.isEmpty())
		{

			OrderEntry entrie = new OrderEntry();
				entrie.setIdTransation(id);
				entrie.setSource(getName());
				entrie.setCurrency(Currency.getInstance("EUR"));
				entrie.setSeller(getName());
				entrie.setTypeTransaction(TYPE_TRANSACTION.BUY);
				entrie.setTransactionDate(date);
				entrie.setDescription(name);
				if(iscard)
				{
					entrie.setType(TYPE_ITEM.CARD);
					entrie.setDescription(e.select("div.td.name.name_mobile").text());
					entrie.setItemPrice(UITools.parseDouble(e.attr("attribute_price")));
					String set = e.select("div.td.ext img").attr("title");
					try {
						
						entrie.setEdition(MTGControler.getInstance().getEnabled(MTGCardsProvider.class).getSetByName(set));
					}
					catch(Exception ex)
					{
						logger.error(set + " is not found");
					}
					
					
				}
				else
				{
					String price =e.select("div.new_price").html().replaceAll("&nbsp;"+Currency.getInstance("EUR").getSymbol(), "").trim(); 
					entrie.setItemPrice(UITools.parseDouble(price));
					if(entrie.getDescription().contains("Set")||entrie.getDescription().toLowerCase().contains("collection"))
						entrie.setType(TYPE_ITEM.FULLSET);
					else if(entrie.getDescription().toLowerCase().contains("booster"))
						entrie.setType(TYPE_ITEM.BOOSTER);
					else if(entrie.getDescription().toLowerCase().startsWith("boite de") || entrie.getDescription().contains("Display") )
						entrie.setType(TYPE_ITEM.BOX);
					else
						entrie.setType(TYPE_ITEM.LOTS);
				}
				notify(entrie);
				entries.add(entrie);	
		}
		
		
		
	}
	
	
	
	return entries;
}

Source File: BookClass.java From nju-lib-downloader with GNU General Public License v3.0

4 votes

private Set<Book> queryBooks(Elements booksliNode) {
    Set<Book> books = new HashSet<>();
    for (Element element : booksliNode) {
        //获取书名和id
        String name = null, id = null, author = null, publishDate = null, theme = null, detailBookClass = null;
        BookClass bookBookClass;
        Elements nameIdNode = element.select("p[class=name]");
        if (nameIdNode != null) {
            name = nameIdNode.text();
            Elements idNode = nameIdNode.select("a[onclick]");
            if (idNode != null && idNode.size() > 0) {
                String idOnClick = idNode.get(0).attr("onclick");
                int start = idOnClick.indexOf("(") + 1, end = idOnClick.lastIndexOf(",");
                if (start != 0 && end != -1) {
                    id = idOnClick.substring(start, end);
                }
            }
        }
        //获取分类
        BookClass[] bookClasses = new BookClass[0];
        Elements infoNode = element.select("p[class=info]");
        if (infoNode != null) {
            Elements bookInfos = infoNode.select("a");
            if (bookInfos != null && bookInfos.size() > 0) {
                Element terminalCataNode = bookInfos.last();
                bookInfos.remove(terminalCataNode);
                List<BookClass> tmplist = bookInfos.stream()
                        .map(bookInfo -> getBookCata(bookInfo, false))
                        .filter(Objects::nonNull)
                        .collect(Collectors.toList());
                BookClass terminalBookClass = getBookCata(terminalCataNode, true);
                if (terminalBookClass != null) {
                    tmplist.add(terminalBookClass);
                }
                bookClasses = tmplist.toArray(bookClasses);
            }
        }
        bookBookClass = new RootBookClass().link(bookClasses);

        //获取作者，出版日期，主题词，分类
        String info = element.text();
        Pattern pattern = Pattern.compile("\\d+\\. (.*) 作者[:：](.*) 出版日期[:：](\\d+).*?(?:主题词[:：](.+))? 分类[:：](.*)");
        Matcher matcher = pattern.matcher(info);
        while (matcher.find()) {
            name = matcher.group(1);
            author = matcher.group(2);
            publishDate = matcher.group(3);
            theme = matcher.group(4);
            detailBookClass = matcher.group(5);
        }
        Pattern minPattern = Pattern.compile(".*(《.*》).*");
        Matcher minMatcher = minPattern.matcher(info);
        while (minMatcher.find()) {
            name = minMatcher.group(1);
        }

        //汇总书本
        if (name != null && id != null) {
            Book book = new Book(id, name, author, publishDate, theme, bookBookClass, detailBookClass);
            book.setCookie(cookie);
            books.add(book);
            if (bookBookClass.isTerminal()) {
                ((TerminalBookClass) bookBookClass).addBook(book);
            } else {
                System.out.println("未获取到分类信息，将不被归档 " + book);
            }
        } else {
            System.out.println("error: " + info);
        }
    }
    return books;
}

Source File: BookClass.java From nju-lib-downloader with GNU General Public License v3.0

4 votes

private Set<Book> queryBooks(Elements booksliNode) {
    Set<Book> books = new HashSet<>();
    for (Element element : booksliNode) {
        //获取书名和id
        String name = null, id = null, author = null, publishDate = null, theme = null, detailBookClass = null;
        BookClass bookBookClass;
        Elements nameIdNode = element.select("p[class=name]");
        if (nameIdNode != null) {
            name = nameIdNode.text();
            Elements idNode = nameIdNode.select("a[onclick]");
            if (idNode != null && idNode.size() > 0) {
                String idOnClick = idNode.get(0).attr("onclick");
                int start = idOnClick.indexOf("(") + 1, end = idOnClick.lastIndexOf(",");
                if (start != 0 && end != -1) {
                    id = idOnClick.substring(start, end);
                }
            }
        }
        //获取分类
        BookClass[] bookClasses = new BookClass[0];
        Elements infoNode = element.select("p[class=info]");
        if (infoNode != null) {
            Elements bookInfos = infoNode.select("a");
            if (bookInfos != null && bookInfos.size() > 0) {
                Element terminalCataNode = bookInfos.last();
                bookInfos.remove(terminalCataNode);
                List<BookClass> tmplist = bookInfos.stream()
                        .map(bookInfo -> getBookCata(bookInfo, false))
                        .filter(Objects::nonNull)
                        .collect(Collectors.toList());
                BookClass terminalBookClass = getBookCata(terminalCataNode, true);
                if (terminalBookClass != null) {
                    tmplist.add(terminalBookClass);
                }
                bookClasses = tmplist.toArray(bookClasses);
            }
        }
        bookBookClass = this.link(bookClasses);

        //获取作者，出版日期，主题词，分类
        String info = element.text();
        Pattern pattern = Pattern.compile("\\d+\\. (.*) 作者[:：](.*) 出版日期[:：](\\d+).*?(?:主题词[:：](.+))? 分类[:：](.*)");
        Matcher matcher = pattern.matcher(info);
        while (matcher.find()) {
            name = matcher.group(1);
            author = matcher.group(2);
            publishDate = matcher.group(3);
            theme = matcher.group(4);
            detailBookClass = matcher.group(5);
        }
        Pattern minPattern = Pattern.compile(".*(《.*》).*");
        Matcher minMatcher = minPattern.matcher(info);
        while (minMatcher.find()) {
            name = minMatcher.group(1);
        }

        //汇总书本
        if (name != null && id != null) {
            Book book = new Book(id, name, author, publishDate, theme, bookBookClass, detailBookClass);
            book.setCookie(cookie);
            books.add(book);
            if (bookBookClass.isTerminal()) {
                ((TerminalBookClass) bookBookClass).addBook(book);
            } else {
                System.out.println("未获取到分类信息，将不被归档 " + book);
            }
        } else {
            System.out.println("error: " + info);
        }
    }
    return books;
}

Source File: JsoupHelper.java From seed with Apache License 2.0

4 votes

/**
 * 抓取天涯论坛帖子内容
 * 目前只抓取楼主发言部分，且内容会存储到用户桌面的文章URL同名txt文件中
 * @param bbsURL      帖子地址（支持传入首页地址或本帖其它任意页面的地址）
 * @param finalPageNo 帖子的最大的页码（如传入页码超出实际最大页码，这里在抓取完最大页码内容后，会自动停止作业）
 */
private static void getTianyaBBSTxt(String bbsURL, int finalPageNo) throws IOException {
    String txt;
    String author;
    String publishTime;
    Element atlInfo;
    Elements elements;
    Document document;
    //去掉URL中的参数
    bbsURL = bbsURL.endsWith("shtml") ? bbsURL : bbsURL.substring(0, bbsURL.indexOf(".shtml")+6);
    //计算待写入的txt文件，并预先清空里面的内容（如果已存在）
    String filePath = FileSystemView.getFileSystemView().getHomeDirectory().getAbsolutePath();
    String fileName = bbsURL.substring(bbsURL.indexOf("post")).replace(".shtml", ".txt");
    File bbsFile = new File(filePath, fileName);
    FileUtils.writeStringToFile(bbsFile, "", StandardCharsets.UTF_8);
    //获取帖子的起始页码
    int pageNo = Integer.parseInt(bbsURL.substring(bbsURL.lastIndexOf("-")+1, bbsURL.lastIndexOf(".")));
    //开始处理所有页面的所有楼层
    for(int i=pageNo; i<finalPageNo; i++){
        if(i == 1){
            /*
             * 单独处理首层楼（首层楼只存在于首页）
             */
            document = Jsoup.connect(bbsURL).get();
            //读取作者和发布时间
            atlInfo = document.getElementById("post_head").select("div.atl-info").first();
            author = atlInfo.select("span").eq(0).select("a").first().text();
            publishTime = atlInfo.select("span").eq(1).text();
            //获取楼层内容：每一个<div class="atl-item"></div>都代表一个楼层，首层也不例外
            elements = document.getElementsByClass("atl-item");
            //楼层具体内容都是在<div class="bbs-content"></div>里面包着的
            txt = elements.first().select("div.bbs-content").html().replaceAll("<br>", "");
            //写入txt
            FileUtils.writeStringToFile(bbsFile, "楼主："+author+"，"+publishTime+"\r\n", StandardCharsets.UTF_8, true);
            FileUtils.writeStringToFile(bbsFile, txt+"\r\n\r\n", StandardCharsets.UTF_8, true);
            //需要移除已处理过的首层楼
            elements.remove(0);
        }else{
            /*
             * 对于非首页的帖子，每次都需重新计算URL，并重新抓取内容
             */
            bbsURL = bbsURL.replace("-"+(i-1)+".shtml", "-"+i+".shtml");
            document = Jsoup.connect(bbsURL).get();
            //超出帖子最终页码的访问，会被天涯重定向到最终页码页
            if(!bbsURL.equals(document.location())){
                System.out.println("帖子抓取完毕");
                return;
            }
            //得到本页需要抓取的elements
            elements = document.getElementsByClass("atl-item");
        }
        /*
         * 上面两种条件，最终都是计算好本页需要迭代处理的elements
         */
        for(Element obj: elements){
            atlInfo = obj.select("div.atl-info").first();
            String authorType = atlInfo.select("strong.host").text();
            //作者类型为空就说明，该楼层非楼主发言，暂时不写入txt
            if(StringUtils.isNotBlank(authorType)){
                txt = obj.select("div.bbs-content").html().replaceAll("<br>", "");
                author = atlInfo.select("span").eq(0).select("a").first().text();
                publishTime = atlInfo.select("span").eq(1).text();
                FileUtils.writeStringToFile(bbsFile, authorType+"："+author+"，"+publishTime+"\r\n", StandardCharsets.UTF_8, true);
                FileUtils.writeStringToFile(bbsFile, txt+"\r\n\r\n", StandardCharsets.UTF_8, true);
            }
        }
    }
}

Source File: Expression.java From firing-range with Apache License 2.0

4 votes

@Override
public void doGet(HttpServletRequest request, HttpServletResponse response) throws IOException {
  if (request.getParameter("q") == null) {
    Responses.sendError(response, "Missing q parameter", 400);
    return;
  }

  String  q = request.getParameter("q");
  Document doc = Jsoup.parseBodyFragment(q);
  Element body = doc.body();
  Elements elements = body.getAllElements();
  elements.remove(body);
  if (elements.isEmpty()) {
    Responses.sendError(response, "Invalid input, no tags", 400);
    return;
  }

  StringBuilder res = new StringBuilder();
  for (Element element : elements) {
    boolean validElement = true;

    Attributes attributes = element.attributes();
    for (Attribute attribute : attributes) {
      if (attribute.getKey().toLowerCase().startsWith("on")
          || attribute.getKey().toLowerCase().equals("href")
          || attribute.getKey().toLowerCase().equals("src")) {
        validElement = false;
      }

      if (attribute.getKey().toLowerCase().equals("style")
          && attribute.getValue().toLowerCase().contains("expression")) {
        validElement = false;
      }
    }

    if (validElement) {
      res.append(element.toString());
    }
  }
  Responses.sendXssed(response, res.toString());
}

Source File: TtsHelper.java From coolreader with MIT License

4 votes

private void removeAllChildren(Element el, Elements elements) {
	for (Element child : el.getAllElements()) {
		elements.remove(child);
	}
}

Source File: MTGGradeGrader.java From MtgDesktopCompanion with GNU General Public License v3.0

3 votes

@Override
public Grading loadGrading(String identifier) throws IOException {
	
	String url=getWebSite()+"/produit/"+identifier;
	
	
	Document d = RequestBuilder.build().method(METHOD.GET)
			   .setClient(URLTools.newClient())
			   .url(url)
			   .toHtml();
	
	Elements trs = d.select("table.table-product tr");
	
	if(trs.isEmpty())
		return null;
	
	
	
	Grading grad = new Grading();
			grad.setGraderName(getName());
			grad.setNumberID(identifier);
			grad.setUrlInfo(url);
			
	trs.remove(0);
	
	logger.debug("found " + trs.text());
	
	grad.setGradeNote(Double.parseDouble(trs.select("td").get(3).text()));
	
	
	return grad;
}

Java Code Examples for org.jsoup.select.Elements#remove()