Java Code Examples for org.jsoup.nodes.Element#text()
The following examples show how to use
org.jsoup.nodes.Element#text() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ITClassFileFormatVersion.java From japicmp with Apache License 2.0 | 6 votes |
@Test public void testClassFileFormatVersionIsPresent() throws IOException { Path htmlPath = Paths.get(System.getProperty("user.dir"), "target", "japicmp", "class-file-format-version.html"); if (!Files.exists(htmlPath)) { return; //in JDK 1.7 case } Document document = Jsoup.parse(htmlPath.toFile(), Charset.forName("UTF-8").toString()); Elements classFileFormatElements = document.select(".class_fileFormatVersion"); assertThat(classFileFormatElements.isEmpty(), is(false)); Elements tdCells = classFileFormatElements.select("table > tbody > tr > td"); assertThat(tdCells.isEmpty(), is(false)); for (Element element : tdCells) { String text = element.text(); if (!"MODIFIED".equals(text) && !"50.0".equals(text) && !"52.0".equals(text)) { Assert.fail("text of HTML element does not equal 'MODIFIED' or 50.0 or 52.0: " + text); } } }
Example 2
Source File: IPUtils.java From superword with Apache License 2.0 | 6 votes |
public static List<String> getIPLocation(String ip){ List<String> locations = new ArrayList<>(); try { Elements elements = Jsoup .parse(new URL("http://ip138.com/ips138.asp?ip=" + ip), 60000) .select("ul li"); for(Element element : elements){ String text = element.text(); if(StringUtils.isNotBlank(text)){ String[] attrs = text.split(":"); if(attrs != null && attrs.length == 2){ locations.add(attrs[1]); } } } }catch (Exception e){ LOG.error("获取IP地址的地理位置", e); } return locations; }
Example 3
Source File: BookClass.java From nju-lib-downloader with GNU General Public License v3.0 | 6 votes |
/** * 通过HTML中对应节点获取到书所在分类 * * @param bookInfo 书本信息的HTML节点 * @param isTerminal 是否是终端分类 * @return 书所在分类。如果是终端分类将会返回{@code TerminalBookClass} */ private BookClass getBookCata(Element bookInfo, boolean isTerminal) { String cataName = bookInfo.text(); String href = bookInfo.attr("href"); if (href != null) { int cataIdStart = href.indexOf('=') + 1; if (cataIdStart != 0) { String cataId = href.substring(href.indexOf('=') + 1, href.length()); BookClass tmp = isTerminal ? new TerminalBookClass(cataId) : new BookClass(cataId); tmp.setName(cataName); return tmp; } } return null; }
Example 4
Source File: JsoupEx.java From FairEmail with GNU General Public License v3.0 | 6 votes |
static Document parse(String html) { try { /* org.jsoup.UncheckedIOException: java.io.IOException: Input is binary and unsupported at org.jsoup.parser.CharacterReader.<init>(SourceFile:38) at org.jsoup.parser.CharacterReader.<init>(SourceFile:43) at org.jsoup.parser.TreeBuilder.initialiseParse(SourceFile:38) at org.jsoup.parser.HtmlTreeBuilder.initialiseParse(SourceFile:65) at org.jsoup.parser.TreeBuilder.parse(SourceFile:46) at org.jsoup.parser.Parser.parse(SourceFile:107) at org.jsoup.Jsoup.parse(SourceFile:58) */ return Jsoup.parse(html.replace("\0", "")); } catch (OutOfMemoryError ex) { Log.e(ex); Document document = Document.createShell(""); Element strong = document.createElement("strong"); strong.text(Log.formatThrowable(ex)); document.body().appendChild(strong); return document; } }
Example 5
Source File: BookClass.java From nju-lib-downloader with GNU General Public License v3.0 | 6 votes |
/** * 通过HTML中对应节点获取到书所在分类 * * @param bookInfo 书本信息的HTML节点 * @param isTerminal 是否是终端分类 * @return 书所在分类。如果是终端分类将会返回{@code TerminalBookClass} */ private BookClass getBookCata(Element bookInfo, boolean isTerminal) { String cataName = bookInfo.text(); String href = bookInfo.attr("href"); if (href != null) { int cataIdStart = href.indexOf('=') + 1; if (cataIdStart != 0) { String cataId = href.substring(href.indexOf('=') + 1, href.length()); BookClass tmp = isTerminal ? new TerminalBookClass(cataId) : new BookClass(cataId); tmp.setName(cataName); return tmp; } } return null; }
Example 6
Source File: AppsGamesCatalogApi.java From 4pdaClient-plus with Apache License 2.0 | 5 votes |
private static void loadCatalog(IHttpClient client, AppGameCatalog catalog, ArrayList<AppGameCatalog> res) throws IOException { String pageBody = client.performGet(APPS_CATALOG_URL).getResponseBody(); Matcher contentMatcher = Pattern.compile("<div class=\"[^\"]*post_body[^\"]*\"[^>]*?>([\\s\\S]*?)<a name=\"entry\\d+\"></a>", Pattern.CASE_INSENSITIVE).matcher(pageBody); if (!contentMatcher.find()) { throw new IOException("Не найден пост с содержанием каталога приложений"); } Document doc = Jsoup.parse(contentMatcher.group(1)); for (Element categoryElement : doc.select("ol[type=1]>li")) { Elements elements = categoryElement.select("a"); if (elements.size() == 0) continue; Element element = elements.get(0); Uri uri = Uri.parse(element.attr("href")); String title = element.text(); AppGameCatalog category = new AppGameCatalog(uri.getQueryParameter("p"), title) .setLevel(AppGameCatalog.LEVEL_CATEGORY); category.setParent(catalog); res.add(category); AppGameCatalog subCategory = new AppGameCatalog(category.getId().toString(), category.getTitle() + " @ темы") .setLevel(AppGameCatalog.LEVEL_CATEGORY); subCategory.setParent(category); res.add(subCategory); for (Element subCategoryElement : categoryElement.select("ul>li>a")) { uri = Uri.parse(subCategoryElement.attr("href")); title = subCategoryElement.text(); AppGameCatalog subcategory = new AppGameCatalog(uri.getQueryParameter("anchor"), title) .setLevel(AppGameCatalog.LEVEL_SUBCATEGORY); subcategory.setParent(category); res.add(subcategory); } } }
Example 7
Source File: YZWBPaperCollector.java From search with Apache License 2.0 | 5 votes |
@Override public List<File> collect(Date date) { List<File> files = new ArrayList<>(); try { LOG.debug("url: "+url); String paper = url + sf.format(date) + start; LOG.debug("paper: "+paper); Document document = Jsoup.connect(paper).get(); LOG.debug("typeCssQuery: " + typeCssQuery); Elements elements = document.select(typeCssQuery); int i = 1; for(Element element : elements){ LOG.debug("处理子报"+(i++)); String href = element.attr("href"); LOG.debug("type href:"+href); if(href != null && href.endsWith(".htm")){ String type = element.text(); LOG.debug("type:"+type); href = href.replace("./", ""); href = url + sf.format(date) + href; LOG.debug("type href:"+href); //不同的子报的pdfCssQuery都一样 List<String> hrefs = collect(href, pdfCssQuery); files.addAll(downloadPaper(hrefs)); } } } catch (IOException ex) { LOG.error("采集出错",ex); } return files; }
Example 8
Source File: CityParser.java From zuihou-admin-cloud with Apache License 2.0 | 5 votes |
private List<Area> parseProvince(String url) { String htmlStr = HttpUtil.get(url, CHARSET); Document document = Jsoup.parse(htmlStr); // 获取 class='provincetr' 的元素 Elements elements = document.getElementsByClass("provincetr"); List<Area> provinces = new LinkedList<Area>(); int sort = 1; for (Element element : elements) { // 获取 elements 下属性是 href 的元素 Elements links = element.getElementsByAttribute("href"); for (Element link : links) { String provinceName = link.text(); String href = link.attr("href"); String provinceCode = href.substring(0, 2); Area provinceArea = Area.builder().code(provinceCode + "0000") .label(provinceName).source(url) .sortValue(sort++) .level(new RemoteData<>("PROVINCE")) .fullName(provinceName) .build(); provinceArea.setChildren(parseCity(provinceName, COMMON_URL + href)); StaticLog.info("省级数据: {} ", provinceArea); provinces.add(provinceArea); } } return provinces; }
Example 9
Source File: SparkUtil.java From yanagishima with Apache License 2.0 | 5 votes |
public static List<SparkSqlJob> getSparkSqlJobFromSqlserver(String resourceManagerUrl, String sparkJdbcApplicationId) { try { List<SparkSqlJob> sparkSqlJobs = new ArrayList<>(); Document document = Jsoup.connect(resourceManagerUrl + "/proxy/" + sparkJdbcApplicationId + "/sqlserver").get(); // SQL Statistics // User JobID GroupID Start Time Finish Time Duration Statement State Detail Element table = document.getElementsByTag("tbody").last(); if (table == null) { return sparkSqlJobs; } for (Element row : table.getElementsByTag("tr")) { SparkSqlJob sparkSqlJob = new SparkSqlJob(); Elements td = row.getElementsByTag("td"); sparkSqlJob.setUser(td.get(0).text()); Element jobIds = td.get(1); List<Integer> jobIdList = new ArrayList<>(); if (jobIds.childNodeSize() > 1) { for (Element a : jobIds.getElementsByTag("a")) { String str = a.text(); jobIdList.add(Integer.parseInt(str.substring(1, str.length() - 1))); } } sparkSqlJob.setJobIds(jobIdList); sparkSqlJob.setGroupId(td.get(2).text()); sparkSqlJob.setStartTime(td.get(3).text()); sparkSqlJob.setFinishTime(td.get(4).text()); sparkSqlJob.setDuration(td.get(5).text()); sparkSqlJob.setStatement(td.get(6).text()); sparkSqlJob.setState(td.get(7).text()); sparkSqlJob.setDetail(td.get(8).text()); sparkSqlJobs.add(sparkSqlJob); } return sparkSqlJobs; } catch (IOException e) { throw new RuntimeException(e); } }
Example 10
Source File: SynonymDiscriminationExtractor.java From superword with Apache License 2.0 | 5 votes |
/** * 解析同义词辨析 * @param html * @return */ public static Set<SynonymDiscrimination> parseSynonymDiscrimination(String html){ Set<SynonymDiscrimination> data = new HashSet<>(); try { for(Element element : Jsoup.parse(html).select(SYNONYM_DISCRIMINATION_CSS_PATH)){ String title = element.select(TITLE).text().trim(); Elements elements = element.select(DES); if(elements.size() != 2){ LOGGER.error("解析描述信息出错,elements.size="+elements.size()); continue; } String des = elements.get(0).text().replace("“ ”", "").replace("“ ", "“").trim(); SynonymDiscrimination synonymDiscrimination = new SynonymDiscrimination(); synonymDiscrimination.setTitle(title); synonymDiscrimination.setDes(des); elements = element.select(WORDS); for(Element ele : elements){ String word = ele.text(); String[] attr = word.split(":"); if(attr != null && attr.length == 2){ synonymDiscrimination.addWord(new Word(attr[0].trim(), attr[1].trim())); }else { LOGGER.error("解析词义信息出错:"+word); } } data.add(synonymDiscrimination); LOGGER.info("解析出同义词辨析:" + synonymDiscrimination); } }catch (Exception e){ LOGGER.error("解析同义词辨析出错", e); } return data; }
Example 11
Source File: ParseHelper.java From 4pdaClient-plus with Apache License 2.0 | 5 votes |
private void parseComments(Element main) { String comment, link, userName, date, ratingNum, ratingText; List<CommentsModel> cache = new ArrayList<>(); ArrayList<String> dr = new ArrayList<>(); for (Element element1 : main.select("#comments .reviews li")) { if (!element1.select(".text-box").text().isEmpty()) { /** * Тут короче если текст бокс не нуль, то и все остальное не нуль. */ Element element = element1.select(".text-box .w-toggle").first(); if(element==null) element = element1.select(".text-box").first(); comment = element.text(); element = element1.select("div.name a").first(); link = element.attr("href"); userName = element.attr("title"); date = element1.select("div.date").first().text(); ratingNum = element1.select("span.num").first().text(); ratingText = element1.select("span.text").first().text(); // for detail dialog Elements elements1 = element1.getElementsByClass("reviews-list"); if (elements1 != null) for (Element element2 : elements1) dr.add(element2.select("div.line").text()); cache.add(new CommentsModel(date, ratingNum, ratingText, comment, link, userName, dr)); } } parsed.setCommentsModels(new Gson().toJson(cache)); }
Example 12
Source File: GetYAnswersPropertiesFromQid.java From LiveQAServerDemo with MIT License | 4 votes |
@Override public String getText(Element e) { return e.text(); }
Example 13
Source File: ScheduleNew.java From AcgClub with MIT License | 4 votes |
@ForEach("span:containsOwn(类型) a") void labels(Element element, int index) { type += element.text() + " "; }
Example 14
Source File: BookClass.java From nju-lib-downloader with GNU General Public License v3.0 | 4 votes |
private Set<Book> queryBooks(Elements booksliNode) { Set<Book> books = new HashSet<>(); for (Element element : booksliNode) { //获取书名和id String name = null, id = null, author = null, publishDate = null, theme = null, detailBookClass = null; BookClass bookBookClass; Elements nameIdNode = element.select("p[class=name]"); if (nameIdNode != null) { name = nameIdNode.text(); Elements idNode = nameIdNode.select("a[onclick]"); if (idNode != null && idNode.size() > 0) { String idOnClick = idNode.get(0).attr("onclick"); int start = idOnClick.indexOf("(") + 1, end = idOnClick.lastIndexOf(","); if (start != 0 && end != -1) { id = idOnClick.substring(start, end); } } } //获取分类 BookClass[] bookClasses = new BookClass[0]; Elements infoNode = element.select("p[class=info]"); if (infoNode != null) { Elements bookInfos = infoNode.select("a"); if (bookInfos != null && bookInfos.size() > 0) { Element terminalCataNode = bookInfos.last(); bookInfos.remove(terminalCataNode); List<BookClass> tmplist = bookInfos.stream() .map(bookInfo -> getBookCata(bookInfo, false)) .filter(Objects::nonNull) .collect(Collectors.toList()); BookClass terminalBookClass = getBookCata(terminalCataNode, true); if (terminalBookClass != null) { tmplist.add(terminalBookClass); } bookClasses = tmplist.toArray(bookClasses); } } bookBookClass = this.link(bookClasses); //获取作者,出版日期,主题词,分类 String info = element.text(); Pattern pattern = Pattern.compile("\\d+\\. (.*) 作者[::](.*) 出版日期[::](\\d+).*?(?:主题词[::](.+))? 分类[::](.*)"); Matcher matcher = pattern.matcher(info); while (matcher.find()) { name = matcher.group(1); author = matcher.group(2); publishDate = matcher.group(3); theme = matcher.group(4); detailBookClass = matcher.group(5); } Pattern minPattern = Pattern.compile(".*(《.*》).*"); Matcher minMatcher = minPattern.matcher(info); while (minMatcher.find()) { name = minMatcher.group(1); } //汇总书本 if (name != null && id != null) { Book book = new Book(id, name, author, publishDate, theme, bookBookClass, detailBookClass); book.setCookie(cookie); books.add(book); if (bookBookClass.isTerminal()) { ((TerminalBookClass) bookBookClass).addBook(book); } else { System.out.println("未获取到分类信息,将不被归档 " + book); } } else { System.out.println("error: " + info); } } return books; }
Example 15
Source File: XXSBPaperCollector.java From search with Apache License 2.0 | 4 votes |
@Override public List<File> collect(Date date) { List<String> hrefs = new ArrayList<>(); try { LOG.debug("url: "+url); String paper = url + sf.format(date) + start; LOG.debug("paper: "+paper); Document document = Jsoup.connect(paper).get(); //1、找到子报纸 LOG.debug("subCssQuery: " + subCssQuery); Elements elements = document.select(subCssQuery); for(Element element : elements){ String text = element.text(); String href = element.attr("href"); if(text != null && text.contains(":") && href != null && href.endsWith(".htm")){ String subPaperURL = url + sf.format(date) + href; LOG.debug("子报纸文本:"+text+" , "+href); LOG.debug("subPaperURL:"+subPaperURL); //2、找到内容页面 LOG.debug("contentCssQuery: " + contentCssQuery); Elements contentElements = Jsoup.connect(subPaperURL).get().select(contentCssQuery); for(Element contentElement : contentElements){ String h = contentElement.attr("href"); if(h != null && h.startsWith("content_") && h.endsWith(".htm")){ String contentURL = url + sf.format(date) + h; LOG.debug("contentURL:"+contentURL); //3、找PDF LOG.debug("pdfCssQuery: " + pdfCssQuery); Elements pdfElements = Jsoup.connect(contentURL).get().select(pdfCssQuery); for(Element pdfElement : pdfElements){ String pdf = pdfElement.attr("href"); if(pdf != null && pdf.endsWith(".pdf")){ LOG.debug("报纸链接:"+pdf); pdf = pdf.replace("../../../", ""); LOG.debug("报纸链接:"+pdf); hrefs.add(host+pdf); }else{ LOG.debug("不是报纸链接:"+pdf); } } //有多个content,选择一个即可 break; } } }else{ LOG.debug("不是子报纸文本:"+text+" , "+href); } } } catch (IOException ex) { LOG.error("采集出错",ex); } return downloadPaper(hrefs); }
Example 16
Source File: JsoupUtil.java From materialup with Apache License 2.0 | 4 votes |
private static String text(Element e) { if (e == null) { return null; } return e.text(); }
Example 17
Source File: ElementOperator.java From xsoup with MIT License | 4 votes |
@Override public String operate(Element element) { return element.text(); }
Example 18
Source File: DefaultParser.java From rank with Apache License 2.0 | 4 votes |
@Override public List<Article> parse(String url, String nextPageCssQuery, String nextPageText, String titleCssQuery) { List<Article> articles = new ArrayList<>(); try{ Document document = Jsoup.connect(url) .header("Accept", ACCEPT) .header("Accept-Encoding", ENCODING) .header("Accept-Language", LANGUAGE) .header("Connection", CONNECTION) .header("User-Agent", USER_AGENT) .get(); Elements elements = document.select(titleCssQuery); for(Element element : elements){ String title = element.text(); String href = element.attr("href"); if(!StringUtils.isBlank(title) && !StringUtils.isBlank(href)){ href = UrlTools.normalizeUrl(url, href); Article article = new Article(); article.setTitle(title); article.setUrl(href); articles.add(article); }else{ LOGGER.info("解析列表页出错:"+url+" title:"+title+", href:"+href); } } //获取下一页链接地址 String nextPageUrl = getNextPageUrl(document, nextPageCssQuery, nextPageText); LOGGER.debug("下一页链接:"+nextPageUrl); if(nextPageUrl != null){ nextPageUrl = UrlTools.normalizeUrl(url, nextPageUrl); LOGGER.debug("规范化后的下一页链接:"+nextPageUrl); //解析下一页 List<Article> result = parse(nextPageUrl, nextPageCssQuery, nextPageText, titleCssQuery); articles.addAll(result); }else{ LOGGER.info("列表页解析完毕,最后一页:"+url); } }catch(Exception e){ LOGGER.error("解析列表页出错:"+url, e); } return articles; }
Example 19
Source File: PlumbleService.java From Plumble with GNU General Public License v3.0 | 4 votes |
@Override public void onMessageLogged(IMessage message) { // Split on / strip all HTML tags. Document parsedMessage = Jsoup.parseBodyFragment(message.getMessage()); String strippedMessage = parsedMessage.text(); String ttsMessage; if(mShortTtsMessagesEnabled) { for (Element anchor : parsedMessage.getElementsByTag("A")) { // Get just the domain portion of links String href = anchor.attr("href"); // Only shorten anchors without custom text if (href != null && href.equals(anchor.text())) { String urlHostname = HtmlUtils.getHostnameFromLink(href); if (urlHostname != null) { anchor.text(getString(R.string.chat_message_tts_short_link, urlHostname)); } } } ttsMessage = parsedMessage.text(); } else { ttsMessage = strippedMessage; } String formattedTtsMessage = getString(R.string.notification_message, message.getActorName(), ttsMessage); // Read if TTS is enabled, the message is less than threshold, is a text message, and not deafened if(mSettings.isTextToSpeechEnabled() && mTTS != null && formattedTtsMessage.length() <= TTS_THRESHOLD && getSessionUser() != null && !getSessionUser().isSelfDeafened()) { mTTS.speak(formattedTtsMessage, TextToSpeech.QUEUE_ADD, null); } // TODO: create a customizable notification sieve if (mSettings.isChatNotifyEnabled()) { mMessageNotification.show(message); } mMessageLog.add(new IChatMessage.TextMessage(message)); }
Example 20
Source File: JsoupPropertyTableExtractor.java From wandora with GNU General Public License v3.0 | 3 votes |
private boolean parseTable(Element table) throws Exception{ Elements rows = table.select("tr"); Element masterRow = rows.first(); Element masterCell = masterRow.select("td").first(); if(masterCell == null) throw new Exception("No master row!"); String masterValue = masterCell.text(); Topic masterTopic = getOrCreateTopic(tm, null, masterValue); Association assoc = tm.createAssociation(masterTopic); List<Element> playerRows = rows.subList(1, rows.size()); for(Element playerRow: playerRows) { try { handleAssoc(assoc, playerRow); } catch (Exception e) { log(e); } } return true; }