Java Code Examples for org.jsoup.nodes.Element#getElementsByTag()
The following examples show how to use
org.jsoup.nodes.Element#getElementsByTag() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CityStats.java From zuihou-admin-boot with Apache License 2.0 | 6 votes |
public static void parseCity(String url, Area provinceArea) { String htmlStr = HttpUtil.get(url, CHARSET); Document document = Jsoup.parse(htmlStr); Elements trs = document.getElementsByClass("citytr"); List<Area> cities = new LinkedList<Area>(); int sort = 1; for (Element tr : trs) { Elements links = tr.getElementsByTag("a"); String href = links.get(0).attr("href"); String cityCode = links.get(0).text().substring(0, 4); String cityName = links.get(1).text(); Area cityArea = Area.builder().label(cityName).code(cityCode).source(url) .sortValue(sort++).level(new RemoteData<>("CITY")).fullName(provinceArea.getFullName() + cityName) .build(); StaticLog.info(" 市级数据: {} ", cityArea); parseCounty(COMMON_URL + href, cityArea); cities.add(cityArea); } provinceArea.setChildren(cities); }
Example 2
Source File: SpiderService.java From Doctor with Apache License 2.0 | 6 votes |
/** * 获取科室 * 传入科室ID */ public Department queryAllDepar(int i){ Document document = getDocument(depar_Url); Element element = document.select(div_class).first(); Elements elements = element.getElementsByTag(a_tag); Department department = null; if (i < elements.size()) { Element e = elements.get(i); logger.info("***** 第"+ i +"科 ********"); department = new Department(); department.setId(-1); department.setName(e.text());//获取科室名称 logger.info("正在爬取科室:" + department.getName()); department.setDiseases(queryDiseases(e.attr(a_href)));//单独获取疾病 } return department; }
Example 3
Source File: NpmTemplateParserTest.java From flow with Apache License 2.0 | 6 votes |
@Test public void getTemplateContent_polymer2TemplateStyleInsertion_contentParsedCorrectly() { Mockito.when(configuration.getStringProperty(Mockito.anyString(), Mockito.anyString())) .thenReturn(VAADIN_SERVLET_RESOURCES + "config/no-html-template.json"); TemplateParser parser = NpmTemplateParser.getInstance(); TemplateData data = parser.getTemplateContent( NoHtmlTemplateContent.class, "no-html-template", service); Element templateElement = data.getTemplateElement(); Assert.assertNotNull(templateElement); Elements divs = templateElement.getElementsByTag("div"); Assert.assertEquals(1, divs.size()); Assert.assertEquals("No Template", divs.get(0).text()); }
Example 4
Source File: CityStats.java From zuihou-admin-boot with Apache License 2.0 | 6 votes |
public static void parseVillagetr(String url, Area countyArea) { String htmlStr = HttpUtil.get(url, CHARSET); Document document = Jsoup.parse(htmlStr); Elements trs = document.getElementsByClass("villagetr"); List<Area> counties = new LinkedList<Area>(); int sort = 1; for (Element tr : trs) { Elements tds = tr.getElementsByTag("td"); if (tds == null || tds.size() != 3) { continue; } String villagetrCode = tds.get(0).text(); String villagetrName = tds.get(2).text(); Area villagetrArea = Area.builder().code(villagetrCode).label(villagetrName).source(url) .sortValue(sort++).level(new RemoteData<>("VILLAGETR")).fullName(countyArea.getFullName() + villagetrName) .build(); StaticLog.info(" 村级数据: {} ", villagetrArea); counties.add(villagetrArea); } countyArea.setChildren(counties); }
Example 5
Source File: CityParser.java From zuihou-admin-boot with Apache License 2.0 | 5 votes |
/** * 村庄数据 * * @param url * @return */ public List<Area> parseVillagetr(String fullName, String url) { String htmlStr = HttpUtil.get(url, CHARSET); Document document = Jsoup.parse(htmlStr); Elements trs = document.getElementsByClass("villagetr"); List<Area> counties = new LinkedList<Area>(); int sort = 1; for (Element tr : trs) { Elements tds = tr.getElementsByTag("td"); if (tds == null || tds.size() != 3) { continue; } String villagetrCode = tds.get(0).text(); String villagetrName = tds.get(2).text(); Area villagetrArea = Area.builder().code(villagetrCode) .label(villagetrName) .fullName(fullName + villagetrName) .sortValue(sort++) .source(url).build(); StaticLog.info(" 村级数据: {} ", villagetrArea); counties.add(villagetrArea); } return counties; }
Example 6
Source File: TianLaiReadUtil.java From MissZzzReader with Apache License 2.0 | 5 votes |
/** * 从html中获取章节列表 * * @param html * @return */ public static ArrayList<Chapter> getChaptersFromHtml(String html,Book book) { ArrayList<Chapter> chapters = new ArrayList<>(); Document doc = Jsoup.parse(html); Element divList = doc.getElementById("list"); Element dl = divList.getElementsByTag("dl").get(0); String lastTile = null; int i = 0; for(Element dd : dl.getElementsByTag("dd")){ Elements as = dd.getElementsByTag("a"); if (as.size() > 0) { Element a = as.get(0); String title = a.html(); if (!StringHelper.isEmpty(lastTile) && title.equals(lastTile)) { continue; } Chapter chapter = new Chapter(); chapter.setNumber(i++); chapter.setTitle(title); String url = a.attr("href"); if (StringHelper.isEmpty(book.getSource()) || BookSource.tianlai.toString().equals(book.getSource())) { url = URLCONST.nameSpace_tianlai + url; } else if (BookSource.biquge.toString().equals(book.getSource())) { url = book.getChapterUrl() + url; } chapter.setUrl(url); chapters.add(chapter); lastTile = title; } } return chapters; }
Example 7
Source File: RepositoriesPresenter.java From OpenHub with GNU General Public License v3.0 | 5 votes |
private Repository parseCollectionsRepositoryData(Element element) throws Exception{ String fullName = element.select("div > h1 > a").attr("href"); fullName = fullName.substring(1); String owner = fullName.substring(0, fullName.lastIndexOf("/")); String repoName = fullName.substring(fullName.lastIndexOf("/") + 1); // String ownerAvatar = element.select("div > div > a > img").attr("src"); String ownerAvatar = ""; Elements articleElements = element.getElementsByTag("div"); Element descElement = articleElements.get(articleElements.size() - 2); StringBuilder desc = new StringBuilder(""); for(TextNode textNode : descElement.textNodes()){ desc.append(textNode.getWholeText()); } Element numElement = articleElements.last(); String starNumStr = numElement.select("a").get(0).textNodes().get(1).toString(); String forkNumStr = numElement.select("a").get(1).textNodes().get(1).toString(); String language = ""; Elements languageElements = numElement.select("span > span > span"); if(languageElements.size() > 0){ language = numElement.select("span > span > span").get(1).textNodes().get(0).toString(); } Repository repo = new Repository(); repo.setFullName(fullName); repo.setName(repoName); User user = new User(); user.setLogin(owner); user.setAvatarUrl(ownerAvatar); repo.setOwner(user); repo.setDescription(desc.toString()); repo.setStargazersCount(Integer.parseInt(starNumStr.replaceAll(" ", ""))); repo.setForksCount(Integer.parseInt(forkNumStr.replaceAll(" ", ""))); repo.setLanguage(language); return repo; }
Example 8
Source File: SparkUtil.java From yanagishima with Apache License 2.0 | 5 votes |
public static List<SparkSqlJob> getSparkSqlJobFromSqlserver(String resourceManagerUrl, String sparkJdbcApplicationId) { try { List<SparkSqlJob> sparkSqlJobs = new ArrayList<>(); Document document = Jsoup.connect(resourceManagerUrl + "/proxy/" + sparkJdbcApplicationId + "/sqlserver").get(); // SQL Statistics // User JobID GroupID Start Time Finish Time Duration Statement State Detail Element table = document.getElementsByTag("tbody").last(); if (table == null) { return sparkSqlJobs; } for (Element row : table.getElementsByTag("tr")) { SparkSqlJob sparkSqlJob = new SparkSqlJob(); Elements td = row.getElementsByTag("td"); sparkSqlJob.setUser(td.get(0).text()); Element jobIds = td.get(1); List<Integer> jobIdList = new ArrayList<>(); if (jobIds.childNodeSize() > 1) { for (Element a : jobIds.getElementsByTag("a")) { String str = a.text(); jobIdList.add(Integer.parseInt(str.substring(1, str.length() - 1))); } } sparkSqlJob.setJobIds(jobIdList); sparkSqlJob.setGroupId(td.get(2).text()); sparkSqlJob.setStartTime(td.get(3).text()); sparkSqlJob.setFinishTime(td.get(4).text()); sparkSqlJob.setDuration(td.get(5).text()); sparkSqlJob.setStatement(td.get(6).text()); sparkSqlJob.setState(td.get(7).text()); sparkSqlJob.setDetail(td.get(8).text()); sparkSqlJobs.add(sparkSqlJob); } return sparkSqlJobs; } catch (IOException e) { throw new RuntimeException(e); } }
Example 9
Source File: CityParser.java From zuihou-admin-cloud with Apache License 2.0 | 5 votes |
/** * 村庄数据 * * @param url * @return */ public List<Area> parseVillagetr(String fullName, String url) { String htmlStr = HttpUtil.get(url, CHARSET); Document document = Jsoup.parse(htmlStr); Elements trs = document.getElementsByClass("villagetr"); List<Area> counties = new LinkedList<Area>(); int sort = 1; for (Element tr : trs) { Elements tds = tr.getElementsByTag("td"); if (tds == null || tds.size() != 3) { continue; } String villagetrCode = tds.get(0).text(); String villagetrName = tds.get(2).text(); Area villagetrArea = Area.builder().code(villagetrCode) .label(villagetrName) .fullName(fullName + villagetrName) .sortValue(sort++) .source(url).build(); StaticLog.info(" 村级数据: {} ", villagetrArea); counties.add(villagetrArea); } return counties; }
Example 10
Source File: JsoupTest.java From java-tutorial with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
/** * 使用DOM方法来遍历一个文档 */ @Test public void test01() { // 遍历一个Document对象中所有的链接 Element content = docFromFile.body(); Elements links = content.getElementsByTag("a"); for (Element link : links) { System.out.println("linkHref: " + link.attr("href")); System.out.println("linkText: " + link.text()); } }
Example 11
Source File: GoogleCodeImporter.java From scava with Eclipse Public License 2.0 | 5 votes |
private List<GoogleIssue> getGoogleIssueList(Platform platform, String url) { List<GoogleIssue> result = new ArrayList<GoogleIssue>(); org.jsoup.nodes.Document doc; org.jsoup.nodes.Element content; String URL_PROJECT = url; try { doc = Jsoup.connect(URL_PROJECT).timeout(10000).get(); //Pagination Element pagination = doc.getElementsByClass("pagination").first(); Integer totPagination = Integer.parseInt(pagination.text().split(" ")[4]); Integer numPag = (totPagination % 100) == 0 ? totPagination/100:totPagination/100+1; //End pagination Element e = doc.getElementById("resultstable"); e = e.getElementsByTag("tbody").first(); Elements tableRows = e.getElementsByTag("tr"); for (Element iterable_element : tableRows) { String urlIssue = url.substring(0,url.length()-4) + "detail?id="+ iterable_element.getElementsByTag("td").get(1).getElementsByTag("a").first().text(); GoogleIssue gi = getGoogleIssue(platform, urlIssue); result.add(gi); break; } } catch (IOException e1) { // TODO Auto-generated catch block logger.error("Google code importer error for retirve Issue list" + e1.getMessage()); } return result; }
Example 12
Source File: Test2.java From Doctor with Apache License 2.0 | 5 votes |
@Test public void test2(){ Document document = SpiderUtil.getDocument("http://jb39.com/jibing/JiXingSheBianTaoTiYan269516.htm"); Element select = document.select(".ul-ss-3.jb-xx-zz").first(); if (select==null){ return; } Elements a = select.getElementsByTag("a"); for (Element element: a){ System.out.println(element.text()); } }
Example 13
Source File: CityParser.java From zuihou-admin-boot with Apache License 2.0 | 5 votes |
private List<Area> parseCounty(String fullName, String url) { String htmlStr = HttpUtil.get(url, CHARSET); Document document = Jsoup.parse(htmlStr); Elements trs = document.getElementsByClass("countytr"); List<Area> counties = new LinkedList<Area>(); int sort = 1; for (Element tr : trs) { Elements links = tr.getElementsByTag("a"); if (links == null || links.size() != 2) { continue; } String href = links.get(0).attr("href"); String countyCode = links.get(0).text(); // String countyCode = links.get(0).text().substring(0, 6); String countyName = links.get(1).text(); Area countyArea = Area.builder().code(countyCode) .label(countyName) .source(url) .fullName(fullName + countyName) .sortValue(sort++) .level(new RemoteData<>("COUNTY")) // .nodes(parseTowntr(fullName + countyName, COMMON_URL + href.subSequence(2, 5).toString() + "/" + href)) .build(); StaticLog.info(" 县级数据: {} ", countyArea); counties.add(countyArea); } return counties; }
Example 14
Source File: HackTool.java From AndroidAnimationExercise with Apache License 2.0 | 5 votes |
public static HtmlBean getInfoFromUrl(String url) { HtmlBean htmlBean = null; try { //获取指定网址的页面内容 Document document = Jsoup.connect(url).timeout(50000).get(); String title = document.getElementsByClass("title").get(0).text(); String username = document.getElementsByClass("name").get(0).getElementsByTag("a").get(0).text(); String userImg = document.getElementsByClass("avatar").get(0).getElementsByTag("img").get(0).attr("src"); String publishTime = document.getElementsByClass("publish-time").text(); String words = document.getElementsByClass("wordage").text(); Elements content = document.getElementsByClass("show-content"); Element element = content.get(0); Elements imgs = element.getElementsByTag("img"); // for (Element ele_img : imgs) { // ele_img.attr("style", "max-width:100%;height:auto;"); // } String contentStr = JIAN_SHU_CSS + content.toString(); htmlBean = new HtmlBean(); htmlBean.setContent(contentStr); htmlBean.setUsername(username); htmlBean.setTitle(title); htmlBean.setUserImg(userImg); htmlBean.setPublishTime(publishTime.split(" ")[0]); htmlBean.setWords(words.split(" ")[1]); } catch (IOException e) { e.printStackTrace(); } return htmlBean; }
Example 15
Source File: Twitter.java From xGetter with Apache License 2.0 | 5 votes |
private static String getSize(Element input){ Elements elements = input.getElementsByTag("td"); for (int i=0;i<elements.size();i++){ String s = elements.get(i).html(); if (!s.startsWith("<") && s.contains("x")){ if (s.contains(" ")){ s = s.replace(" ",""); } return s; } } return null; }
Example 16
Source File: FresherArticlesParser.java From AndroidWeekly with Apache License 2.0 | 4 votes |
@Override public List<Object> parse(String issue) throws IOException { Document doc = DocumentProvider.get(issue); List<Object> articles = new ArrayList<>(); Elements tables = doc.getElementsByTag("table"); String currentSection = null; for (Element e : tables) { Elements h2 = e.getElementsByTag("h2"); Elements h5 = e.getElementsByTag("h5");// 兼容issue-226 SPONSORED 在 h5 标签里面 if (!h2.isEmpty() || !h5.isEmpty()) { currentSection = h2.size() > 0 ? h2.get(0).text() : h5.get(0).text(); if (!articles.contains(currentSection)) { articles.add(currentSection); } } else { Elements tds = e.getElementsByTag("td"); Element td = tds.get(tds.size() - 2); String imageUrl = null; if (tds.size() == 4) { imageUrl = tds.get(0).getElementsByTag("img").get(0).attr("src"); } String title = td.getElementsByClass("article-headline").get(0).text(); String brief = td.getElementsByTag("p").get(0).text(); String link = td.getElementsByClass("article-headline").get(0).attr("href"); String domain = td.getElementsByTag("span").get(0).text().replace("(", "").replace(")", ""); if (issue == null) { String number = doc.getElementsByClass("issue-header").get(0).getElementsByTag("span").get(0).text(); issue = "/issues/issue-" + number.replace("#", ""); } Article article = new Article(); article.setTitle(title); article.setBrief(brief); article.setLink(link); article.setDomain(domain); article.setIssue(issue); article.setImageUrl(imageUrl); article.setSection(currentSection); articles.add(article); } } return articles; }
Example 17
Source File: ParsePicUrlList.java From HHComicViewer with Apache License 2.0 | 4 votes |
private static int getPicCount(Document doc) { Element pageHtm = doc.getElementById("iPageHtm"); Elements pageLink = pageHtm.getElementsByTag("a"); // 注意,页码是以1开始的,所以最后一页的页码就是总页数 return Integer.valueOf(pageLink.get(pageLink.size() - 1).text()); }
Example 18
Source File: ParseCourse.java From ClassSchedule with Apache License 2.0 | 4 votes |
/** * @param html * @return 解析失败返回空 */ public static ArrayList<CourseV2> parse(String html) { Document doc = org.jsoup.Jsoup.parse(html); Element table1 = doc.getElementById("Table1"); Elements trs = table1.getElementsByTag("tr"); ArrayList<CourseV2> courses = new ArrayList<>(); int node = 0; for (Element tr : trs) { Elements tds = tr.getElementsByTag("td"); for (Element td : tds) { String courseSource = td.text().trim(); if (courseSource.length() <= 6) { //null data continue; } if (Pattern.matches(pattern, courseSource)) { //node number try { node = Integer.decode(courseSource.substring(1, courseSource.length() - 1)); } catch (Exception e) { node = 0; e.printStackTrace(); } continue; } if (inArray(other, courseSource)) { //other data continue; } courses.addAll(ParseCourse.parseTextInfo(courseSource, node)); } } return mergeSameClass(courses); }
Example 19
Source File: SearchParser.java From WordPressHelper with MIT License | 4 votes |
@Override protected Object doInBackground(Object[] params) { try { Document document = Jsoup.connect(URL_WORDPRESS + "/?s=" + searchClear + "&feed=rss2") .userAgent("Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 Safari/537.22") .timeout(60000).ignoreContentType(true).get(); Elements elements = document.getElementsByTag("item"); for (Element element : elements) { FeedItem feedItem = new FeedItem(); //get all simple information feedItem.setTitle(element.getElementsByTag("title").first().text()); feedItem.setPubDate(element.getElementsByTag("pubDate").first().text()); feedItem.setCreator(element.getElementsByTag("dc:creator").first().text()); feedItem.setDescription(element.getElementsByTag("description").first().text()); feedItem.setContent(element.getElementsByTag("content:encoded").first().text()); feedItem.setCommentRss(element.getElementsByTag("wfw:commentRss").first().text()); feedItem.setComments(element.getElementsByTag("slash:comments").first().text()); feedItem.setLink(element.select("link").first().nextSibling().toString().trim()); feedItem.setGuid(element.getElementsByTag("guid").first().text()); //get first image Document document1 = Jsoup.parse(element.getElementsByTag("content:encoded").first().text()); Elements elements1 = document1.select("img"); feedItem.setImage(elements1.attr("src")); //get all category Elements elements2 = element.getElementsByTag("category"); ArrayList<String> category = new ArrayList<>(); for (int i = 0; i < elements2.size(); i++) { category.add(element.getElementsByTag("category").get(i).text()); } feedItem.setCategory(category); //get id String idPost[] = element.getElementsByTag("guid").first().text().split("p="); if (idPost.length > 1) { feedItem.setId(idPost[1]); //add feeditem to arraylist feedItems.add(feedItem); } } } catch (IOException e) { e.printStackTrace(); } return null; }
Example 20
Source File: AuthorParser.java From WordPressHelper with MIT License | 4 votes |
@Override protected Object doInBackground(Object[] params) { try { Document document = Jsoup.connect(URL_WORDPRESS + "/author/" + authorName + "/feed/") .userAgent("Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 Safari/537.22") .timeout(60000).ignoreContentType(true).get(); Elements elements = document.getElementsByTag("item"); for (Element element : elements) { FeedItem feedItem = new FeedItem(); //get all simple information feedItem.setTitle(element.getElementsByTag("title").first().text()); feedItem.setPubDate(element.getElementsByTag("pubDate").first().text()); feedItem.setCreator(element.getElementsByTag("dc:creator").first().text()); feedItem.setDescription(element.getElementsByTag("description").first().text()); feedItem.setContent(element.getElementsByTag("content:encoded").first().text()); feedItem.setCommentRss(element.getElementsByTag("wfw:commentRss").first().text()); feedItem.setComments(element.getElementsByTag("slash:comments").first().text()); feedItem.setLink(element.select("link").first().nextSibling().toString().trim()); feedItem.setGuid(element.getElementsByTag("guid").first().text()); //get first image Document document1 = Jsoup.parse(element.getElementsByTag("content:encoded").first().text()); Elements elements1 = document1.select("img"); feedItem.setImage(elements1.attr("src")); //get all category Elements elements2 = element.getElementsByTag("category"); ArrayList<String> category = new ArrayList<>(); for (int i = 0; i < elements2.size(); i++) { category.add(element.getElementsByTag("category").get(i).text()); } feedItem.setCategory(category); String idPost[] = element.getElementsByTag("guid").first().text().split("p="); if (idPost.length > 1) { feedItem.setId(idPost[1]); //add feeditem to arraylist feedItems.add(feedItem); } } } catch (IOException e) { e.printStackTrace(); } return null; }