Java Code Examples for org.jsoup.nodes.Element#getElementsByTag()

The following examples show how to use org.jsoup.nodes.Element#getElementsByTag() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CityStats.java    From zuihou-admin-boot with Apache License 2.0 6 votes vote down vote up
public static void parseCity(String url, Area provinceArea) {
    String htmlStr = HttpUtil.get(url, CHARSET);
    Document document = Jsoup.parse(htmlStr);
    Elements trs = document.getElementsByClass("citytr");
    List<Area> cities = new LinkedList<Area>();
    int sort = 1;
    for (Element tr : trs) {
        Elements links = tr.getElementsByTag("a");
        String href = links.get(0).attr("href");
        String cityCode = links.get(0).text().substring(0, 4);
        String cityName = links.get(1).text();

        Area cityArea = Area.builder().label(cityName).code(cityCode).source(url)
                .sortValue(sort++).level(new RemoteData<>("CITY")).fullName(provinceArea.getFullName() + cityName)
                .build();

        StaticLog.info("	市级数据:  {}  ", cityArea);

        parseCounty(COMMON_URL + href, cityArea);
        cities.add(cityArea);
    }
    provinceArea.setChildren(cities);
}
 
Example 2
Source File: SpiderService.java    From Doctor with Apache License 2.0 6 votes vote down vote up
/**
 * 获取科室
 * 传入科室ID
 */
public Department queryAllDepar(int i){
    Document document = getDocument(depar_Url);
    Element element = document.select(div_class).first();
    Elements elements = element.getElementsByTag(a_tag);
    Department department = null;
    if (i < elements.size()) {
        Element e = elements.get(i);
        logger.info("***** 第"+ i +"科 ********");
        department = new Department();
        department.setId(-1);
        department.setName(e.text());//获取科室名称
        logger.info("正在爬取科室:" + department.getName());
        department.setDiseases(queryDiseases(e.attr(a_href)));//单独获取疾病
    }
    return department;
}
 
Example 3
Source File: NpmTemplateParserTest.java    From flow with Apache License 2.0 6 votes vote down vote up
@Test
public void getTemplateContent_polymer2TemplateStyleInsertion_contentParsedCorrectly() {
    Mockito.when(configuration.getStringProperty(Mockito.anyString(),
            Mockito.anyString()))
            .thenReturn(VAADIN_SERVLET_RESOURCES
                    + "config/no-html-template.json");

    TemplateParser parser = NpmTemplateParser.getInstance();
    TemplateData data = parser.getTemplateContent(
            NoHtmlTemplateContent.class, "no-html-template", service);
    Element templateElement = data.getTemplateElement();
    Assert.assertNotNull(templateElement);
    Elements divs = templateElement.getElementsByTag("div");
    Assert.assertEquals(1, divs.size());
    Assert.assertEquals("No Template", divs.get(0).text());
}
 
Example 4
Source File: CityStats.java    From zuihou-admin-boot with Apache License 2.0 6 votes vote down vote up
public static void parseVillagetr(String url, Area countyArea) {
    String htmlStr = HttpUtil.get(url, CHARSET);
    Document document = Jsoup.parse(htmlStr);
    Elements trs = document.getElementsByClass("villagetr");

    List<Area> counties = new LinkedList<Area>();
    int sort = 1;
    for (Element tr : trs) {
        Elements tds = tr.getElementsByTag("td");
        if (tds == null || tds.size() != 3) {
            continue;
        }
        String villagetrCode = tds.get(0).text();
        String villagetrName = tds.get(2).text();

        Area villagetrArea = Area.builder().code(villagetrCode).label(villagetrName).source(url)
                .sortValue(sort++).level(new RemoteData<>("VILLAGETR")).fullName(countyArea.getFullName() + villagetrName)
                .build();
        StaticLog.info("		村级数据:  {}  ", villagetrArea);

        counties.add(villagetrArea);

    }
    countyArea.setChildren(counties);
}
 
Example 5
Source File: CityParser.java    From zuihou-admin-boot with Apache License 2.0 5 votes vote down vote up
/**
 * 村庄数据
 *
 * @param url
 * @return
 */
public List<Area> parseVillagetr(String fullName, String url) {
    String htmlStr = HttpUtil.get(url, CHARSET);
    Document document = Jsoup.parse(htmlStr);
    Elements trs = document.getElementsByClass("villagetr");

    List<Area> counties = new LinkedList<Area>();
    int sort = 1;
    for (Element tr : trs) {
        Elements tds = tr.getElementsByTag("td");
        if (tds == null || tds.size() != 3) {
            continue;
        }
        String villagetrCode = tds.get(0).text();
        String villagetrName = tds.get(2).text();

        Area villagetrArea = Area.builder().code(villagetrCode)
                .label(villagetrName)
                .fullName(fullName + villagetrName)
                .sortValue(sort++)
                .source(url).build();
        StaticLog.info("				村级数据:  {}  ", villagetrArea);

        counties.add(villagetrArea);
    }
    return counties;
}
 
Example 6
Source File: TianLaiReadUtil.java    From MissZzzReader with Apache License 2.0 5 votes vote down vote up
/**
 * 从html中获取章节列表
 *
 * @param html
 * @return
 */
public static ArrayList<Chapter> getChaptersFromHtml(String html,Book book) {
    ArrayList<Chapter> chapters = new ArrayList<>();
    Document doc = Jsoup.parse(html);
    Element divList = doc.getElementById("list");
    Element dl = divList.getElementsByTag("dl").get(0);

    String lastTile = null;
    int i = 0;
    for(Element dd : dl.getElementsByTag("dd")){
        Elements as = dd.getElementsByTag("a");
        if (as.size() > 0) {
            Element a = as.get(0);
            String title = a.html();
            if (!StringHelper.isEmpty(lastTile) && title.equals(lastTile)) {
                continue;
            }
            Chapter chapter = new Chapter();
            chapter.setNumber(i++);
            chapter.setTitle(title);
            String url = a.attr("href");
            if (StringHelper.isEmpty(book.getSource()) || BookSource.tianlai.toString().equals(book.getSource())) {
                url = URLCONST.nameSpace_tianlai + url;
            } else if (BookSource.biquge.toString().equals(book.getSource())) {
                url = book.getChapterUrl() + url;
            }
            chapter.setUrl(url);
            chapters.add(chapter);
            lastTile = title;
        }

    }

    return chapters;
}
 
Example 7
Source File: RepositoriesPresenter.java    From OpenHub with GNU General Public License v3.0 5 votes vote down vote up
private Repository parseCollectionsRepositoryData(Element element) throws Exception{
        String fullName = element.select("div > h1 > a").attr("href");
        fullName = fullName.substring(1);
        String owner = fullName.substring(0, fullName.lastIndexOf("/"));
        String repoName = fullName.substring(fullName.lastIndexOf("/") + 1);
//        String ownerAvatar = element.select("div > div > a > img").attr("src");
        String ownerAvatar = "";

        Elements articleElements = element.getElementsByTag("div");
        Element descElement = articleElements.get(articleElements.size() - 2);
        StringBuilder desc = new StringBuilder("");
        for(TextNode textNode : descElement.textNodes()){
            desc.append(textNode.getWholeText());
        }

        Element numElement = articleElements.last();
        String starNumStr =  numElement.select("a").get(0).textNodes().get(1).toString();
        String forkNumStr =  numElement.select("a").get(1).textNodes().get(1).toString();
        String language = "";
        Elements languageElements = numElement.select("span > span > span");
        if(languageElements.size() > 0){
            language = numElement.select("span > span > span").get(1).textNodes().get(0).toString();
        }

        Repository repo = new Repository();
        repo.setFullName(fullName);
        repo.setName(repoName);
        User user = new User();
        user.setLogin(owner);
        user.setAvatarUrl(ownerAvatar);
        repo.setOwner(user);

        repo.setDescription(desc.toString());
        repo.setStargazersCount(Integer.parseInt(starNumStr.replaceAll(" ", "")));
        repo.setForksCount(Integer.parseInt(forkNumStr.replaceAll(" ", "")));
        repo.setLanguage(language);

        return repo;
    }
 
Example 8
Source File: SparkUtil.java    From yanagishima with Apache License 2.0 5 votes vote down vote up
public static List<SparkSqlJob> getSparkSqlJobFromSqlserver(String resourceManagerUrl, String sparkJdbcApplicationId) {
    try {
        List<SparkSqlJob> sparkSqlJobs = new ArrayList<>();
        Document document = Jsoup.connect(resourceManagerUrl + "/proxy/" + sparkJdbcApplicationId + "/sqlserver").get();
        // SQL Statistics
        // User	JobID	GroupID	Start Time	Finish Time	Duration	Statement	State	Detail
        Element table = document.getElementsByTag("tbody").last();
        if (table == null) {
            return sparkSqlJobs;
        }
        for (Element row : table.getElementsByTag("tr")) {
            SparkSqlJob sparkSqlJob = new SparkSqlJob();
            Elements td = row.getElementsByTag("td");
            sparkSqlJob.setUser(td.get(0).text());
            Element jobIds = td.get(1);
            List<Integer> jobIdList = new ArrayList<>();
            if (jobIds.childNodeSize() > 1) {
                for (Element a : jobIds.getElementsByTag("a")) {
                    String str = a.text();
                    jobIdList.add(Integer.parseInt(str.substring(1, str.length() - 1)));
                }
            }
            sparkSqlJob.setJobIds(jobIdList);
            sparkSqlJob.setGroupId(td.get(2).text());
            sparkSqlJob.setStartTime(td.get(3).text());
            sparkSqlJob.setFinishTime(td.get(4).text());
            sparkSqlJob.setDuration(td.get(5).text());
            sparkSqlJob.setStatement(td.get(6).text());
            sparkSqlJob.setState(td.get(7).text());
            sparkSqlJob.setDetail(td.get(8).text());
            sparkSqlJobs.add(sparkSqlJob);
        }
        return sparkSqlJobs;
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
 
Example 9
Source File: CityParser.java    From zuihou-admin-cloud with Apache License 2.0 5 votes vote down vote up
/**
 * 村庄数据
 *
 * @param url
 * @return
 */
public List<Area> parseVillagetr(String fullName, String url) {
    String htmlStr = HttpUtil.get(url, CHARSET);
    Document document = Jsoup.parse(htmlStr);
    Elements trs = document.getElementsByClass("villagetr");

    List<Area> counties = new LinkedList<Area>();
    int sort = 1;
    for (Element tr : trs) {
        Elements tds = tr.getElementsByTag("td");
        if (tds == null || tds.size() != 3) {
            continue;
        }
        String villagetrCode = tds.get(0).text();
        String villagetrName = tds.get(2).text();

        Area villagetrArea = Area.builder().code(villagetrCode)
                .label(villagetrName)
                .fullName(fullName + villagetrName)
                .sortValue(sort++)
                .source(url).build();
        StaticLog.info("				村级数据:  {}  ", villagetrArea);

        counties.add(villagetrArea);
    }
    return counties;
}
 
Example 10
Source File: JsoupTest.java    From java-tutorial with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
/**
 * 使用DOM方法来遍历一个文档
 */
@Test
public void test01() {
	// 遍历一个Document对象中所有的链接
	Element content = docFromFile.body();
	Elements links = content.getElementsByTag("a");
	for (Element link : links) {
		System.out.println("linkHref: " + link.attr("href"));
		System.out.println("linkText: " + link.text());
	}
}
 
Example 11
Source File: GoogleCodeImporter.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
private List<GoogleIssue> getGoogleIssueList(Platform platform, String url) {
	List<GoogleIssue> result = new ArrayList<GoogleIssue>();
	org.jsoup.nodes.Document doc;
	org.jsoup.nodes.Element content;	
	String URL_PROJECT = url;	
	try 
	{
		doc = Jsoup.connect(URL_PROJECT).timeout(10000).get();
		
		//Pagination
		Element pagination = doc.getElementsByClass("pagination").first();
		Integer totPagination = Integer.parseInt(pagination.text().split(" ")[4]);
		Integer numPag = (totPagination % 100) == 0 ? totPagination/100:totPagination/100+1;
		//End pagination
		
		Element e = doc.getElementById("resultstable");
		e = e.getElementsByTag("tbody").first();
		Elements tableRows = e.getElementsByTag("tr");
		for (Element iterable_element : tableRows) 
		{
			String urlIssue = url.substring(0,url.length()-4) +
					"detail?id="+ iterable_element.getElementsByTag("td").get(1).getElementsByTag("a").first().text();
			GoogleIssue gi = getGoogleIssue(platform, urlIssue);
			result.add(gi);
			
			break;
		}
		
		
	} catch (IOException e1) {
		// TODO Auto-generated catch block
		logger.error("Google code importer error for retirve Issue list" + e1.getMessage());
	}
	return result;
}
 
Example 12
Source File: Test2.java    From Doctor with Apache License 2.0 5 votes vote down vote up
@Test
public void test2(){
    Document document = SpiderUtil.getDocument("http://jb39.com/jibing/JiXingSheBianTaoTiYan269516.htm");
    Element select = document.select(".ul-ss-3.jb-xx-zz").first();
    if (select==null){
        return;
    }
    Elements a = select.getElementsByTag("a");
    for (Element element: a){
        System.out.println(element.text());
    }
}
 
Example 13
Source File: CityParser.java    From zuihou-admin-boot with Apache License 2.0 5 votes vote down vote up
private List<Area> parseCounty(String fullName, String url) {
        String htmlStr = HttpUtil.get(url, CHARSET);
        Document document = Jsoup.parse(htmlStr);
        Elements trs = document.getElementsByClass("countytr");

        List<Area> counties = new LinkedList<Area>();
        int sort = 1;
        for (Element tr : trs) {
            Elements links = tr.getElementsByTag("a");
            if (links == null || links.size() != 2) {
                continue;
            }
            String href = links.get(0).attr("href");
            String countyCode = links.get(0).text();
//            String countyCode = links.get(0).text().substring(0, 6);
            String countyName = links.get(1).text();

            Area countyArea = Area.builder().code(countyCode)
                    .label(countyName)
                    .source(url)
                    .fullName(fullName + countyName)
                    .sortValue(sort++)
                    .level(new RemoteData<>("COUNTY"))
//                    .nodes(parseTowntr(fullName + countyName, COMMON_URL + href.subSequence(2, 5).toString() + "/" + href))
                    .build();
            StaticLog.info("		县级数据:  {}  ", countyArea);

            counties.add(countyArea);
        }
        return counties;
    }
 
Example 14
Source File: HackTool.java    From AndroidAnimationExercise with Apache License 2.0 5 votes vote down vote up
public static HtmlBean getInfoFromUrl(String url) {
        HtmlBean htmlBean = null;

        try {
            //获取指定网址的页面内容
            Document document = Jsoup.connect(url).timeout(50000).get();
            String title = document.getElementsByClass("title").get(0).text();
            String username = document.getElementsByClass("name").get(0).getElementsByTag("a").get(0).text();
            String userImg = document.getElementsByClass("avatar").get(0).getElementsByTag("img").get(0).attr("src");
            String publishTime = document.getElementsByClass("publish-time").text();
            String words = document.getElementsByClass("wordage").text();
            Elements content = document.getElementsByClass("show-content");
            Element element = content.get(0);
            Elements imgs = element.getElementsByTag("img");
//            for (Element ele_img : imgs) {
//                ele_img.attr("style", "max-width:100%;height:auto;");
//            }
            String contentStr = JIAN_SHU_CSS + content.toString();
            htmlBean = new HtmlBean();
            htmlBean.setContent(contentStr);
            htmlBean.setUsername(username);
            htmlBean.setTitle(title);
            htmlBean.setUserImg(userImg);
            htmlBean.setPublishTime(publishTime.split(" ")[0]);
            htmlBean.setWords(words.split(" ")[1]);
        } catch (IOException e) {
            e.printStackTrace();
        }


        return htmlBean;
    }
 
Example 15
Source File: Twitter.java    From xGetter with Apache License 2.0 5 votes vote down vote up
private static String getSize(Element input){
    Elements elements = input.getElementsByTag("td");
    for (int i=0;i<elements.size();i++){
        String s = elements.get(i).html();
        if (!s.startsWith("<") && s.contains("x")){
            if (s.contains(" ")){
                s = s.replace(" ","");
            }
            return s;
        }
    }
    return null;
}
 
Example 16
Source File: FresherArticlesParser.java    From AndroidWeekly with Apache License 2.0 4 votes vote down vote up
@Override
public List<Object> parse(String issue) throws IOException {
    Document doc = DocumentProvider.get(issue);
    List<Object> articles = new ArrayList<>();
    Elements tables = doc.getElementsByTag("table");
    String currentSection = null;
    for (Element e : tables) {
        Elements h2 = e.getElementsByTag("h2");
        Elements h5 = e.getElementsByTag("h5");// 兼容issue-226 SPONSORED 在 h5 标签里面
        if (!h2.isEmpty() || !h5.isEmpty()) {
            currentSection = h2.size() > 0 ? h2.get(0).text() : h5.get(0).text();
            if (!articles.contains(currentSection)) {
                articles.add(currentSection);
            }
        } else {
            Elements tds = e.getElementsByTag("td");
            Element td = tds.get(tds.size() - 2);
            String imageUrl = null;
            if (tds.size() == 4) {
                imageUrl = tds.get(0).getElementsByTag("img").get(0).attr("src");
            }
            String title = td.getElementsByClass("article-headline").get(0).text();
            String brief = td.getElementsByTag("p").get(0).text();
            String link = td.getElementsByClass("article-headline").get(0).attr("href");
            String domain = td.getElementsByTag("span").get(0).text().replace("(", "").replace(")", "");
            if (issue == null) {
                String number = doc.getElementsByClass("issue-header").get(0).getElementsByTag("span").get(0).text();
                issue = "/issues/issue-" + number.replace("#", "");
            }
            Article article = new Article();
            article.setTitle(title);
            article.setBrief(brief);
            article.setLink(link);
            article.setDomain(domain);
            article.setIssue(issue);
            article.setImageUrl(imageUrl);
            article.setSection(currentSection);
            articles.add(article);
        }
    }
    return articles;
}
 
Example 17
Source File: ParsePicUrlList.java    From HHComicViewer with Apache License 2.0 4 votes vote down vote up
private static int getPicCount(Document doc) {
    Element pageHtm = doc.getElementById("iPageHtm");
    Elements pageLink = pageHtm.getElementsByTag("a");
    // 注意,页码是以1开始的,所以最后一页的页码就是总页数
    return Integer.valueOf(pageLink.get(pageLink.size() - 1).text());
}
 
Example 18
Source File: ParseCourse.java    From ClassSchedule with Apache License 2.0 4 votes vote down vote up
/**
 * @param html
 * @return 解析失败返回空
 */
public static ArrayList<CourseV2> parse(String html) {

    Document doc = org.jsoup.Jsoup.parse(html);

    Element table1 = doc.getElementById("Table1");
    Elements trs = table1.getElementsByTag("tr");

    ArrayList<CourseV2> courses = new ArrayList<>();

    int node = 0;
    for (Element tr : trs) {
        Elements tds = tr.getElementsByTag("td");
        for (Element td : tds) {
            String courseSource = td.text().trim();
            if (courseSource.length() <= 6) {
                //null data
                continue;
            }

            if (Pattern.matches(pattern, courseSource)) {
                //node number
                try {
                    node = Integer.decode(courseSource.substring(1, courseSource.length() - 1));
                } catch (Exception e) {
                    node = 0;
                    e.printStackTrace();
                }
                continue;
            }

            if (inArray(other, courseSource)) {
                //other data
                continue;
            }
            courses.addAll(ParseCourse.parseTextInfo(courseSource, node));
        }
    }

    return mergeSameClass(courses);
}
 
Example 19
Source File: SearchParser.java    From WordPressHelper with MIT License 4 votes vote down vote up
@Override
protected Object doInBackground(Object[] params) {
    try {
        Document document = Jsoup.connect(URL_WORDPRESS + "/?s=" + searchClear + "&feed=rss2")
                .userAgent("Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 Safari/537.22")
                .timeout(60000).ignoreContentType(true).get();
        Elements elements = document.getElementsByTag("item");
        for (Element element : elements) {
            FeedItem feedItem = new FeedItem();

            //get all simple information
            feedItem.setTitle(element.getElementsByTag("title").first().text());
            feedItem.setPubDate(element.getElementsByTag("pubDate").first().text());
            feedItem.setCreator(element.getElementsByTag("dc:creator").first().text());
            feedItem.setDescription(element.getElementsByTag("description").first().text());
            feedItem.setContent(element.getElementsByTag("content:encoded").first().text());
            feedItem.setCommentRss(element.getElementsByTag("wfw:commentRss").first().text());
            feedItem.setComments(element.getElementsByTag("slash:comments").first().text());
            feedItem.setLink(element.select("link").first().nextSibling().toString().trim());
            feedItem.setGuid(element.getElementsByTag("guid").first().text());

            //get first image
            Document document1 = Jsoup.parse(element.getElementsByTag("content:encoded").first().text());
            Elements elements1 = document1.select("img");
            feedItem.setImage(elements1.attr("src"));

            //get all category
            Elements elements2 = element.getElementsByTag("category");
            ArrayList<String> category = new ArrayList<>();
            for (int i = 0; i < elements2.size(); i++) {
                category.add(element.getElementsByTag("category").get(i).text());
            }
            feedItem.setCategory(category);

            //get id
            String idPost[] = element.getElementsByTag("guid").first().text().split("p=");
            if (idPost.length  > 1) {
                feedItem.setId(idPost[1]);
                //add feeditem to arraylist
                feedItems.add(feedItem);
            }
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    return null;
}
 
Example 20
Source File: AuthorParser.java    From WordPressHelper with MIT License 4 votes vote down vote up
@Override
protected Object doInBackground(Object[] params) {
    try {
        Document document = Jsoup.connect(URL_WORDPRESS + "/author/" + authorName + "/feed/")
                .userAgent("Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 Safari/537.22")
                .timeout(60000).ignoreContentType(true).get();
        Elements elements = document.getElementsByTag("item");
        for (Element element : elements) {
            FeedItem feedItem = new FeedItem();

            //get all simple information
            feedItem.setTitle(element.getElementsByTag("title").first().text());
            feedItem.setPubDate(element.getElementsByTag("pubDate").first().text());
            feedItem.setCreator(element.getElementsByTag("dc:creator").first().text());
            feedItem.setDescription(element.getElementsByTag("description").first().text());
            feedItem.setContent(element.getElementsByTag("content:encoded").first().text());
            feedItem.setCommentRss(element.getElementsByTag("wfw:commentRss").first().text());
            feedItem.setComments(element.getElementsByTag("slash:comments").first().text());
            feedItem.setLink(element.select("link").first().nextSibling().toString().trim());
            feedItem.setGuid(element.getElementsByTag("guid").first().text());

            //get first image
            Document document1 = Jsoup.parse(element.getElementsByTag("content:encoded").first().text());
            Elements elements1 = document1.select("img");
            feedItem.setImage(elements1.attr("src"));

            //get all category
            Elements elements2 = element.getElementsByTag("category");
            ArrayList<String> category = new ArrayList<>();
            for (int i = 0; i < elements2.size(); i++) {
                category.add(element.getElementsByTag("category").get(i).text());
            }
            feedItem.setCategory(category);

            String idPost[] = element.getElementsByTag("guid").first().text().split("p=");
            if (idPost.length  > 1) {
                feedItem.setId(idPost[1]);
                //add feeditem to arraylist
                feedItems.add(feedItem);
            }
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    return null;
}