Java Code Examples for org.jsoup.nodes.Element#text()

The following examples show how to use org.jsoup.nodes.Element#text() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ITClassFileFormatVersion.java    From japicmp with Apache License 2.0 6 votes vote down vote up
@Test
public void testClassFileFormatVersionIsPresent() throws IOException {
	Path htmlPath = Paths.get(System.getProperty("user.dir"), "target", "japicmp", "class-file-format-version.html");
	if (!Files.exists(htmlPath)) {
		return; //in JDK 1.7 case
	}
	Document document = Jsoup.parse(htmlPath.toFile(), Charset.forName("UTF-8").toString());
	Elements classFileFormatElements = document.select(".class_fileFormatVersion");
	assertThat(classFileFormatElements.isEmpty(), is(false));
	Elements tdCells = classFileFormatElements.select("table > tbody > tr > td");
	assertThat(tdCells.isEmpty(), is(false));
	for (Element element : tdCells) {
		String text = element.text();
		if (!"MODIFIED".equals(text) && !"50.0".equals(text) && !"52.0".equals(text)) {
			Assert.fail("text of HTML element does not equal 'MODIFIED' or 50.0 or 52.0: " + text);
		}
	}
}
 
Example 2
Source File: IPUtils.java    From superword with Apache License 2.0 6 votes vote down vote up
public static List<String> getIPLocation(String ip){
    List<String> locations = new ArrayList<>();
    try {
        Elements elements = Jsoup
                .parse(new URL("http://ip138.com/ips138.asp?ip=" + ip), 60000)
                .select("ul li");
        for(Element element : elements){
            String text = element.text();
            if(StringUtils.isNotBlank(text)){
                String[] attrs = text.split(":");
                if(attrs != null && attrs.length == 2){
                    locations.add(attrs[1]);
                }
            }
        }
    }catch (Exception e){
        LOG.error("获取IP地址的地理位置", e);
    }
    return locations;
}
 
Example 3
Source File: BookClass.java    From nju-lib-downloader with GNU General Public License v3.0 6 votes vote down vote up
/**
 * 通过HTML中对应节点获取到书所在分类
 *
 * @param bookInfo   书本信息的HTML节点
 * @param isTerminal 是否是终端分类
 * @return 书所在分类。如果是终端分类将会返回{@code TerminalBookClass}
 */
private BookClass getBookCata(Element bookInfo, boolean isTerminal) {
    String cataName = bookInfo.text();
    String href = bookInfo.attr("href");
    if (href != null) {
        int cataIdStart = href.indexOf('=') + 1;
        if (cataIdStart != 0) {
            String cataId = href.substring(href.indexOf('=') + 1, href.length());
            BookClass tmp = isTerminal ? new TerminalBookClass(cataId) : new BookClass(cataId);
            tmp.setName(cataName);
            return tmp;
        }

    }
    return null;
}
 
Example 4
Source File: JsoupEx.java    From FairEmail with GNU General Public License v3.0 6 votes vote down vote up
static Document parse(String html) {
        try {
/*
        org.jsoup.UncheckedIOException: java.io.IOException: Input is binary and unsupported
                at org.jsoup.parser.CharacterReader.<init>(SourceFile:38)
                at org.jsoup.parser.CharacterReader.<init>(SourceFile:43)
                at org.jsoup.parser.TreeBuilder.initialiseParse(SourceFile:38)
                at org.jsoup.parser.HtmlTreeBuilder.initialiseParse(SourceFile:65)
                at org.jsoup.parser.TreeBuilder.parse(SourceFile:46)
                at org.jsoup.parser.Parser.parse(SourceFile:107)
                at org.jsoup.Jsoup.parse(SourceFile:58)
*/
            return Jsoup.parse(html.replace("\0", ""));
        } catch (OutOfMemoryError ex) {
            Log.e(ex);
            Document document = Document.createShell("");
            Element strong = document.createElement("strong");
            strong.text(Log.formatThrowable(ex));
            document.body().appendChild(strong);
            return document;
        }
    }
 
Example 5
Source File: BookClass.java    From nju-lib-downloader with GNU General Public License v3.0 6 votes vote down vote up
/**
 * 通过HTML中对应节点获取到书所在分类
 *
 * @param bookInfo   书本信息的HTML节点
 * @param isTerminal 是否是终端分类
 * @return 书所在分类。如果是终端分类将会返回{@code TerminalBookClass}
 */
private BookClass getBookCata(Element bookInfo, boolean isTerminal) {
    String cataName = bookInfo.text();
    String href = bookInfo.attr("href");
    if (href != null) {
        int cataIdStart = href.indexOf('=') + 1;
        if (cataIdStart != 0) {
            String cataId = href.substring(href.indexOf('=') + 1, href.length());
            BookClass tmp = isTerminal ? new TerminalBookClass(cataId) : new BookClass(cataId);
            tmp.setName(cataName);
            return tmp;
        }

    }
    return null;
}
 
Example 6
Source File: AppsGamesCatalogApi.java    From 4pdaClient-plus with Apache License 2.0 5 votes vote down vote up
private static void loadCatalog(IHttpClient client, AppGameCatalog catalog, ArrayList<AppGameCatalog> res) throws IOException {
    String pageBody = client.performGet(APPS_CATALOG_URL).getResponseBody();

    Matcher contentMatcher = Pattern.compile("<div class=\"[^\"]*post_body[^\"]*\"[^>]*?>([\\s\\S]*?)<a name=\"entry\\d+\"></a>",
            Pattern.CASE_INSENSITIVE).matcher(pageBody);
    if (!contentMatcher.find()) {
        throw new IOException("Не найден пост с содержанием каталога приложений");

    }

    Document doc = Jsoup.parse(contentMatcher.group(1));


    for (Element categoryElement : doc.select("ol[type=1]>li")) {
        Elements elements = categoryElement.select("a");
        if (elements.size() == 0) continue;

        Element element = elements.get(0);
        Uri uri = Uri.parse(element.attr("href"));
        String title = element.text();
        AppGameCatalog category = new AppGameCatalog(uri.getQueryParameter("p"), title)
                .setLevel(AppGameCatalog.LEVEL_CATEGORY);
        category.setParent(catalog);
        res.add(category);

        AppGameCatalog subCategory = new AppGameCatalog(category.getId().toString(), category.getTitle() + " @ темы")
                .setLevel(AppGameCatalog.LEVEL_CATEGORY);
        subCategory.setParent(category);
        res.add(subCategory);
        for (Element subCategoryElement : categoryElement.select("ul>li>a")) {

            uri = Uri.parse(subCategoryElement.attr("href"));
            title = subCategoryElement.text();
            AppGameCatalog subcategory = new AppGameCatalog(uri.getQueryParameter("anchor"), title)
                    .setLevel(AppGameCatalog.LEVEL_SUBCATEGORY);
            subcategory.setParent(category);
            res.add(subcategory);
        }
    }
}
 
Example 7
Source File: YZWBPaperCollector.java    From search with Apache License 2.0 5 votes vote down vote up
@Override
public List<File> collect(Date date) {
    List<File> files = new ArrayList<>();
    try {
        LOG.debug("url: "+url);
        String paper = url + sf.format(date) + start;
        LOG.debug("paper: "+paper);
        Document document = Jsoup.connect(paper).get();
        
        LOG.debug("typeCssQuery: " + typeCssQuery);
        Elements elements = document.select(typeCssQuery);
        int i = 1;
        for(Element element : elements){
            LOG.debug("处理子报"+(i++));
            String href = element.attr("href");
            LOG.debug("type href:"+href);
            if(href != null && href.endsWith(".htm")){
                String type = element.text();
                LOG.debug("type:"+type);
                href = href.replace("./", "");
                href = url + sf.format(date) + href;
                LOG.debug("type href:"+href);
                //不同的子报的pdfCssQuery都一样
                List<String> hrefs = collect(href, pdfCssQuery);
                files.addAll(downloadPaper(hrefs));
            }
        }        
    } catch (IOException ex) {
        LOG.error("采集出错",ex);
    }
    return files;
}
 
Example 8
Source File: CityParser.java    From zuihou-admin-cloud with Apache License 2.0 5 votes vote down vote up
private List<Area> parseProvince(String url) {

        String htmlStr = HttpUtil.get(url, CHARSET);
        Document document = Jsoup.parse(htmlStr);

        // 获取 class='provincetr' 的元素
        Elements elements = document.getElementsByClass("provincetr");
        List<Area> provinces = new LinkedList<Area>();
        int sort = 1;
        for (Element element : elements) {
            // 获取 elements 下属性是 href 的元素
            Elements links = element.getElementsByAttribute("href");
            for (Element link : links) {
                String provinceName = link.text();
                String href = link.attr("href");
                String provinceCode = href.substring(0, 2);

                Area provinceArea = Area.builder().code(provinceCode + "0000")
                        .label(provinceName).source(url)
                        .sortValue(sort++)
                        .level(new RemoteData<>("PROVINCE"))
                        .fullName(provinceName)
                        .build();
                provinceArea.setChildren(parseCity(provinceName, COMMON_URL + href));

                StaticLog.info("省级数据:  {}  ", provinceArea);

                provinces.add(provinceArea);
            }
        }
        return provinces;
    }
 
Example 9
Source File: SparkUtil.java    From yanagishima with Apache License 2.0 5 votes vote down vote up
public static List<SparkSqlJob> getSparkSqlJobFromSqlserver(String resourceManagerUrl, String sparkJdbcApplicationId) {
    try {
        List<SparkSqlJob> sparkSqlJobs = new ArrayList<>();
        Document document = Jsoup.connect(resourceManagerUrl + "/proxy/" + sparkJdbcApplicationId + "/sqlserver").get();
        // SQL Statistics
        // User	JobID	GroupID	Start Time	Finish Time	Duration	Statement	State	Detail
        Element table = document.getElementsByTag("tbody").last();
        if (table == null) {
            return sparkSqlJobs;
        }
        for (Element row : table.getElementsByTag("tr")) {
            SparkSqlJob sparkSqlJob = new SparkSqlJob();
            Elements td = row.getElementsByTag("td");
            sparkSqlJob.setUser(td.get(0).text());
            Element jobIds = td.get(1);
            List<Integer> jobIdList = new ArrayList<>();
            if (jobIds.childNodeSize() > 1) {
                for (Element a : jobIds.getElementsByTag("a")) {
                    String str = a.text();
                    jobIdList.add(Integer.parseInt(str.substring(1, str.length() - 1)));
                }
            }
            sparkSqlJob.setJobIds(jobIdList);
            sparkSqlJob.setGroupId(td.get(2).text());
            sparkSqlJob.setStartTime(td.get(3).text());
            sparkSqlJob.setFinishTime(td.get(4).text());
            sparkSqlJob.setDuration(td.get(5).text());
            sparkSqlJob.setStatement(td.get(6).text());
            sparkSqlJob.setState(td.get(7).text());
            sparkSqlJob.setDetail(td.get(8).text());
            sparkSqlJobs.add(sparkSqlJob);
        }
        return sparkSqlJobs;
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
 
Example 10
Source File: SynonymDiscriminationExtractor.java    From superword with Apache License 2.0 5 votes vote down vote up
/**
 * 解析同义词辨析
 * @param html
 * @return
 */
public static Set<SynonymDiscrimination> parseSynonymDiscrimination(String html){
    Set<SynonymDiscrimination> data = new HashSet<>();
    try {
        for(Element element : Jsoup.parse(html).select(SYNONYM_DISCRIMINATION_CSS_PATH)){
            String title = element.select(TITLE).text().trim();
            Elements elements = element.select(DES);
            if(elements.size() != 2){
                LOGGER.error("解析描述信息出错,elements.size="+elements.size());
                continue;
            }
            String des = elements.get(0).text().replace("“ ”", "").replace("“ ", "“").trim();
            SynonymDiscrimination synonymDiscrimination = new SynonymDiscrimination();
            synonymDiscrimination.setTitle(title);
            synonymDiscrimination.setDes(des);
            elements = element.select(WORDS);
            for(Element ele : elements){
                String word = ele.text();
                String[] attr = word.split(":");
                if(attr != null && attr.length == 2){
                    synonymDiscrimination.addWord(new Word(attr[0].trim(), attr[1].trim()));
                }else {
                    LOGGER.error("解析词义信息出错:"+word);
                }
            }
            data.add(synonymDiscrimination);
            LOGGER.info("解析出同义词辨析:" + synonymDiscrimination);
        }
    }catch (Exception e){
        LOGGER.error("解析同义词辨析出错", e);
    }
    return data;
}
 
Example 11
Source File: ParseHelper.java    From 4pdaClient-plus with Apache License 2.0 5 votes vote down vote up
private void parseComments(Element main) {
    String comment, link, userName, date, ratingNum, ratingText;
    List<CommentsModel> cache = new ArrayList<>();
    ArrayList<String> dr = new ArrayList<>();
    for (Element element1 : main.select("#comments .reviews li")) {
        if (!element1.select(".text-box").text().isEmpty()) {
            /**
             * Тут короче если текст бокс не нуль, то и все остальное не нуль.
             */
            Element element =  element1.select(".text-box .w-toggle").first();
            if(element==null)
                element = element1.select(".text-box").first();
            comment = element.text();
            element = element1.select("div.name a").first();
            link = element.attr("href");
            userName = element.attr("title");
            date = element1.select("div.date").first().text();
            ratingNum = element1.select("span.num").first().text();
            ratingText = element1.select("span.text").first().text();
            // for detail dialog
            Elements elements1 = element1.getElementsByClass("reviews-list");
            if (elements1 != null)
                for (Element element2 : elements1)
                    dr.add(element2.select("div.line").text());
            cache.add(new CommentsModel(date, ratingNum, ratingText, comment, link, userName, dr));
        }
    }
    parsed.setCommentsModels(new Gson().toJson(cache));
}
 
Example 12
Source File: GetYAnswersPropertiesFromQid.java    From LiveQAServerDemo with MIT License 4 votes vote down vote up
@Override
public String getText(Element e) {
    return e.text();
}
 
Example 13
Source File: ScheduleNew.java    From AcgClub with MIT License 4 votes vote down vote up
@ForEach("span:containsOwn(类型) a")
void labels(Element element, int index) {
  type += element.text() + " ";
}
 
Example 14
Source File: BookClass.java    From nju-lib-downloader with GNU General Public License v3.0 4 votes vote down vote up
private Set<Book> queryBooks(Elements booksliNode) {
    Set<Book> books = new HashSet<>();
    for (Element element : booksliNode) {
        //获取书名和id
        String name = null, id = null, author = null, publishDate = null, theme = null, detailBookClass = null;
        BookClass bookBookClass;
        Elements nameIdNode = element.select("p[class=name]");
        if (nameIdNode != null) {
            name = nameIdNode.text();
            Elements idNode = nameIdNode.select("a[onclick]");
            if (idNode != null && idNode.size() > 0) {
                String idOnClick = idNode.get(0).attr("onclick");
                int start = idOnClick.indexOf("(") + 1, end = idOnClick.lastIndexOf(",");
                if (start != 0 && end != -1) {
                    id = idOnClick.substring(start, end);
                }
            }
        }
        //获取分类
        BookClass[] bookClasses = new BookClass[0];
        Elements infoNode = element.select("p[class=info]");
        if (infoNode != null) {
            Elements bookInfos = infoNode.select("a");
            if (bookInfos != null && bookInfos.size() > 0) {
                Element terminalCataNode = bookInfos.last();
                bookInfos.remove(terminalCataNode);
                List<BookClass> tmplist = bookInfos.stream()
                        .map(bookInfo -> getBookCata(bookInfo, false))
                        .filter(Objects::nonNull)
                        .collect(Collectors.toList());
                BookClass terminalBookClass = getBookCata(terminalCataNode, true);
                if (terminalBookClass != null) {
                    tmplist.add(terminalBookClass);
                }
                bookClasses = tmplist.toArray(bookClasses);
            }
        }
        bookBookClass = this.link(bookClasses);

        //获取作者,出版日期,主题词,分类
        String info = element.text();
        Pattern pattern = Pattern.compile("\\d+\\. (.*) 作者[::](.*) 出版日期[::](\\d+).*?(?:主题词[::](.+))? 分类[::](.*)");
        Matcher matcher = pattern.matcher(info);
        while (matcher.find()) {
            name = matcher.group(1);
            author = matcher.group(2);
            publishDate = matcher.group(3);
            theme = matcher.group(4);
            detailBookClass = matcher.group(5);
        }
        Pattern minPattern = Pattern.compile(".*(《.*》).*");
        Matcher minMatcher = minPattern.matcher(info);
        while (minMatcher.find()) {
            name = minMatcher.group(1);
        }

        //汇总书本
        if (name != null && id != null) {
            Book book = new Book(id, name, author, publishDate, theme, bookBookClass, detailBookClass);
            book.setCookie(cookie);
            books.add(book);
            if (bookBookClass.isTerminal()) {
                ((TerminalBookClass) bookBookClass).addBook(book);
            } else {
                System.out.println("未获取到分类信息,将不被归档 " + book);
            }
        } else {
            System.out.println("error: " + info);
        }
    }
    return books;
}
 
Example 15
Source File: XXSBPaperCollector.java    From search with Apache License 2.0 4 votes vote down vote up
@Override
public List<File> collect(Date date) {
    List<String> hrefs = new ArrayList<>();
    try {
        LOG.debug("url: "+url);
        String paper = url + sf.format(date) + start;
        LOG.debug("paper: "+paper);
        Document document = Jsoup.connect(paper).get();
        
        //1、找到子报纸
        LOG.debug("subCssQuery: " + subCssQuery);
        Elements elements = document.select(subCssQuery);
        for(Element element : elements){
            String text = element.text();
            String href = element.attr("href");
            if(text != null && text.contains(":") && href != null && href.endsWith(".htm")){
                String subPaperURL = url + sf.format(date) + href;
                LOG.debug("子报纸文本:"+text+" , "+href);
                LOG.debug("subPaperURL:"+subPaperURL);
                //2、找到内容页面
                LOG.debug("contentCssQuery: " + contentCssQuery);
                Elements contentElements = Jsoup.connect(subPaperURL).get().select(contentCssQuery);
                for(Element contentElement : contentElements){
                    String h = contentElement.attr("href");
                    if(h != null && h.startsWith("content_") && h.endsWith(".htm")){
                        String contentURL = url + sf.format(date) + h;
                        LOG.debug("contentURL:"+contentURL);
                        //3、找PDF
                        LOG.debug("pdfCssQuery: " + pdfCssQuery);
                        Elements pdfElements = Jsoup.connect(contentURL).get().select(pdfCssQuery);
                        for(Element pdfElement : pdfElements){
                            String pdf = pdfElement.attr("href");
                            if(pdf != null && pdf.endsWith(".pdf")){
                                LOG.debug("报纸链接:"+pdf);
                                pdf = pdf.replace("../../../", "");
                                LOG.debug("报纸链接:"+pdf);
                                hrefs.add(host+pdf);
                            }else{
                                LOG.debug("不是报纸链接:"+pdf);
                            }
                        }
                        //有多个content,选择一个即可
                        break;
                    }
                }
            }else{
                LOG.debug("不是子报纸文本:"+text+" , "+href);
            }
        }
    } catch (IOException ex) {
        LOG.error("采集出错",ex);
    }
    return downloadPaper(hrefs);
}
 
Example 16
Source File: JsoupUtil.java    From materialup with Apache License 2.0 4 votes vote down vote up
private static String text(Element e) {
    if (e == null) {
        return null;
    }
    return e.text();
}
 
Example 17
Source File: ElementOperator.java    From xsoup with MIT License 4 votes vote down vote up
@Override
public String operate(Element element) {
    return element.text();
}
 
Example 18
Source File: DefaultParser.java    From rank with Apache License 2.0 4 votes vote down vote up
@Override
public List<Article> parse(String url, String nextPageCssQuery, String nextPageText, String titleCssQuery) {
    List<Article> articles = new ArrayList<>();
    try{
        Document document = Jsoup.connect(url)
                    .header("Accept", ACCEPT)
                    .header("Accept-Encoding", ENCODING)
                    .header("Accept-Language", LANGUAGE)
                    .header("Connection", CONNECTION)
                    .header("User-Agent", USER_AGENT)
                    .get();
        Elements elements = document.select(titleCssQuery);
        for(Element element : elements){
            String title = element.text();
            String href = element.attr("href");
            if(!StringUtils.isBlank(title) && !StringUtils.isBlank(href)){
                href = UrlTools.normalizeUrl(url, href);
                Article article = new Article();
                article.setTitle(title);
                article.setUrl(href);
                articles.add(article);
            }else{
                LOGGER.info("解析列表页出错:"+url+" title:"+title+", href:"+href);
            }
        }
        //获取下一页链接地址
        String nextPageUrl = getNextPageUrl(document, nextPageCssQuery, nextPageText);
        LOGGER.debug("下一页链接:"+nextPageUrl);
        if(nextPageUrl != null){
            nextPageUrl = UrlTools.normalizeUrl(url, nextPageUrl);
            LOGGER.debug("规范化后的下一页链接:"+nextPageUrl);
            //解析下一页
            List<Article> result = parse(nextPageUrl, nextPageCssQuery, nextPageText, titleCssQuery);
            articles.addAll(result);
        }else{
            LOGGER.info("列表页解析完毕,最后一页:"+url);
        }
    }catch(Exception e){
        LOGGER.error("解析列表页出错:"+url, e);
    }
    return articles;
}
 
Example 19
Source File: PlumbleService.java    From Plumble with GNU General Public License v3.0 4 votes vote down vote up
@Override
public void onMessageLogged(IMessage message) {
    // Split on / strip all HTML tags.
    Document parsedMessage = Jsoup.parseBodyFragment(message.getMessage());
    String strippedMessage = parsedMessage.text();

    String ttsMessage;
    if(mShortTtsMessagesEnabled) {
        for (Element anchor : parsedMessage.getElementsByTag("A")) {
            // Get just the domain portion of links
            String href = anchor.attr("href");
            // Only shorten anchors without custom text
            if (href != null && href.equals(anchor.text())) {
                String urlHostname = HtmlUtils.getHostnameFromLink(href);
                if (urlHostname != null) {
                    anchor.text(getString(R.string.chat_message_tts_short_link, urlHostname));
                }
            }
        }
        ttsMessage = parsedMessage.text();
    } else {
        ttsMessage = strippedMessage;
    }

    String formattedTtsMessage = getString(R.string.notification_message,
            message.getActorName(), ttsMessage);

    // Read if TTS is enabled, the message is less than threshold, is a text message, and not deafened
    if(mSettings.isTextToSpeechEnabled() &&
            mTTS != null &&
            formattedTtsMessage.length() <= TTS_THRESHOLD &&
            getSessionUser() != null &&
            !getSessionUser().isSelfDeafened()) {
        mTTS.speak(formattedTtsMessage, TextToSpeech.QUEUE_ADD, null);
    }

    // TODO: create a customizable notification sieve
    if (mSettings.isChatNotifyEnabled()) {
        mMessageNotification.show(message);
    }

    mMessageLog.add(new IChatMessage.TextMessage(message));
}
 
Example 20
Source File: JsoupPropertyTableExtractor.java    From wandora with GNU General Public License v3.0 3 votes vote down vote up
private boolean parseTable(Element table) throws Exception{
    
    Elements rows = table.select("tr");
    
    Element masterRow = rows.first();
    Element masterCell = masterRow.select("td").first();
    
    if(masterCell == null) throw new Exception("No master row!");
    
    String masterValue = masterCell.text();
    
    Topic masterTopic = getOrCreateTopic(tm, null, masterValue);
    Association assoc = tm.createAssociation(masterTopic);
    
    List<Element> playerRows = rows.subList(1, rows.size());
    
    for(Element playerRow: playerRows) {
        try {
            handleAssoc(assoc, playerRow);
        } catch (Exception e) {
            log(e);
        }
    }
    
    return true;
    
}