Java Code Examples for org.jsoup.nodes.Element#attr()
The following examples show how to use
org.jsoup.nodes.Element#attr() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ImgurRipper.java From ripme with MIT License | 6 votes |
/** * Rips all albums in an imgur user's account. * @param url * URL to imgur user account (http://username.imgur.com) * @throws IOException */ private void ripUserAccount(URL url) throws IOException { LOGGER.info("Retrieving " + url); sendUpdate(STATUS.LOADING_RESOURCE, url.toExternalForm()); Document doc = Http.url(url).get(); for (Element album : doc.select("div.cover a")) { stopCheck(); if (!album.hasAttr("href") || !album.attr("href").contains("imgur.com/a/")) { continue; } String albumID = album.attr("href").substring(album.attr("href").lastIndexOf('/') + 1); URL albumURL = new URL("http:" + album.attr("href") + "/noscript"); try { ripAlbum(albumURL, albumID); Thread.sleep(SLEEP_BETWEEN_ALBUMS * 1000); } catch (Exception e) { LOGGER.error("Error while ripping album: " + e.getMessage(), e); } } }
Example 2
Source File: SyncFragment.java From SteamGifts with MIT License | 6 votes |
@Override protected String[] doInBackground(Void... params) { Log.d(TAG, "Fetching sync details"); try { // Fetch the Giveaway page Connection jsoup = Jsoup.connect("https://www.steamgifts.com/account/profile/sync") .userAgent(Constants.JSOUP_USER_AGENT) .timeout(Constants.JSOUP_TIMEOUT) .cookie("PHPSESSID", SteamGiftsUserData.getCurrent(fragment.getContext()).getSessionId()); Document document = jsoup.get(); SteamGiftsUserData.extract(fragment.getContext(), document); // Fetch the xsrf token Element xsrfToken = document.select("input[name=xsrf_token]").first(); Element lastSyncTime = document.select(".form__sync-data .notification").first(); if (xsrfToken != null) { return new String[]{xsrfToken.attr("value"), lastSyncTime == null ? null : lastSyncTime.text()}; } } catch (Exception e) { Log.e(TAG, "Error fetching URL", e); } return null; }
Example 3
Source File: ArticleTextExtractor.java From JumpGo with Mozilla Public License 2.0 | 5 votes |
private int calcWeight(Element e) { int weight = 0; if (POSITIVE.matcher(e.className()).find()) weight += 35; if (POSITIVE.matcher(e.id()).find()) weight += 45; if (UNLIKELY.matcher(e.className()).find()) weight -= 20; if (UNLIKELY.matcher(e.id()).find()) weight -= 20; if (NEGATIVE.matcher(e.className()).find()) weight -= 50; if (NEGATIVE.matcher(e.id()).find()) weight -= 50; String style = e.attr("style"); if (style != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find()) weight -= 50; String itemprop = e.attr("itemprop"); if (itemprop != null && !itemprop.isEmpty() && POSITIVE.matcher(itemprop).find()) { weight += 100; } return weight; }
Example 4
Source File: TextAttributeOfElementBuilderTest.java From Asqatasun with GNU Affero General Public License v3.0 | 5 votes |
/** * Test of buildTextFromElement method, of class TextAttributeOfElementBuilder. */ public void testBuildTextFromElementWithTargettedAttributeNotSet() { LOGGER.debug("buildTextFromElementWithTargettedAttributeNotSet"); Element element = new Element(Tag.valueOf("div"), ""); element.attr(AttributeStore.ALT_ATTR, "test"); TextAttributeOfElementBuilder instance = new TextAttributeOfElementBuilder(); String result = instance.buildTextFromElement(element); assertNull(result); // assertNull(instance.getAttributeName()); }
Example 5
Source File: HtmlView.java From JavaRushTasks with MIT License | 5 votes |
private String getUpdatedFileContent(List<Vacancy> vacancies) { Document document = null; try { document = getDocument(); Element templateOriginal = document.getElementsByClass("template").first(); Element copyTemplate = templateOriginal.clone(); copyTemplate.removeAttr("style"); copyTemplate.removeClass("template"); document.select("tr[class=vacancy]").remove().not("tr[class=vacancy template"); for (Vacancy vacancy : vacancies) { Element localClone = copyTemplate.clone(); localClone.getElementsByClass("city").first().text(vacancy.getCity()); localClone.getElementsByClass("companyName").first().text(vacancy.getCompanyName()); localClone.getElementsByClass("salary").first().text(vacancy.getSalary()); Element link =localClone.getElementsByTag("a").first(); link.text(vacancy.getTitle()); link.attr("href", vacancy.getUrl()); templateOriginal.before(localClone.outerHtml()); } } catch (IOException e) { e.printStackTrace(); return "Some exception occurred"; } return document.html(); }
Example 6
Source File: IfanrHotProcessor.java From hot-crawler with MIT License | 5 votes |
@Override protected Info getInfoByElement(Element element) { element = element.getElementsByClass("js-title-transform").get(0); String infoUrl = element.attr("href"); String infoTitle = element.html(); return new Info(infoTitle, infoUrl); }
Example 7
Source File: DefaultYoutubeTrackDetails.java From lavaplayer with Apache License 2.0 | 5 votes |
private List<YoutubeTrackFormat> loadTrackFormatsFromDashDocument(Document document) { List<YoutubeTrackFormat> tracks = new ArrayList<>(); for (Element adaptation : document.select("AdaptationSet")) { String mimeType = adaptation.attr("mimeType"); for (Element representation : adaptation.select("Representation")) { String url = representation.select("BaseURL").first().text(); String contentLength = DataFormatTools.extractBetween(url, "/clen/", "/"); String contentType = mimeType + "; codecs=" + representation.attr("codecs"); if (contentLength == null) { log.debug("Skipping format {} because the content length is missing", contentType); continue; } tracks.add(new YoutubeTrackFormat( ContentType.parse(contentType), Long.parseLong(representation.attr("bandwidth")), Long.parseLong(contentLength), url, null, DEFAULT_SIGNATURE_KEY )); } } return tracks; }
Example 8
Source File: SankakuComplexRipper.java From ripme with MIT License | 5 votes |
@Override public Document getNextPage(Document doc) throws IOException { Element pagination = doc.select("div.pagination").first(); if (pagination.hasAttr("next-page-url")) { String nextPage = pagination.attr("abs:next-page-url"); // Only logged in users can see past page 25 // Trying to rip page 26 will throw a no images found error if (!nextPage.contains("page=26")) { LOGGER.info("Getting next page: " + pagination.attr("abs:next-page-url")); return Http.url(pagination.attr("abs:next-page-url")).cookies(cookies).get(); } } throw new IOException("No more pages"); }
Example 9
Source File: OnnmyoujiSpider.java From SpringBootUnity with MIT License | 5 votes |
/** * 获取御魂信息详情页连接 */ private static List<String> getMitamaDetailInfoUrl() { List<String> list = new ArrayList<>(); String html = HttpUtil.get(URL); Document doc = Jsoup.parse(html); Element select = doc.select(".heroList-1").get(0); Elements liElement = select.select("a"); for (Element element : liElement) { String href = element.attr("href"); list.add(href); } return list; }
Example 10
Source File: HiParser.java From hipda with GNU General Public License v2.0 | 5 votes |
private static SimpleListItemBean parseNotifyThread(Element root) { SimpleListItemBean item = new SimpleListItemBean(); String info = ""; Elements aES = root.select("a"); for (Element a : aES) { String href = a.attr("href"); if (href.contains("space.php")) { // get replied usernames info += a.text() + " "; } else if (href.contains("redirect.php?")) { // Thread Name and TID and PID item.setTitle(a.text()); item.setTid(Utils.getMiddleString(a.attr("href"), "ptid=", "&")); item.setPid(Utils.getMiddleString(a.attr("href"), "pid=", "&")); break; } } // time Elements emES = root.select("em"); if (emES.size() == 0) { return null; } item.setTime(emES.first().text()); if (root.text().contains("回复了您关注的主题")) info += "回复了您关注的主题"; else info += "回复了您的帖子 "; item.setNew(true); item.setInfo(info); return item; }
Example 11
Source File: FUN_CSSPath.java From sparql-generate with Apache License 2.0 | 5 votes |
private NodeValue selectAttribute(Element element, String selectPath, String attributeName) { Elements elements = element.select(selectPath); Element e = elements.first(); if (e == null) { throw new ExprEvalException("No evaluation of " + element + ", " + selectPath); } if (!e.hasAttr(attributeName)) { throw new ExprEvalException("The evaluation of " + element + ", " + selectPath + " is an element that does not have attribute " + attributeName); } return new NodeValueString(e.attr(attributeName)); }
Example 12
Source File: CommonParser.java From movienow with GNU General Public License v3.0 | 5 votes |
private static String getTextWithoutOr(Element element, String lastRule) { String[] rules = lastRule.split("!"); String text; if (rules.length > 1) { if (rules[0].equals("Text")) { text = element.text(); } else if (rules[0].contains("Attr")) { text = element.attr(rules[0].replace("Attr", "")); } else { text = element.select(rules[0]).first().toString(); } text = StringUtil.replaceBlank(text); for (int i = 1; i < rules.length; i++) { text = text.replace(rules[i], ""); } return text; } else { if (lastRule.equals("Text")) { text = element.text(); } else if (lastRule.contains("Attr")) { text = element.attr(lastRule.replace("Attr", "")); } else { text = element.attr(lastRule); // text = element.select(lastRule).first().toString(); } return StringUtil.replaceBlank(text); } }
Example 13
Source File: SteamGiftsUserData.java From SteamGifts with MIT License | 5 votes |
public static void extract(@Nullable Context context, @Nullable Document document) { if (getCurrent(context) == null) return; if (document == null) return; Elements navbar = document.select(".nav__button-container"); Element userContainer = navbar.last().select("a").first(); String link = userContainer.attr("href"); if (link.startsWith("/user/")) { current.setName(link.substring(6)); // fetch the image String style = userContainer.select("div").first().attr("style"); style = Utils.extractAvatar(style); current.setImageUrl(style); // points Element accountContainer = navbar.select("a[href=/account]").first(); current.setPoints(Utils.parseInt(accountContainer.select(".nav__points").text())); // Level float level = Float.parseFloat(accountContainer.select("span").last().attr("title")); current.setLevel((int) level); // Notifications Elements notifications = navbar.select(".nav__button-container--notification"); current.setCreatedNotification(getInt(notifications.select("a[href=/giveaways/created]").first().text())); current.setWonNotification(getInt(notifications.select("a[href=/giveaways/won]").first().text())); current.setMessageNotification(getInt(notifications.select("a[href=/messages]").first().text())); } else if (link.startsWith("/?login") && current.isLoggedIn()) { current = new SteamGiftsUserData(); if (context != null) current.save(context); } }
Example 14
Source File: ImagearnRipper.java From ripme with MIT License | 5 votes |
@Override public List<String> getURLsFromPage(Document doc) { List<String> imageURLs = new ArrayList<>(); for (Element thumb : doc.select("div#gallery > div > a")) { String imageURL = thumb.attr("href"); try { Document imagedoc = new Http("http://imagearn.com/" + imageURL).get(); String image = imagedoc.select("a.thickbox").first().attr("href"); imageURLs.add(image); } catch (IOException e) { LOGGER.warn("Was unable to download page: " + imageURL); } } return imageURLs; }
Example 15
Source File: ResourceQuote.java From templatespider with Apache License 2.0 | 5 votes |
/** * 替换 img 标签 * @param doc * @return */ public Document imgTag(Document doc){ Elements imgElements = doc.getElementsByTag("img"); for (int i = 0; i < imgElements.size(); i++) { Element e = imgElements.get(i); String url = e.attr("src"); String absUrl = hierarchyReplace(this.baseUri, url); if(!url.equals(absUrl)){ e.attr("src", absUrl); } } return doc; }
Example 16
Source File: ParseSection.java From schedge with MIT License | 5 votes |
public static SectionAttribute parse(@NotNull String rawData) { logger.debug("parsing raw catalog section data into SectionAttribute..."); rawData = rawData.trim(); if (rawData.equals("")) { logger.warn("Got bad data: empty string"); return null; // the course doesn't exist } Document doc = Jsoup.parse(rawData); Element failed = doc.selectFirst("div.alert.alert-info"); if (failed != null) { logger.warn("Got bad data: " + failed.text()); return null; // the course doesn't exist } Elements elements = doc.select("a"); String link = null; for (Element element : elements) { String el = element.attr("href"); if (el.contains("mapBuilding")) { link = el; } } doc.select("a").unwrap(); doc.select("i").unwrap(); doc.select("b").unwrap(); Element outerDataSection = doc.selectFirst("body > section.main"); Element innerDataSection = outerDataSection.selectFirst("> section"); Element courseNameDiv = innerDataSection.selectFirst("> div.primary-head"); String courseName = courseNameDiv.text(); Elements dataDivs = innerDataSection.select("> div.section-content.clearfix"); Map<String, String> secData = parseSectionAttributes(dataDivs); return parsingElements(secData, courseName, link); }
Example 17
Source File: JSoupBaiduSearcher.java From search with Apache License 2.0 | 4 votes |
@Override public SearchResult search(String keyword, int page) { int pageSize = 10; //百度搜索结果每页大小为10,pn参数代表的不是页数,而是返回结果的开始数 //如获取第一页则pn=0,第二页则pn=10,第三页则pn=20,以此类推,抽象出模式:(page-1)*pageSize String url = "http://www.baidu.com/s?pn="+(page-1)*pageSize+"&wd="+keyword; SearchResult searchResult = new SearchResult(); searchResult.setPage(page); List<Webpage> webpages = new ArrayList<>(); try { Document document = Jsoup.connect(url).get(); //获取搜索结果数目 int total = getBaiduSearchResultCount(document); searchResult.setTotal(total); int len = 10; if (total < 1) { return null; } //如果搜索到的结果不足一页 if (total < 10) { len = total; } for (int i = 0; i < len; i++) { String titleCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container h3.t a"; String summaryCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container div.c-abstract"; LOG.debug("titleCssQuery:" + titleCssQuery); LOG.debug("summaryCssQuery:" + summaryCssQuery); Element titleElement = document.select(titleCssQuery).first(); String href = ""; String titleText = ""; if(titleElement != null){ titleText = titleElement.text(); href = titleElement.attr("href"); }else{ //处理百度百科 titleCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op h3.t a"; summaryCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op div p"; LOG.debug("处理百度百科 titleCssQuery:" + titleCssQuery); LOG.debug("处理百度百科 summaryCssQuery:" + summaryCssQuery); titleElement = document.select(titleCssQuery).first(); if(titleElement != null){ titleText = titleElement.text(); href = titleElement.attr("href"); } } LOG.debug(titleText); Element summaryElement = document.select(summaryCssQuery).first(); //处理百度知道 if(summaryElement == null){ summaryCssQuery = summaryCssQuery.replace("div.c-abstract","font"); LOG.debug("处理百度知道 summaryCssQuery:" + summaryCssQuery); summaryElement = document.select(summaryCssQuery).first(); } String summaryText = ""; if(summaryElement != null){ summaryText = summaryElement.text(); } LOG.debug(summaryText); if (titleText != null && !"".equals(titleText.trim()) && summaryText != null && !"".equals(summaryText.trim())) { Webpage webpage = new Webpage(); webpage.setTitle(titleText); webpage.setUrl(href); webpage.setSummary(summaryText); if (href != null) { String content = Tools.getHTMLContent(href); webpage.setContent(content); } else { LOG.info("页面正确提取失败"); } webpages.add(webpage); } else { LOG.error("获取搜索结果列表项出错:" + titleText + " - " + summaryText); } } } catch (IOException ex) { LOG.error("搜索出错",ex); } searchResult.setWebpages(webpages);; return searchResult; }
Example 18
Source File: GetYAnswersPropertiesFromQid.java From LiveQAServerDemo with MIT License | 4 votes |
@Override public String getText(Element e) { return e.attr("content"); }
Example 19
Source File: GetReviewerInfo.java From customer-review-crawler with The Unlicense | 4 votes |
public ArrayList<String> reviewer_info(String reviewerID) { System.out.println("Reviewer: " + reviewerID); String url = "http://www.amazon.com/gp/pdp/profile/" + reviewerID; String url2 = "http://www.amazon.com/gp/cdp/member-reviews/" + reviewerID + "/?sort_by=MostRecentReview"; Document doc = null; ArrayList<String> attributes = new ArrayList<String>(); String Reviewer_ranking = ""; String Total_helpful_votes = ""; String Total_reviews = "1"; String Location = ""; List<String> Recent_rating = new ArrayList<>(); try { doc = Jsoup.connect(url).header("User-Agent", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2").get(); // reviewer ranking Elements Reviewer_ranking_e = doc.select("span.a-size-small:contains(Reviewer Ranking: #)"); System.out.println(Reviewer_ranking_e); Pattern pattern = Pattern.compile("(Reviewer ranking: #)(\\S+)"); Matcher matcher = pattern.matcher(Reviewer_ranking_e.text()); if (matcher.find()) { Reviewer_ranking = matcher.group(2); } // review helpful votes Element total_vote = doc.select("span.a-size-small:contains(votes received on reviews)").first(); if(total_vote != null){ Element vote_parent = total_vote.parent(); String votes_string = vote_parent.select("span:contains( of )").text(); pattern = Pattern .compile("([(])(\\S+)( of )(\\S+)([)])"); matcher = pattern.matcher(votes_string); if(matcher.find()){ Total_helpful_votes = matcher.group(2) + " of " + matcher.group(4); } } // total number of reviews Element Total_reviews_e = doc.select("div.reviews-link").first(); if(Total_reviews_e != null){ pattern = Pattern.compile("(Reviews [(])((\\S+))([)])"); matcher = pattern.matcher(Total_reviews_e.text()); if (matcher.find()) { Total_reviews = matcher.group(2); } } // location of the reviewer (if listed) Element Location_e = doc.select("div.profile-name-container").first(); if(Location_e.parent() != null) Location = Location_e.parent().text(); //recent 10 ratings doc = Jsoup.connect(url2).get(); Elements images = doc.select("img"); for (Element image : images) { String imagealt = image.attr("alt"); if (imagealt.contains("out of 5 stars")) { Recent_rating.add(imagealt.substring(0, 1)); } } } catch (IOException e) { System.out.println(e); System.out.println(reviewerID + " Removed"); return (null); } if (Recent_rating.size() > 10) { Recent_rating = Recent_rating.subList(0, 10); } else { Total_reviews = Integer.toString(Recent_rating.size()); } String Recent_rating_joined = org.apache.commons.lang.StringUtils.join( Recent_rating, " "); attributes.addAll(Arrays.asList(reviewerID, Total_reviews, Reviewer_ranking, Total_helpful_votes, Location, Recent_rating_joined.toString())); return (attributes); }
Example 20
Source File: LotusNoirDecks.java From MtgDesktopCompanion with GNU General Public License v3.0 | 4 votes |
@Override public List<RetrievableDeck> getDeckList() throws IOException { String decksUrl = getString(URL) + "?dpage=" + getString(MAX_PAGE) + "&action=" + getString(FORMAT); logger.debug("snif decks : " + decksUrl); int nbPage = getInt(MAX_PAGE); List<RetrievableDeck> list = new ArrayList<>(); for (int i = 1; i <= nbPage; i++) { Document d = URLTools.extractHtml(getString(URL) + "?dpage=" + i + "&action=" + getString(FORMAT)); Elements e = d.select("div.thumb_page"); for (Element cont : e) { RetrievableDeck deck = new RetrievableDeck(); Element info = cont.select("a").get(0); String name = info.attr("title").replace("Lien vers ", "").trim(); String url = info.attr("href"); String auteur = cont.select("small").select("a").text(); Elements value = URLTools.extractHtml(url).select("span.card_title_us"); StringBuilder deckColor = new StringBuilder(); for (Element element : value) { String land = element.text().split(" ")[1]; switch (land) { case "Plain": case "Plains": deckColor.append("{W}"); break; case "Island": case "Islands": deckColor.append("{U}"); break; case "Swamp": case "Swamps": deckColor.append("{B}"); break; case "Mountain": case "Mountains": deckColor.append("{R}"); break; case "Forest": case "Forests": deckColor.append("{G}"); break; default: break; } } deck.setName(name); try { deck.setUrl(new URI(url)); } catch (URISyntaxException e1) { deck.setUrl(null); } deck.setAuthor(auteur); deck.setColor(deckColor.toString()); list.add(deck); } } return list; }