org.jsoup.nodes.Element#attr

Source File: ImgurRipper.java From ripme with MIT License

6 votes

/**
 * Rips all albums in an imgur user's account.
 * @param url
 *      URL to imgur user account (http://username.imgur.com)
 * @throws IOException
 */
private void ripUserAccount(URL url) throws IOException {
    LOGGER.info("Retrieving " + url);
    sendUpdate(STATUS.LOADING_RESOURCE, url.toExternalForm());
    Document doc = Http.url(url).get();
    for (Element album : doc.select("div.cover a")) {
        stopCheck();
        if (!album.hasAttr("href")
                || !album.attr("href").contains("imgur.com/a/")) {
            continue;
        }
        String albumID = album.attr("href").substring(album.attr("href").lastIndexOf('/') + 1);
        URL albumURL = new URL("http:" + album.attr("href") + "/noscript");
        try {
            ripAlbum(albumURL, albumID);
            Thread.sleep(SLEEP_BETWEEN_ALBUMS * 1000);
        } catch (Exception e) {
            LOGGER.error("Error while ripping album: " + e.getMessage(), e);
        }
    }
}

Source File: SyncFragment.java From SteamGifts with MIT License

6 votes

@Override
protected String[] doInBackground(Void... params) {
    Log.d(TAG, "Fetching sync details");

    try {
        // Fetch the Giveaway page

        Connection jsoup = Jsoup.connect("https://www.steamgifts.com/account/profile/sync")
                .userAgent(Constants.JSOUP_USER_AGENT)
                .timeout(Constants.JSOUP_TIMEOUT)
                .cookie("PHPSESSID", SteamGiftsUserData.getCurrent(fragment.getContext()).getSessionId());
        Document document = jsoup.get();

        SteamGiftsUserData.extract(fragment.getContext(), document);

        // Fetch the xsrf token
        Element xsrfToken = document.select("input[name=xsrf_token]").first();
        Element lastSyncTime = document.select(".form__sync-data .notification").first();
        if (xsrfToken != null) {
            return new String[]{xsrfToken.attr("value"), lastSyncTime == null ? null : lastSyncTime.text()};
        }
    } catch (Exception e) {
        Log.e(TAG, "Error fetching URL", e);
    }
    return null;
}

Source File: ArticleTextExtractor.java From JumpGo with Mozilla Public License 2.0

5 votes

private int calcWeight(Element e) {
    int weight = 0;
    if (POSITIVE.matcher(e.className()).find())
        weight += 35;

    if (POSITIVE.matcher(e.id()).find())
        weight += 45;

    if (UNLIKELY.matcher(e.className()).find())
        weight -= 20;

    if (UNLIKELY.matcher(e.id()).find())
        weight -= 20;

    if (NEGATIVE.matcher(e.className()).find())
        weight -= 50;

    if (NEGATIVE.matcher(e.id()).find())
        weight -= 50;

    String style = e.attr("style");
    if (style != null && !style.isEmpty() && NEGATIVE_STYLE.matcher(style).find())
        weight -= 50;

    String itemprop = e.attr("itemprop");
    if (itemprop != null && !itemprop.isEmpty() && POSITIVE.matcher(itemprop).find()) {
        weight += 100;
    }

    return weight;
}

Source File: TextAttributeOfElementBuilderTest.java From Asqatasun with GNU Affero General Public License v3.0

5 votes

/**
     * Test of buildTextFromElement method, of class TextAttributeOfElementBuilder.
     */
    public void testBuildTextFromElementWithTargettedAttributeNotSet() {
        LOGGER.debug("buildTextFromElementWithTargettedAttributeNotSet");
        Element element = new Element(Tag.valueOf("div"), "");
        element.attr(AttributeStore.ALT_ATTR, "test");
        TextAttributeOfElementBuilder instance = new TextAttributeOfElementBuilder();
        String result = instance.buildTextFromElement(element);
        assertNull(result);
//        assertNull(instance.getAttributeName());
    }

Source File: HtmlView.java From JavaRushTasks with MIT License

5 votes

private String getUpdatedFileContent(List<Vacancy> vacancies) {

        Document document = null;
        try {
            document = getDocument();

            Element templateOriginal = document.getElementsByClass("template").first();
            Element copyTemplate = templateOriginal.clone();
            copyTemplate.removeAttr("style");
            copyTemplate.removeClass("template");
            document.select("tr[class=vacancy]").remove().not("tr[class=vacancy template");

            for (Vacancy vacancy : vacancies) {
                Element localClone = copyTemplate.clone();
                localClone.getElementsByClass("city").first().text(vacancy.getCity());
                localClone.getElementsByClass("companyName").first().text(vacancy.getCompanyName());
                localClone.getElementsByClass("salary").first().text(vacancy.getSalary());
                Element link =localClone.getElementsByTag("a").first();
                link.text(vacancy.getTitle());
                link.attr("href", vacancy.getUrl());

                templateOriginal.before(localClone.outerHtml());
            }
        } catch (IOException e) {
            e.printStackTrace();
            return "Some exception occurred";
        }
        return document.html();
    }

Source File: IfanrHotProcessor.java From hot-crawler with MIT License

5 votes

@Override
protected Info getInfoByElement(Element element) {
    element = element.getElementsByClass("js-title-transform").get(0);
    String infoUrl = element.attr("href");
    String infoTitle = element.html();
    return new Info(infoTitle, infoUrl);
}

Source File: DefaultYoutubeTrackDetails.java From lavaplayer with Apache License 2.0

5 votes

private List<YoutubeTrackFormat> loadTrackFormatsFromDashDocument(Document document) {
  List<YoutubeTrackFormat> tracks = new ArrayList<>();

  for (Element adaptation : document.select("AdaptationSet")) {
    String mimeType = adaptation.attr("mimeType");

    for (Element representation : adaptation.select("Representation")) {
      String url = representation.select("BaseURL").first().text();
      String contentLength = DataFormatTools.extractBetween(url, "/clen/", "/");
      String contentType = mimeType + "; codecs=" + representation.attr("codecs");

      if (contentLength == null) {
        log.debug("Skipping format {} because the content length is missing", contentType);
        continue;
      }

      tracks.add(new YoutubeTrackFormat(
          ContentType.parse(contentType),
          Long.parseLong(representation.attr("bandwidth")),
          Long.parseLong(contentLength),
          url,
          null,
          DEFAULT_SIGNATURE_KEY
      ));
    }
  }

  return tracks;
}

Source File: SankakuComplexRipper.java From ripme with MIT License

5 votes

@Override
public Document getNextPage(Document doc) throws IOException {
    Element pagination = doc.select("div.pagination").first();
    if (pagination.hasAttr("next-page-url")) {
        String nextPage = pagination.attr("abs:next-page-url");
        // Only logged in users can see past page 25
        // Trying to rip page 26 will throw a no images found error
        if (!nextPage.contains("page=26")) {
            LOGGER.info("Getting next page: " + pagination.attr("abs:next-page-url"));
            return Http.url(pagination.attr("abs:next-page-url")).cookies(cookies).get();
        }
    }
    throw new IOException("No more pages");
}

Source File: OnnmyoujiSpider.java From SpringBootUnity with MIT License

5 votes

/**
 * 获取御魂信息详情页连接
 */
private static List<String> getMitamaDetailInfoUrl() {
    List<String> list = new ArrayList<>();
    String html = HttpUtil.get(URL);
    Document doc = Jsoup.parse(html);
    Element select = doc.select(".heroList-1").get(0);
    Elements liElement = select.select("a");
    for (Element element : liElement) {
        String href = element.attr("href");
        list.add(href);
    }
    return list;
}

Source File: HiParser.java From hipda with GNU General Public License v2.0

5 votes

private static SimpleListItemBean parseNotifyThread(Element root) {
    SimpleListItemBean item = new SimpleListItemBean();
    String info = "";

    Elements aES = root.select("a");
    for (Element a : aES) {
        String href = a.attr("href");
        if (href.contains("space.php")) {
            // get replied usernames
            info += a.text() + " ";
        } else if (href.contains("redirect.php?")) {
            // Thread Name and TID and PID
            item.setTitle(a.text());
            item.setTid(Utils.getMiddleString(a.attr("href"), "ptid=", "&"));
            item.setPid(Utils.getMiddleString(a.attr("href"), "pid=", "&"));
            break;
        }
    }

    // time
    Elements emES = root.select("em");
    if (emES.size() == 0) {
        return null;
    }
    item.setTime(emES.first().text());

    if (root.text().contains("回复了您关注的主题"))
        info += "回复了您关注的主题";
    else
        info += "回复了您的帖子 ";

    item.setNew(true);
    item.setInfo(info);
    return item;
}

Source File: FUN_CSSPath.java From sparql-generate with Apache License 2.0

5 votes

private NodeValue selectAttribute(Element element, String selectPath, String attributeName) {
    Elements elements = element.select(selectPath);
    Element e = elements.first();
    if (e == null) {
        throw new ExprEvalException("No evaluation of " + element + ", " + selectPath);
    }
    if (!e.hasAttr(attributeName)) {
        throw new ExprEvalException("The evaluation of " + element + ", " + selectPath + " is an element that does not have attribute " + attributeName);
    }
    return new NodeValueString(e.attr(attributeName));
}

Source File: CommonParser.java From movienow with GNU General Public License v3.0

5 votes

private static String getTextWithoutOr(Element element, String lastRule) {
        String[] rules = lastRule.split("!");
        String text;
        if (rules.length > 1) {
            if (rules[0].equals("Text")) {
                text = element.text();
            } else if (rules[0].contains("Attr")) {
                text = element.attr(rules[0].replace("Attr", ""));
            } else {
                text = element.select(rules[0]).first().toString();
            }
            text = StringUtil.replaceBlank(text);
            for (int i = 1; i < rules.length; i++) {
                text = text.replace(rules[i], "");
            }
            return text;
        } else {
            if (lastRule.equals("Text")) {
                text = element.text();
            } else if (lastRule.contains("Attr")) {
                text = element.attr(lastRule.replace("Attr", ""));
            } else {
                text = element.attr(lastRule);
//                text = element.select(lastRule).first().toString();
            }
            return StringUtil.replaceBlank(text);
        }
    }

Source File: SteamGiftsUserData.java From SteamGifts with MIT License

5 votes

public static void extract(@Nullable Context context, @Nullable Document document) {
    if (getCurrent(context) == null)
        return;

    if (document == null)
        return;

    Elements navbar = document.select(".nav__button-container");

    Element userContainer = navbar.last().select("a").first();
    String link = userContainer.attr("href");

    if (link.startsWith("/user/")) {
        current.setName(link.substring(6));

        // fetch the image
        String style = userContainer.select("div").first().attr("style");
        style = Utils.extractAvatar(style);
        current.setImageUrl(style);

        // points
        Element accountContainer = navbar.select("a[href=/account]").first();
        current.setPoints(Utils.parseInt(accountContainer.select(".nav__points").text()));

        // Level
        float level = Float.parseFloat(accountContainer.select("span").last().attr("title"));
        current.setLevel((int) level);

        // Notifications
        Elements notifications = navbar.select(".nav__button-container--notification");
        current.setCreatedNotification(getInt(notifications.select("a[href=/giveaways/created]").first().text()));
        current.setWonNotification(getInt(notifications.select("a[href=/giveaways/won]").first().text()));
        current.setMessageNotification(getInt(notifications.select("a[href=/messages]").first().text()));
    } else if (link.startsWith("/?login") && current.isLoggedIn()) {
        current = new SteamGiftsUserData();
        if (context != null)
            current.save(context);
    }
}

Source File: ImagearnRipper.java From ripme with MIT License

5 votes

@Override
public List<String> getURLsFromPage(Document doc) {
    List<String> imageURLs = new ArrayList<>();
    for (Element thumb : doc.select("div#gallery > div > a")) {
        String imageURL = thumb.attr("href");
        try {
            Document imagedoc = new Http("http://imagearn.com/" + imageURL).get();
            String image = imagedoc.select("a.thickbox").first().attr("href");
            imageURLs.add(image);
        } catch (IOException e) {
            LOGGER.warn("Was unable to download page: " + imageURL);
        }
    }
    return imageURLs;
}

Source File: ResourceQuote.java From templatespider with Apache License 2.0

5 votes

/**
 * 替换 img 标签
 * @param doc
 * @return
 */
public Document imgTag(Document doc){
	Elements imgElements = doc.getElementsByTag("img");
	for (int i = 0; i < imgElements.size(); i++) {
		Element e = imgElements.get(i);
		String url = e.attr("src");
		String absUrl = hierarchyReplace(this.baseUri, url);
		if(!url.equals(absUrl)){
			e.attr("src", absUrl);
		}
	}
	return doc;
}

Source File: ParseSection.java From schedge with MIT License

5 votes

public static SectionAttribute parse(@NotNull String rawData) {
  logger.debug("parsing raw catalog section data into SectionAttribute...");

  rawData = rawData.trim();

  if (rawData.equals("")) {
    logger.warn("Got bad data: empty string");
    return null; // the course doesn't exist
  }

  Document doc = Jsoup.parse(rawData);
  Element failed = doc.selectFirst("div.alert.alert-info");
  if (failed != null) {
    logger.warn("Got bad data: " + failed.text());
    return null; // the course doesn't exist
  }

  Elements elements = doc.select("a");
  String link = null;
  for (Element element : elements) {
    String el = element.attr("href");
    if (el.contains("mapBuilding")) {
      link = el;
    }
  }

  doc.select("a").unwrap();
  doc.select("i").unwrap();
  doc.select("b").unwrap();
  Element outerDataSection = doc.selectFirst("body > section.main");
  Element innerDataSection = outerDataSection.selectFirst("> section");
  Element courseNameDiv = innerDataSection.selectFirst("> div.primary-head");
  String courseName = courseNameDiv.text();
  Elements dataDivs =
      innerDataSection.select("> div.section-content.clearfix");
  Map<String, String> secData = parseSectionAttributes(dataDivs);

  return parsingElements(secData, courseName, link);
}

Source File: JSoupBaiduSearcher.java From search with Apache License 2.0

4 votes

@Override
public SearchResult search(String keyword, int page) {
    int pageSize = 10;
    //百度搜索结果每页大小为10，pn参数代表的不是页数，而是返回结果的开始数
    //如获取第一页则pn=0，第二页则pn=10，第三页则pn=20，以此类推，抽象出模式：(page-1)*pageSize
    String url = "http://www.baidu.com/s?pn="+(page-1)*pageSize+"&wd="+keyword;
    
    SearchResult searchResult = new SearchResult();
    searchResult.setPage(page);
    List<Webpage> webpages = new ArrayList<>();
    try {
        Document document = Jsoup.connect(url).get();
        
        //获取搜索结果数目
        int total = getBaiduSearchResultCount(document);
        searchResult.setTotal(total);
        int len = 10;
        if (total < 1) {
            return null;
        }
        //如果搜索到的结果不足一页
        if (total < 10) {
            len = total;
        }
        for (int i = 0; i < len; i++) {
            String titleCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container h3.t a";
            String summaryCssQuery = "html body div div div div#content_left div#" + (i + 1 + (page-1)*pageSize) + ".result.c-container div.c-abstract";
            LOG.debug("titleCssQuery:" + titleCssQuery);
            LOG.debug("summaryCssQuery:" + summaryCssQuery);
            Element titleElement = document.select(titleCssQuery).first();
            String href = "";
            String titleText = "";
            if(titleElement != null){
                titleText = titleElement.text();
                href = titleElement.attr("href");
            }else{
                //处理百度百科
                titleCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op h3.t a";
                summaryCssQuery = "html body div#out div#in div#wrapper div#container div#content_left div#1.result-op div p";
                LOG.debug("处理百度百科 titleCssQuery:" + titleCssQuery);
                LOG.debug("处理百度百科 summaryCssQuery:" + summaryCssQuery);
                titleElement = document.select(titleCssQuery).first();
                if(titleElement != null){
                    titleText = titleElement.text();
                    href = titleElement.attr("href");
                }
            }
            LOG.debug(titleText);
            Element summaryElement = document.select(summaryCssQuery).first();
            //处理百度知道
            if(summaryElement == null){
                summaryCssQuery = summaryCssQuery.replace("div.c-abstract","font");
                LOG.debug("处理百度知道 summaryCssQuery:" + summaryCssQuery);
                summaryElement = document.select(summaryCssQuery).first();
            }
            String summaryText = "";
            if(summaryElement != null){
                summaryText = summaryElement.text(); 
            }
            LOG.debug(summaryText);                
            
            if (titleText != null && !"".equals(titleText.trim()) && summaryText != null && !"".equals(summaryText.trim())) {
                Webpage webpage = new Webpage();
                webpage.setTitle(titleText);
                webpage.setUrl(href);
                webpage.setSummary(summaryText);
                if (href != null) {
                    String content = Tools.getHTMLContent(href);
                    webpage.setContent(content);
                } else {
                    LOG.info("页面正确提取失败");
                }
                webpages.add(webpage);
            } else {
                LOG.error("获取搜索结果列表项出错:" + titleText + " - " + summaryText);
            }
        }
        
        
    } catch (IOException ex) {
        LOG.error("搜索出错",ex);
    }
    searchResult.setWebpages(webpages);;
    return searchResult;
}

Source File: GetYAnswersPropertiesFromQid.java From LiveQAServerDemo with MIT License

4 votes

@Override
public String getText(Element e) {
    return e.attr("content");
}

Source File: GetReviewerInfo.java From customer-review-crawler with The Unlicense

4 votes

public ArrayList<String> reviewer_info(String reviewerID) {
       System.out.println("Reviewer: " + reviewerID);
       String url = "http://www.amazon.com/gp/pdp/profile/" + reviewerID;
	String url2 = "http://www.amazon.com/gp/cdp/member-reviews/"
			+ reviewerID + "/?sort_by=MostRecentReview";
	Document doc = null;
	ArrayList<String> attributes = new ArrayList<String>();
	String Reviewer_ranking = "";
	String Total_helpful_votes = "";
	String Total_reviews = "1";
	String Location = "";
	List<String> Recent_rating = new ArrayList<>();
	try {
		doc = Jsoup.connect(url).header("User-Agent",
                   "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4; en-US; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2").get();

           // reviewer ranking
		Elements Reviewer_ranking_e = doc.select("span.a-size-small:contains(Reviewer Ranking: #)");
           System.out.println(Reviewer_ranking_e);
           Pattern pattern = Pattern.compile("(Reviewer ranking: #)(\\S+)");
		Matcher matcher = pattern.matcher(Reviewer_ranking_e.text());
		if (matcher.find()) {
			Reviewer_ranking = matcher.group(2);
		}

           // review helpful votes
		Element total_vote = doc.select("span.a-size-small:contains(votes received on reviews)").first();
           if(total_vote != null){
               Element vote_parent = total_vote.parent();
               String votes_string = vote_parent.select("span:contains( of )").text();
               pattern = Pattern
                       .compile("([(])(\\S+)( of )(\\S+)([)])");
               matcher = pattern.matcher(votes_string);
               if(matcher.find()){
                   Total_helpful_votes = matcher.group(2) + " of "
                           + matcher.group(4);
               }
           }

           // total number of reviews
		Element Total_reviews_e = doc.select("div.reviews-link").first();
           if(Total_reviews_e != null){
               pattern = Pattern.compile("(Reviews [(])((\\S+))([)])");
               matcher = pattern.matcher(Total_reviews_e.text());
               if (matcher.find()) {
                   Total_reviews = matcher.group(2);
               }
           }

           // location of the reviewer (if listed)
           Element Location_e = doc.select("div.profile-name-container").first();
           if(Location_e.parent() != null)
               Location = Location_e.parent().text();


           //recent 10 ratings
		doc = Jsoup.connect(url2).get();
		Elements images = doc.select("img");
		for (Element image : images) {
			String imagealt = image.attr("alt");
               if (imagealt.contains("out of 5 stars")) {
                   Recent_rating.add(imagealt.substring(0, 1));
               }
           }

	} catch (IOException e) {
           System.out.println(e);
           System.out.println(reviewerID + " Removed");
		return (null);
	}

	if (Recent_rating.size() > 10) {
		Recent_rating = Recent_rating.subList(0, 10);
	} else {
		Total_reviews = Integer.toString(Recent_rating.size());
	}
	String Recent_rating_joined = org.apache.commons.lang.StringUtils.join(
			Recent_rating, " ");
	attributes.addAll(Arrays.asList(reviewerID, Total_reviews,
			Reviewer_ranking, Total_helpful_votes, Location,
			Recent_rating_joined.toString()));
	return (attributes);
}

Source File: LotusNoirDecks.java From MtgDesktopCompanion with GNU General Public License v3.0

4 votes

@Override
public List<RetrievableDeck> getDeckList() throws IOException {

	String decksUrl = getString(URL) + "?dpage=" + getString(MAX_PAGE) + "&action=" + getString(FORMAT);

	logger.debug("snif decks : " + decksUrl);

	int nbPage = getInt(MAX_PAGE);
	List<RetrievableDeck> list = new ArrayList<>();

	for (int i = 1; i <= nbPage; i++) {
		Document d = URLTools.extractHtml(getString(URL) + "?dpage=" + i + "&action=" + getString(FORMAT));

		Elements e = d.select("div.thumb_page");

		for (Element cont : e) {
			RetrievableDeck deck = new RetrievableDeck();
			Element info = cont.select("a").get(0);

			String name = info.attr("title").replace("Lien vers ", "").trim();
			String url = info.attr("href");
			String auteur = cont.select("small").select("a").text();
			Elements value = URLTools.extractHtml(url).select("span.card_title_us");
			StringBuilder deckColor = new StringBuilder();
			for (Element element : value)
			{
				String land = element.text().split(" ")[1];
				switch (land) 
				{
					case "Plain":
					case "Plains":
						deckColor.append("{W}");
						break;
					case "Island":
					case "Islands":
						deckColor.append("{U}");
						break;
					case "Swamp":
					case "Swamps":
						deckColor.append("{B}");
						break;
					case "Mountain":
					case "Mountains":
						deckColor.append("{R}");
						break;
					case "Forest":
					case "Forests":
						deckColor.append("{G}");
						break;
					default:
						break;
				} 
			}
			deck.setName(name);
			try {
				deck.setUrl(new URI(url));
			} catch (URISyntaxException e1) {
				deck.setUrl(null);
			}
			deck.setAuthor(auteur);
			deck.setColor(deckColor.toString());

			list.add(deck);
		}
	}
	return list;
}

Java Code Examples for org.jsoup.nodes.Element#attr()