org.jsoup.select.Elements#first

Source File: JsoupAssociationRowTableExtractor.java From wandora with GNU General Public License v3.0

6 votes

private void parseTable(Element table) throws Exception{
    
    Elements rows = table.select("tr");
    
    Element headerRow = rows.first();
    
    ArrayList<Topic> roles = new ArrayList<Topic>();
    
    for(Element headerCell: headerRow.select("th")){
        String roleValue = headerCell.text().trim();
        if(roleValue.length() == 0) continue;
        
        Topic role = getOrCreateTopic(tm, null, roleValue);
        roles.add(role);
    }
    
    List<Element> playerRows = rows.subList(1,rows.size());
    
    for(Element playerRow: playerRows){
        try {
            handlePlayerRow(playerRow, roles);
        } catch (Exception e) {
            log(e.getMessage());
        }
    }
}

Source File: BlacklistHelper.java From hipda with GNU General Public License v2.0

6 votes

public static String addBlacklist2(String formhash, String username) throws Exception {
    ParamsMap params = new ParamsMap();
    params.put("formhash", formhash);
    params.put("user", username);
    String response = OkHttpHelper.getInstance().post(HiUtils.AddBlackUrl, params);
    Document doc = Jsoup.parse(response);
    Elements errors = doc.select("div.alert_error");
    if (errors.size() > 0) {
        Element el = errors.first();
        el.select("a").remove();
        return el.text();
    } else {
        HiSettingsHelper.getInstance().addToBlacklist(username);
    }
    return "";
}

Source File: ElementTest.java From astor with GNU General Public License v2.0

6 votes

@Test
public void testRemoveBeforeIndex() {
	Document doc = Jsoup.parse(
            "<html><body><div><p>before1</p><p>before2</p><p>XXX</p><p>after1</p><p>after2</p></div></body></html>",
            "");
    Element body = doc.select("body").first();
    Elements elems = body.select("p:matchesOwn(XXX)");
    Element xElem = elems.first();
    Elements beforeX = xElem.parent().getElementsByIndexLessThan(xElem.elementSiblingIndex());

    for(Element p : beforeX) {
        p.remove();
    }

    assertEquals("<body><div><p>XXX</p><p>after1</p><p>after2</p></div></body>", TextUtil.stripNewlines(body.outerHtml()));
}

Source File: NexusParser.java From Hentoid with Apache License 2.0

6 votes

@Override
protected List<String> parseImages(@NonNull Content content) throws IOException {
    List<String> result = new ArrayList<>();

    progressStart(content.getQtyPages());
    /*
     * Open all pages and grab the URL of the displayed image
     */
    for (int i = 0; i < content.getQtyPages(); i++) {
        String readerUrl = content.getReaderUrl().replace("001", Helper.formatIntAsStr(i + 1, 3));
        Document doc = getOnlineDocument(readerUrl);
        if (doc != null) {
            Elements elements = doc.select("section a img");
            if (elements != null && !elements.isEmpty()) {
                Element e = elements.first();
                result.add(e.attr("src"));
            }
        }
        progressPlus();
    }

    progressComplete();

    return result;
}

Source File: JokeBean.java From Study_Android_Demo with Apache License 2.0

6 votes

public JokeBean(Element element) {
    //内容
    //得到内容，返回的是元素集合，然后再取第一个数据
    Element tmpContent = element.getElementsByClass("content").first();
    //取出文本
    this.content = tmpContent.text();

    //图片
    //图片地址,有两种可能，有或没有
    Elements tmpThumb = element.getElementsByClass("thumb");
    //如果imgs为null，或者内容长度为0说明没有图片，否则有图片，取第一个即可
    if(tmpThumb !=null && tmpThumb.size()>0){
        //有图片，解析出图片地址，取出第一个元素
        Element tmpImg = tmpThumb.first();
        //得到img标签的选择器,src的属性值即为图片地址
        this.img = tmpImg.select("img").attr("src");

    }
    //链接地址
    //得到class='contentHerf'，取出第一个元素，得到a的选择器，取出href属性
    Element tmpHerf = element.getElementsByClass("contentHerf").first();
    this.contentHerf = tmpHerf.select("a").attr("href");

}

Source File: HentaifoundryRipper.java From ripme with MIT License

6 votes

@Override
public Document getNextPage(Document doc) throws IOException {
    if (!doc.select("li.next.hidden").isEmpty()) {
        // Last page
        throw new IOException("No more pages");
    }
    Elements els = doc.select("li.next > a");
    Element first = els.first();
    try {
        String nextURL = first.attr("href");
        nextURL = "https://www.hentai-foundry.com" + nextURL;
        return Http.url(nextURL)
                .referrer(url)
                .cookies(cookies)
                .get();
    } catch (NullPointerException e) {
        throw new IOException("No more pages");
    }
}

Source File: BakaTsukiParserAlternative.java From coolreader with MIT License

6 votes

/***
 * Process li to chapter.
 * 
 * @param li
 * @param parent
 * @param chapterOrder
 * @return
 */
private static PageModel processLI(Element li, String parent, int chapterOrder, String language) {
	PageModel p = null;
	Elements links = li.select("a");
	if (links != null && links.size() > 0) {
		// TODO: need to handle multiple link in one list item
		Element link = links.first();

		// skip if User_talk:
		if (link.attr("href").contains("User_talk:"))
			return null;

		p = processA(li.text(), parent, chapterOrder, link, language);
	}
	return p;
}

Source File: Mf2Parser.java From indigenous-android with GNU General Public License v3.0

6 votes

private String parseImpliedUrlRelative(Element elem) {
    //     if a.h-x[href] or area.h-x[href] then use that [href] for url
    if (("a".equals(elem.tagName()) || "area".equals(elem.tagName()))
            && elem.hasAttr("href")) {
        return elem.attr("href");
    }
    //else if .h-x>a[href]:only-of-type:not[.h-*] then use that [href] for url
    //else if .h-x>area[href]:only-of-type:not[.h-*] then use that [href] for url
    for (String childTag : Arrays.asList("a", "area")) {
        Elements children = filterByTag(elem.children(), childTag);
        if(children.size() == 1) {
            Element child = children.first();
            if (!hasRootClass(child) && child.hasAttr("href")) {
                return child.attr("href");
            }
        }
    }

    return null;
}

Source File: SelectorFetcher.java From stevia with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * verify an element locator as unique
 * @param e
 * @param locator
 * @return
 * @throws Exception 
 */
private static String verifyLocator( Element e, String locator) throws Exception {
	Element rootElement = e.parents().last();
	if(!locator.startsWith("//")) {
		Elements selected = rootElement.select(locator);
		if (selected.size() == 1) {
			if (!uniqueLocators.containsKey(e)) {
				uniqueLocators.put(e, locator);
			}
			return locator + " UNIQUE = "+selected.first();
		} else if (selected.size() > 1) {
			return locator + " NON-UNIQUE = "+selected;
		} else {
			return locator +" NOT FOUND - PROBLEM";
		}
	} else if(locator.startsWith("//")) { //xpath 
	    XElements elements = Xsoup.select(rootElement, locator);
	    if (elements.getElements().size() > 1) {
	    	return locator + " NON-UNIQUE!!! ";
	    } else if (elements.getElements().size() == 0) {
	    	return locator +" NOT FOUND - PROBLEM";
	    }
	    if (!uniqueLocators.containsKey(e)) {
			uniqueLocators.put(e, locator);
		}
	    return locator + " UNIQUE = "+ elements.getElements().get(0);
	    
	}
	
	return locator + " XPATH?";
}

Source File: EHentaiParser.java From Hentoid with Apache License 2.0

5 votes

private String getDisplayedImageUrl(@Nonnull Document doc) {
    Elements elements = doc.select("img#img");
    if (!elements.isEmpty()) {
        Element e = elements.first();
        return e.attr("src");
    }
    return "";
}

Source File: AbstractSpiderServer.java From Doctor with Apache License 2.0

5 votes

/**
     * 症状并发症等含有通用词的
     *
     * @param href
     * @return
     */
    protected Map<String, Object> getBrief(String href, String word) throws Exception {
        String url = (word == null) ? (index + href) : (index + word + href.substring(href.lastIndexOf("/")));
        //症状详情页
        Document document = SpiderUtil.getDocument(url);
        Elements select = document.select("div.spider");
        if (select.size() == 0) {
            if (document.select("div.jb-body").size()!=0){
                select = document.select("div.jb-body");
            }else{
                logger.error("异常：详情页无详情 "+url);
            }
        }
        Element first = select.first();
        //爬取所有描述
        Map<String, Object> map = new HashMap<>();
        map.put(ALL, first.text());
        //判断是否有词
        Elements elements = first.getElementsByTag("a");
        if (elements.size()== 0) {
//            logger.warn("正常无spider<a> "+url);
            return map;
        }
        //遍历词
        List<String> symptomList = new ArrayList<>();
        for (Element element1 : elements) {
            symptomList.add(element1.text());
            //新的词链接
            String href1 = element1.attr("href");
            //保存新词到本地txt文件
            TexUtil.write(element1.text()+"\r\n"+href1+"\r\n",ProjectPath.getRootPath("/word_link.txt"));
        }
        map.put(WORD, symptomList);
        return map;
    }

Source File: JsoupProcessor.java From AcgClub with MIT License

5 votes

/**
 * Extract first element according to a query
 */
private static Element element(Element container, String query) {

  Elements select = container.select(query);

  if (select.size() == 0) {
    return null;
  }

  return select.first();
}

Source File: MyJsoup.java From frameworkAggregate with Apache License 2.0

5 votes

private static List<FlowerCategory> getCategoryList() {

		List<FlowerCategory> categories = new ArrayList<FlowerCategory>();

		try {
			Document doc = Jsoup.connect("http://www.aihuhua.com/baike/").get();
			Elements catelist = doc.getElementsByClass("catelist");
			Element cates = catelist.first();
			List<Node> childNodes = cates.childNodes();
			for (int i = 0; i < childNodes.size(); i++) {
				Node node = childNodes.get(i);
				List<Node> childs = node.childNodes();
				if (childs != null && childs.size() > 0) {
					FlowerCategory category = new FlowerCategory();
					for (int j = 0; j < childs.size(); j++) {
						Node child = childs.get(j);
						if ("a".equals(child.nodeName())) {
							category.setUrl(child.attr("href"));
							category.setImgPath(child.childNode(1).attr("src"));
						} else if ("h2".equals(child.nodeName())) {
							category.setName(child.attr("title"));
						}
					}
					categories.add(category);
				}
			}
		} catch (IOException e) {
			e.printStackTrace();
		}

		return categories;
	}

Source File: TestJsoup.java From frameworkAggregate with Apache License 2.0

5 votes

private static List<FlowerCategory> getCategoryList() {

		List<FlowerCategory> categories = new ArrayList<FlowerCategory>();

		try {
			Document doc = Jsoup.connect("http://www.aihuhua.com/baike/").get();
			Elements catelist = doc.getElementsByClass("catelist");
			Element cates = catelist.first();
			List<Node> childNodes = cates.childNodes();
			for (int i = 0; i < childNodes.size(); i++) {
				Node node = childNodes.get(i);
				List<Node> childs = node.childNodes();
				if (childs != null && childs.size() > 0) {
					FlowerCategory category = new FlowerCategory();
					for (int j = 0; j < childs.size(); j++) {
						Node child = childs.get(j);
						if ("a".equals(child.nodeName())) {
							category.setUrl(child.attr("href"));
							category.setImgPath(child.childNode(1).attr("src"));
						} else if ("h2".equals(child.nodeName())) {
							category.setName(child.attr("title"));
						}
					}
					categories.add(category);
				}
			}
		} catch (IOException e) {
			e.printStackTrace();
		}

		return categories;
	}

Source File: BaseTask.java From guanggoo-android with Apache License 2.0

5 votes

protected boolean checkAuth(Document doc) {
    Elements elements = doc.select("div.usercard");
    if (!elements.isEmpty()) {
        Element usercardElement = elements.first();

        AuthInfoManager.getInstance().setUsername(usercardElement.select("div.username").first().text());
        AuthInfoManager.getInstance().setAvatar(usercardElement.select("img.avatar").first().attr("src"));
        return true;
    }
    return false;
}

Source File: JsoupParserIntegrationTest.java From tutorials with MIT License

5 votes

@Test
public void examplesTraversing() {
    Elements sections = doc.select("section");

    Element firstSection = sections.first();
    Element lastSection = sections.last();
    Element secondSection = sections.get(2);
    Elements allParents = firstSection.parents();
    Element parent = firstSection.parent();
    Elements children = firstSection.children();
    Elements siblings = firstSection.siblingElements();

    sections.forEach(el -> System.out.println("section: " + el));
}

Source File: ModifySamlResponseStepBuilder.java From keycloak with Apache License 2.0

4 votes

private HttpUriRequest handlePostBinding(CloseableHttpResponse currentResponse) throws Exception {
    assertThat(currentResponse, statusCodeIsHC(Status.OK));

    final String htmlBody = EntityUtils.toString(currentResponse.getEntity());
    assertThat(htmlBody, Matchers.containsString("SAML"));
    org.jsoup.nodes.Document theResponsePage = Jsoup.parse(htmlBody);
    Elements samlResponses = theResponsePage.select("input[name=SAMLResponse]");
    Elements samlRequests = theResponsePage.select("input[name=SAMLRequest]");
    Elements forms = theResponsePage.select("form");
    Elements relayStates = theResponsePage.select("input[name=RelayState]");
    int size = samlResponses.size() + samlRequests.size();
    assertThat("Checking uniqueness of SAMLResponse/SAMLRequest input field in the page", size, is(1));
    assertThat("Checking uniqueness of forms in the page", forms, hasSize(1));

    Element respElement = samlResponses.isEmpty() ? samlRequests.first() : samlResponses.first();
    Element form = forms.first();

    String base64EncodedSamlDoc = respElement.val();
    InputStream decoded = PostBindingUtil.base64DecodeAsStream(base64EncodedSamlDoc);
    String samlDoc = IOUtils.toString(decoded, GeneralConstants.SAML_CHARSET);
    IOUtils.closeQuietly(decoded);

    String transformed = getTransformer().transform(samlDoc);
    if (transformed == null) {
        return null;
    }

    final String attributeName = this.targetAttribute != null
      ? this.targetAttribute
      : respElement.attr("name");
    List<NameValuePair> parameters = new LinkedList<>();

    if (! relayStates.isEmpty()) {
        parameters.add(new BasicNameValuePair(GeneralConstants.RELAY_STATE, relayStates.first().val()));
    }
    URI locationUri = this.targetUri != null
      ? this.targetUri
      : URI.create(form.attr("action"));

    return createRequest(locationUri, attributeName, transformed, parameters);
}

Source File: EHentaiParser.java From Hentoid with Apache License 2.0

4 votes

public List<ImageFile> parseImageList(@NonNull Content content) throws Exception {
    EventBus.getDefault().register(this);

    try {
        List<ImageFile> result = new ArrayList<>();
        boolean useHentoidAgent = Site.EHENTAI.canKnowHentoidAgent();
        Map<String, String> downloadParams = new HashMap<>();
        int order = 1;

        /*
         * 1- Detect the number of pages of the gallery
         *
         * 2- Browse the gallery and fetch the URL for every page (since all of them have a different temporary key...)
         *
         * 3- Open all pages and grab the URL of the displayed image
         */

        // 1- Detect the number of pages of the gallery
        Element e;
        List<Pair<String, String>> headers = new ArrayList<>();
        headers.add(new Pair<>(HttpHelper.HEADER_COOKIE_KEY, "nw=1")); // nw=1 (always) avoids the Offensive Content popup (equivalent to clicking the "Never warn me again" link)
        Document doc = getOnlineDocument(content.getGalleryUrl(), headers, useHentoidAgent);
        if (doc != null) {
            Elements elements = doc.select("table.ptt a");
            if (null == elements || elements.isEmpty()) return result;

            int tabId = (1 == elements.size()) ? 0 : elements.size() - 2;
            int nbGalleryPages = Integer.parseInt(elements.get(tabId).text());

            progress.start(nbGalleryPages + content.getQtyPages());

            // 2- Browse the gallery and fetch the URL for every page (since all of them have a different temporary key...)
            List<String> pageUrls = new ArrayList<>();

            fetchPageUrls(doc, pageUrls);

            if (nbGalleryPages > 1) {
                for (int i = 1; i < nbGalleryPages && !processHalted; i++) {
                    doc = getOnlineDocument(content.getGalleryUrl() + "/?p=" + i, headers, useHentoidAgent);
                    if (doc != null) fetchPageUrls(doc, pageUrls);
                    progress.advance();
                }
            }

            // 3- Open all pages and
            //    - grab the URL of the displayed image
            //    - grab the alternate URL of the "Click here if the image fails loading" link
            result.add(ImageFile.newCover(content.getCoverImageUrl(), StatusContent.SAVED));
            ImageFile img;
            for (String pageUrl : pageUrls) {
                if (processHalted) break;
                doc = getOnlineDocument(pageUrl, headers, useHentoidAgent);
                if (doc != null) {
                    // Displayed image
                    String imageUrl = getDisplayedImageUrl(doc).toLowerCase();
                    if (!imageUrl.isEmpty()) {
                        // If we have the 509.gif picture, it means the bandwidth limit for e-h has been reached
                        if (imageUrl.contains("/509.gif"))
                            throw new LimitReachedException("E-hentai download points regenerate over time or can be bought on e-hentai if you're in a hurry");
                        img = ParseHelper.urlToImageFile(imageUrl, order++, pageUrls.size(), StatusContent.SAVED);
                        result.add(img);

                        // "Click here if the image fails loading" link
                        elements = doc.select("#loadfail");
                        if (!elements.isEmpty()) {
                            e = elements.first();
                            String arg = e.attr("onclick");
                            // Get the argument between 's
                            int quoteBegin = arg.indexOf('\'');
                            int quoteEnd = arg.indexOf('\'', quoteBegin + 1);
                            arg = arg.substring(quoteBegin + 1, quoteEnd);
                            // Get the query URL
                            if (pageUrl.contains("?")) pageUrl += "&";
                            else pageUrl += "?";
                            pageUrl += "nl=" + arg;
                            // Get the final URL
                            if (URLUtil.isValidUrl(pageUrl)) {
                                downloadParams.put("backupUrl", pageUrl);
                                String downloadParamsStr = JsonHelper.serializeToJson(downloadParams, JsonHelper.MAP_STRINGS);
                                img.setDownloadParams(downloadParamsStr);
                            }
                        }
                    }
                }
                progress.advance();
            }
        }
        progress.complete();

        // If the process has been halted manually, the result is incomplete and should not be returned as is
        if (processHalted) throw new PreparationInterruptedException();
        return result;
    } finally {
        EventBus.getDefault().unregister(this);
    }
}

Source File: Mf2Parser.java From indigenous-android with GNU General Public License v3.0

4 votes

private String parseImpliedName(Element elem) {
    if (("img".equals(elem.tagName()) || ("area".equals(elem.tagName())) && elem.hasAttr("alt"))) {
        return elem.attr("alt");
    }
    if ("abbr".equals(elem.tagName()) && elem.hasAttr("title")) {
        return elem.attr("title");
    }

    Elements children = elem.children();
    if (children.size() == 1) {
        Element child = children.first();
        // else if .h-x>img:only-child[alt]:not[.h-*] then use that img alt for name
        // else if .h-x>area:only-child[alt]:not[.h-*] then use that area alt for name
        if (!hasRootClass(child)
                && ("img".equals(child.tagName()) || "area".equals(child.tagName()))
                && child.hasAttr("alt")) {
            return child.attr("alt");
        }
        // else if .h-x>abbr:only-child[title] then use that abbr title for name
        if ("abbr".equals(child.tagName()) && child.hasAttr("title")) {
            return child.attr("title");
        }

        Elements grandChildren = child.children();
        if (grandChildren.size() == 1) {
            Element grandChild = grandChildren.first();
            // else if .h-x>:only-child>img:only-child[alt]:not[.h-*] then use that img alt for name
            // else if .h-x>:only-child>area:only-child[alt]:not[.h-*] then use that area alt for name
            if (!hasRootClass(grandChild)
                    && ("img".equals(grandChild.tagName()) || "area".equals(grandChild.tagName()))
                    && grandChild.hasAttr("alt")) {
                return grandChild.attr("alt");
            }
            // else if .h-x>:only-child>abbr:only-child[title] use that abbr title for name
            if ("abbr".equals(grandChild.tagName()) && grandChild.hasAttr("c")) {
                return grandChild.attr("title");
            }
        }
    }

    // else use the textContent of the .h-x for name
    // drop leading & trailing white-space from name, including nbsp
    return elem.text().trim();
}

Source File: HiParser.java From hipda with GNU General Public License v2.0

4 votes

private static SimpleListBean parseSearch(Document doc) {
    if (doc == null) {
        return null;
    }

    SimpleListBean list = new SimpleListBean();
    int last_page = 1;

    //if this is the last page, page number is in <strong>
    Elements pagesES = doc.select("div.pages_btns div.pages a");
    pagesES.addAll(doc.select("div.pages_btns div.pages strong"));
    String searchIdUrl;
    if (pagesES.size() > 0) {
        searchIdUrl = pagesES.first().attr("href");
        list.setSearchId(Utils.getMiddleString(searchIdUrl, "searchid=", "&"));
        for (Node n : pagesES) {
            int tmp = Utils.getIntFromString(((Element) n).text());
            if (tmp > last_page) {
                last_page = tmp;
            }
        }
    }
    list.setMaxPage(last_page);

    Elements tbodyES = doc.select("tbody");
    for (int i = 0; i < tbodyES.size(); ++i) {
        Element tbodyE = tbodyES.get(i);
        SimpleListItemBean item = new SimpleListItemBean();

        Elements subjectES = tbodyE.select("tr th.subject a");
        if (subjectES.size() == 0) {
            continue;
        }

        Element titleLink = subjectES.first();
        String href = titleLink.attr("href");
        item.setTid(Utils.getMiddleString(href, "tid=", "&"));
        item.setTitle(titleLink.text());

        Elements authorAES = tbodyE.select("tr td.author cite a");
        if (authorAES.size() == 0) {
            continue;
        }
        item.setAuthor(authorAES.first().text());

        String spaceUrl = authorAES.first().attr("href");
        if (!TextUtils.isEmpty(spaceUrl)) {
            String uid = Utils.getMiddleString(spaceUrl, "uid=", "&");
            item.setAvatarUrl(HiUtils.getAvatarUrlByUid(uid));
        }

        Elements timeES = tbodyE.select("tr td.author em");
        if (timeES.size() > 0) {
            item.setTime(timeES.first().text());
        }

        Elements forumES = tbodyE.select("tr td.forum");
        if (forumES.size() > 0) {
            item.setForum(forumES.first().text());
        }

        list.add(item);
    }

    return list;
}

Java Code Examples for org.jsoup.select.Elements#first()