org.jsoup.nodes.Element#toString

Source File: JsoupUtil.java From xxl-crawler with GNU General Public License v3.0

6 votes

/**
 * 抽取元素数据
 *
 * @param fieldElement
 * @param selectType
 * @param selectVal
 * @return String
 */
public static String parseElement(Element fieldElement, XxlCrawlerConf.SelectType selectType, String selectVal) {
    String fieldElementOrigin = null;
    if (XxlCrawlerConf.SelectType.HTML == selectType) {
        fieldElementOrigin = fieldElement.html();
    } else if (XxlCrawlerConf.SelectType.VAL == selectType) {
        fieldElementOrigin = fieldElement.val();
    } else if (XxlCrawlerConf.SelectType.TEXT == selectType) {
        fieldElementOrigin = fieldElement.text();
    } else if (XxlCrawlerConf.SelectType.ATTR == selectType) {
        fieldElementOrigin = fieldElement.attr(selectVal);
    }  else if (XxlCrawlerConf.SelectType.HAS_CLASS == selectType) {
        fieldElementOrigin = String.valueOf(fieldElement.hasClass(selectVal));
    }  else {
        fieldElementOrigin = fieldElement.toString();
    }
    return fieldElementOrigin;
}

Source File: HbrowseParser.java From Hentoid with Apache License 2.0

6 votes

public static List<String> parseImages(@NonNull Content content, @NonNull List<Element> scripts) {
    content.populateUniqueSiteId();
    List<String> result = new ArrayList<>();

    String chapter = "";
    String[] parts = content.getUrl().split("/");
    if (parts.length > 1) chapter = parts[1];

    for (Element e : scripts) {
        String scriptContent = e.toString();
        if (scriptContent.contains("list")) {
            int beginIndex = scriptContent.indexOf("list = [") + 8;
            String[] list = scriptContent.substring(beginIndex, scriptContent.indexOf("];", beginIndex)).replace("\"", "").split(",");
            for (String s : list) {
                if (!s.trim().isEmpty() && !s.equalsIgnoreCase("zzz")) {
                    String imgUrl = Site.HBROWSE.getUrl() + "data/" + content.getUniqueSiteId() + "/" + chapter + "/" + s;
                    result.add(imgUrl);
                }
            }
            break;
        }
    }

    return result;
}

Source File: Parse99Mm.java From v9porn with MIT License

5 votes

public static List<String> parse99MmImageList(String html) {

        Document doc = Jsoup.parse(html);

        Element elementBox = doc.getElementById("picbox");
        String imgUrl = elementBox.selectFirst("img").attr("src").trim();
        HttpUrl httpUrl = HttpUrl.parse(imgUrl);
        Element element = doc.body().select("script").first();
        String javaScript = element.toString();
        String data = StringUtils.subString(javaScript, javaScript.indexOf("[") + 1, javaScript.lastIndexOf(";") - 1);
        String[] dataArray = data.replace("\"", "").split(",");

        int imgIdArrayLength = dataArray.length - 6;

        String[] imgIdArray = new String[imgIdArrayLength];
        System.arraycopy(dataArray, 6, imgIdArray, 0, imgIdArrayLength);
        Logger.t(TAG).d(dataArray);
        Logger.t(TAG).d(imgIdArray);

        List<String> stringImageList = new ArrayList<>();
        String host;
        if (httpUrl == null) {
            host = "http://fj.kanmengmei.com/";
        } else {
            host = httpUrl.scheme() +"://"+ httpUrl.host();
        }

        for (int i = 0; i < imgIdArrayLength; i++) {
            String tmpImgUrl = host + "/" + dataArray[1] + (i + 1) + "-" + imgIdArray[i] + ".jpg";
            Logger.t(TAG).d(tmpImgUrl);
            stringImageList.add(tmpImgUrl);
        }
        return stringImageList;

    }

Source File: Parse99Mm.java From v9porn with MIT License

5 votes

public static List<String> parse99MmImageList(String html) {

        Document doc = Jsoup.parse(html);

        Element elementBox = doc.getElementById("picbox");
        String imgUrl = elementBox.selectFirst("img").attr("src").trim();
        HttpUrl httpUrl = HttpUrl.parse(imgUrl);
        Element element = doc.body().select("script").first();
        String javaScript = element.toString();
        String data = StringUtils.subString(javaScript, javaScript.indexOf("[") + 1, javaScript.lastIndexOf(";") - 1);
        String[] dataArray = data.replace("\"", "").split(",");

        int imgIdArrayLength = dataArray.length - 6;

        String[] imgIdArray = new String[imgIdArrayLength];
        System.arraycopy(dataArray, 6, imgIdArray, 0, imgIdArrayLength);
        Logger.t(TAG).d(dataArray);
        Logger.t(TAG).d(imgIdArray);

        List<String> stringImageList = new ArrayList<>();
        String host;
        if (httpUrl == null) {
            host = "http://fj.kanmengmei.com/";
        } else {
            host = httpUrl.scheme() +"://"+ httpUrl.host();
        }

        for (int i = 0; i < imgIdArrayLength; i++) {
            String tmpImgUrl = host + "/" + dataArray[1] + (i + 1) + "-" + imgIdArray[i] + ".jpg";
            Logger.t(TAG).d(tmpImgUrl);
            stringImageList.add(tmpImgUrl);
        }
        return stringImageList;

    }

Source File: VideoUrlParser.java From v9porn with MIT License

5 votes

@Override
    public VideoResult parseVideoPlayUrl(String html, User user) {
        VideoResult videoResult = new VideoResult();
        //html= DevHtmlTools.getLocalHtml(MyApplication.getInstance(),"videourl.txt");
        Document document = Jsoup.parse(html);
        Element htmlTag=document.select("html").first();
        if(htmlTag!=null){
            String htmlString=htmlTag.toString();
        }
        Element element = document.getElementById("player_one");

        String imgUrl=element.attr("poster");
        String videoId= imgUrl.substring(imgUrl.indexOf("thumb")+6,imgUrl.lastIndexOf("."));
        videoResult.setVideoId(videoId);
        Logger.t(TAG).d("视频Id：" + videoId);

        Element jsElement=element.select("script").first();
        String jsTagString=jsElement.toString();
        String jsScriptVideoUrl=jsTagString.substring(jsTagString.indexOf("strencode"),jsTagString.indexOf(");"));

        /**
         * element.select("script").toString().substring(element.select("script").toString().indexOf("strencode"),element.select("script").toString().indexOf(");"))
         */

        videoResult.setVideoUrl(jsScriptVideoUrl);
//        String videoUrl = element.selectFirst("source").attr("src");
//        videoResult.setVideoUrl(videoUrl);
//        int startIndex = videoUrl.lastIndexOf("/");
//        int endIndex = videoUrl.indexOf(".mp4");
//        String videoId = videoUrl.substring(startIndex + 1, endIndex);
//        videoResult.setVideoId(videoId);
//        Logger.t(TAG).d("视频Id：" + videoId);
        parserOtherInfo(document, videoResult, user);
        return videoResult;
    }

Source File: HtmlUtil.java From V2EX with GNU General Public License v3.0

5 votes

public static Topic getTopicAndReplies(String html){

        Topic topic = new Topic();
        Document document = Jsoup.parse(html);
        Element header = document.selectFirst("#Main > .box");
        String headerHtml = header.toString();
        Element middleEle = document.selectFirst("#Main > .box > .cell > span");
        Element contentEle = header.selectFirst(".topic_content");
        Element subtleEle = header.selectFirst(".subtle");
        String publishedTime = document.selectFirst("meta[property=article:published_time]")
                .attr("content")
                .replaceAll("[TZ]", " ");

        topic.setCreated(TimeUtil.strToTimestamp(publishedTime,null));
        topic.setId(matcherGroup1Int(Pattern.compile("(\\d{2,})"),
                document.selectFirst("meta[property=og:url]").attr("content")));
        topic.setTitle(header.selectFirst(".header > h1").text());
        topic.setClicks(matcherGroup1Int(PATTERN_TOPIC_CLICK, headerHtml));
        topic.setAgo(matcherGroup1(Pattern.compile("· ([^·]+) ·"),
                header.selectFirst(".header > small").toString()));
        topic.setFavors(matcherGroup1Int(PATTERN_TOPIC_FAVORS, headerHtml));
        topic.setContent_rendered("\n"
                + (contentEle == null ? "<br>" : contentEle.toString())
                + (subtleEle == null ? " " : subtleEle.toString())
                + "\n\t---");
        topic.setMember(new Member(
                matcherGroup1(PATTERN_TOPIC_USERNAME, headerHtml),
                matcherGroup1(PATTERN_TOPIC_USER_AVATAR, headerHtml)));
        topic.setNode(new Node(
                document.selectFirst("meta[property=article:tag]").attr("content"),
                document.selectFirst("meta[property=article:section]").attr("content")));

        if (middleEle != null){
            String lastTouched = matcherGroup1(Pattern.compile("直到 ([^+]+)"), middleEle.toString());
            topic.setLast_touched(lastTouched.isEmpty() ? 0 : TimeUtil.strToTimestamp(lastTouched,null));
            topic.setReplies(matcherGroup1Int(PATTERN_TOPIC_REPLY_COUNT, middleEle.toString()));
        }
        topic.setReplyList(getReplies(document, topic.getMember().getUsername()));
        return topic;
    }

Source File: HtmlUtil.java From V2EX with GNU General Public License v3.0

5 votes

private static List<Reply> getReplies(Document document, String poster){

        Elements elements = document.select("#Main > .box > .cell[id]");
        Iterator<Element> elementIterator = elements.iterator();

        List<Reply> replies = new ArrayList<>(elements.size());
        for (int f=0; elementIterator.hasNext(); f++) {
            Element e = elementIterator.next();
            Reply reply = new Reply();

            Element element = e.selectFirst(".reply_content");
            if (element != null){
                for (Element img:element.select("img")){
                    img.attr("width","100%");
                    img.attr("height","auto");
                }
                reply.setContent(element.html());
            }else{
                throw new V2exException("This post seems to have been blocked\nEmpty reply content");
            }

            String cell = e.toString();
            int id = matcherGroup1Int(PATTERN_REPLY_ID, cell);
            String username = matcherGroup1(PATTERN_REPLY_USERNAME, cell);
            String avatarNormal = matcherGroup1(PATTERN_REPLY_AVATAR, cell);

            reply.setId(id);
            reply.setMember(new Member(username, avatarNormal));
            if (poster != null) reply.setPoster(username.equals(poster));
            reply.setAgo(matcherGroup1(PATTERN_REPLY_AGO, cell));
            reply.setVia(matcherGroup1(PATTERN_REPLY_VIA, cell));
            reply.setLike(matcherGroup1Int(PATTERN_REPLY_LIKE, cell));
            reply.setFloor(f);

            replies.add(reply);
        }
        return replies;
    }

Source File: Handian.java From ankihelper with GNU General Public License v3.0

5 votes

public List<Definition> wordLookup(String key) {
        try {
//            Document doc = Jsoup.connect(wordUrl + key)
//                    .userAgent(DEFAULT_UA)
//                    .timeout(5000)
//                    .get();
//            String html = doc.toString();
            Request request = new Request.Builder().url(wordUrl + key)
                    .header("User-Agent", Constant.UA)
                    .build();
            String rawhtml = MyApplication.getOkHttpClient().newCall(request).execute().body().string();
            Document doc = Jsoup.parse(rawhtml);
            Elements entrys = doc.select("div.cdnr, div.tagContent");
            ArrayList<Definition> defList = new ArrayList<>();
            if (entrys.size() > 0) {
                    Element ele = entrys.get(0);
                    String word = key;
                    String meaning = ele.toString();
                    meaning = meaning.replaceAll("<img src=\"/", "<img src=\"http://www.zdic.net/");
                    meaning = meaning.replaceAll("&amp;","&");
                    HashMap<String, String> defMap = new HashMap<>();
                    String definition = meaning;
                    defMap.put(EXP_ELE[0], word);
                    defMap.put(EXP_ELE[1], definition);
                    defList.add(new Definition(defMap, definition));
            }
            return defList;
        } catch (IOException ioe) {
            Log.d("time out", Log.getStackTraceString(ioe));
            //Toast.makeText(MyApplication.getContext(), Log.getStackTraceString(ioe), Toast.LENGTH_SHORT).show();
            return new ArrayList<Definition>();
        }

    }

Source File: WendaDetailPresenter.java From Toutiao with Apache License 2.0

5 votes

private String getHTML(String response) {
    Document doc = Jsoup.parse(response, "UTF-8");
    Elements elements = doc.getElementsByClass("con-words");
    String content = null;
    for (Element element : elements) {
        content = element.toString();
        break;
    }
    if (content != null) {

        String css = "<link rel=\"stylesheet\" href=\"file:///android_asset/toutiao_light.css\" type=\"text/css\">";
        if (SettingUtil.getInstance().getIsNightMode()) {
            css = css.replace("toutiao_light", "toutiao_dark");
        }

        String html = "<!DOCTYPE html>\n" +
                "<html lang=\"en\">\n" +
                "<head>\n" +
                "    <meta charset=\"UTF-8\">" +
                css +
                "<body>\n" +
                "<article class=\"article-container\">\n" +
                "    <div class=\"article__content article-content\">" +
                content +
                "    </div>\n" +
                "</article>\n" +
                "</body>\n" +
                "</html>";

        return html;
    } else {
        return null;
    }
}

Source File: PhotoContentPresenter.java From Toutiao with Apache License 2.0

5 votes

private Boolean parseHTML(String HTML) {
    boolean flag = false;
    Document doc = Jsoup.parse(HTML);
    // 取得所有的script tag
    Elements scripts = doc.getElementsByTag("script");
    for (Element e : scripts) {
        // 过滤字符串
        String script = e.toString();
        if (script.contains("BASE_DATA.galleryInfo")) {
            // 只取得script的內容
            script = e.childNode(0).toString();

            Matcher matcher = Pattern.compile("(JSON.parse\\(\\\".+\\))").matcher(script);
            while (matcher.find()) {
                int count = matcher.groupCount();
                if (count >= 1) {
                    int start = script.indexOf("(");
                    int end = script.indexOf("),");
                    String json = script.substring(start + 2, end - 1);

                    // 处理特殊符号
                    json = ChineseUtil.UnicodeToChs(json);
                    json = json.replace("\\", "");
                    JsonReader reader = new JsonReader(new StringReader(json));
                    reader.setLenient(true);
                    bean = new Gson().fromJson(reader, PhotoGalleryBean.class);
                    Log.d(TAG, "parseHTML: " + bean.toString());
                    flag = true;
                    break;
                }
            }
        }
    }
    return flag;
}

Source File: LoadTags.java From NClientV2 with Apache License 2.0

4 votes

private String extractArray(Element e) {
    String t = e.toString();
    return t.substring(t.indexOf('['), t.indexOf(';'));
}

Source File: AnnouncementListFragment.java From PKUCourses with GNU General Public License v3.0

4 votes

public String getContents() {
    Element tmp = nNode.getElementsByClass("details").first().getElementsByClass("vtbegenerated").first();
    return tmp == null ? "" : tmp.toString();
}

Source File: HTMLExtensions.java From Android-WYSIWYG-Editor with Apache License 2.0

4 votes

public String getHtmlSpan(Element element) {
    Element el = new Element(Tag.valueOf("span"), "");
    el.attributes().put("style", element.attr("style"));
    el.html(element.html());
    return el.toString();
}

Source File: BootstrapHandlerDependenciesTest.java From flow with Apache License 2.0

4 votes

@Test
public void flowDependenciesShouldBeImportedBeforeUserDependenciesWithCorrectAttributes() {
    Consumer<Document> uiPageTestingMethod = page -> {
        boolean foundClientEngine = false;
        int flowDependencyMaxIndex = Integer.MAX_VALUE;
        int userDependencyMinIndex = Integer.MAX_VALUE;

        Elements children = page.head().children();
        for (int i = 0; i < children.size(); i++) {
            Element element = children.get(i);
            String elementString = element.toString();
            if (foundClientEngine) {
                if (userDependencyMinIndex > i) {
                    userDependencyMinIndex = i;
                }
                if (elementString.contains("dndConnector.js")) {
                    continue;
                }
                assertThat(
                        "Expected to have here dependencies added with Flow public api",
                        elementString,
                        either(containsString("eager"))
                                .or(containsString("lazy"))
                                .or(containsString("inline")));
            } else {
                flowDependencyMaxIndex = i;
                // skip element with uidl that contains lazy dependencies
                if (!elementString.contains(BOOTSTRAP_SCRIPT_CONTENTS)) {
                    assertThat(
                            "Flow dependencies should not contain user dependencies",
                            elementString,
                            both(not(containsString("eager")))
                                    .and(not(containsString("lazy")))
                                    .and(not(containsString("inline"))));
                    if (elementString.contains(
                            BootstrapHandler.clientEngineFile.get())) {
                        foundClientEngine = true;
                    }
                } else {
                    assertThat(
                            "uidl should not contain eager and inline dependencies",
                            elementString,
                            both(not(containsString("eager")))
                                    .and(not(containsString("inline"))));
                }
            }

            assertThat(String.format(
                    "All javascript dependencies should be loaded without 'async' attribute. Dependency with url %s has this attribute",
                    element.attr("src")), element.attr("async"), is(""));
        }

        assertThat(
                "Flow dependencies should be imported before user dependencies",
                flowDependencyMaxIndex,
                is(lessThan(userDependencyMinIndex)));

    };

    testUis(uiPageTestingMethod, new UIAnnotated_LoadingOrderTest(),
            new UIWithMethods_LoadingOrderTest());
}

Java Code Examples for org.jsoup.nodes.Element#toString()