us.codecraft.webmagic.selector.Html Java Examples
The following examples show how to use
us.codecraft.webmagic.selector.Html.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HtmlResolver.java From blog-hunter with MIT License | 6 votes |
@Override public void process(Page page, HunterConfig model) { Html pageHtml = page.getHtml(); String title = pageHtml.xpath(model.getTitleRegex()).get(); String source = page.getRequest().getUrl(); if (model.isSingle() || (!StringUtils.isEmpty(title) && (!"null".equals(title) && !model.getEntryUrls().contains(source)))) { page.putField("title", title); page.putField("source", source); this.put(page, pageHtml, "releaseDate", model.getReleaseDateRegex()); this.put(page, pageHtml, "author", model.getAuthorRegex()); this.put(page, pageHtml, "content", model.getContentRegex()); this.put(page, pageHtml, "tags", model.getTagRegex()); this.put(page, pageHtml, "description", model.getDescriptionRegex()); this.put(page, pageHtml, "keywords", model.getKeywordsRegex()); } if (!model.isSingle()) { if (StringUtils.isNotEmpty(model.getTargetLinksRegex())) { page.addTargetRequests(page.getHtml().links().regex(model.getTargetLinksRegex()).all()); } } }
Example #2
Source File: CommonUtil.java From blog-hunter with MIT License | 6 votes |
/** * 获取所有图片标签的src连接 * * @param html 原博客内容 */ public static String formatHtml(String html) { if (StringUtils.isEmpty(html)) { return null; } String lazyloadFormat = "<img src=\"%s\" title=\"%s\" alt=\"%s\">"; Html pageHtml = getHtml(html); List<Selectable> imgSelectables = pageHtml.$("img").nodes(); for (Selectable imgSelectable : imgSelectables) { String oldImg = imgSelectable.get(); String newImg = String.format(lazyloadFormat, getRealImgUrl(imgSelectable), imgSelectable.xpath("//img/@title").get(), imgSelectable.xpath("//img/@alt").get()); html = html.replace(oldImg, newImg); } return html; }
Example #3
Source File: CommonUtil.java From blog-hunter with MIT License | 6 votes |
/** * 获取所有图片标签的src连接 * * @param html 原博客内容 */ public static Set<ImageLink> getAllImageLink(String html) { if (StringUtils.isEmpty(html)) { return null; } Set<ImageLink> imageLinks = new HashSet<>(); ImageLink imageLink = null; Html pageHtml = getHtml(html); List<Selectable> imgSelectables = pageHtml.$("img").nodes(); for (Selectable imgSelectable : imgSelectables) { String newImgSrc = getRealImgUrl(imgSelectable); imageLink = new ImageLink(newImgSrc); imageLinks.add(imageLink); } return imageLinks; }
Example #4
Source File: ProcessorBenchmark.java From webmagic with Apache License 2.0 | 6 votes |
@Ignore @Test public void test() { ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me(), OschinaBlog.class); Page page = new Page(); page.setRequest(new Request("http://my.oschina.net/flashsword/blog")); page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog")); page.setHtml(new Html(html)); long time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { modelPageProcessor.process(page); } System.out.println(System.currentTimeMillis() - time); time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { modelPageProcessor.process(page); } System.out.println(System.currentTimeMillis() - time); }
Example #5
Source File: ZhihuPageProcessor.java From webmagic with Apache License 2.0 | 6 votes |
@Override public void process(Page page) { List<String> relativeUrl = page.getHtml().xpath("//li[@class='item clearfix']/div/a/@href").all(); page.addTargetRequests(relativeUrl); relativeUrl = page.getHtml().xpath("//div[@id='zh-question-related-questions']//a[@class='question_link']/@href").all(); page.addTargetRequests(relativeUrl); List<String> answers = page.getHtml().xpath("//div[@id='zh-question-answer-wrap']/div").all(); boolean exist = false; for(String answer:answers){ String vote = new Html(answer).xpath("//div[@class='zm-votebar']//span[@class='count']/text()").toString(); if(Integer.valueOf(vote) >= voteNum){ page.putField("vote",vote); page.putField("content",new Html(answer).xpath("//div[@class='zm-editable-content']")); page.putField("userid", new Html(answer).xpath("//a[@class='author-link']/@href")); exist = true; } } if(!exist){ page.setSkip(true); } }
Example #6
Source File: HtmlResolver.java From blog-hunter with MIT License | 5 votes |
private void put(Page page, Html pageHtml, String key, String regex) { if (StringUtils.isNotEmpty(regex)) { if (key.equals("tags")) { page.putField(key, pageHtml.xpath(regex).all()); return; } page.putField(key, pageHtml.xpath(regex).get()); } }
Example #7
Source File: AmanzonPageProcessor.java From webmagic with Apache License 2.0 | 5 votes |
public void process(Page page) { Html html = page.getHtml(); List<String> questionList = html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all(); if(questionList != null && questionList.size() > 1) { //i=0是列名称,所以i从1开始 for( int i = 1 ; i < questionList.size(); i++) { System.out.println(questionList.get(i)); Html tempHtml = Html.create("<table>"+questionList.get(i)+"</table>"); String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString(); System.out.println(comment); String answerNum = tempHtml.xpath("//td[@class='num']/text()").toString(); System.out.println(answerNum); String createTime = tempHtml.xpath("//td[3]/text()").toString(); System.out.println(createTime); /* Document doc = Jsoup.parse(questionList.get(i)); Html hmt = Html.create(questionList.get(i)) ; String str = hmt.links().toString(); String content = doc.getElementsByTag("a").text(); String ss = doc.text();*/ } } }
Example #8
Source File: HtmlTest.java From webmagic with Apache License 2.0 | 5 votes |
@Test public void testNthNodesGet(){ Html html = new Html("<a data-tip=\"p$t$xxx\" href=\"/xx/xx\">xx</a>"); assertThat(html.xpath("//a[1]/@href").get()).isEqualTo("/xx/xx"); Selectable selectable = html.xpath("//a[1]").nodes().get(0); assertThat(selectable.xpath("/a/@href").get()).isEqualTo("/xx/xx"); }
Example #9
Source File: HtmlTest.java From webmagic with Apache License 2.0 | 5 votes |
@Ignore("not work in jsoup 1.8.x") @Test public void testDisableJsoupHtmlEntityEscape() throws Exception { Html.DISABLE_HTML_ENTITY_ESCAPE = true; Html html = new Html("aaaaaaa&b"); assertThat(html.regex("(aaaaaaa&b)").toString()).isEqualTo("aaaaaaa&b"); }
Example #10
Source File: HtmlTest.java From webmagic with Apache License 2.0 | 4 votes |
@Test public void testRegexSelector() { Html selectable = new Html("aaaaaaab"); assertThat(selectable.regex("(a+b)").replace("aa(a)", "$1bb").toString()).isEqualTo("abbabbab"); }
Example #11
Source File: HtmlTest.java From webmagic with Apache License 2.0 | 4 votes |
@Test public void testEnableJsoupHtmlEntityEscape() throws Exception { Html html = new Html("aaaaaaa&b"); assertThat(html.regex("(aaaaaaa&b)").toString()).isEqualTo("aaaaaaa&b"); }
Example #12
Source File: HtmlTest.java From webmagic with Apache License 2.0 | 4 votes |
@Test public void testAHrefExtract(){ Html html = new Html("<a data-tip=\"p$t$xxx\" href=\"/xx/xx\">xx</a>"); assertThat(html.links().all()).contains("/xx/xx"); }
Example #13
Source File: HttpClientDownloaderTest.java From webmagic with Apache License 2.0 | 4 votes |
@Test public void testDownloader() { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Html html = httpClientDownloader.download("https://www.baidu.com/"); assertTrue(!html.getFirstSourceText().isEmpty()); }
Example #14
Source File: CommonUtil.java From blog-hunter with MIT License | 4 votes |
private static Html getHtml(String html) { Page page = new Page(); page.setRequest(new Request("")); page.setRawText(html); return page.getHtml(); }
Example #15
Source File: AbstractDownloader.java From webmagic with Apache License 2.0 | 2 votes |
/** * A simple method to download a url. * * @param url url * @param charset charset * @return html */ public Html download(String url, String charset) { Page page = download(new Request(url), Site.me().setCharset(charset).toTask()); return (Html) page.getHtml(); }
Example #16
Source File: AbstractDownloader.java From webmagic with Apache License 2.0 | 2 votes |
/** * A simple method to download a url. * * @param url url * @return html */ public Html download(String url) { return download(url, null); }
Example #17
Source File: Page.java From webmagic with Apache License 2.0 | 2 votes |
/** * @param html html * @deprecated since 0.4.0 * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. */ public void setHtml(Html html) { this.html = html; }