Java Code Examples for us.codecraft.webmagic.Page#setSkip()
The following examples show how to use
us.codecraft.webmagic.Page#setSkip() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BlogProcesser.java From mogu_blog_v2 with Apache License 2.0 | 6 votes |
private void saveBlogInfo(Page page) { //2、获取我们需要的内容: title和content String title = page.getHtml().xpath("//*[@id=\"mainBox\"]/main/div[1]/div/div/div[1]/h1/text()").toString(); String content = page.getHtml().xpath("//*[@id=\"article_content\"]").toString(); if (title != null) { page.putField("title", title); page.putField("content", content); } else { //跳过爬取 page.setSkip(true); } }
Example 2
Source File: ZhihuPageProcessor.java From webmagic with Apache License 2.0 | 6 votes |
@Override public void process(Page page) { List<String> relativeUrl = page.getHtml().xpath("//li[@class='item clearfix']/div/a/@href").all(); page.addTargetRequests(relativeUrl); relativeUrl = page.getHtml().xpath("//div[@id='zh-question-related-questions']//a[@class='question_link']/@href").all(); page.addTargetRequests(relativeUrl); List<String> answers = page.getHtml().xpath("//div[@id='zh-question-answer-wrap']/div").all(); boolean exist = false; for(String answer:answers){ String vote = new Html(answer).xpath("//div[@class='zm-votebar']//span[@class='count']/text()").toString(); if(Integer.valueOf(vote) >= voteNum){ page.putField("vote",vote); page.putField("content",new Html(answer).xpath("//div[@class='zm-editable-content']")); page.putField("userid", new Html(answer).xpath("//a[@class='author-link']/@href")); exist = true; } } if(!exist){ page.setSkip(true); } }
Example 3
Source File: ConfigurablePageProcessor.java From webmagic with Apache License 2.0 | 6 votes |
@Override public void process(Page page) { for (ExtractRule extractRule : extractRules) { if (extractRule.isMulti()) { List<String> results = page.getHtml().selectDocumentForList(extractRule.getSelector()); if (extractRule.isNotNull() && results.size() == 0) { page.setSkip(true); } else { page.getResultItems().put(extractRule.getFieldName(), results); } } else { String result = page.getHtml().selectDocument(extractRule.getSelector()); if (extractRule.isNotNull() && result == null) { page.setSkip(true); } else { page.getResultItems().put(extractRule.getFieldName(), result); } } } }
Example 4
Source File: GithubRepoPageProcessor.java From SmartEducation with Apache License 2.0 | 5 votes |
@Override // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 public void process(Page page) { // 部分二:定义如何抽取页面信息,并保存下来 String author=page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString(); page.putField("author",author ); String name=page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString(); page.putField("name", name); if (page.getResultItems().get("name") == null) { //skip this page page.setSkip(true); } String readme=page.getHtml().xpath("//div[@id='readme']/tidyText()").toString(); page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); // if(name==null){ // Test test=new Test(); // test.setAuthor(author); // test.setName(name); // test.setReadme(readme); // testService.save(test); // } // 部分三:从页面发现后续的url地址来抓取 page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); }
Example 5
Source File: GithubRepoPageProcessor.java From webmagic with Apache License 2.0 | 5 votes |
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all()); page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString()); if (page.getResultItems().get("name")==null){ //skip this page page.setSkip(true); } page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); }
Example 6
Source File: ZhihuPageProcessor.java From webmagic with Apache License 2.0 | 5 votes |
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("https://www\\.zhihu\\.com/question/\\d+/answer/\\d+.*").all()); page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString()); page.putField("question", page.getHtml().xpath("//div[@class='QuestionRichText']//tidyText()").toString()); page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString()); if (page.getResultItems().get("title")==null){ //skip this page page.setSkip(true); } }
Example 7
Source File: MamacnPageProcessor.java From webmagic with Apache License 2.0 | 5 votes |
@Override public void process(Page page) { List<Selectable> nodes = page.getHtml().xpath("//ul[@id=ma-thumb-list]/li").nodes(); StringBuilder accum = new StringBuilder(); for (Selectable node : nodes) { accum.append("img:").append(node.xpath("//a/@href").get()).append("\n"); accum.append("title:").append(node.xpath("//img/@alt").get()).append("\n"); } page.putField("",accum.toString()); if (accum.length() == 0) { page.setSkip(true); } page.addTargetRequests(page.getHtml().links().regex("http://www\\.mama\\.cn/photo/.*\\.html").all()); }
Example 8
Source File: GithubRepoPageProcessor.java From webmagic with Apache License 2.0 | 5 votes |
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all()); GithubRepo githubRepo = new GithubRepo(); githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); githubRepo.setName(page.getHtml().xpath("//h1[contains(@class, 'entry-title') and contains(@class, 'public')]/strong/a/text()").toString()); githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString()); if (githubRepo.getName() == null) { //skip this page page.setSkip(true); } else { page.putField("repo", githubRepo); } }
Example 9
Source File: AlexanderMcqueenGoodsProcessor.java From webmagic with Apache License 2.0 | 5 votes |
@Override public void process(Page page) { if (page.getUrl().regex(URL_POST).match()) { page.putField("goodsName", page.getHtml().xpath("//div[@id='description']/h1/tidyText()")); if (page.getResultItems().get("goodsName") == null) { page.setSkip(true); } page.putField("currency", page.getHtml().xpath("//div[@id='description']//div[@class='itemBoxPrice']/span//span[@class='currency']/tidyText()")); page.putField("goodsPrice", page.getHtml().xpath("//div[@id='description']//div[@class='itemBoxPrice']/span//span[@class='priceValue']/tidyText()")); page.putField("description", page.getHtml() .xpath("//div[@id='tabbedDescription']//div[@class='tabbedDescription']//ul[@id='tabs']//li[@id='tab_description']/div[@id='description_pane']/tidyText()")); page.putField("material", page.getHtml() .xpath("//div[@id='tabbedDescription']" + "//div[@class='tabbedDescription']" + "//ul[@id='tabs']" + "//li[@id='tab_description']" + "//div[@class='productProperty']" + "//div[@class='productPropertyRow']/span[2]/tidyText()")); page.putField("goodsCode", page.getHtml() .xpath("//div[@id='tabbedDescription']" + "//div[@class='tabbedDescription']" + "//ul[@id='tabs']" + "//li[@id='tab_description']" + "//div[@class='productProperty']" + "//div[@class='productPropertyRow']//span[@id='modelFabricColorContainer']/tidyText()")); page.putField("goodsSize", page.getHtml() .xpath("//div[@id='sizesContainer']//div[@id='sizes']//ul[@class='SizeW']")); page.putField("goodsColors", page.getHtml() .xpath("//div[@id='colors']/ul/html()")); } else { page.addTargetRequests(page.getHtml().links().regex(URL_POST).all(), 1000); page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all(), 1); } }
Example 10
Source File: GithubRepoPageMapper.java From webmagic with Apache License 2.0 | 5 votes |
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all()); GithubRepo githubRepo = githubRepoPageMapper.get(page); if (githubRepo == null) { page.setSkip(true); } else { page.putField("repo", githubRepo); } }