us.codecraft.webmagic.Page Java Examples
The following examples show how to use
us.codecraft.webmagic.Page.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HttpClientDownloaderTest.java From webmagic with Apache License 2.0 | 6 votes |
@Test public void test_download_binary_content() throws Exception { HttpServer server = httpServer(13423); server.response("binary"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { final HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setBinaryContent(true); request.setUrl("http://127.0.0.1:13423/"); Page page = httpClientDownloader.download(request, Site.me().toTask()); assertThat(page.getRawText()).isNull(); assertThat(page.getBytes()).isEqualTo("binary".getBytes()); } }); }
Example #2
Source File: SeleniumDownloaderTest.java From webmagic with Apache License 2.0 | 6 votes |
@Ignore @Test public void testBaiduWenku() { SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); seleniumDownloader.setSleepTime(10000); long time1 = System.currentTimeMillis(); Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() { @Override public String getUUID() { return "huaban.com"; } @Override public Site getSite() { return Site.me(); } }); System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>","").replace("&nsbp;","").all()); }
Example #3
Source File: ProcessorBenchmark.java From webmagic with Apache License 2.0 | 6 votes |
@Ignore @Test public void test() { ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me(), OschinaBlog.class); Page page = new Page(); page.setRequest(new Request("http://my.oschina.net/flashsword/blog")); page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog")); page.setHtml(new Html(html)); long time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { modelPageProcessor.process(page); } System.out.println(System.currentTimeMillis() - time); time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { modelPageProcessor.process(page); } System.out.println(System.currentTimeMillis() - time); }
Example #4
Source File: BlogProcesser.java From mogu_blog_v2 with Apache License 2.0 | 6 votes |
private void saveBlogInfo(Page page) { //2、获取我们需要的内容: title和content String title = page.getHtml().xpath("//*[@id=\"mainBox\"]/main/div[1]/div/div/div[1]/h1/text()").toString(); String content = page.getHtml().xpath("//*[@id=\"article_content\"]").toString(); if (title != null) { page.putField("title", title); page.putField("content", content); } else { //跳过爬取 page.setSkip(true); } }
Example #5
Source File: HttpClientDownloader.java From webmagic with Apache License 2.0 | 6 votes |
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); if (!request.isBinaryContent()){ if (charset == null) { charset = getHtmlCharset(contentType, bytes); } page.setCharset(charset); page.setRawText(new String(bytes, charset)); } page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); page.setDownloadSuccess(true); if (responseHeader) { page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } return page; }
Example #6
Source File: HttpClientDownloaderTest.java From webmagic with Apache License 2.0 | 6 votes |
@Test public void test_set_request_cookie() throws Exception { HttpServer server = httpServer(13423); server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setUrl("http://127.0.0.1:13423"); request.addCookie("cookie","cookie-webmagic"); Page page = httpClientDownloader.download(request, Site.me().toTask()); assertThat(page.getRawText()).isEqualTo("ok"); } }); }
Example #7
Source File: HttpClientDownloader.java From plumemo with Apache License 2.0 | 6 votes |
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); if (!request.isBinaryContent()) { if (charset == null) { charset = getHtmlCharset(contentType, bytes); } page.setCharset(charset); page.setRawText(new String(bytes, charset)); } page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); page.setDownloadSuccess(true); if (responseHeader) { page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } return page; }
Example #8
Source File: SeleniumDownloaderTest.java From webmagic with Apache License 2.0 | 6 votes |
@Ignore("need chrome driver") @Test public void test() { SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); long time1 = System.currentTimeMillis(); for (int i = 0; i < 100; i++) { Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() { @Override public String getUUID() { return "huaban.com"; } @Override public Site getSite() { return Site.me(); } }); System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all()); } System.out.println(System.currentTimeMillis() - time1); }
Example #9
Source File: DownloadPicture.java From Gather-Platform with GNU General Public License v3.0 | 6 votes |
@Override public void process(Page page) { List<String> url_list = new ArrayList<>(); List<String> name_list = new ArrayList<>(); JSONObject jsonObject = (JSONObject) JSONObject.parse(page.getRawText()); JSONArray data = (JSONArray) jsonObject.get("imgs"); for(int i=0;i<data.size();i++){ String url = (String) data.getJSONObject(i).get("objURL"); String name = (String) data.getJSONObject(i).get("fromPageTitleEnc"); if(url!=null){ url_list.add(url); name_list.add(name); } } setUrls(url_list); setNames(name_list); }
Example #10
Source File: HttpClientDownloaderTest.java From webmagic with Apache License 2.0 | 6 votes |
@Test public void test_set_site_cookie() throws Exception { HttpServer server = httpServer(13423); server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setUrl("http://127.0.0.1:13423"); Site site = Site.me().addCookie("cookie", "cookie-webmagic").setDomain("127.0.0.1"); Page page = httpClientDownloader.download(request, site.toTask()); assertThat(page.getRawText()).isEqualTo("ok"); } }); }
Example #11
Source File: ProfessionTypeSpider.java From SmartEducation with Apache License 2.0 | 6 votes |
@Override public void process(Page page) { // <li><a href="/category/01">哲学</a></li> // 筛选名称 List<String> professionTypeNameList = page.getHtml() .xpath("//ul[@class='category']/li/a/html()").all(); page.putField("professionName", professionTypeNameList); // 筛选url List<String> professionTypeUrlList = page.getHtml().xpath("//ul[@class='category']/li/a/@href").all(); page.putField("professionName", professionTypeUrlList); if(professionTypeNameList.size()>0){ for(int i=0;i<professionTypeNameList.size();i++){ SpiderProfessionType model=new SpiderProfessionType(professionTypeNameList.get(i).toString(), professionTypeUrlList.get(i)); spiderProfessionTypeService.save(model); } } }
Example #12
Source File: CasperjsDownloader.java From spider with GNU General Public License v3.0 | 6 votes |
@Override public Page download(Request request, Task task) { String html = null; Site site = null; if (task != null) { site = task.getSite(); } try { html = casperjs.gatherHtml(new com.gs.spider.model.commons.Request(request.getUrl(), true)); } catch (Exception e) { if (site.getCycleRetryTimes() > 0) { return addToCycleRetry(request, site); } request.putExtra("EXCEPTION", e); onError(request); return null; } Page page = new Page(); page.setRawText(html); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); onSuccess(request); return page; }
Example #13
Source File: ContentImageProcessor.java From javabase with Apache License 2.0 | 6 votes |
@Override public void process(Page page) { List<String> imageUrlList = page.getHtml().$(".BDE_Image", "src").all(); String pageId = page.getUrl().toString().replace(tieBaConfiguration.getTiebaContentPageUrl(),""); List<String> list = new ArrayList<>(); for (String imageUrl : imageUrlList) { if (imageUrl.startsWith(tieBaConfiguration.getTiebaImageUrl())) { imageUrl=convertImageUrl(imageUrl); if (null!=imageUrl)list.add(imageUrl); } } if (list.size() > 0) { map.put(WebmagicService.getByte(TieBaImageIdMessageListener.TIEBA_CONTENT_IMAGE_KEY+pageId), WebmagicService.getByte(JSONObject.toJSONString(list))); }else{ redisTemplate.convertAndSend(tieBaConfiguration.getTiebaContentNoImageIdTopic(), JSONObject.toJSONString(new ContentBean(pageId,tiebaName))); } if (!isAddTarget) { for (String id : pageNumberList) { StringBuilder sb = new StringBuilder(); sb.append(url).append(id); page.addTargetRequests(Arrays.asList(sb.toString())); } isAddTarget = true; } }
Example #14
Source File: ConfigurablePageProcessor.java From webmagic with Apache License 2.0 | 6 votes |
@Override public void process(Page page) { for (ExtractRule extractRule : extractRules) { if (extractRule.isMulti()) { List<String> results = page.getHtml().selectDocumentForList(extractRule.getSelector()); if (extractRule.isNotNull() && results.size() == 0) { page.setSkip(true); } else { page.getResultItems().put(extractRule.getFieldName(), results); } } else { String result = page.getHtml().selectDocument(extractRule.getSelector()); if (extractRule.isNotNull() && result == null) { page.setSkip(true); } else { page.getResultItems().put(extractRule.getFieldName(), result); } } } }
Example #15
Source File: ContentIdProcessor.java From javabase with Apache License 2.0 | 6 votes |
@Override public void process(Page page) { for (int i = 1; i <= pageSize; i++) { String json = page.getHtml().xpath("//ul[@id='thread_list']/li[@class='j_thread_list clearfix'][" + (i) + "]/@data-field").toString(); if(json!=null&&JSONObject.parseObject(json).containsKey("id")){ JSONObject jsonObject = JSONObject.parseObject(json); String pageId=jsonObject.getString("id"); String authorName=jsonObject.getString("author_name"); String date = praseDate(page,i); String title=page.getHtml().xpath("a[@href='"+tieBaConfiguration.getTiebaContentPageUrl()+pageId+"']/@title").toString(); pageNumberList.add(new ContentBean(pageId,date,tiebaName,authorName,title)); } } if (!isAddTarget) { for (int i = 2; i <= endNum; i++) { StringBuilder sb = new StringBuilder(); sb.append(tiebaUrl).append("&pn=" + i*pageSize); page.addTargetRequests(Arrays.asList(sb.toString())); } isAddTarget = true; } }
Example #16
Source File: CourseSpider.java From SmartEducation with Apache License 2.0 | 6 votes |
@Override public void process(Page page) { // 格式:http://mooc.chaoxing.com/category/01/0/1000 if (page.getUrl().regex("http://mooc\\.chaoxing\\.com/category/\\d+/\\d/\\d+") .toString() != null) { System.out.println("第一层"); crawerCourse(page); } // 格式:http://mooc.chaoxing.com/course/55672.html else if (page.getUrl().regex("http://mooc\\.chaoxing\\.com/course/\\d+\\.html") .toString() != null) { System.out.println("第二层"); crawCourseInfo(page); } }
Example #17
Source File: DiandianBlogProcessor.java From webmagic with Apache License 2.0 | 6 votes |
@Override public void process(Page page) { //a()表示提取链接,links()表示提取所有链接 //getHtml()返回Html对象,支持链式调用 //r()表示用正则表达式提取一条内容,regex()表示提取多条内容 //toString()表示取单条结果,all()表示取多条 List<String> requests = page.getHtml().links().regex("(.*/post/.*)").all(); //使用page.addTargetRequests()方法将待抓取的链接加入队列 page.addTargetRequests(requests); //page.putField(key,value)将抽取的内容加入结果Map //x()和xs()使用xpath进行抽取 page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|").toString()); //smartContent()使用readability技术直接抽取正文,对于规整的文本有比较好的抽取正确率 page.putField("content", page.getHtml().smartContent()); page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/")); page.putField("id", page.getUrl().regex("post/\\d+-\\d+-\\d+/(\\d+)")); }
Example #18
Source File: ModelPageProcessor.java From webmagic with Apache License 2.0 | 6 votes |
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) { List<String> links; if (urlRegionSelector == null) { links = page.getHtml().links().all(); } else { links = page.getHtml().selectList(urlRegionSelector).links().all(); } for (String link : links) { for (Pattern targetUrlPattern : urlPatterns) { Matcher matcher = targetUrlPattern.matcher(link); if (matcher.find()) { page.addTargetRequest(new Request(matcher.group(0))); } } } }
Example #19
Source File: NeteaseNewsPageProcesser.java From elasticsearch-jest-example with MIT License | 5 votes |
public void process(Page page) { //列表页 if (page.getUrl().regex(URL_LIST).match()||page.getUrl().regex("http://news\\.163\\.com/domestic").match()||page.getUrl().regex("http://news\\.163\\.com/shehui").match()) { page.addTargetRequests(page.getHtml().links().regex(URL_POST).all()); page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all()); }else{ page.putField("title", Utils.replaceHTML(page.getHtml().xpath("//h1[@id='h1title']").toString())); page.putField("content", Utils.replaceHTML(page.getHtml().xpath("//div[@id='endText']").toString())); page.putField("create", Utils.replaceHTML(page.getHtml().xpath("//div[@class=\"ep-time-soure cDGray\"]").toString())); page.putField("source", Utils.replaceHTML(page.getHtml().xpath("//a[@id=\"ne_article_source\"]/text()").toString())); page.putField("url", page.getUrl().get()); String title = (String)page.getResultItems().get("title"); String content = (String)page.getResultItems().get("content"); String create = (String)page.getResultItems().get("create"); String source = (String)page.getResultItems().get("source"); String url = (String)page.getResultItems().get("url"); String author = ""; // 创建article Article article = Utils.createArticle(title, content, source, author, url, create); // 索引 Utils.index(article); } }
Example #20
Source File: GithubRepoPageProcessor.java From webmagic with Apache License 2.0 | 5 votes |
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all()); GithubRepo githubRepo = new GithubRepo(); githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); githubRepo.setName(page.getHtml().xpath("//h1[contains(@class, 'entry-title') and contains(@class, 'public')]/strong/a/text()").toString()); githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString()); if (githubRepo.getName() == null) { //skip this page page.setSkip(true); } else { page.putField("repo", githubRepo); } }
Example #21
Source File: ZhihuPageProcessor.java From webmagic with Apache License 2.0 | 5 votes |
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("https://www\\.zhihu\\.com/question/\\d+/answer/\\d+.*").all()); page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString()); page.putField("question", page.getHtml().xpath("//div[@class='QuestionRichText']//tidyText()").toString()); page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString()); if (page.getResultItems().get("title")==null){ //skip this page page.setSkip(true); } }
Example #22
Source File: AmanzonPageProcessor.java From webmagic with Apache License 2.0 | 5 votes |
public void process(Page page) { Html html = page.getHtml(); List<String> questionList = html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all(); if(questionList != null && questionList.size() > 1) { //i=0是列名称,所以i从1开始 for( int i = 1 ; i < questionList.size(); i++) { System.out.println(questionList.get(i)); Html tempHtml = Html.create("<table>"+questionList.get(i)+"</table>"); String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString(); System.out.println(comment); String answerNum = tempHtml.xpath("//td[@class='num']/text()").toString(); System.out.println(answerNum); String createTime = tempHtml.xpath("//td[3]/text()").toString(); System.out.println(createTime); /* Document doc = Jsoup.parse(questionList.get(i)); Html hmt = Html.create(questionList.get(i)) ; String str = hmt.links().toString(); String content = doc.getElementsByTag("a").text(); String ss = doc.text();*/ } } }
Example #23
Source File: GithubRepoPageProcessor.java From webmagic with Apache License 2.0 | 5 votes |
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all()); page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all()); page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString()); if (page.getResultItems().get("name")==null){ //skip this page page.setSkip(true); } page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); }
Example #24
Source File: SSLCompatibilityTest.java From webmagic with Apache License 2.0 | 5 votes |
@Test public void test_tls12() throws Exception { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Task task = Site.me().setCycleRetryTimes(5).toTask(); Request request = new Request("https://juejin.im/"); Page page = httpClientDownloader.download(request, task); assertThat(page.isDownloadSuccess()).isTrue(); }
Example #25
Source File: ZipCodePageProcessor.java From webmagic with Apache License 2.0 | 5 votes |
private void processCountry(Page page) { List<String> provinces = page.getHtml().xpath("//*[@id=\"newAlexa\"]/table/tbody/tr/td").all(); for (String province : provinces) { String link = xpath("//@href").select(province); String title = xpath("/text()").select(province); Request request = new Request(link).setPriority(0).putExtra("province", title); page.addTargetRequest(request); } }
Example #26
Source File: MockGithubDownloader.java From webmagic with Apache License 2.0 | 5 votes |
@Override public Page download(Request request, Task task) { Page page = new Page(); InputStream resourceAsStream = this.getClass().getResourceAsStream("/html/mock-github.html"); try { page.setRawText(IOUtils.toString(resourceAsStream)); } catch (IOException e) { e.printStackTrace(); } page.setRequest(new Request("https://github.com/code4craft/webmagic")); page.setUrl(new PlainText("https://github.com/code4craft/webmagic")); return page; }
Example #27
Source File: GithubRepoPageProcessor.java From SmartEducation with Apache License 2.0 | 5 votes |
@Override // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 public void process(Page page) { // 部分二:定义如何抽取页面信息,并保存下来 String author=page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString(); page.putField("author",author ); String name=page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString(); page.putField("name", name); if (page.getResultItems().get("name") == null) { //skip this page page.setSkip(true); } String readme=page.getHtml().xpath("//div[@id='readme']/tidyText()").toString(); page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); // if(name==null){ // Test test=new Test(); // test.setAuthor(author); // test.setName(name); // test.setReadme(readme); // testService.save(test); // } // 部分三:从页面发现后续的url地址来抓取 page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); }
Example #28
Source File: HttpClientDownloader.java From webmagic with Apache License 2.0 | 5 votes |
@Override public Page download(Request request, Task task) { if (task == null || task.getSite() == null) { throw new NullPointerException("task or site can not be null"); } CloseableHttpResponse httpResponse = null; CloseableHttpClient httpClient = getHttpClient(task.getSite()); Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null; HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); Page page = Page.fail(); try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); onSuccess(request); logger.info("downloading page success {}", request.getUrl()); return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); onError(request); return page; } finally { if (httpResponse != null) { //ensure the connection is released back to pool EntityUtils.consumeQuietly(httpResponse.getEntity()); } if (proxyProvider != null && proxy != null) { proxyProvider.returnProxy(proxy, page, task); } } }
Example #29
Source File: HuabanProcessor.java From webmagic with Apache License 2.0 | 5 votes |
@Override public void process(Page page) { page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all()); if (page.getUrl().toString().contains("pins")) { page.putField("img", page.getHtml().xpath("//div[@class='image-holder']/a/img/@src").toString()); } else { page.getResultItems().setSkip(true); } }
Example #30
Source File: DocumentTest.java From SmartEducation with Apache License 2.0 | 5 votes |
@Override public void process(Page page) { if (page.getUrl() .regex("http://mooc\\.chaoxing\\.com/course/\\d+\\.html") .toString() != null) { System.out.println("第二层"); crawCourseInfo(page); } }