us.codecraft.webmagic.Page Java Exaples

Source File: HttpClientDownloaderTest.java From webmagic with Apache License 2.0

6 votes

@Test
public void test_download_binary_content() throws Exception {
    HttpServer server = httpServer(13423);
    server.response("binary");
    Runner.running(server, new Runnable() {
        @Override
        public void run() throws Exception {
            final HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
            Request request = new Request();
            request.setBinaryContent(true);
            request.setUrl("http://127.0.0.1:13423/");
            Page page = httpClientDownloader.download(request, Site.me().toTask());
            assertThat(page.getRawText()).isNull();
            assertThat(page.getBytes()).isEqualTo("binary".getBytes());
        }
    });
}

Source File: SeleniumDownloaderTest.java From webmagic with Apache License 2.0

6 votes

@Ignore
@Test
public void testBaiduWenku() {
	SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
       seleniumDownloader.setSleepTime(10000);
	long time1 = System.currentTimeMillis();
	Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() {
		@Override
		public String getUUID() {
			return "huaban.com";
		}

		@Override
		public Site getSite() {
			return Site.me();
		}
	});
	System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>","").replace("&nsbp;","").all());
}

Source File: ProcessorBenchmark.java From webmagic with Apache License 2.0

6 votes

@Ignore
@Test
public void test() {
    ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me(), OschinaBlog.class);
    Page page = new Page();
    page.setRequest(new Request("http://my.oschina.net/flashsword/blog"));
    page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog"));
    page.setHtml(new Html(html));
    long time = System.currentTimeMillis();
    for (int i = 0; i < 1000; i++) {
        modelPageProcessor.process(page);
    }
    System.out.println(System.currentTimeMillis() - time);
    time = System.currentTimeMillis();
    for (int i = 0; i < 1000; i++) {
        modelPageProcessor.process(page);
    }
    System.out.println(System.currentTimeMillis() - time);
}

Source File: BlogProcesser.java From mogu_blog_v2 with Apache License 2.0

6 votes

private void saveBlogInfo(Page page) {

        //2、获取我们需要的内容： title和content
        String title = page.getHtml().xpath("//*[@id=\"mainBox\"]/main/div[1]/div/div/div[1]/h1/text()").toString();
        String content = page.getHtml().xpath("//*[@id=\"article_content\"]").toString();


        if (title != null) {
            page.putField("title", title);
            page.putField("content", content);
        } else {
			//跳过爬取
            page.setSkip(true);
        }

    }

Source File: HttpClientDownloader.java From webmagic with Apache License 2.0

6 votes

protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
    byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
    String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
    Page page = new Page();
    page.setBytes(bytes);
    if (!request.isBinaryContent()){
        if (charset == null) {
            charset = getHtmlCharset(contentType, bytes);
        }
        page.setCharset(charset);
        page.setRawText(new String(bytes, charset));
    }
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
    page.setDownloadSuccess(true);
    if (responseHeader) {
        page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
    }
    return page;
}

Source File: HttpClientDownloaderTest.java From webmagic with Apache License 2.0

6 votes

@Test
public void test_set_request_cookie() throws Exception {
    HttpServer server = httpServer(13423);
    server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok");
    Runner.running(server, new Runnable() {
        @Override
        public void run() throws Exception {
            HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
            Request request = new Request();
            request.setUrl("http://127.0.0.1:13423");
            request.addCookie("cookie","cookie-webmagic");
            Page page = httpClientDownloader.download(request, Site.me().toTask());
            assertThat(page.getRawText()).isEqualTo("ok");
        }
    });
}

Source File: HttpClientDownloader.java From plumemo with Apache License 2.0

6 votes

protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
    byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
    String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();
    Page page = new Page();
    page.setBytes(bytes);
    if (!request.isBinaryContent()) {
        if (charset == null) {
            charset = getHtmlCharset(contentType, bytes);
        }
        page.setCharset(charset);
        page.setRawText(new String(bytes, charset));
    }
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
    page.setDownloadSuccess(true);
    if (responseHeader) {
        page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));
    }
    return page;
}

Source File: SeleniumDownloaderTest.java From webmagic with Apache License 2.0

6 votes

@Ignore("need chrome driver")
@Test
public void test() {
	SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
	long time1 = System.currentTimeMillis();
	for (int i = 0; i < 100; i++) {
		Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() {
			@Override
			public String getUUID() {
				return "huaban.com";
			}

			@Override
			public Site getSite() {
				return Site.me();
			}
		});
		System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all());
	}
	System.out.println(System.currentTimeMillis() - time1);
}

Source File: DownloadPicture.java From Gather-Platform with GNU General Public License v3.0

6 votes

@Override
public void process(Page page) {
    List<String> url_list = new ArrayList<>();
    List<String> name_list = new ArrayList<>();
    JSONObject jsonObject = (JSONObject) JSONObject.parse(page.getRawText());
    JSONArray data = (JSONArray) jsonObject.get("imgs");
    for(int i=0;i<data.size();i++){
        String url = (String) data.getJSONObject(i).get("objURL");
        String name = (String) data.getJSONObject(i).get("fromPageTitleEnc");
        if(url!=null){
            url_list.add(url);
            name_list.add(name);
        }
    }
    setUrls(url_list);
    setNames(name_list);
}

Source File: HttpClientDownloaderTest.java From webmagic with Apache License 2.0

6 votes

@Test
public void test_set_site_cookie() throws Exception {
    HttpServer server = httpServer(13423);
    server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok");
    Runner.running(server, new Runnable() {
        @Override
        public void run() throws Exception {
            HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
            Request request = new Request();
            request.setUrl("http://127.0.0.1:13423");
            Site site = Site.me().addCookie("cookie", "cookie-webmagic").setDomain("127.0.0.1");
            Page page = httpClientDownloader.download(request, site.toTask());
            assertThat(page.getRawText()).isEqualTo("ok");
        }
    });
}

Source File: ProfessionTypeSpider.java From SmartEducation with Apache License 2.0

6 votes

@Override
public void process(Page page) {
	// <li><a href="/category/01">哲学</a></li>
	// 筛选名称
	List<String> professionTypeNameList = page.getHtml()
			.xpath("//ul[@class='category']/li/a/html()").all();
	page.putField("professionName", professionTypeNameList);
	// 筛选url
	List<String> professionTypeUrlList = page.getHtml().xpath("//ul[@class='category']/li/a/@href").all();
	page.putField("professionName", professionTypeUrlList);
	
	if(professionTypeNameList.size()>0){
		for(int i=0;i<professionTypeNameList.size();i++){
			SpiderProfessionType model=new SpiderProfessionType(professionTypeNameList.get(i).toString(), professionTypeUrlList.get(i));
			spiderProfessionTypeService.save(model);
		}
	}
}

Source File: CasperjsDownloader.java From spider with GNU General Public License v3.0

6 votes

@Override
public Page download(Request request, Task task) {
    String html = null;
    Site site = null;
    if (task != null) {
        site = task.getSite();
    }
    try {
        html = casperjs.gatherHtml(new com.gs.spider.model.commons.Request(request.getUrl(), true));
    } catch (Exception e) {
        if (site.getCycleRetryTimes() > 0) {
            return addToCycleRetry(request, site);
        }
        request.putExtra("EXCEPTION", e);
        onError(request);
        return null;
    }
    Page page = new Page();
    page.setRawText(html);
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    onSuccess(request);
    return page;
}

Source File: ContentImageProcessor.java From javabase with Apache License 2.0

6 votes

@Override
public void process(Page page) {
    List<String> imageUrlList = page.getHtml().$(".BDE_Image", "src").all();
    String pageId = page.getUrl().toString().replace(tieBaConfiguration.getTiebaContentPageUrl(),"");
    List<String> list = new ArrayList<>();
    for (String imageUrl : imageUrlList) {
        if (imageUrl.startsWith(tieBaConfiguration.getTiebaImageUrl())) {
            imageUrl=convertImageUrl(imageUrl);
            if (null!=imageUrl)list.add(imageUrl);
        }
    }
    if (list.size() > 0) {
        map.put(WebmagicService.getByte(TieBaImageIdMessageListener.TIEBA_CONTENT_IMAGE_KEY+pageId), WebmagicService.getByte(JSONObject.toJSONString(list)));
    }else{
        redisTemplate.convertAndSend(tieBaConfiguration.getTiebaContentNoImageIdTopic(), JSONObject.toJSONString(new ContentBean(pageId,tiebaName)));
    }
    if (!isAddTarget) {
        for (String id : pageNumberList) {
            StringBuilder sb = new StringBuilder();
            sb.append(url).append(id);
            page.addTargetRequests(Arrays.asList(sb.toString()));
        }
        isAddTarget = true;
    }
}

Source File: ConfigurablePageProcessor.java From webmagic with Apache License 2.0

6 votes

@Override
public void process(Page page) {
    for (ExtractRule extractRule : extractRules) {
        if (extractRule.isMulti()) {
            List<String> results = page.getHtml().selectDocumentForList(extractRule.getSelector());
            if (extractRule.isNotNull() && results.size() == 0) {
                page.setSkip(true);
            } else {
                page.getResultItems().put(extractRule.getFieldName(), results);
            }
        } else {
            String result = page.getHtml().selectDocument(extractRule.getSelector());
            if (extractRule.isNotNull() && result == null) {
                page.setSkip(true);
            } else {
                page.getResultItems().put(extractRule.getFieldName(), result);
            }
        }
    }
}

Source File: ContentIdProcessor.java From javabase with Apache License 2.0

6 votes

@Override
public void process(Page page) {
    for (int i = 1; i <= pageSize; i++) {
        String json = page.getHtml().xpath("//ul[@id='thread_list']/li[@class='j_thread_list clearfix'][" + (i) + "]/@data-field").toString();
        if(json!=null&&JSONObject.parseObject(json).containsKey("id")){
            JSONObject jsonObject = JSONObject.parseObject(json);
            String pageId=jsonObject.getString("id");
            String authorName=jsonObject.getString("author_name");
            String date = praseDate(page,i);
            String title=page.getHtml().xpath("a[@href='"+tieBaConfiguration.getTiebaContentPageUrl()+pageId+"']/@title").toString();

            pageNumberList.add(new ContentBean(pageId,date,tiebaName,authorName,title));
        }
    }

    if (!isAddTarget) {
        for (int i = 2; i <= endNum; i++) {
            StringBuilder sb = new StringBuilder();
            sb.append(tiebaUrl).append("&pn=" + i*pageSize);
            page.addTargetRequests(Arrays.asList(sb.toString()));
        }
        isAddTarget = true;
    }
}

Source File: CourseSpider.java From SmartEducation with Apache License 2.0

6 votes

@Override
public void process(Page page) {
	// 格式：http://mooc.chaoxing.com/category/01/0/1000
	if (page.getUrl().regex("http://mooc\\.chaoxing\\.com/category/\\d+/\\d/\\d+")
			.toString() != null) {
		System.out.println("第一层");
		crawerCourse(page);
	}
	// 格式：http://mooc.chaoxing.com/course/55672.html
	else if (page.getUrl().regex("http://mooc\\.chaoxing\\.com/course/\\d+\\.html")
			.toString() != null) {
		System.out.println("第二层");
		crawCourseInfo(page);
	}

}

Source File: DiandianBlogProcessor.java From webmagic with Apache License 2.0

6 votes

@Override
public void process(Page page) {
    //a()表示提取链接，links()表示提取所有链接
    //getHtml()返回Html对象，支持链式调用
    //r()表示用正则表达式提取一条内容，regex()表示提取多条内容
    //toString()表示取单条结果，all()表示取多条
    List<String> requests = page.getHtml().links().regex("(.*/post/.*)").all();
    //使用page.addTargetRequests()方法将待抓取的链接加入队列
    page.addTargetRequests(requests);
    //page.putField(key,value)将抽取的内容加入结果Map
    //x()和xs()使用xpath进行抽取
    page.putField("title", page.getHtml().xpath("//title").regex("(.*?)\\|").toString());
    //smartContent()使用readability技术直接抽取正文，对于规整的文本有比较好的抽取正确率
    page.putField("content", page.getHtml().smartContent());
    page.putField("date", page.getUrl().regex("post/(\\d+-\\d+-\\d+)/"));
    page.putField("id", page.getUrl().regex("post/\\d+-\\d+-\\d+/(\\d+)"));
}

Source File: ModelPageProcessor.java From webmagic with Apache License 2.0

6 votes

private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) {
    List<String> links;
    if (urlRegionSelector == null) {
        links = page.getHtml().links().all();
    } else {
        links = page.getHtml().selectList(urlRegionSelector).links().all();
    }
    for (String link : links) {
        for (Pattern targetUrlPattern : urlPatterns) {
            Matcher matcher = targetUrlPattern.matcher(link);
            if (matcher.find()) {
                page.addTargetRequest(new Request(matcher.group(0)));
            }
        }
    }
}

Source File: NeteaseNewsPageProcesser.java From elasticsearch-jest-example with MIT License

5 votes

public void process(Page page) {
    //列表页
    if (page.getUrl().regex(URL_LIST).match()||page.getUrl().regex("http://news\\.163\\.com/domestic").match()||page.getUrl().regex("http://news\\.163\\.com/shehui").match()) {
        page.addTargetRequests(page.getHtml().links().regex(URL_POST).all());
        page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all());
    }else{

        page.putField("title", Utils.replaceHTML(page.getHtml().xpath("//h1[@id='h1title']").toString()));
        page.putField("content", Utils.replaceHTML(page.getHtml().xpath("//div[@id='endText']").toString()));
        page.putField("create", Utils.replaceHTML(page.getHtml().xpath("//div[@class=\"ep-time-soure cDGray\"]").toString()));
        page.putField("source", Utils.replaceHTML(page.getHtml().xpath("//a[@id=\"ne_article_source\"]/text()").toString()));
        page.putField("url", page.getUrl().get());

        String title = (String)page.getResultItems().get("title");
        String content = (String)page.getResultItems().get("content");
        String create = (String)page.getResultItems().get("create");
        String source = (String)page.getResultItems().get("source");
        String url = (String)page.getResultItems().get("url");
        String author = "";

        // 创建article
        Article article = Utils.createArticle(title, content, source, author, url, create);

        // 索引

        Utils.index(article);

    }
}

Source File: GithubRepoPageProcessor.java From webmagic with Apache License 2.0

5 votes

@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
    GithubRepo githubRepo = new GithubRepo();
    githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
    githubRepo.setName(page.getHtml().xpath("//h1[contains(@class, 'entry-title') and contains(@class, 'public')]/strong/a/text()").toString());
    githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString());
    if (githubRepo.getName() == null) {
        //skip this page
        page.setSkip(true);
    } else {
        page.putField("repo", githubRepo);
    }
}

Source File: ZhihuPageProcessor.java From webmagic with Apache License 2.0

5 votes

@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex("https://www\\.zhihu\\.com/question/\\d+/answer/\\d+.*").all());
    page.putField("title", page.getHtml().xpath("//h1[@class='QuestionHeader-title']/text()").toString());
    page.putField("question", page.getHtml().xpath("//div[@class='QuestionRichText']//tidyText()").toString());
    page.putField("answer", page.getHtml().xpath("//div[@class='QuestionAnswer-content']/tidyText()").toString());
    if (page.getResultItems().get("title")==null){
        //skip this page
        page.setSkip(true);
    }
}

Source File: AmanzonPageProcessor.java From webmagic with Apache License 2.0

5 votes

public void process(Page page) {

        Html html = page.getHtml();
        List<String> questionList =  html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all();

        if(questionList != null && questionList.size() > 1)
        {
            //i=0是列名称，所以i从1开始
            for( int i = 1 ; i < questionList.size(); i++)
            {
                System.out.println(questionList.get(i));
                Html tempHtml =  Html.create("<table>"+questionList.get(i)+"</table>");
                String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString();
                System.out.println(comment);
                String answerNum =  tempHtml.xpath("//td[@class='num']/text()").toString();
                System.out.println(answerNum);
                String createTime = tempHtml.xpath("//td[3]/text()").toString();
                System.out.println(createTime);

				/* Document doc = Jsoup.parse(questionList.get(i));
				 Html hmt  = Html.create(questionList.get(i)) ;
			     String str = hmt.links().toString();
				  String   content =   doc.getElementsByTag("a").text();
				  String ss = doc.text();*/

            }
        }

    }

Source File: GithubRepoPageProcessor.java From webmagic with Apache License 2.0

5 votes

@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all());
    page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
    page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
    page.putField("name", page.getHtml().xpath("//h1[@class='public']/strong/a/text()").toString());
    if (page.getResultItems().get("name")==null){
        //skip this page
        page.setSkip(true);
    }
    page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
}

Source File: SSLCompatibilityTest.java From webmagic with Apache License 2.0

5 votes

@Test
public void test_tls12() throws Exception {
    HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
    Task task = Site.me().setCycleRetryTimes(5).toTask();
    Request request = new Request("https://juejin.im/");
    Page page = httpClientDownloader.download(request, task);
    assertThat(page.isDownloadSuccess()).isTrue();
}

Source File: ZipCodePageProcessor.java From webmagic with Apache License 2.0

5 votes

private void processCountry(Page page) {
    List<String> provinces = page.getHtml().xpath("//*[@id=\"newAlexa\"]/table/tbody/tr/td").all();
    for (String province : provinces) {
        String link = xpath("//@href").select(province);
        String title = xpath("/text()").select(province);
        Request request = new Request(link).setPriority(0).putExtra("province", title);
        page.addTargetRequest(request);
    }
}

Source File: MockGithubDownloader.java From webmagic with Apache License 2.0

5 votes

@Override
public Page download(Request request, Task task) {
    Page page = new Page();
    InputStream resourceAsStream = this.getClass().getResourceAsStream("/html/mock-github.html");
    try {
        page.setRawText(IOUtils.toString(resourceAsStream));
    } catch (IOException e) {
        e.printStackTrace();
    }
    page.setRequest(new Request("https://github.com/code4craft/webmagic"));
    page.setUrl(new PlainText("https://github.com/code4craft/webmagic"));
    return page;
}

Source File: GithubRepoPageProcessor.java From SmartEducation with Apache License 2.0

5 votes

@Override
    // process是定制爬虫逻辑的核心接口，在这里编写抽取逻辑
    public void process(Page page) {
        // 部分二：定义如何抽取页面信息，并保存下来
    	String author=page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString();
        page.putField("author",author );
        String name=page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString();
        page.putField("name", name);

        if (page.getResultItems().get("name") == null) {
            //skip this page
            page.setSkip(true);
        }
        String readme=page.getHtml().xpath("//div[@id='readme']/tidyText()").toString();
        page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));

//        if(name==null){
//            Test test=new Test();
//            test.setAuthor(author);
//            test.setName(name);
//            test.setReadme(readme);
//            testService.save(test);
//        }
        // 部分三：从页面发现后续的url地址来抓取
        page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
    
    }

Source File: HttpClientDownloader.java From webmagic with Apache License 2.0

5 votes

@Override
public Page download(Request request, Task task) {
    if (task == null || task.getSite() == null) {
        throw new NullPointerException("task or site can not be null");
    }
    CloseableHttpResponse httpResponse = null;
    CloseableHttpClient httpClient = getHttpClient(task.getSite());
    Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;
    HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);
    Page page = Page.fail();
    try {
        httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());
        page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);
        onSuccess(request);
        logger.info("downloading page success {}", request.getUrl());
        return page;
    } catch (IOException e) {
        logger.warn("download page {} error", request.getUrl(), e);
        onError(request);
        return page;
    } finally {
        if (httpResponse != null) {
            //ensure the connection is released back to pool
            EntityUtils.consumeQuietly(httpResponse.getEntity());
        }
        if (proxyProvider != null && proxy != null) {
            proxyProvider.returnProxy(proxy, page, task);
        }
    }
}

Source File: HuabanProcessor.java From webmagic with Apache License 2.0

5 votes

@Override
public void process(Page page) {
    page.addTargetRequests(page.getHtml().links().regex("http://huaban\\.com/.*").all());
    if (page.getUrl().toString().contains("pins")) {
        page.putField("img", page.getHtml().xpath("//div[@class='image-holder']/a/img/@src").toString());
    } else {
        page.getResultItems().setSkip(true);
    }
}

Source File: DocumentTest.java From SmartEducation with Apache License 2.0

5 votes

@Override
public void process(Page page) {
	if (page.getUrl()
			.regex("http://mooc\\.chaoxing\\.com/course/\\d+\\.html")
			.toString() != null) {
		System.out.println("第二层");
		crawCourseInfo(page);
	}

}

us.codecraft.webmagic.Page Java Examples