us.codecraft.webmagic.Request Java Examples
The following examples show how to use
us.codecraft.webmagic.Request.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HttpClientDownloaderTest.java From webmagic with Apache License 2.0 | 6 votes |
@Test public void test_download_binary_content() throws Exception { HttpServer server = httpServer(13423); server.response("binary"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { final HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setBinaryContent(true); request.setUrl("http://127.0.0.1:13423/"); Page page = httpClientDownloader.download(request, Site.me().toTask()); assertThat(page.getRawText()).isNull(); assertThat(page.getBytes()).isEqualTo("binary".getBytes()); } }); }
Example #2
Source File: HttpClientDownloader.java From plumemo with Apache License 2.0 | 6 votes |
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); if (!request.isBinaryContent()) { if (charset == null) { charset = getHtmlCharset(contentType, bytes); } page.setCharset(charset); page.setRawText(new String(bytes, charset)); } page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); page.setDownloadSuccess(true); if (responseHeader) { page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } return page; }
Example #3
Source File: CovertUtil.java From vscrawler with Apache License 2.0 | 6 votes |
public static Seed covertRequest(Request request) { if (StringUtils.isNotEmpty(request.getMethod()) && !StringUtils.equalsIgnoreCase(request.getMethod(), "get")) { log.warn("vscrawler can not support webmagic get method,this request {} will be ignore", request.getUrl()); return null; } Seed seed = new Seed(request.getUrl()); seed.setExt(Maps.transformEntries(request.getExtras(), new Maps.EntryTransformer<String, Object, String>() { @Override public String transformEntry(String key, Object value) { if (value instanceof String) { return (String) value; } return JSONObject.toJSONString(value); } })); return seed; }
Example #4
Source File: CommonSpider.java From Gather-Platform with GNU General Public License v3.0 | 6 votes |
@Override protected void onSuccess(Request request) { super.onSuccess(request); Task task = taskManager.getTaskById(this.getUUID()); boolean reachMax = false, exceedRatio = false; if ( ( //已抓取数量大于最大抓取页数,退出 (reachMax = (SPIDER_INFO.getMaxPageGather() > 0 && task.getCount() >= SPIDER_INFO.getMaxPageGather())) || //如果抓取页面超过最大抓取数量ratio倍的时候,仍未达到最大抓取数量,爬虫也退出 (exceedRatio = (this.getPageCount() > SPIDER_INFO.getMaxPageGather() * staticValue.getCommonsWebpageCrawlRatio())) ) && this.getStatus() == Status.Running) { LOG.info("爬虫ID{}已处理{}个页面,有效页面{}个,最大抓取页数{},reachMax={},exceedRatio={},退出.", this.getUUID(), this.getPageCount(), task.getCount(), SPIDER_INFO.getMaxPageGather(), reachMax, exceedRatio); task.setDescription("爬虫ID%s已处理%s个页面,有效页面%s个,达到最大抓取页数%s,reachMax=%s,exceedRatio=%s,退出.", this.getUUID(), this.getPageCount(), task.getCount(), SPIDER_INFO.getMaxPageGather(), reachMax, exceedRatio); this.stop(); } }
Example #5
Source File: PhantomJSDownloader.java From webmagic with Apache License 2.0 | 6 votes |
protected String getPage(Request request) { try { String url = request.getUrl(); Runtime runtime = Runtime.getRuntime(); Process process = runtime.exec(phantomJsCommand + " " + crawlJsPath + " " + url); InputStream is = process.getInputStream(); BufferedReader br = new BufferedReader(new InputStreamReader(is)); StringBuffer stringBuffer = new StringBuffer(); String line; while ((line = br.readLine()) != null) { stringBuffer.append(line).append("\n"); } return stringBuffer.toString(); } catch (IOException e) { e.printStackTrace(); } return null; }
Example #6
Source File: ModelPageProcessor.java From webmagic with Apache License 2.0 | 6 votes |
private void extractLinks(Page page, Selector urlRegionSelector, List<Pattern> urlPatterns) { List<String> links; if (urlRegionSelector == null) { links = page.getHtml().links().all(); } else { links = page.getHtml().selectList(urlRegionSelector).links().all(); } for (String link : links) { for (Pattern targetUrlPattern : urlPatterns) { Matcher matcher = targetUrlPattern.matcher(link); if (matcher.find()) { page.addTargetRequest(new Request(matcher.group(0))); } } } }
Example #7
Source File: CommonSpider.java From spider with GNU General Public License v3.0 | 6 votes |
@Override protected void onSuccess(Request request) { super.onSuccess(request); Task task = taskManager.getTaskById(this.getUUID()); boolean reachMax = false, exceedRatio = false; if ( ( //已抓取数量大于最大抓取页数,退出 (reachMax = (SPIDER_INFO.getMaxPageGather() > 0 && task.getCount() >= SPIDER_INFO.getMaxPageGather())) || //如果抓取页面超过最大抓取数量ratio倍的时候,仍未达到最大抓取数量,爬虫也退出 (exceedRatio = (this.getPageCount() > SPIDER_INFO.getMaxPageGather() * staticValue.getCommonsWebpageCrawlRatio() && SPIDER_INFO.getMaxPageGather() > 0)) ) && this.getStatus() == Status.Running) { LOG.info("爬虫ID{}已处理{}个页面,有效页面{}个,最大抓取页数{},reachMax={},exceedRatio={},退出.", this.getUUID(), this.getPageCount(), task.getCount(), SPIDER_INFO.getMaxPageGather(), reachMax, exceedRatio); task.setDescription("爬虫ID%s已处理%s个页面,有效页面%s个,达到最大抓取页数%s,reachMax=%s,exceedRatio=%s,退出.", this.getUUID(), this.getPageCount(), task.getCount(), SPIDER_INFO.getMaxPageGather(), reachMax, exceedRatio); this.stop(); } }
Example #8
Source File: CasperjsDownloader.java From spider with GNU General Public License v3.0 | 6 votes |
@Override public Page download(Request request, Task task) { String html = null; Site site = null; if (task != null) { site = task.getSite(); } try { html = casperjs.gatherHtml(new com.gs.spider.model.commons.Request(request.getUrl(), true)); } catch (Exception e) { if (site.getCycleRetryTimes() > 0) { return addToCycleRetry(request, site); } request.putExtra("EXCEPTION", e); onError(request); return null; } Page page = new Page(); page.setRawText(html); page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); onSuccess(request); return page; }
Example #9
Source File: SeleniumDownloaderTest.java From webmagic with Apache License 2.0 | 6 votes |
@Ignore("need chrome driver") @Test public void test() { SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); long time1 = System.currentTimeMillis(); for (int i = 0; i < 100; i++) { Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() { @Override public String getUUID() { return "huaban.com"; } @Override public Site getSite() { return Site.me(); } }); System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all()); } System.out.println(System.currentTimeMillis() - time1); }
Example #10
Source File: HttpClientDownloader.java From webmagic with Apache License 2.0 | 6 votes |
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); if (!request.isBinaryContent()){ if (charset == null) { charset = getHtmlCharset(contentType, bytes); } page.setCharset(charset); page.setRawText(new String(bytes, charset)); } page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); page.setDownloadSuccess(true); if (responseHeader) { page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } return page; }
Example #11
Source File: HttpUriRequestConverter.java From webmagic with Apache License 2.0 | 6 votes |
private HttpClientContext convertHttpClientContext(Request request, Site site, Proxy proxy) { HttpClientContext httpContext = new HttpClientContext(); if (proxy != null && proxy.getUsername() != null) { AuthState authState = new AuthState(); authState.update(new BasicScheme(ChallengeState.PROXY), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword())); httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState); } if (request.getCookies() != null && !request.getCookies().isEmpty()) { CookieStore cookieStore = new BasicCookieStore(); for (Map.Entry<String, String> cookieEntry : request.getCookies().entrySet()) { BasicClientCookie cookie1 = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); cookie1.setDomain(UrlUtils.removePort(UrlUtils.getDomain(request.getUrl()))); cookieStore.addCookie(cookie1); } httpContext.setCookieStore(cookieStore); } return httpContext; }
Example #12
Source File: HttpClientDownloader.java From blog-hunter with MIT License | 6 votes |
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); if (!request.isBinaryContent()){ if (charset == null) { charset = getHtmlCharset(contentType, bytes); } page.setCharset(charset); page.setRawText(new String(bytes, charset)); } page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); page.setDownloadSuccess(true); if (responseHeader) { page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } return page; }
Example #13
Source File: HttpClientDownloaderTest.java From webmagic with Apache License 2.0 | 6 votes |
@Test public void test_download_auth_by_SimpleProxyProvider() throws Exception { HttpServer server = httpServer(13423); server.get(eq(header("Proxy-Authorization"), "Basic dXNlcm5hbWU6cGFzc3dvcmQ=")).response("ok"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy("127.0.0.1", 13423, "username", "password"))); Request request = new Request(); request.setUrl("http://www.baidu.com"); Page page = httpClientDownloader.download(request, Site.me().toTask()); assertThat(page.getRawText()).isEqualTo("ok"); } }); }
Example #14
Source File: HttpClientDownloaderTest.java From webmagic with Apache License 2.0 | 6 votes |
@Test public void test_set_request_header() throws Exception { HttpServer server = httpServer(13423); server.get(eq(header("header"), "header-webmagic")).response("ok"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setUrl("http://127.0.0.1:13423"); request.addHeader("header","header-webmagic"); Page page = httpClientDownloader.download(request, Site.me().toTask()); assertThat(page.getRawText()).isEqualTo("ok"); } }); }
Example #15
Source File: HttpClientDownloaderTest.java From webmagic with Apache License 2.0 | 6 votes |
@Test public void test_disableCookieManagement() throws Exception { HttpServer server = httpServer(13423); server.get(not(eq(cookie("cookie"), "cookie-webmagic"))).response("ok"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setUrl("http://127.0.0.1:13423"); request.addCookie("cookie","cookie-webmagic"); Page page = httpClientDownloader.download(request, Site.me().setDisableCookieManagement(true).toTask()); assertThat(page.getRawText()).isEqualTo("ok"); } }); }
Example #16
Source File: HttpClientDownloaderTest.java From webmagic with Apache License 2.0 | 6 votes |
@Test public void test_set_request_cookie() throws Exception { HttpServer server = httpServer(13423); server.get(eq(cookie("cookie"), "cookie-webmagic")).response("ok"); Runner.running(server, new Runnable() { @Override public void run() throws Exception { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Request request = new Request(); request.setUrl("http://127.0.0.1:13423"); request.addCookie("cookie","cookie-webmagic"); Page page = httpClientDownloader.download(request, Site.me().toTask()); assertThat(page.getRawText()).isEqualTo("ok"); } }); }
Example #17
Source File: FilePipelineTest.java From webmagic with Apache License 2.0 | 6 votes |
@BeforeClass public static void before() { resultItems = new ResultItems(); resultItems.put("content", "webmagic 爬虫工具"); Request request = new Request("http://www.baidu.com"); resultItems.setRequest(request); task = new Task() { @Override public String getUUID() { return UUID.randomUUID().toString(); } @Override public Site getSite() { return null; } }; }
Example #18
Source File: RedisScheduler.java From webmagic with Apache License 2.0 | 6 votes |
@Override public synchronized Request poll(Task task) { Jedis jedis = pool.getResource(); try { String url = jedis.lpop(getQueueKey(task)); if (url == null) { return null; } String key = ITEM_PREFIX + task.getUUID(); String field = DigestUtils.shaHex(url); byte[] bytes = jedis.hget(key.getBytes(), field.getBytes()); if (bytes != null) { Request o = JSON.parseObject(new String(bytes), Request.class); return o; } Request request = new Request(url); return request; } finally { pool.returnResource(jedis); } }
Example #19
Source File: HttpUriRequestConverter.java From webmagic with Apache License 2.0 | 6 votes |
private RequestBuilder selectRequestMethod(Request request) { String method = request.getMethod(); if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) { //default get return RequestBuilder.get(); } else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) { return addFormParams(RequestBuilder.post(),request); } else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) { return RequestBuilder.head(); } else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) { return addFormParams(RequestBuilder.put(), request); } else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) { return RequestBuilder.delete(); } else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) { return RequestBuilder.trace(); } throw new IllegalArgumentException("Illegal HTTP Method " + method); }
Example #20
Source File: RedisPriorityScheduler.java From webmagic with Apache License 2.0 | 6 votes |
@Override protected void pushWhenNoDuplicate(Request request, Task task) { Jedis jedis = pool.getResource(); try { if(request.getPriority() > 0) jedis.zadd(getZsetPlusPriorityKey(task), request.getPriority(), request.getUrl()); else if(request.getPriority() < 0) jedis.zadd(getZsetMinusPriorityKey(task), request.getPriority(), request.getUrl()); else jedis.lpush(getQueueNoPriorityKey(task), request.getUrl()); setExtrasInItem(jedis, request, task); } finally { pool.returnResource(jedis); } }
Example #21
Source File: RedisScheduler.java From webmagic with Apache License 2.0 | 5 votes |
@Override protected void pushWhenNoDuplicate(Request request, Task task) { Jedis jedis = pool.getResource(); try { jedis.rpush(getQueueKey(task), request.getUrl()); if (checkForAdditionalInfo(request)) { String field = DigestUtils.shaHex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset((ITEM_PREFIX + task.getUUID()), field, value); } } finally { jedis.close(); } }
Example #22
Source File: BlockingQueueScheduler.java From blog-hunter with MIT License | 5 votes |
@Override public void pushWhenNoDuplicate(Request request, Task task) { // 当程序退出方式非URL_COUNT时按照正常逻辑处理 if (realUrlCount == -1) { this.queue.add(request); return; } // 在有效期内(realUrlCount > 0),每次push url时realUrlCount - 1, 当 realUrlCount <= 0 时,当前Scheduler将不再收录新的url if (realUrlCount <= 0) { return; } realUrlCount--; this.queue.add(request); }
Example #23
Source File: BloomFilterDuplicateRemoverTest.java From webmagic with Apache License 2.0 | 5 votes |
@Test public void testRemove() throws Exception { BloomFilterDuplicateRemover bloomFilterDuplicateRemover = new BloomFilterDuplicateRemover(10); boolean isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null); assertThat(isDuplicate).isFalse(); isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("a"), null); assertThat(isDuplicate).isTrue(); isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null); assertThat(isDuplicate).isFalse(); isDuplicate = bloomFilterDuplicateRemover.isDuplicate(new Request("b"), null); assertThat(isDuplicate).isTrue(); }
Example #24
Source File: RedisPriorityScheduler.java From webmagic with Apache License 2.0 | 5 votes |
private Request getExtrasInItem(Jedis jedis, String url, Task task) { String key = getItemKey(task); String field = DigestUtils.shaHex(url); byte[] bytes = jedis.hget(key.getBytes(), field.getBytes()); if(bytes != null) return JSON.parseObject(new String(bytes), Request.class); return new Request(url); }
Example #25
Source File: CommonWebpagePipeline.java From Gather-Platform with GNU General Public License v3.0 | 5 votes |
@Override public boolean isDuplicate(Request request, Task task) { Set<String> tempLists = urls.computeIfAbsent(task.getUUID(), k -> Sets.newConcurrentHashSet()); //初始化已采集网站列表缓存 if (tempLists.add(request.getUrl())) {//先检查当前生命周期是否抓取过,如果当前生命周期未抓取,则进一步检查ES GetResponse response = client.prepareGet(INDEX_NAME, TYPE_NAME, Hashing.md5().hashString(request.getUrl(), Charset.forName("utf-8")).toString() ).get(); return response.isExists(); } else {//如果当前生命周期已抓取,直接置为重复 return true; } }
Example #26
Source File: HttpUriRequestConverter.java From webmagic with Apache License 2.0 | 5 votes |
private HttpUriRequest convertHttpUriRequest(Request request, Site site, Proxy proxy) { RequestBuilder requestBuilder = selectRequestMethod(request).setUri(UrlUtils.fixIllegalCharacterInUrl(request.getUrl())); if (site.getHeaders() != null) { for (Map.Entry<String, String> headerEntry : site.getHeaders().entrySet()) { requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue()); } } RequestConfig.Builder requestConfigBuilder = RequestConfig.custom(); if (site != null) { requestConfigBuilder.setConnectionRequestTimeout(site.getTimeOut()) .setSocketTimeout(site.getTimeOut()) .setConnectTimeout(site.getTimeOut()) .setCookieSpec(CookieSpecs.STANDARD); } if (proxy != null) { requestConfigBuilder.setProxy(new HttpHost(proxy.getHost(), proxy.getPort(), proxy.getScheme())); } requestBuilder.setConfig(requestConfigBuilder.build()); HttpUriRequest httpUriRequest = requestBuilder.build(); if (request.getHeaders() != null && !request.getHeaders().isEmpty()) { for (Map.Entry<String, String> header : request.getHeaders().entrySet()) { httpUriRequest.addHeader(header.getKey(), header.getValue()); } } return httpUriRequest; }
Example #27
Source File: HttpUriRequestConverter.java From webmagic with Apache License 2.0 | 5 votes |
private RequestBuilder addFormParams(RequestBuilder requestBuilder, Request request) { if (request.getRequestBody() != null) { ByteArrayEntity entity = new ByteArrayEntity(request.getRequestBody().getBody()); entity.setContentType(request.getRequestBody().getContentType()); requestBuilder.setEntity(entity); } return requestBuilder; }
Example #28
Source File: MockGithubDownloader.java From webmagic with Apache License 2.0 | 5 votes |
@Override public Page download(Request request, Task task) { Page page = new Page(); InputStream resourceAsStream = this.getClass().getResourceAsStream("/html/mock-github.html"); try { page.setRawText(IOUtils.toString(resourceAsStream)); } catch (IOException e) { e.printStackTrace(); } page.setRequest(new Request("https://github.com/code4craft/webmagic")); page.setUrl(new PlainText("https://github.com/code4craft/webmagic")); return page; }
Example #29
Source File: RedisPriorityScheduler.java From webmagic with Apache License 2.0 | 5 votes |
private void setExtrasInItem(Jedis jedis,Request request, Task task) { if(request.getExtras() != null) { String field = DigestUtils.shaHex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset(getItemKey(task), field, value); } }
Example #30
Source File: DelayQueueSchedulerTest.java From webmagic with Apache License 2.0 | 5 votes |
@Ignore("infinite") @Test public void test() { DelayQueueScheduler delayQueueScheduler = new DelayQueueScheduler(1, TimeUnit.SECONDS); delayQueueScheduler.push(new Request("1"), null); while (true){ Request poll = delayQueueScheduler.poll(null); System.out.println(System.currentTimeMillis()+"\t"+poll); } }