us.codecraft.webmagic.Task Java Examples
The following examples show how to use
us.codecraft.webmagic.Task.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: RedisPriorityScheduler.java From webmagic with Apache License 2.0 | 6 votes |
private String getRequest(Jedis jedis, Task task) { String url; Set<String> urls = jedis.zrevrange(getZsetPlusPriorityKey(task), 0, 0); if(urls.isEmpty()) { url = jedis.lpop(getQueueNoPriorityKey(task)); if(StringUtils.isBlank(url)) { urls = jedis.zrevrange(getZsetMinusPriorityKey(task), 0, 0); if(!urls.isEmpty()) { url = urls.toArray(new String[0])[0]; jedis.zrem(getZsetMinusPriorityKey(task), url); } } } else { url = urls.toArray(new String[0])[0]; jedis.zrem(getZsetPlusPriorityKey(task), url); } return url; }
Example #2
Source File: RedisPriorityScheduler.java From webmagic with Apache License 2.0 | 6 votes |
@Override public synchronized Request poll(Task task) { Jedis jedis = pool.getResource(); try { String url = getRequest(jedis, task); if(StringUtils.isBlank(url)) return null; return getExtrasInItem(jedis, url, task); } finally { pool.returnResource(jedis); } }
Example #3
Source File: SeleniumDownloaderTest.java From webmagic with Apache License 2.0 | 6 votes |
@Ignore @Test public void testBaiduWenku() { SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); seleniumDownloader.setSleepTime(10000); long time1 = System.currentTimeMillis(); Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() { @Override public String getUUID() { return "huaban.com"; } @Override public Site getSite() { return Site.me(); } }); System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>","").replace("&nsbp;","").all()); }
Example #4
Source File: HttpClientDownloader.java From blog-hunter with MIT License | 6 votes |
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); if (!request.isBinaryContent()){ if (charset == null) { charset = getHtmlCharset(contentType, bytes); } page.setCharset(charset); page.setRawText(new String(bytes, charset)); } page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); page.setDownloadSuccess(true); if (responseHeader) { page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } return page; }
Example #5
Source File: ScriptConsole.java From webmagic with Apache License 2.0 | 6 votes |
private static void startSpider(Params params) { ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom() .language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build(); pageProcessor.getSite().setSleepTime(params.getSleepTime()); pageProcessor.getSite().setRetryTimes(3); pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404,403, 500,502)); Spider spider = Spider.create(pageProcessor).thread(params.getThread()); spider.clearPipeline().addPipeline(new Pipeline() { @Override public void process(ResultItems resultItems, Task task) { } }); if (params.getUrls() == null || params.getUrls().size() == 0) { System.err.println("Need at least one argument"); System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]"); System.exit(-1); } for (String url : params.getUrls()) { spider.addUrl(url); } spider.run(); }
Example #6
Source File: FilePipelineTest.java From webmagic with Apache License 2.0 | 6 votes |
@BeforeClass public static void before() { resultItems = new ResultItems(); resultItems.put("content", "webmagic 爬虫工具"); Request request = new Request("http://www.baidu.com"); resultItems.setRequest(request); task = new Task() { @Override public String getUUID() { return UUID.randomUUID().toString(); } @Override public Site getSite() { return null; } }; }
Example #7
Source File: ModelPipeline.java From webmagic with Apache License 2.0 | 6 votes |
@Override public void process(ResultItems resultItems, Task task) { for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) { Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName()); if (o != null) { Annotation annotation = classPageModelPipelineEntry.getKey().getAnnotation(ExtractBy.class); if (annotation == null || !((ExtractBy) annotation).multi()) { classPageModelPipelineEntry.getValue().process(o, task); } else { List<Object> list = (List<Object>) o; for (Object o1 : list) { classPageModelPipelineEntry.getValue().process(o1, task); } } } } }
Example #8
Source File: FilePageModelPipeline.java From webmagic with Apache License 2.0 | 6 votes |
@Override public void process(Object o, Task task) { String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR; try { String filename; if (o instanceof HasKey) { filename = path + ((HasKey) o).key() + ".html"; } else { filename = path + DigestUtils.md5Hex(ToStringBuilder.reflectionToString(o)) + ".html"; } PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(filename))); printWriter.write(ToStringBuilder.reflectionToString(o)); printWriter.close(); } catch (IOException e) { logger.warn("write file error", e); } }
Example #9
Source File: OneFilePipeline.java From webmagic with Apache License 2.0 | 6 votes |
@Override public synchronized void process(ResultItems resultItems, Task task) { printWriter.println("url:\t" + resultItems.getRequest().getUrl()); for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) { if (entry.getValue() instanceof Iterable) { Iterable value = (Iterable) entry.getValue(); printWriter.println(entry.getKey() + ":"); for (Object o : value) { printWriter.println(o); } } else { printWriter.println(entry.getKey() + ":\t" + entry.getValue()); } } printWriter.flush(); }
Example #10
Source File: HttpClientDownloader.java From webmagic with Apache License 2.0 | 6 votes |
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); String contentType = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); if (!request.isBinaryContent()){ if (charset == null) { charset = getHtmlCharset(contentType, bytes); } page.setCharset(charset); page.setRawText(new String(bytes, charset)); } page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); page.setDownloadSuccess(true); if (responseHeader) { page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } return page; }
Example #11
Source File: JsonFilePipeline.java From spider with GNU General Public License v3.0 | 5 votes |
@Override public void process(ResultItems resultItems, Task task) { Webpage webpage = CommonWebpagePipeline.convertResultItems2Webpage(resultItems); try { FileUtils.writeStringToFile( new File("gather_platform_data/" + webpage.getSpiderUUID() + ".json"), gson.toJson(webpage) + "\n", true); } catch (IOException e) { LOG.error("序列化网页信息出错,{}", e.getLocalizedMessage()); } }
Example #12
Source File: CommonWebpagePipeline.java From spider with GNU General Public License v3.0 | 5 votes |
@Override public boolean isDuplicate(Request request, Task task) { Set<String> tempLists = urls.computeIfAbsent(task.getUUID(), k -> Sets.newConcurrentHashSet()); //初始化已采集网站列表缓存 if (tempLists.add(request.getUrl())) {//先检查当前生命周期是否抓取过,如果当前生命周期未抓取,则进一步检查ES GetResponse response = client.prepareGet(INDEX_NAME, TYPE_NAME, Hashing.md5().hashString(request.getUrl(), Charset.forName("utf-8")).toString() ).get(); return response.isExists(); } else {//如果当前生命周期已抓取,直接置为重复 return true; } }
Example #13
Source File: RedisPriorityScheduler.java From webmagic with Apache License 2.0 | 5 votes |
private void setExtrasInItem(Jedis jedis,Request request, Task task) { if(request.getExtras() != null) { String field = DigestUtils.shaHex(request.getUrl()); String value = JSON.toJSONString(request); jedis.hset(getItemKey(task), field, value); } }
Example #14
Source File: DelayQueueScheduler.java From webmagic with Apache License 2.0 | 5 votes |
@Override public synchronized void push(Request request, Task task) { if (urls.add(request.getUrl())) { queue.add(new RequestWrapper(request)); } }
Example #15
Source File: PageModelCollectorPipeline.java From webmagic with Apache License 2.0 | 5 votes |
@Override public synchronized void process(ResultItems resultItems, Task task) { Object o = resultItems.get(clazz.getCanonicalName()); if (o != null) { Annotation annotation = clazz.getAnnotation(ExtractBy.class); if (annotation == null || !((ExtractBy) annotation).multi()) { classPipeline.process((T) o, task); } else { List<Object> list = (List<Object>) o; for (Object o1 : list) { classPipeline.process((T) o1, task); } } } }
Example #16
Source File: Kr36NewsModel.java From webmagic with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws IOException, JMException { //Just for benchmark Spider thread = OOSpider.create(Site.me().setSleepTime(0), new PageModelPipeline() { @Override public void process(Object o, Task task) { } }, Kr36NewsModel.class).thread(20).addUrl("http://www.36kr.com/"); thread.start(); SpiderMonitor spiderMonitor = SpiderMonitor.instance(); spiderMonitor.register(thread); }
Example #17
Source File: ContentLengthLimitHttpClientDownloader.java From Gather-Platform with GNU General Public License v3.0 | 5 votes |
@Override protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { Page page; try { page = super.handleResponse(request, charset, httpResponse, task); } catch (IllegalArgumentException e) { writeExceptionLog(e, request); onError(request); LOG.warn("URL为:{} ,{}", request.getUrl(), e.getLocalizedMessage()); throw e; } return page; }
Example #18
Source File: ESPipeline.java From Gather-Platform with GNU General Public License v3.0 | 5 votes |
@Override public void process(ResultItems resultItems, Task task) { Iterator i$ = resultItems.getAll().entrySet().iterator(); try { XContentBuilder xContentBuilder = jsonBuilder().startObject(); while (i$.hasNext()) { Map.Entry entry = (Map.Entry) i$.next(); xContentBuilder.field((String) entry.getKey(), entry.getValue()); } String json = xContentBuilder.endObject().string(); IndexResponse response = null; if (StringUtils.isNotBlank(resultItems.get("id"))) { response = client .prepareIndex(INDEX_NAME, TYPE_NAME, resultItems.get("id")) .setSource(json).get(); } else { response = client .prepareIndex(INDEX_NAME, TYPE_NAME) .setSource(json).get(); } if (response.getResult() != IndexResponse.Result.CREATED) LOG.error("索引失败,可能重复创建,resultItem:" + resultItems); } catch (IOException e) { LOG.error("索引出错," + e.getLocalizedMessage()); e.printStackTrace(); } }
Example #19
Source File: JsonFilePipeline.java From Gather-Platform with GNU General Public License v3.0 | 5 votes |
@Override public void process(ResultItems resultItems, Task task) { Webpage webpage = CommonWebpagePipeline.convertResultItems2Webpage(resultItems); try { FileUtils.writeStringToFile( new File("gather_platform_data/" + webpage.getSpiderUUID() + ".json"), gson.toJson(webpage) + "\n", true); } catch (IOException e) { LOG.error("序列化网页信息出错,{}", e.getLocalizedMessage()); } }
Example #20
Source File: TopicInfoPipeline.java From feiqu-opensource with Apache License 2.0 | 5 votes |
@Override public void process(V2exDTO v2exDTO, Task task) { if(StringUtils.isEmpty(v2exDTO.getTitle())){ return; } Date now = new Date(); FqTopicExample topicExample = new FqTopicExample(); topicExample.createCriteria().andGmtCreateGreaterThan(DateUtil.offsetHour(now,-5)); long count = fqTopicMapper.countByExample(topicExample); if(count >= 50){ OOSpider ooSpider = (OOSpider)task; ooSpider.stop(); } topicExample.clear(); topicExample.createCriteria().andTitleEqualTo(v2exDTO.getTitle()).andAuthorEqualTo(v2exDTO.getAuthor()); count = fqTopicMapper.countByExample(topicExample); if(count > 0){ return; } FqTopic fqTopic = DTO2DO(v2exDTO); fqTopic.setContent(EmojiUtils.toAliases(fqTopic.getContent())); fqTopicMapper.insert(fqTopic); if(CollectionUtil.isNotEmpty(v2exDTO.getReply())){ v2exDTO.getReply().forEach(reply->{ if(StringUtils.isEmpty(reply)){ return; } if(reply.length() > 500){ reply = reply.substring(0,480); } reply = EmojiUtils.toAliases(reply); FqTopicReply fqTopicReply = new FqTopicReply(); fqTopicReply.setGmtCreate(now); fqTopicReply.setContent(reply); fqTopicReply.setTopicId(fqTopic.getId()); fqTopicReplyMapper.insert(fqTopicReply); }); } }
Example #21
Source File: ContentLengthLimitHttpClientDownloader.java From spider with GNU General Public License v3.0 | 5 votes |
@Override protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { Page page; try { page = super.handleResponse(request, charset, httpResponse, task); } catch (IllegalArgumentException e) { writeExceptionLog(e, request); onError(request); LOG.warn("URL为:{} ,{}", request.getUrl(), e.getLocalizedMessage()); throw e; } return page; }
Example #22
Source File: HttpClientDownloader.java From webmagic with Apache License 2.0 | 5 votes |
@Override public Page download(Request request, Task task) { if (task == null || task.getSite() == null) { throw new NullPointerException("task or site can not be null"); } CloseableHttpResponse httpResponse = null; CloseableHttpClient httpClient = getHttpClient(task.getSite()); Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null; HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); Page page = Page.fail(); try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); onSuccess(request); logger.info("downloading page success {}", request.getUrl()); return page; } catch (IOException e) { logger.warn("download page {} error", request.getUrl(), e); onError(request); return page; } finally { if (httpResponse != null) { //ensure the connection is released back to pool EntityUtils.consumeQuietly(httpResponse.getEntity()); } if (proxyProvider != null && proxy != null) { proxyProvider.returnProxy(proxy, page, task); } } }
Example #23
Source File: OschinaBlog.java From webmagic with Apache License 2.0 | 5 votes |
public static void main(String[] args) { OOSpider.create(Site.me() .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36") .setSleepTime(0) .setRetryTimes(3) ,new PageModelPipeline() { @Override public void process(Object o, Task task) { } }, OschinaBlog.class).thread(10).addUrl("http://my.oschina.net/flashsword/blog").run(); }
Example #24
Source File: PriorityScheduler.java From webmagic with Apache License 2.0 | 5 votes |
@Override public void pushWhenNoDuplicate(Request request, Task task) { if (request.getPriority() == 0) { noPriorityQueue.add(request); } else if (request.getPriority() > 0) { priorityQueuePlus.put(request); } else { priorityQueueMinus.put(request); } }
Example #25
Source File: DuplicateRemovedScheduler.java From webmagic with Apache License 2.0 | 5 votes |
@Override public void push(Request request, Task task) { logger.trace("get a candidate url {}", request.getUrl()); if (shouldReserved(request) || noNeedToRemoveDuplicate(request) || !duplicatedRemover.isDuplicate(request, task)) { logger.debug("push to queue {}", request.getUrl()); pushWhenNoDuplicate(request, task); } }
Example #26
Source File: GithubRepoPageProcessorTest.java From webmagic with Apache License 2.0 | 5 votes |
@Test public void test_github() throws Exception { Spider.create(new GithubRepoPageProcessor()).addPipeline(new Pipeline() { @Override public void process(ResultItems resultItems, Task task) { assertThat(((String) resultItems.get("name")).trim()).isEqualTo("webmagic"); assertThat(((String) resultItems.get("author")).trim()).isEqualTo("code4craft"); } }).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic"); }
Example #27
Source File: MockGithubDownloader.java From webmagic with Apache License 2.0 | 5 votes |
@Override public Page download(Request request, Task task) { Page page = new Page(); InputStream resourceAsStream = this.getClass().getResourceAsStream("/html/mock-github.html"); try { page.setRawText(IOUtils.toString(resourceAsStream)); } catch (IOException e) { e.printStackTrace(); } page.setRequest(new Request("https://github.com/code4craft/webmagic")); page.setUrl(new PlainText("https://github.com/code4craft/webmagic")); return page; }
Example #28
Source File: SSLCompatibilityTest.java From webmagic with Apache License 2.0 | 5 votes |
@Test public void test_tls12() throws Exception { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Task task = Site.me().setCycleRetryTimes(5).toTask(); Request request = new Request("https://juejin.im/"); Page page = httpClientDownloader.download(request, task); assertThat(page.isDownloadSuccess()).isTrue(); }
Example #29
Source File: HttpClientDownloaderTest.java From webmagic with Apache License 2.0 | 5 votes |
@Test public void test_download_fail() { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); Task task = Site.me().setDomain("localhost").setCycleRetryTimes(5).toTask(); Request request = new Request(PAGE_ALWAYS_NOT_EXISTS); Page page = httpClientDownloader.download(request, task); assertThat(page.isDownloadSuccess()).isFalse(); }
Example #30
Source File: RedisScheduler.java From webmagic with Apache License 2.0 | 5 votes |
@Override public int getTotalRequestsCount(Task task) { Jedis jedis = pool.getResource(); try { Long size = jedis.scard(getSetKey(task)); return size.intValue(); } finally { pool.returnResource(jedis); } }