us.codecraft.webmagic.ResultItems Java Examples
The following examples show how to use
us.codecraft.webmagic.ResultItems.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JdbcPipeline.java From elasticsearch-jest-example with MIT License | 6 votes |
public void process(ResultItems resultItems, Task task) { Map<String,Object> items = resultItems.getAll(); if(resultItems!=null&&resultItems.getAll().size()>0){ Article article = new Article(); article.setTitle((String) items.get("title")); article.setContent((String) items.get("content")); article.setSource((String) items.get("source")); article.setAuthor((String) items.get("author")); article.setUrl((String)items.get("url")); String dataStr = (String)items.get("create"); Pattern pattern = Pattern.compile("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}"); Matcher matcher = pattern.matcher(dataStr); if(matcher.find()){ dataStr = matcher.group(0); } try { article.setPubdate(new SimpleDateFormat("yyyy-MM-dd HH:mm").parse(dataStr)); } catch (ParseException e) { e.printStackTrace(); } articleDao.save(article); } }
Example #2
Source File: ScriptConsole.java From webmagic with Apache License 2.0 | 6 votes |
private static void startSpider(Params params) { ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom() .language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build(); pageProcessor.getSite().setSleepTime(params.getSleepTime()); pageProcessor.getSite().setRetryTimes(3); pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404,403, 500,502)); Spider spider = Spider.create(pageProcessor).thread(params.getThread()); spider.clearPipeline().addPipeline(new Pipeline() { @Override public void process(ResultItems resultItems, Task task) { } }); if (params.getUrls() == null || params.getUrls().size() == 0) { System.err.println("Need at least one argument"); System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]"); System.exit(-1); } for (String url : params.getUrls()) { spider.addUrl(url); } spider.run(); }
Example #3
Source File: ConfigurablePageProcessorTest.java From webmagic with Apache License 2.0 | 6 votes |
@Test public void test() throws Exception { List<ExtractRule> extractRules = new ArrayList<ExtractRule>(); ExtractRule extractRule = new ExtractRule(); extractRule.setExpressionType(ExpressionType.XPath); extractRule.setExpressionValue("//title"); extractRule.setFieldName("title"); extractRules.add(extractRule); extractRule = new ExtractRule(); extractRule.setExpressionType(ExpressionType.XPath); extractRule.setExpressionValue("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()"); extractRule.setFieldName("star"); extractRules.add(extractRule); ResultItems resultItems = Spider.create(new ConfigurablePageProcessor(Site.me(), extractRules)) .setDownloader(new MockGithubDownloader()).get("https://github.com/code4craft/webmagic"); assertThat(resultItems.getAll()).containsEntry("title", "<title>code4craft/webmagic · GitHub</title>"); assertThat(resultItems.getAll()).containsEntry("star", " 86 "); }
Example #4
Source File: ModelPipeline.java From webmagic with Apache License 2.0 | 6 votes |
@Override public void process(ResultItems resultItems, Task task) { for (Map.Entry<Class, PageModelPipeline> classPageModelPipelineEntry : pageModelPipelines.entrySet()) { Object o = resultItems.get(classPageModelPipelineEntry.getKey().getCanonicalName()); if (o != null) { Annotation annotation = classPageModelPipelineEntry.getKey().getAnnotation(ExtractBy.class); if (annotation == null || !((ExtractBy) annotation).multi()) { classPageModelPipelineEntry.getValue().process(o, task); } else { List<Object> list = (List<Object>) o; for (Object o1 : list) { classPageModelPipelineEntry.getValue().process(o1, task); } } } } }
Example #5
Source File: OneFilePipeline.java From webmagic with Apache License 2.0 | 6 votes |
@Override public synchronized void process(ResultItems resultItems, Task task) { printWriter.println("url:\t" + resultItems.getRequest().getUrl()); for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) { if (entry.getValue() instanceof Iterable) { Iterable value = (Iterable) entry.getValue(); printWriter.println(entry.getKey() + ":"); for (Object o : value) { printWriter.println(o); } } else { printWriter.println(entry.getKey() + ":\t" + entry.getValue()); } } printWriter.flush(); }
Example #6
Source File: FilePipelineTest.java From webmagic with Apache License 2.0 | 6 votes |
@BeforeClass public static void before() { resultItems = new ResultItems(); resultItems.put("content", "webmagic 爬虫工具"); Request request = new Request("http://www.baidu.com"); resultItems.setRequest(request); task = new Task() { @Override public String getUUID() { return UUID.randomUUID().toString(); } @Override public Site getSite() { return null; } }; }
Example #7
Source File: BaiduBaikePageProcessor.java From webmagic with Apache License 2.0 | 6 votes |
public static void main(String[] args) { //single download Spider spider = Spider.create(new BaiduBaikePageProcessor()).thread(2); String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8"; ResultItems resultItems = spider.<ResultItems>get(String.format(urlTemplate, "水力发电")); System.out.println(resultItems); //multidownload List<String> list = new ArrayList<String>(); list.add(String.format(urlTemplate,"风力发电")); list.add(String.format(urlTemplate,"太阳能")); list.add(String.format(urlTemplate,"地热发电")); list.add(String.format(urlTemplate,"地热发电")); List<ResultItems> resultItemses = spider.<ResultItems>getAll(list); for (ResultItems resultItemse : resultItemses) { System.out.println(resultItemse.getAll()); } spider.close(); }
Example #8
Source File: FilePipeline.java From webmagic with Apache License 2.0 | 6 votes |
@Override public void process(ResultItems resultItems, Task task) { String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR; try { PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")),"UTF-8")); printWriter.println("url:\t" + resultItems.getRequest().getUrl()); for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) { if (entry.getValue() instanceof Iterable) { Iterable value = (Iterable) entry.getValue(); printWriter.println(entry.getKey() + ":"); for (Object o : value) { printWriter.println(o); } } else { printWriter.println(entry.getKey() + ":\t" + entry.getValue()); } } printWriter.close(); } catch (IOException e) { logger.warn("write file error", e); } }
Example #9
Source File: CommonWebpagePipeline.java From spider with GNU General Public License v3.0 | 6 votes |
/** * 将webmagic的resultItems转换成webpage对象 * * @param resultItems * @return */ public static Webpage convertResultItems2Webpage(ResultItems resultItems) { Webpage webpage = new Webpage(); webpage.setContent(resultItems.get("content")); webpage.setTitle(resultItems.get("title")); webpage.setUrl(resultItems.get("url")); webpage.setId(Hashing.md5().hashString(webpage.getUrl(), Charset.forName("utf-8")).toString()); webpage.setDomain(resultItems.get("domain")); webpage.setSpiderInfoId(resultItems.get("spiderInfoId")); webpage.setGathertime(resultItems.get("gatherTime")); webpage.setSpiderUUID(resultItems.get("spiderUUID")); webpage.setKeywords(resultItems.get("keywords")); webpage.setSummary(resultItems.get("summary")); webpage.setNamedEntity(resultItems.get("namedEntity")); webpage.setPublishTime(resultItems.get("publishTime")); webpage.setCategory(resultItems.get("category")); webpage.setRawHTML(resultItems.get("rawHTML")); webpage.setDynamicFields(resultItems.get(DYNAMIC_FIELD)); webpage.setStaticFields(resultItems.get("staticField")); webpage.setAttachmentList(resultItems.get("attachmentList")); webpage.setImageList(resultItems.get("imageList")); webpage.setProcessTime(resultItems.get("processTime")); return webpage; }
Example #10
Source File: HunterProcessor.java From blog-hunter with MIT License | 6 votes |
/** * 自定义管道的处理方法 * * @param resultItems 自定义Processor处理完后的所有参数 * @param virtualArticles 爬虫文章集合 */ final void process(ResultItems resultItems, List<VirtualArticle> virtualArticles, Hunter spider) { if (null == spider) { return; } Map<String, Object> map = resultItems.getAll(); if (CollectionUtil.isEmpty(map)) { return; } String title = String.valueOf(map.get("title")); ParserConfig jcParserConfig = new ParserConfig(); jcParserConfig.putDeserializer(Date.class, HunterDateDeserializer.instance); VirtualArticle virtualArticle = JSON.parseObject(JSON.toJSONString(map), VirtualArticle.class, jcParserConfig, JSON.DEFAULT_PARSER_FEATURE); virtualArticle.setDescription(CommonUtil.getRealDescription(virtualArticle.getDescription(), virtualArticle.getContent())) .setKeywords(CommonUtil.getRealKeywords(virtualArticle.getKeywords())); if (this.config.isConvertImg()) { virtualArticle.setContent(CommonUtil.formatHtml(virtualArticle.getContent())); virtualArticle.setImageLinks(CommonUtil.getAllImageLink(virtualArticle.getContent())); } if (CollectionUtils.isEmpty(virtualArticle.getTags())) { virtualArticle.setTags(Collections.singletonList("其他")); } virtualArticles.add(virtualArticle); writer.print(String.format("<a href=\"%s\" target=\"_blank\">%s</a> -- %s -- %s", virtualArticle.getSource(), title, virtualArticle.getAuthor(), virtualArticle.getReleaseDate())); }
Example #11
Source File: WebMagicProcessorDelegator.java From vscrawler with Apache License 2.0 | 6 votes |
@Override protected void parse(Seed seed, String result, GrabResult crawlResult) { if (result == null) { seed.retry(); return; } SipSoupPage sipSoupPage = new SipSoupPage(); sipSoupPage.setRawText(result); sipSoupPage.setUrl(new PlainText(seed.getData())); sipSoupPage.setRequest(CovertUtil.convertSeed(seed)); sipSoupPage.setStatusCode(200); pageProcessor.process(sipSoupPage); // new url List<Request> targetRequests = sipSoupPage.getTargetRequests(); for (Request request : targetRequests) { crawlResult.addSeed(CovertUtil.covertRequest(request)); } if (!sipSoupPage.getResultItems().isSkip()) { ResultItems resultItems = sipSoupPage.getResultItems(); crawlResult.addResult(JSONObject.toJSONString(resultItems.getAll())); } }
Example #12
Source File: CommonWebpagePipeline.java From Gather-Platform with GNU General Public License v3.0 | 6 votes |
/** * 将webmagic的resultItems转换成webpage对象 * * @param resultItems * @return */ public static Webpage convertResultItems2Webpage(ResultItems resultItems) { Webpage webpage = new Webpage(); webpage.setContent(resultItems.get("content")); webpage.setTitle(resultItems.get("title")); webpage.setUrl(resultItems.get("url")); webpage.setId(Hashing.md5().hashString(webpage.getUrl(), Charset.forName("utf-8")).toString()); webpage.setDomain(resultItems.get("domain")); webpage.setSpiderInfoId(resultItems.get("spiderInfoId")); webpage.setGathertime(resultItems.get("gatherTime")); webpage.setSpiderUUID(resultItems.get("spiderUUID")); webpage.setKeywords(resultItems.get("keywords")); webpage.setSummary(resultItems.get("summary")); webpage.setNamedEntity(resultItems.get("namedEntity")); webpage.setPublishTime(resultItems.get("publishTime")); webpage.setCategory(resultItems.get("category")); webpage.setRawHTML(resultItems.get("rawHTML")); webpage.setDynamicFields(resultItems.get(DYNAMIC_FIELD)); webpage.setStaticFields(resultItems.get("staticField")); webpage.setAttachmentList(resultItems.get("attachmentList")); webpage.setImageList(resultItems.get("imageList")); webpage.setProcessTime(resultItems.get("processTime")); return webpage; }
Example #13
Source File: CommonWebpagePipeline.java From Gather-Platform with GNU General Public License v3.0 | 6 votes |
@Override public void process(ResultItems resultItems, Task task) { SpiderInfo spiderInfo = resultItems.get("spiderInfo"); Webpage webpage = convertResultItems2Webpage(resultItems); SearchRequestBuilder searchRequestBuilder = client.prepareSearch(INDEX_NAME) .setTypes(TYPE_NAME) .setQuery(QueryBuilders.matchQuery("url", webpage.getUrl())); SearchResponse response = searchRequestBuilder.execute().actionGet(); if (response.getHits().totalHits() == 0) { try { client.prepareIndex(INDEX_NAME, TYPE_NAME) .setId(Hashing.md5().hashString(webpage.getUrl(), Charset.forName("utf-8")).toString()) .setSource(gson.toJson(webpage)) .get(); } catch (Exception e) { LOG.error("索引 Webpage 出错," + e.getLocalizedMessage()); } } }
Example #14
Source File: WebMagicPipelineDelegator.java From vscrawler with Apache License 2.0 | 6 votes |
@Override public void saveItem(GrabResult grabResult, Seed seed) { for (Object str : grabResult.allEntityResult()) { ResultItems resultItems = new ResultItems(); resultItems.setRequest(CovertUtil.convertSeed(seed)); if (str instanceof CharSequence) { handleJson(resultItems, str.toString()); } else { handleJsonObject(resultItems, str); } try { webMagicPipeline.process(resultItems, null); } catch (Exception e) { log.error("error when process result", e); } } }
Example #15
Source File: ConsolePipeline.java From webmagic with Apache License 2.0 | 5 votes |
@Override public void process(ResultItems resultItems, Task task) { System.out.println("get page: " + resultItems.getRequest().getUrl()); for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) { System.out.println(entry.getKey() + ":\t" + entry.getValue()); } }
Example #16
Source File: BlogPipeline.java From mogu_blog_v2 with Apache License 2.0 | 5 votes |
@Override public void process(ResultItems res, Task task) { //获取title和content String title = res.get("title"); String content = res.get("content"); System.out.println("title: " + title); System.out.println("content: " + content); if (!StringUtils.isEmpty(title) && !StringUtils.isEmpty(content)) { try { BlogSpider blog = new BlogSpider(); blog.setUid(idWorker.nextId() + ""); blog.setTitle(title); blog.setSummary(title); blog.setContent(content); blog.setTagUid("5c4c541e600ff422ccb371ee788f59d6"); blog.setClickCount(0); blog.setCollectCount(0); blog.setStatus(EStatus.ENABLE); blog.setAdminUid("1f01cd1d2f474743b241d74008b12333"); blog.setAuthor("陌溪"); blog.setArticlesPart("蘑菇博客"); blog.setBlogSortUid("6a1c7a50c0e7b8e8657949bf02d5d0ca"); blog.setLevel(0); blog.setIsPublish(EPublish.PUBLISH); blog.setSort(0); blog.insert(); //下载到本地 //DownloadUtil.download("http://pic.netbian.com"+fileUrl,fileName,SAVE_PATH); } catch (Exception e) { e.printStackTrace(); } } }
Example #17
Source File: CompositePipeline.java From webmagic with Apache License 2.0 | 5 votes |
@Override public void process(ResultItems resultItems, Task task) { for (SubPipeline subPipeline : subPipelines) { if (subPipeline.match(resultItems.getRequest())) { RequestMatcher.MatchOther matchOtherProcessorProcessor = subPipeline.processResult(resultItems, task); if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != RequestMatcher.MatchOther.YES) { return; } } } }
Example #18
Source File: PageModelCollectorPipeline.java From webmagic with Apache License 2.0 | 5 votes |
@Override public synchronized void process(ResultItems resultItems, Task task) { Object o = resultItems.get(clazz.getCanonicalName()); if (o != null) { Annotation annotation = clazz.getAnnotation(ExtractBy.class); if (annotation == null || !((ExtractBy) annotation).multi()) { classPipeline.process((T) o, task); } else { List<Object> list = (List<Object>) o; for (Object o1 : list) { classPipeline.process((T) o1, task); } } } }
Example #19
Source File: JsonFilePipeline.java From Gather-Platform with GNU General Public License v3.0 | 5 votes |
@Override public void process(ResultItems resultItems, Task task) { Webpage webpage = CommonWebpagePipeline.convertResultItems2Webpage(resultItems); try { FileUtils.writeStringToFile( new File("gather_platform_data/" + webpage.getSpiderUUID() + ".json"), gson.toJson(webpage) + "\n", true); } catch (IOException e) { LOG.error("序列化网页信息出错,{}", e.getLocalizedMessage()); } }
Example #20
Source File: MultiPagePipeline.java From webmagic with Apache License 2.0 | 5 votes |
@Override public void process(ResultItems resultItems, Task task) { Map<String, Object> resultItemsAll = resultItems.getAll(); Iterator<Map.Entry<String, Object>> iterator = resultItemsAll.entrySet().iterator(); while (iterator.hasNext()) { handleObject(iterator); } }
Example #21
Source File: JsonFilePipeline.java From webmagic with Apache License 2.0 | 5 votes |
@Override public void process(ResultItems resultItems, Task task) { String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR; try { PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json"))); printWriter.write(JSON.toJSONString(resultItems.getAll())); printWriter.close(); } catch (IOException e) { logger.warn("write file error", e); } }
Example #22
Source File: PhantomJSPageProcessor.java From webmagic with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { PhantomJSDownloader phantomDownloader = new PhantomJSDownloader().setRetryNum(3); CollectorPipeline<ResultItems> collectorPipeline = new ResultItemsCollectorPipeline(); Spider.create(new PhantomJSPageProcessor()) .addUrl("http://s.taobao.com/search?q=%B6%AC%D7%B0&sort=sale-desc") //%B6%AC%D7%B0为冬装的GBK编码 .setDownloader(phantomDownloader) .addPipeline(collectorPipeline) .thread((Runtime.getRuntime().availableProcessors() - 1) << 1) .run(); List<ResultItems> resultItemsList = collectorPipeline.getCollected(); System.out.println(resultItemsList.get(0).get("html").toString()); }
Example #23
Source File: GithubRepoPageProcessorTest.java From webmagic with Apache License 2.0 | 5 votes |
@Test public void test_github() throws Exception { Spider.create(new GithubRepoPageProcessor()).addPipeline(new Pipeline() { @Override public void process(ResultItems resultItems, Task task) { assertThat(((String) resultItems.get("name")).trim()).isEqualTo("webmagic"); assertThat(((String) resultItems.get("author")).trim()).isEqualTo("code4craft"); } }).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic"); }
Example #24
Source File: ESPipeline.java From Gather-Platform with GNU General Public License v3.0 | 5 votes |
@Override public void process(ResultItems resultItems, Task task) { Iterator i$ = resultItems.getAll().entrySet().iterator(); try { XContentBuilder xContentBuilder = jsonBuilder().startObject(); while (i$.hasNext()) { Map.Entry entry = (Map.Entry) i$.next(); xContentBuilder.field((String) entry.getKey(), entry.getValue()); } String json = xContentBuilder.endObject().string(); IndexResponse response = null; if (StringUtils.isNotBlank(resultItems.get("id"))) { response = client .prepareIndex(INDEX_NAME, TYPE_NAME, resultItems.get("id")) .setSource(json).get(); } else { response = client .prepareIndex(INDEX_NAME, TYPE_NAME) .setSource(json).get(); } if (response.getResult() != IndexResponse.Result.CREATED) LOG.error("索引失败,可能重复创建,resultItem:" + resultItems); } catch (IOException e) { LOG.error("索引出错," + e.getLocalizedMessage()); e.printStackTrace(); } }
Example #25
Source File: MyPipeline.java From spring-boot-demo with MIT License | 5 votes |
@Override public void process(ResultItems resultItems, Task task) { log.info("get page: " + resultItems.getRequest().getUrl()); for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) { log.info(entry.getKey() + ":\t" + entry.getValue()); } }
Example #26
Source File: ESPipeline.java From spider with GNU General Public License v3.0 | 5 votes |
@Override public void process(ResultItems resultItems, Task task) { Iterator i$ = resultItems.getAll().entrySet().iterator(); try { XContentBuilder xContentBuilder = jsonBuilder().startObject(); while (i$.hasNext()) { Map.Entry entry = (Map.Entry) i$.next(); xContentBuilder.field((String) entry.getKey(), entry.getValue()); } String json = xContentBuilder.endObject().string(); IndexResponse response = null; if (StringUtils.isNotBlank(resultItems.get("id"))) { response = client .prepareIndex(INDEX_NAME, TYPE_NAME, resultItems.get("id")) .setSource(json).get(); } else { response = client .prepareIndex(INDEX_NAME, TYPE_NAME) .setSource(json).get(); } if (response.getResult() != IndexResponse.Result.CREATED) LOG.error("索引失败,可能重复创建,resultItem:" + resultItems); } catch (IOException e) { LOG.error("索引出错," + e.getLocalizedMessage()); e.printStackTrace(); } }
Example #27
Source File: CommonWebpagePipeline.java From spider with GNU General Public License v3.0 | 5 votes |
@Override public void process(ResultItems resultItems, Task task) { SpiderInfo spiderInfo = resultItems.get("spiderInfo"); Webpage webpage = convertResultItems2Webpage(resultItems); try { client.prepareIndex(INDEX_NAME, TYPE_NAME) .setId(Hashing.md5().hashString(webpage.getUrl(), Charset.forName("utf-8")).toString()) .setSource(gson.toJson(webpage)) .get(); } catch (Exception e) { LOG.error("索引 Webpage 出错," + e.getLocalizedMessage()); } }
Example #28
Source File: JsonFilePipeline.java From spider with GNU General Public License v3.0 | 5 votes |
@Override public void process(ResultItems resultItems, Task task) { Webpage webpage = CommonWebpagePipeline.convertResultItems2Webpage(resultItems); try { FileUtils.writeStringToFile( new File("gather_platform_data/" + webpage.getSpiderUUID() + ".json"), gson.toJson(webpage) + "\n", true); } catch (IOException e) { LOG.error("序列化网页信息出错,{}", e.getLocalizedMessage()); } }
Example #29
Source File: WebMagicPipelineDelegator.java From vscrawler with Apache License 2.0 | 5 votes |
private void handleJson(ResultItems resultItems, String str) { try { JSONObject jsonObject = JSON.parseObject(str); for (Map.Entry<String, Object> entry : jsonObject.entrySet()) { resultItems.put(entry.getKey(), entry.getValue()); } } catch (Exception e) { log.warn("craw result is not a json format:{}", str); resultItems.put("data", str); } }
Example #30
Source File: WebMagicPipelineDelegator.java From vscrawler with Apache License 2.0 | 5 votes |
private void handleJsonObject(ResultItems resultItems, Object obj) { Field[] declaredFields = obj.getClass().getDeclaredFields(); for (Field field : declaredFields) { try { resultItems.put(field.getName(), ReflectUtil.getField(obj, field.getName())); } catch (Exception e) { //ignore,not happen } } }