us.codecraft.webmagic.model.OOSpider Java Examples
The following examples show how to use
us.codecraft.webmagic.model.OOSpider.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BaiduBaike.java From webmagic with Apache License 2.0 | 6 votes |
public static void main(String[] args) { OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(0), BaiduBaike.class); //single download String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8"; BaiduBaike baike = ooSpider.<BaiduBaike>get("http://baike.baidu.com/search/word?word=httpclient&pic=1&sug=1&enc=utf8"); System.out.println(baike); //multidownload List<String> list = new ArrayList<String>(); list.add(String.format(urlTemplate,"风力发电")); list.add(String.format(urlTemplate,"太阳能")); list.add(String.format(urlTemplate,"地热发电")); list.add(String.format(urlTemplate,"地热发电")); List<BaiduBaike> resultItemses = ooSpider.<BaiduBaike>getAll(list); for (BaiduBaike resultItemse : resultItemses) { System.out.println(resultItemse); } ooSpider.close(); }
Example #2
Source File: QuickStarter.java From webmagic with Apache License 2.0 | 6 votes |
public static void main(String[] args) { init(); String key = null; key = readKey(key); System.out.println("The demo started and will last 20 seconds..."); //Start spider OOSpider.create(Site.me(), clazzMap.get(key)).addUrl(urlMap.get(key)).addPipeline(new MultiPagePipeline()).addPipeline(new ConsolePipeline()).runAsync(); try { Thread.sleep(20000); } catch (InterruptedException e) { e.printStackTrace(); } System.out.println("The demo stopped!"); System.out.println("To more usage, try to customize your own Spider!"); System.exit(0); }
Example #3
Source File: Kr36NewsModel.java From webmagic with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws IOException, JMException { //Just for benchmark Spider thread = OOSpider.create(Site.me().setSleepTime(0), new PageModelPipeline() { @Override public void process(Object o, Task task) { } }, Kr36NewsModel.class).thread(20).addUrl("http://www.36kr.com/"); thread.start(); SpiderMonitor spiderMonitor = SpiderMonitor.instance(); spiderMonitor.register(thread); }
Example #4
Source File: GithubRepoProcessor.java From webmagic with Apache License 2.0 | 5 votes |
@Test public void test() { OOSpider.create(new GithubRepoProcessor()).addPipeline(new Pipeline() { @Override public void process(ResultItems resultItems, Task task) { Assert.assertEquals("78",((String)resultItems.get("star")).trim()); Assert.assertEquals("65",((String)resultItems.get("fork")).trim()); } }).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic"); }
Example #5
Source File: AppStore.java From webmagic with Apache License 2.0 | 5 votes |
public static void main(String[] args) { AppStore appStore = OOSpider.create(Site.me(), AppStore.class).<AppStore>get("http://itunes.apple.com/lookup?id=653350791&country=cn&entity=software"); System.out.println(appStore.trackName); System.out.println(appStore.description); System.out.println(appStore.userRatingCount); System.out.println(appStore.screenshotUrls); System.out.println(appStore.supportedDevices); }
Example #6
Source File: OschinaBlog.java From webmagic with Apache License 2.0 | 5 votes |
public static void main(String[] args) { OOSpider.create(Site.me() .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36") .setSleepTime(0) .setRetryTimes(3) ,new PageModelPipeline() { @Override public void process(Object o, Task task) { } }, OschinaBlog.class).thread(10).addUrl("http://my.oschina.net/flashsword/blog").run(); }
Example #7
Source File: JokejiModel.java From webmagic with Apache License 2.0 | 5 votes |
public static void main(String[] args) { OOSpider.create(Site.me().setDomain("www.jokeji.cn").setCharset("gbk").setSleepTime(100).setTimeOut(3000) .setUserAgent("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)") , new ConsolePageModelPipeline(), JokejiModel.class).addUrl("http://www.jokeji.cn/").thread(2) .scheduler(new RedisScheduler("127.0.0.1")) .run(); }
Example #8
Source File: TopicInfoPipeline.java From feiqu-opensource with Apache License 2.0 | 5 votes |
@Override public void process(V2exDTO v2exDTO, Task task) { if(StringUtils.isEmpty(v2exDTO.getTitle())){ return; } Date now = new Date(); FqTopicExample topicExample = new FqTopicExample(); topicExample.createCriteria().andGmtCreateGreaterThan(DateUtil.offsetHour(now,-5)); long count = fqTopicMapper.countByExample(topicExample); if(count >= 50){ OOSpider ooSpider = (OOSpider)task; ooSpider.stop(); } topicExample.clear(); topicExample.createCriteria().andTitleEqualTo(v2exDTO.getTitle()).andAuthorEqualTo(v2exDTO.getAuthor()); count = fqTopicMapper.countByExample(topicExample); if(count > 0){ return; } FqTopic fqTopic = DTO2DO(v2exDTO); fqTopic.setContent(EmojiUtils.toAliases(fqTopic.getContent())); fqTopicMapper.insert(fqTopic); if(CollectionUtil.isNotEmpty(v2exDTO.getReply())){ v2exDTO.getReply().forEach(reply->{ if(StringUtils.isEmpty(reply)){ return; } if(reply.length() > 500){ reply = reply.substring(0,480); } reply = EmojiUtils.toAliases(reply); FqTopicReply fqTopicReply = new FqTopicReply(); fqTopicReply.setGmtCreate(now); fqTopicReply.setContent(reply); fqTopicReply.setTopicId(fqTopic.getId()); fqTopicReplyMapper.insert(fqTopicReply); }); } }
Example #9
Source File: BaiduNews.java From webmagic with Apache License 2.0 | 5 votes |
public static void main(String[] args) { OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(0), BaiduNews.class); //single download BaiduNews baike = ooSpider.<BaiduNews>get("http://news.baidu.com/ns?tn=news&cl=2&rn=20&ct=1&fr=bks0000&ie=utf-8&word=httpclient"); System.out.println(baike); ooSpider.close(); }
Example #10
Source File: JobCrawler.java From jobhunter with Apache License 2.0 | 5 votes |
public void crawl() { OOSpider.create(Site.me() .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36"),jobInfoDaoPipeline, LieTouJobInfo.class) .addUrl("https://www.liepin.com/sojob/?dqs=020&curPage=0") .thread(5) .run(); }
Example #11
Source File: PostsServiceImpl.java From plumemo with Apache License 2.0 | 5 votes |
private void crawler(PostsVO postsVO) { Class platformClass = PlatformEnum.getEnumTypeMap().get(postsVO.getPlatformType()).getPlatformClass(); Spider spider = OOSpider.create(Site.me(), platformClass).setDownloader(new HttpClientDownloader()); Object object = spider.get(postsVO.getSourceUri()); String join = ""; if (postsVO.getPlatformType().equals(PlatformEnum.JIAN_SHU.getType())) { JianShuVO jianShuVO = (JianShuVO) object; postsVO.setTitle(jianShuVO.getTitle()); join = String.join("", jianShuVO.getContent()); } else if (postsVO.getPlatformType().equals(PlatformEnum.JUE_JIN.getType())) { JueJinVO jueJinVO = (JueJinVO) object; postsVO.setTitle(jueJinVO.getTitle()); join = String.join("", jueJinVO.getContent()); } else if (postsVO.getPlatformType().equals(PlatformEnum.SEGMENT_FAULT.getType())) { SegmentFaultVO segmentFaultVO = (SegmentFaultVO) object; postsVO.setTitle(segmentFaultVO.getTitle()); join = String.join("", segmentFaultVO.getContent()); } else if (postsVO.getPlatformType().equals(PlatformEnum.CSDN.getType())) { CSDNVO csdnVO = (CSDNVO) object; postsVO.setTitle(csdnVO.getTitle()); join = String.join("", csdnVO.getContent()); } else if (postsVO.getPlatformType().equals(PlatformEnum.CN_BLOGS.getType())) { CNBlogsVO cnBlogsVO = (CNBlogsVO) object; postsVO.setTitle(cnBlogsVO.getTitle()); join = String.join("", cnBlogsVO.getContent()); } else { ExceptionUtil.rollback(ErrorEnum.PARAM_ERROR); } String converted = new Remark().convertFragment(join); postsVO.setContent(converted); }
Example #12
Source File: SpiderController.java From feiqu-opensource with Apache License 2.0 | 5 votes |
@RequestMapping("v2exSpider") @ResponseBody public Object v2exSpider(){ OOSpider ooSpider = OOSpider.create(Site.me() .setUserAgent(CommonConstant.userAgentArray[new Random().nextInt(CommonConstant.userAgentArray.length)]) .addHeader("Referer","https://www.v2ex.com/").setSleepTime(5000).setDomain("v2ex.com"), topicInfoPipeline, V2exDTO.class); ooSpider.addUrl("https://www.v2ex.com/?tab=jobs") .run(); return true; }
Example #13
Source File: IteyeBlog.java From webmagic with Apache License 2.0 | 4 votes |
public static void main(String[] args) { OOSpider.create(Site.me(), IteyeBlog.class).addUrl("http://flashsword20.iteye.com/blog").run(); }
Example #14
Source File: GithubRepo.java From webmagic with Apache License 2.0 | 4 votes |
public static void main(String[] args) { OOSpider.create(Site.me().setSleepTime(0).setRetryTimes(3), new JsonFilePageModelPipeline(), GithubRepo.class) .addUrl("https://github.com/explore") .setScheduler(new FileCacheQueueScheduler("/data/webmagic/cache/")).thread(15).run(); }
Example #15
Source File: DianpingFtlDataScanner.java From webmagic with Apache License 2.0 | 4 votes |
public static void main(String[] args) { OOSpider.create(Site.me().setSleepTime(0), DianpingFtlDataScanner.class) .thread(5).run(); }
Example #16
Source File: News163.java From webmagic with Apache License 2.0 | 4 votes |
public static void main(String[] args) { OOSpider.create(Site.me(), News163.class).addUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html") .scheduler(new RedisScheduler("localhost")).addPipeline(new MultiPagePipeline()).addPipeline(new ConsolePipeline()).run(); }
Example #17
Source File: QQMeishi.java From webmagic with Apache License 2.0 | 4 votes |
public static void main(String[] args) { OOSpider.create(Site.me(), new ConsolePageModelPipeline(), QQMeishi.class).addUrl("http://meishi.qq.com/beijing/c/all").thread(4).run(); }
Example #18
Source File: GithubRepo.java From SmartEducation with Apache License 2.0 | 4 votes |
public static void main(String[] args) { OOSpider.create(Site.me().setSleepTime(1000) , new ConsolePageModelPipeline(), GithubRepo.class) .addUrl("https://github.com/code4craft").thread(5).run(); }
Example #19
Source File: GithubRepo.java From webmagic with Apache License 2.0 | 4 votes |
public static void main(String[] args) { OOSpider.create(Site.me().setSleepTime(100) , new ConsolePageModelPipeline(), GithubRepo.class) .addUrl("https://github.com/code4craft").thread(10).run(); }
Example #20
Source File: GithubRepoApi.java From webmagic with Apache License 2.0 | 4 votes |
public static void main(String[] args) { OOSpider.create(Site.me().setSleepTime(100) , new ConsolePageModelPipeline(), GithubRepoApi.class) .addUrl("https://api.github.com/repos/code4craft/webmagic").run(); }
Example #21
Source File: OschinaBlog.java From webmagic with Apache License 2.0 | 4 votes |
public static void main(String[] args) { //results will be saved to "/data/webmagic/" in json format OOSpider.create(Site.me(), new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class) .addUrl("http://my.oschina.net/flashsword/blog").run(); }
Example #22
Source File: HotContentJob.java From feiqu-opensource with Apache License 2.0 | 4 votes |
@Scheduled(cron = "0 3 */6 * * ?") public void spider(){ Stopwatch stopwatch = Stopwatch.createStarted(); try { OOSpider ooSpider = OOSpider.create(Site.me() .setUserAgent(CommonConstant.userAgentArray[new Random().nextInt(CommonConstant.userAgentArray.length)]) .addHeader("Referer","https://www.v2ex.com/").setSleepTime(30000).setDomain("v2ex.com"), topicInfoPipeline, V2exDTO.class); ooSpider.addUrl("https://www.v2ex.com/?tab=hot") .run(); stopwatch.stop(); /*String s = HttpClientUtil.getWebPage("https://api.readhub.cn/topic?lastCursor=&pageSize=20"); JSONObject jsonObject = new JSONObject(s); JSONArray data = jsonObject.getJSONArray("data"); Date now = new Date(); for (Object d : data) { JSONObject j = (JSONObject) d; String summary = j.getStr("summary"); String publishDate = j.getStr("publishDate"); String title = j.getStr("title"); String url = ""; JSONArray newsArray = j.getJSONArray("newsArray"); if (!newsArray.isEmpty()) { JSONObject ja = (JSONObject) newsArray.get(0); url = ja.getStr("url"); } // order = ((JSONObject) d).getInt("order"); FqTopic fqTopic = new FqTopic(); fqTopic.setAuthor(""); fqTopic.setAuthorIcon(""); fqTopic.setCommentCount(0); fqTopic.setContent(summary+"<br>发布时间:"+publishDate+"<br>相关地址:"+url); fqTopic.setTitle(title); fqTopic.setSource(SpiderSourceEnum.READ_HUB.getValue()); fqTopic.setGmtCreate(now); fqTopic.setType(""); fqTopicService.insert(fqTopic); }*/ } catch (Exception e) { logger.error("爬虫出错",e); } long seconds = stopwatch.elapsed(TimeUnit.SECONDS); logger.info("爬虫数据更新完毕,耗时{}秒",seconds); }