us.codecraft.webmagic.model.OOSpider Java Examples

The following examples show how to use us.codecraft.webmagic.model.OOSpider. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BaiduBaike.java    From webmagic with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(0), BaiduBaike.class);
    //single download
    String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
    BaiduBaike baike = ooSpider.<BaiduBaike>get("http://baike.baidu.com/search/word?word=httpclient&pic=1&sug=1&enc=utf8");
    System.out.println(baike);

    //multidownload
    List<String> list = new ArrayList<String>();
    list.add(String.format(urlTemplate,"风力发电"));
    list.add(String.format(urlTemplate,"太阳能"));
    list.add(String.format(urlTemplate,"地热发电"));
    list.add(String.format(urlTemplate,"地热发电"));
    List<BaiduBaike> resultItemses = ooSpider.<BaiduBaike>getAll(list);
    for (BaiduBaike resultItemse : resultItemses) {
        System.out.println(resultItemse);
    }
    ooSpider.close();
}
 
Example #2
Source File: QuickStarter.java    From webmagic with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    init();
    String key = null;
    key = readKey(key);
    System.out.println("The demo started and will last 20 seconds...");
    //Start spider
    OOSpider.create(Site.me(), clazzMap.get(key)).addUrl(urlMap.get(key)).addPipeline(new MultiPagePipeline()).addPipeline(new ConsolePipeline()).runAsync();

    try {
        Thread.sleep(20000);
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
    System.out.println("The demo stopped!");
    System.out.println("To more usage, try to customize your own Spider!");
    System.exit(0);
}
 
Example #3
Source File: Kr36NewsModel.java    From webmagic with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws IOException, JMException {
    //Just for benchmark
    Spider thread = OOSpider.create(Site.me().setSleepTime(0), new PageModelPipeline() {
        @Override
        public void process(Object o, Task task) {

        }
    }, Kr36NewsModel.class).thread(20).addUrl("http://www.36kr.com/");
    thread.start();
    SpiderMonitor spiderMonitor = SpiderMonitor.instance();
    spiderMonitor.register(thread);
}
 
Example #4
Source File: GithubRepoProcessor.java    From webmagic with Apache License 2.0 5 votes vote down vote up
@Test
public void test() {
    OOSpider.create(new GithubRepoProcessor()).addPipeline(new Pipeline() {
        @Override
        public void process(ResultItems resultItems, Task task) {
            Assert.assertEquals("78",((String)resultItems.get("star")).trim());
            Assert.assertEquals("65",((String)resultItems.get("fork")).trim());
        }
    }).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
}
 
Example #5
Source File: AppStore.java    From webmagic with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    AppStore appStore = OOSpider.create(Site.me(), AppStore.class).<AppStore>get("http://itunes.apple.com/lookup?id=653350791&country=cn&entity=software");
    System.out.println(appStore.trackName);
    System.out.println(appStore.description);
    System.out.println(appStore.userRatingCount);
    System.out.println(appStore.screenshotUrls);
    System.out.println(appStore.supportedDevices);
}
 
Example #6
Source File: OschinaBlog.java    From webmagic with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    OOSpider.create(Site.me()
            .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36")
            .setSleepTime(0)
            .setRetryTimes(3)
            ,new PageModelPipeline() {
        @Override
        public void process(Object o, Task task) {

        }
    }, OschinaBlog.class).thread(10).addUrl("http://my.oschina.net/flashsword/blog").run();
}
 
Example #7
Source File: JokejiModel.java    From webmagic with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    OOSpider.create(Site.me().setDomain("www.jokeji.cn").setCharset("gbk").setSleepTime(100).setTimeOut(3000)
            .setUserAgent("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)")
            , new ConsolePageModelPipeline(), JokejiModel.class).addUrl("http://www.jokeji.cn/").thread(2)
            .scheduler(new RedisScheduler("127.0.0.1"))
            .run();
}
 
Example #8
Source File: TopicInfoPipeline.java    From feiqu-opensource with Apache License 2.0 5 votes vote down vote up
@Override
public void process(V2exDTO v2exDTO, Task task) {
    if(StringUtils.isEmpty(v2exDTO.getTitle())){
        return;
    }
    Date now = new Date();
    FqTopicExample topicExample = new FqTopicExample();
    topicExample.createCriteria().andGmtCreateGreaterThan(DateUtil.offsetHour(now,-5));
    long count = fqTopicMapper.countByExample(topicExample);
    if(count >= 50){
        OOSpider ooSpider = (OOSpider)task;
        ooSpider.stop();
    }
    topicExample.clear();
    topicExample.createCriteria().andTitleEqualTo(v2exDTO.getTitle()).andAuthorEqualTo(v2exDTO.getAuthor());
    count = fqTopicMapper.countByExample(topicExample);
    if(count > 0){
        return;
    }
    FqTopic fqTopic = DTO2DO(v2exDTO);
    fqTopic.setContent(EmojiUtils.toAliases(fqTopic.getContent()));
    fqTopicMapper.insert(fqTopic);
    if(CollectionUtil.isNotEmpty(v2exDTO.getReply())){
        v2exDTO.getReply().forEach(reply->{
            if(StringUtils.isEmpty(reply)){
                return;
            }
            if(reply.length() > 500){
                reply = reply.substring(0,480);
            }
            reply = EmojiUtils.toAliases(reply);
            FqTopicReply fqTopicReply = new FqTopicReply();
            fqTopicReply.setGmtCreate(now);
            fqTopicReply.setContent(reply);
            fqTopicReply.setTopicId(fqTopic.getId());
            fqTopicReplyMapper.insert(fqTopicReply);
        });
    }
}
 
Example #9
Source File: BaiduNews.java    From webmagic with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(0), BaiduNews.class);
    //single download
    BaiduNews baike = ooSpider.<BaiduNews>get("http://news.baidu.com/ns?tn=news&cl=2&rn=20&ct=1&fr=bks0000&ie=utf-8&word=httpclient");
    System.out.println(baike);

    ooSpider.close();
}
 
Example #10
Source File: JobCrawler.java    From jobhunter with Apache License 2.0 5 votes vote down vote up
public void crawl() {
    OOSpider.create(Site.me()
            .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36"),jobInfoDaoPipeline, LieTouJobInfo.class)
            .addUrl("https://www.liepin.com/sojob/?dqs=020&curPage=0")
            .thread(5)
            .run();
}
 
Example #11
Source File: PostsServiceImpl.java    From plumemo with Apache License 2.0 5 votes vote down vote up
private void crawler(PostsVO postsVO) {
    Class platformClass = PlatformEnum.getEnumTypeMap().get(postsVO.getPlatformType()).getPlatformClass();
    Spider spider = OOSpider.create(Site.me(), platformClass).setDownloader(new HttpClientDownloader());
    Object object = spider.get(postsVO.getSourceUri());

    String join = "";
    if (postsVO.getPlatformType().equals(PlatformEnum.JIAN_SHU.getType())) {
        JianShuVO jianShuVO = (JianShuVO) object;
        postsVO.setTitle(jianShuVO.getTitle());
        join = String.join("", jianShuVO.getContent());
    } else if (postsVO.getPlatformType().equals(PlatformEnum.JUE_JIN.getType())) {
        JueJinVO jueJinVO = (JueJinVO) object;
        postsVO.setTitle(jueJinVO.getTitle());
        join = String.join("", jueJinVO.getContent());
    } else if (postsVO.getPlatformType().equals(PlatformEnum.SEGMENT_FAULT.getType())) {
        SegmentFaultVO segmentFaultVO = (SegmentFaultVO) object;
        postsVO.setTitle(segmentFaultVO.getTitle());
        join = String.join("", segmentFaultVO.getContent());
    } else if (postsVO.getPlatformType().equals(PlatformEnum.CSDN.getType())) {
        CSDNVO csdnVO = (CSDNVO) object;
        postsVO.setTitle(csdnVO.getTitle());
        join = String.join("", csdnVO.getContent());
    } else if (postsVO.getPlatformType().equals(PlatformEnum.CN_BLOGS.getType())) {
        CNBlogsVO cnBlogsVO = (CNBlogsVO) object;
        postsVO.setTitle(cnBlogsVO.getTitle());
        join = String.join("", cnBlogsVO.getContent());
    } else {
        ExceptionUtil.rollback(ErrorEnum.PARAM_ERROR);
    }
    String converted = new Remark().convertFragment(join);
    postsVO.setContent(converted);
}
 
Example #12
Source File: SpiderController.java    From feiqu-opensource with Apache License 2.0 5 votes vote down vote up
@RequestMapping("v2exSpider")
@ResponseBody
public Object v2exSpider(){
    OOSpider ooSpider = OOSpider.create(Site.me()
                    .setUserAgent(CommonConstant.userAgentArray[new Random().nextInt(CommonConstant.userAgentArray.length)])
                    .addHeader("Referer","https://www.v2ex.com/").setSleepTime(5000).setDomain("v2ex.com"),
            topicInfoPipeline, V2exDTO.class);
    ooSpider.addUrl("https://www.v2ex.com/?tab=jobs")
            .run();
    return true;
}
 
Example #13
Source File: IteyeBlog.java    From webmagic with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
    OOSpider.create(Site.me(), IteyeBlog.class).addUrl("http://flashsword20.iteye.com/blog").run();
}
 
Example #14
Source File: GithubRepo.java    From webmagic with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
    OOSpider.create(Site.me().setSleepTime(0).setRetryTimes(3),
            new JsonFilePageModelPipeline(), GithubRepo.class)
            .addUrl("https://github.com/explore")
            .setScheduler(new FileCacheQueueScheduler("/data/webmagic/cache/")).thread(15).run();
}
 
Example #15
Source File: DianpingFtlDataScanner.java    From webmagic with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
	OOSpider.create(Site.me().setSleepTime(0), DianpingFtlDataScanner.class)
			.thread(5).run();
}
 
Example #16
Source File: News163.java    From webmagic with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
    OOSpider.create(Site.me(), News163.class).addUrl("http://news.163.com/13/0802/05/958I1E330001124J_2.html")
            .scheduler(new RedisScheduler("localhost")).addPipeline(new MultiPagePipeline()).addPipeline(new ConsolePipeline()).run();
}
 
Example #17
Source File: QQMeishi.java    From webmagic with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
    OOSpider.create(Site.me(), new ConsolePageModelPipeline(), QQMeishi.class).addUrl("http://meishi.qq.com/beijing/c/all").thread(4).run();
}
 
Example #18
Source File: GithubRepo.java    From SmartEducation with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
    OOSpider.create(Site.me().setSleepTime(1000)
            , new ConsolePageModelPipeline(), GithubRepo.class)
            .addUrl("https://github.com/code4craft").thread(5).run();
}
 
Example #19
Source File: GithubRepo.java    From webmagic with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
    OOSpider.create(Site.me().setSleepTime(100)
            , new ConsolePageModelPipeline(), GithubRepo.class)
            .addUrl("https://github.com/code4craft").thread(10).run();
}
 
Example #20
Source File: GithubRepoApi.java    From webmagic with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
    OOSpider.create(Site.me().setSleepTime(100)
            , new ConsolePageModelPipeline(), GithubRepoApi.class)
            .addUrl("https://api.github.com/repos/code4craft/webmagic").run();
}
 
Example #21
Source File: OschinaBlog.java    From webmagic with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
    //results will be saved to "/data/webmagic/" in json format
    OOSpider.create(Site.me(), new JsonFilePageModelPipeline("/data/webmagic/"), OschinaBlog.class)
            .addUrl("http://my.oschina.net/flashsword/blog").run();
}
 
Example #22
Source File: HotContentJob.java    From feiqu-opensource with Apache License 2.0 4 votes vote down vote up
@Scheduled(cron = "0 3 */6 * * ?")
public void spider(){
    Stopwatch stopwatch = Stopwatch.createStarted();
    try {
        OOSpider ooSpider = OOSpider.create(Site.me()
                        .setUserAgent(CommonConstant.userAgentArray[new Random().nextInt(CommonConstant.userAgentArray.length)])
                        .addHeader("Referer","https://www.v2ex.com/").setSleepTime(30000).setDomain("v2ex.com"),
                topicInfoPipeline, V2exDTO.class);
        ooSpider.addUrl("https://www.v2ex.com/?tab=hot")
                .run();
        stopwatch.stop();

        /*String s = HttpClientUtil.getWebPage("https://api.readhub.cn/topic?lastCursor=&pageSize=20");
        JSONObject jsonObject = new JSONObject(s);
        JSONArray data = jsonObject.getJSONArray("data");
        Date now = new Date();
        for (Object d : data) {
            JSONObject j = (JSONObject) d;
            String summary = j.getStr("summary");
            String publishDate = j.getStr("publishDate");
            String title = j.getStr("title");
            String url = "";
            JSONArray newsArray = j.getJSONArray("newsArray");
            if (!newsArray.isEmpty()) {
                JSONObject ja = (JSONObject) newsArray.get(0);
                url = ja.getStr("url");
            }
//                order = ((JSONObject) d).getInt("order");
            FqTopic fqTopic = new FqTopic();
            fqTopic.setAuthor("");
            fqTopic.setAuthorIcon("");
            fqTopic.setCommentCount(0);
            fqTopic.setContent(summary+"<br>发布时间:"+publishDate+"<br>相关地址:"+url);
            fqTopic.setTitle(title);
            fqTopic.setSource(SpiderSourceEnum.READ_HUB.getValue());
            fqTopic.setGmtCreate(now);
            fqTopic.setType("");
            fqTopicService.insert(fqTopic);
        }*/
    } catch (Exception e) {
        logger.error("爬虫出错",e);
    }
    long seconds = stopwatch.elapsed(TimeUnit.SECONDS);
    logger.info("爬虫数据更新完毕,耗时{}秒",seconds);
}