us.codecraft.webmagic.scheduler.QueueScheduler Java Examples

The following examples show how to use us.codecraft.webmagic.scheduler.QueueScheduler. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CommonSpider.java    From Gather-Platform with GNU General Public License v3.0 6 votes vote down vote up
/**
     * 测试爬虫模板
     *
     * @param info
     * @return
     */
    public List<Webpage> testSpiderInfo(SpiderInfo info) throws JMException {
        final ResultItemsCollectorPipeline resultItemsCollectorPipeline = new ResultItemsCollectorPipeline();
        final String uuid = UUID.randomUUID().toString();
        Task task = taskManager.initTask(uuid, info.getDomain(), info.getCallbackURL(), "spiderInfoId=" + info.getId() + "&spiderUUID=" + uuid);
        task.addExtraInfo("spiderInfo", info);
        QueueScheduler queueScheduler = new QueueScheduler();
        MySpider spider = (MySpider) makeSpider(info, task)
                .addPipeline(resultItemsCollectorPipeline)
                .setScheduler(queueScheduler);
        spider.startUrls(info.getStartURL());
        //慎用爬虫监控,可能导致内存泄露
//        spiderMonitor.register(spider);
        spiderMap.put(uuid, spider);
        taskManager.getTaskById(uuid).setState(State.RUNNING);
        spider.run();
        List<Webpage> webpageList = Lists.newLinkedList();
        resultItemsCollectorPipeline.getCollected().forEach(resultItems -> webpageList.add(CommonWebpagePipeline.convertResultItems2Webpage(resultItems)));
        return webpageList;
    }
 
Example #2
Source File: CommonSpider.java    From spider with GNU General Public License v3.0 6 votes vote down vote up
/**
     * 测试爬虫模板
     *
     * @param info
     * @return
     */
    public List<Webpage> testSpiderInfo(SpiderInfo info) throws JMException {
        final ResultItemsCollectorPipeline resultItemsCollectorPipeline = new ResultItemsCollectorPipeline();
        final String uuid = UUID.randomUUID().toString();
        Task task = taskManager.initTask(uuid, info.getDomain(), info.getCallbackURL(), "spiderInfoId=" + info.getId() + "&spiderUUID=" + uuid);
        task.addExtraInfo("spiderInfo", info);
        QueueScheduler queueScheduler = new QueueScheduler();
        MySpider spider = (MySpider) makeSpider(info, task)
                .addPipeline(resultItemsCollectorPipeline)
                .setScheduler(queueScheduler);
        if (info.isAjaxSite() && StringUtils.isNotBlank(staticValue.getAjaxDownloader())) {
            spider.setDownloader(casperjsDownloader);
        } else {
            spider.setDownloader(contentLengthLimitHttpClientDownloader);
        }
        spider.startUrls(info.getStartURL());
        //慎用爬虫监控,可能导致内存泄露
//        spiderMonitor.register(spider);
        spiderMap.put(uuid, spider);
        taskManager.getTaskById(uuid).setState(State.RUNNING);
        spider.run();
        List<Webpage> webpageList = Lists.newLinkedList();
        resultItemsCollectorPipeline.getCollected().forEach(resultItems -> webpageList.add(CommonWebpagePipeline.convertResultItems2Webpage(resultItems)));
        return webpageList;
    }
 
Example #3
Source File: BlogSpiderRestApi.java    From mogu_blog_v2 with Apache License 2.0 5 votes vote down vote up
/**
 * 爬取csdn博客
 *
 * @return
 */
@ApiOperation(value = "startSpiderCsdn", notes = "startSpiderCsdn")
@RequestMapping(value = "/startSpiderCsdn", method = RequestMethod.GET)
public String startSpiderCsdn() {

    if (spider != null) {
        spider.run();
        return "启动爬取";
    }
    //开启蜘蛛爬取内容
    spider = Spider.create(blogProcesser)
            .addUrl("https://www.csdn.net/")
            .addPipeline(blogPipeline)
            .setScheduler(new QueueScheduler())
            .thread(10);

    spider.start();

    return "开始爬取";
}
 
Example #4
Source File: BlogTask.java    From mogu_blog_v2 with Apache License 2.0 5 votes vote down vote up
/**
 * 爬取文章: 爬取数据库分类
 */
//@Scheduled(cron = "0/20 * * * * ?")
//initialDelay 任务启动后多久后执行
//fixedDelay 多久执行一次
@Scheduled(initialDelay = 1000, fixedDelay = 100 * 1000)
public void webArticleTask() {
    //开启蜘蛛爬取内容
    Spider.create(blogProcesser)
            .addUrl("https://www.csdn.net/")
            .addPipeline(blogPipeline)
            .setScheduler(new QueueScheduler())
            .thread(10)
            .run();
}