us.codecraft.webmagic.pipeline.Pipeline Java Examples
The following examples show how to use
us.codecraft.webmagic.pipeline.Pipeline.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SpiderTest.java From webmagic with Apache License 2.0 | 6 votes |
@Ignore("long time") @Test public void testStartAndStop() throws InterruptedException { Spider spider = Spider.create(new SimplePageProcessor( "http://www.oschina.net/*")).addPipeline(new Pipeline() { @Override public void process(ResultItems resultItems, Task task) { System.out.println(1); } }).thread(1).addUrl("http://www.oschina.net/"); spider.start(); Thread.sleep(10000); spider.stop(); Thread.sleep(10000); spider.start(); Thread.sleep(10000); }
Example #2
Source File: ScriptConsole.java From webmagic with Apache License 2.0 | 6 votes |
private static void startSpider(Params params) { ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom() .language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build(); pageProcessor.getSite().setSleepTime(params.getSleepTime()); pageProcessor.getSite().setRetryTimes(3); pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404,403, 500,502)); Spider spider = Spider.create(pageProcessor).thread(params.getThread()); spider.clearPipeline().addPipeline(new Pipeline() { @Override public void process(ResultItems resultItems, Task task) { } }); if (params.getUrls() == null || params.getUrls().size() == 0) { System.err.println("Need at least one argument"); System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]"); System.exit(-1); } for (String url : params.getUrls()) { spider.addUrl(url); } spider.run(); }
Example #3
Source File: Spider.java From webmagic with Apache License 2.0 | 5 votes |
public void close() { destroyEach(downloader); destroyEach(pageProcessor); destroyEach(scheduler); for (Pipeline pipeline : pipelines) { destroyEach(pipeline); } threadPool.shutdown(); }
Example #4
Source File: Spider.java From webmagic with Apache License 2.0 | 5 votes |
private void onDownloadSuccess(Request request, Page page) { if (site.getAcceptStatCode().contains(page.getStatusCode())){ pageProcessor.process(page); extractAndAddRequests(page, spawnUrl); if (!page.getResultItems().isSkip()) { for (Pipeline pipeline : pipelines) { pipeline.process(page.getResultItems(), this); } } } else { logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode()); } sleep(site.getSleepTime()); return; }
Example #5
Source File: GithubRepoPageProcessorTest.java From webmagic with Apache License 2.0 | 5 votes |
@Test public void test_github() throws Exception { Spider.create(new GithubRepoPageProcessor()).addPipeline(new Pipeline() { @Override public void process(ResultItems resultItems, Task task) { assertThat(((String) resultItems.get("name")).trim()).isEqualTo("webmagic"); assertThat(((String) resultItems.get("author")).trim()).isEqualTo("code4craft"); } }).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic"); }
Example #6
Source File: GithubRepoProcessor.java From webmagic with Apache License 2.0 | 5 votes |
@Test public void test() { OOSpider.create(new GithubRepoProcessor()).addPipeline(new Pipeline() { @Override public void process(ResultItems resultItems, Task task) { Assert.assertEquals("78",((String)resultItems.get("star")).trim()); Assert.assertEquals("65",((String)resultItems.get("fork")).trim()); } }).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic"); }
Example #7
Source File: CommonSpider.java From Gather-Platform with GNU General Public License v3.0 | 4 votes |
public List<Pipeline> getPipelineList() { return pipelineList; }
Example #8
Source File: CommonSpider.java From Gather-Platform with GNU General Public License v3.0 | 4 votes |
public CommonSpider setPipelineList(List<Pipeline> pipelineList) { this.pipelineList = pipelineList; return this; }
Example #9
Source File: CommonSpider.java From spider with GNU General Public License v3.0 | 4 votes |
public List<Pipeline> getPipelineList() { return pipelineList; }
Example #10
Source File: CommonSpider.java From spider with GNU General Public License v3.0 | 4 votes |
public CommonSpider setPipelineList(List<Pipeline> pipelineList) { this.pipelineList = pipelineList; return this; }
Example #11
Source File: Spider.java From webmagic with Apache License 2.0 | 2 votes |
/** * add a pipeline for Spider * * @param pipeline pipeline * @return this * @see Pipeline * @since 0.2.1 */ public Spider addPipeline(Pipeline pipeline) { checkIfRunning(); this.pipelines.add(pipeline); return this; }
Example #12
Source File: Spider.java From webmagic with Apache License 2.0 | 2 votes |
/** * set pipelines for Spider * * @param pipelines pipelines * @return this * @see Pipeline * @since 0.4.1 */ public Spider setPipelines(List<Pipeline> pipelines) { checkIfRunning(); this.pipelines = pipelines; return this; }
Example #13
Source File: Spider.java From webmagic with Apache License 2.0 | 2 votes |
/** * clear the pipelines set * * @return this */ public Spider clearPipeline() { pipelines = new ArrayList<Pipeline>(); return this; }