edu.uci.ics.crawler4j.crawler.CrawlConfig#setMaxPagesToFetch

Source File: CrawlerController.java From Java-for-Data-Science with MIT License

6 votes

public static void main(String[] args) throws Exception {
  int numberOfCrawlers = 2;
  CrawlConfig config = new CrawlConfig();
  String crawlStorageFolder = "data";
  
  config.setCrawlStorageFolder(crawlStorageFolder);
  config.setPolitenessDelay(500);
  config.setMaxDepthOfCrawling(2);
  config.setMaxPagesToFetch(20);
  config.setIncludeBinaryContentInCrawling(false);

  PageFetcher pageFetcher = new PageFetcher(config);
  RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
  RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
  CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

  controller.addSeed("https://en.wikipedia.org/wiki/Bishop_Rock,_Isles_of_Scilly");

  controller.start(SampleCrawler.class, numberOfCrawlers);
}

Source File: ImageCrawlerController.java From tutorials with MIT License

6 votes

public static void main(String[] args) throws Exception {
    File crawlStorage = new File("src/test/resources/crawler4j");
    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(crawlStorage.getAbsolutePath());
    config.setIncludeBinaryContentInCrawling(true);
    config.setMaxPagesToFetch(500);
    
    File saveDir = new File("src/test/resources/crawler4j");
    
    int numCrawlers = 12;
    
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
    
    controller.addSeed("https://www.baeldung.com/");
    
    CrawlController.WebCrawlerFactory<ImageCrawler> factory = () -> new ImageCrawler(saveDir);
    
    controller.start(factory, numCrawlers);
}

Source File: VsController.java From visual-spider with MIT License

5 votes

/**
 * 初始化
 *
 * @param numberOfCrawlers 爬虫线程数
 * @param maxDepthOfCrawling 抓取深度
 * @param maxPagesToFetch 最大抓取页数
 * @param politenessDelay 延迟
 * @param links 待爬取链接
 */
public void init(int numberOfCrawlers, int maxDepthOfCrawling, int maxPagesToFetch, int politenessDelay, String[]
        links) {
    this.numberOfCrawlers = numberOfCrawlers;
    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(DefaultConfigValues.CRAWL_STORAGE_FOLDER);
    config.setMaxDepthOfCrawling(maxDepthOfCrawling);
    config.setIncludeHttpsPages(true);
    config.setMaxPagesToFetch(maxPagesToFetch);
    config.setIncludeBinaryContentInCrawling(false);
    config.setPolitenessDelay(politenessDelay);
    config.setUserAgentString(DefaultConfigValues.USER_AGENT);
    config.setResumableCrawling(true);

    if (com.zhazhapan.vspider.models.CrawlConfig.getTurnOnProxy().get()) {
        LOGGER.info("open proxy");
        config.setProxyHost(com.zhazhapan.vspider.models.CrawlConfig.getProxyServer().get());
        config.setProxyPort(Formatter.stringToInt(com.zhazhapan.vspider.models.CrawlConfig.getProxyPort().get()));
        config.setProxyUsername(com.zhazhapan.vspider.models.CrawlConfig.getProxyUser().get());
        config.setProxyPassword(com.zhazhapan.vspider.models.CrawlConfig.getProxyPass().get());
    }

    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    robotstxtConfig.setEnabled(false);
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    try {
        controller = new CrawlController(config, pageFetcher, robotstxtServer);
        for (String link : links) {
            if (Checker.isHyperLink(link)) {
                controller.addSeed(link);
            }
        }
        isInited = true;
    } catch (Exception e) {
        LOGGER.error("start to crawl urls error: " + e.getMessage());
    }
}

Source File: Crawler.java From scava with Eclipse Public License 2.0

5 votes

public Crawler(File storing, List<String> urlSeeds, int maxDept, int maxPages)
{
	logger = (OssmeterLogger) OssmeterLogger.getLogger("nlp.tools.webcrawler");
	CrawlConfig config = new CrawlConfig();
	config.setIncludeHttpsPages(true);
	config.setPolitenessDelay(1000);
	config.setCrawlStorageFolder(storing.toString());
	config.setMaxDepthOfCrawling(maxDept);
       config.setMaxPagesToFetch(maxPages);       
       
       createCrawler(config, storing, urlSeeds);
}

Source File: Crawler.java From scava with Eclipse Public License 2.0

5 votes

public Crawler(File storing, List<String> urlSeeds, int maxDept, int maxPages,String loginURL, String username, String password, String usernameFieldName, String passwordFieldName) throws MalformedURLException
{
	logger = (OssmeterLogger) OssmeterLogger.getLogger("nlp.tools.webcrawler");
	CrawlConfig config = new CrawlConfig();
	config.setIncludeHttpsPages(true);
	config.setPolitenessDelay(1000);
	config.setCrawlStorageFolder(storing.toString());
	config.setMaxDepthOfCrawling(maxDept);
	config.addAuthInfo(createAuthethicator(username, password, loginURL, usernameFieldName, passwordFieldName));
       config.setMaxPagesToFetch(maxPages);       
       
       createCrawler(config, storing, urlSeeds);
}

Source File: Crawler.java From scava with Eclipse Public License 2.0

5 votes

public Crawler(File storing, List<String> urlSeeds)
{
	logger = (OssmeterLogger) OssmeterLogger.getLogger("nlp.tools.webcrawler");
	CrawlConfig config = new CrawlConfig();
	config.setIncludeHttpsPages(true);
	config.setPolitenessDelay(1000);
	config.setCrawlStorageFolder(storing.toString());
	config.setMaxDepthOfCrawling(-1);
       config.setMaxPagesToFetch(-1);       
       
       createCrawler(config, storing, urlSeeds);
}

Source File: Crawler.java From scava with Eclipse Public License 2.0

5 votes

public Crawler(File storing, List<String> urlSeeds, String loginURL, String username, String password, String usernameFieldName, String passwordFieldName) throws MalformedURLException
{
	logger = (OssmeterLogger) OssmeterLogger.getLogger("nlp.tools.webcrawler");
	CrawlConfig config = new CrawlConfig();
	config.setIncludeHttpsPages(true);
	config.setPolitenessDelay(1000);
	config.setCrawlStorageFolder(storing.toString());
	config.setMaxDepthOfCrawling(-1);
       config.setMaxPagesToFetch(-1);       
       config.addAuthInfo(createAuthethicator(username, password, loginURL, usernameFieldName, passwordFieldName));
       createCrawler(config, storing, urlSeeds);
}

Source File: MultipleCrawlerController.java From tutorials with MIT License

4 votes

public static void main(String[] args) throws Exception {
    File crawlStorageBase = new File("src/test/resources/crawler4j");
    CrawlConfig htmlConfig = new CrawlConfig();
    CrawlConfig imageConfig = new CrawlConfig();
    
    htmlConfig.setCrawlStorageFolder(new File(crawlStorageBase, "html").getAbsolutePath());
    imageConfig.setCrawlStorageFolder(new File(crawlStorageBase, "image").getAbsolutePath());
    imageConfig.setIncludeBinaryContentInCrawling(true);
    
    htmlConfig.setMaxPagesToFetch(500);
    imageConfig.setMaxPagesToFetch(1000);
    
    PageFetcher pageFetcherHtml = new PageFetcher(htmlConfig);
    PageFetcher pageFetcherImage = new PageFetcher(imageConfig);
    
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcherHtml);

    CrawlController htmlController = new CrawlController(htmlConfig, pageFetcherHtml, robotstxtServer);
    CrawlController imageController = new CrawlController(imageConfig, pageFetcherImage, robotstxtServer);
    
    htmlController.addSeed("https://www.baeldung.com/");
    imageController.addSeed("https://www.baeldung.com/");
    
    CrawlerStatistics stats = new CrawlerStatistics();
    CrawlController.WebCrawlerFactory<HtmlCrawler> htmlFactory = () -> new HtmlCrawler(stats);
    
    File saveDir = new File("src/test/resources/crawler4j");
    CrawlController.WebCrawlerFactory<ImageCrawler> imageFactory = () -> new ImageCrawler(saveDir);
    
    imageController.startNonBlocking(imageFactory, 7);
    htmlController.startNonBlocking(htmlFactory, 10);
    

    htmlController.waitUntilFinish();
    System.out.printf("Crawled %d pages %n", stats.getProcessedPageCount());
    System.out.printf("Total Number of outbound links = %d %n", stats.getTotalLinksCount());

    imageController.waitUntilFinish();
    System.out.printf("Image Crawler is finished.");
    
}

Java Code Examples for edu.uci.ics.crawler4j.crawler.CrawlConfig#setMaxPagesToFetch()