edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig Java Exaples

Source File: CrawlControllerFactory.java From vividus with Apache License 2.0

6 votes

@Override
public CrawlController createCrawlController(URI mainApplicationPage)
{
    CrawlConfig crawlConfig = createCrawlConfig(mainApplicationPage);

    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    robotstxtConfig.setEnabled(false);
    PageFetcher pageFetcher = new PageFetcher(crawlConfig);
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);

    try
    {
        return new CrawlController(crawlConfig, pageFetcher, robotstxtServer);
    }
    catch (Exception e)
    {
        throw new IllegalArgumentException(e);
    }
}

Source File: CrawlerController.java From Java-for-Data-Science with MIT License

6 votes

public static void main(String[] args) throws Exception {
  int numberOfCrawlers = 2;
  CrawlConfig config = new CrawlConfig();
  String crawlStorageFolder = "data";
  
  config.setCrawlStorageFolder(crawlStorageFolder);
  config.setPolitenessDelay(500);
  config.setMaxDepthOfCrawling(2);
  config.setMaxPagesToFetch(20);
  config.setIncludeBinaryContentInCrawling(false);

  PageFetcher pageFetcher = new PageFetcher(config);
  RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
  RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
  CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

  controller.addSeed("https://en.wikipedia.org/wiki/Bishop_Rock,_Isles_of_Scilly");

  controller.start(SampleCrawler.class, numberOfCrawlers);
}

Source File: HtmlCrawlerController.java From tutorials with MIT License

6 votes

public static void main(String[] args) throws Exception {
    File crawlStorage = new File("src/test/resources/crawler4j");
    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(crawlStorage.getAbsolutePath());
    config.setMaxDepthOfCrawling(2);

    int numCrawlers = 12;

    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

    controller.addSeed("https://www.baeldung.com/");

    CrawlerStatistics stats = new CrawlerStatistics();
    CrawlController.WebCrawlerFactory<HtmlCrawler> factory = () -> new HtmlCrawler(stats);

    controller.start(factory, numCrawlers);
    System.out.printf("Crawled %d pages %n", stats.getProcessedPageCount());
    System.out.printf("Total Number of outbound links = %d %n", stats.getTotalLinksCount());
}

Source File: ImageCrawlerController.java From tutorials with MIT License

6 votes

public static void main(String[] args) throws Exception {
    File crawlStorage = new File("src/test/resources/crawler4j");
    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(crawlStorage.getAbsolutePath());
    config.setIncludeBinaryContentInCrawling(true);
    config.setMaxPagesToFetch(500);
    
    File saveDir = new File("src/test/resources/crawler4j");
    
    int numCrawlers = 12;
    
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
    
    controller.addSeed("https://www.baeldung.com/");
    
    CrawlController.WebCrawlerFactory<ImageCrawler> factory = () -> new ImageCrawler(saveDir);
    
    controller.start(factory, numCrawlers);
}

Source File: VsController.java From visual-spider with MIT License

5 votes

/**
 * 初始化
 *
 * @param numberOfCrawlers 爬虫线程数
 * @param maxDepthOfCrawling 抓取深度
 * @param maxPagesToFetch 最大抓取页数
 * @param politenessDelay 延迟
 * @param links 待爬取链接
 */
public void init(int numberOfCrawlers, int maxDepthOfCrawling, int maxPagesToFetch, int politenessDelay, String[]
        links) {
    this.numberOfCrawlers = numberOfCrawlers;
    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(DefaultConfigValues.CRAWL_STORAGE_FOLDER);
    config.setMaxDepthOfCrawling(maxDepthOfCrawling);
    config.setIncludeHttpsPages(true);
    config.setMaxPagesToFetch(maxPagesToFetch);
    config.setIncludeBinaryContentInCrawling(false);
    config.setPolitenessDelay(politenessDelay);
    config.setUserAgentString(DefaultConfigValues.USER_AGENT);
    config.setResumableCrawling(true);

    if (com.zhazhapan.vspider.models.CrawlConfig.getTurnOnProxy().get()) {
        LOGGER.info("open proxy");
        config.setProxyHost(com.zhazhapan.vspider.models.CrawlConfig.getProxyServer().get());
        config.setProxyPort(Formatter.stringToInt(com.zhazhapan.vspider.models.CrawlConfig.getProxyPort().get()));
        config.setProxyUsername(com.zhazhapan.vspider.models.CrawlConfig.getProxyUser().get());
        config.setProxyPassword(com.zhazhapan.vspider.models.CrawlConfig.getProxyPass().get());
    }

    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    robotstxtConfig.setEnabled(false);
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    try {
        controller = new CrawlController(config, pageFetcher, robotstxtServer);
        for (String link : links) {
            if (Checker.isHyperLink(link)) {
                controller.addSeed(link);
            }
        }
        isInited = true;
    } catch (Exception e) {
        LOGGER.error("start to crawl urls error: " + e.getMessage());
    }
}

Source File: Crawler.java From scava with Eclipse Public License 2.0

5 votes

private void createCrawler(CrawlConfig config, File storing, List<String> urlSeeds)
{
	
	config.setIncludeBinaryContentInCrawling(true);
       config.setResumableCrawling(false);
       config.setMaxDownloadSize(6250000); //50mb
       
       PageFetcher pageFetcher = new PageFetcher(config);
       RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
       RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
       
       try {
		controller = new CrawlController(config, pageFetcher, robotstxtServer);
		
		for(String seed : urlSeeds)
			controller.addSeed(seed);
		
		core = new CrawlerCore(storing, urlSeeds);
		
		factory = () -> core;
		
		core.getMappingPaths();
		
		
	}  
       catch (Exception e) {
		logger.error("Error in the creation of a crawler:"+e);
	}
}

Source File: MultipleCrawlerController.java From tutorials with MIT License

4 votes

public static void main(String[] args) throws Exception {
    File crawlStorageBase = new File("src/test/resources/crawler4j");
    CrawlConfig htmlConfig = new CrawlConfig();
    CrawlConfig imageConfig = new CrawlConfig();
    
    htmlConfig.setCrawlStorageFolder(new File(crawlStorageBase, "html").getAbsolutePath());
    imageConfig.setCrawlStorageFolder(new File(crawlStorageBase, "image").getAbsolutePath());
    imageConfig.setIncludeBinaryContentInCrawling(true);
    
    htmlConfig.setMaxPagesToFetch(500);
    imageConfig.setMaxPagesToFetch(1000);
    
    PageFetcher pageFetcherHtml = new PageFetcher(htmlConfig);
    PageFetcher pageFetcherImage = new PageFetcher(imageConfig);
    
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcherHtml);

    CrawlController htmlController = new CrawlController(htmlConfig, pageFetcherHtml, robotstxtServer);
    CrawlController imageController = new CrawlController(imageConfig, pageFetcherImage, robotstxtServer);
    
    htmlController.addSeed("https://www.baeldung.com/");
    imageController.addSeed("https://www.baeldung.com/");
    
    CrawlerStatistics stats = new CrawlerStatistics();
    CrawlController.WebCrawlerFactory<HtmlCrawler> htmlFactory = () -> new HtmlCrawler(stats);
    
    File saveDir = new File("src/test/resources/crawler4j");
    CrawlController.WebCrawlerFactory<ImageCrawler> imageFactory = () -> new ImageCrawler(saveDir);
    
    imageController.startNonBlocking(imageFactory, 7);
    htmlController.startNonBlocking(htmlFactory, 10);
    

    htmlController.waitUntilFinish();
    System.out.printf("Crawled %d pages %n", stats.getProcessedPageCount());
    System.out.printf("Total Number of outbound links = %d %n", stats.getTotalLinksCount());

    imageController.waitUntilFinish();
    System.out.printf("Image Crawler is finished.");
    
}

edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig Java Examples