edu.uci.ics.crawler4j.crawler.CrawlConfig Java Exaples

Source File: CrawlerController.java From Java-for-Data-Science with MIT License

6 votes

public static void main(String[] args) throws Exception {
  int numberOfCrawlers = 2;
  CrawlConfig config = new CrawlConfig();
  String crawlStorageFolder = "data";
  
  config.setCrawlStorageFolder(crawlStorageFolder);
  config.setPolitenessDelay(500);
  config.setMaxDepthOfCrawling(2);
  config.setMaxPagesToFetch(20);
  config.setIncludeBinaryContentInCrawling(false);

  PageFetcher pageFetcher = new PageFetcher(config);
  RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
  RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
  CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

  controller.addSeed("https://en.wikipedia.org/wiki/Bishop_Rock,_Isles_of_Scilly");

  controller.start(SampleCrawler.class, numberOfCrawlers);
}

Source File: ImageCrawlerController.java From tutorials with MIT License

6 votes

public static void main(String[] args) throws Exception {
    File crawlStorage = new File("src/test/resources/crawler4j");
    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(crawlStorage.getAbsolutePath());
    config.setIncludeBinaryContentInCrawling(true);
    config.setMaxPagesToFetch(500);
    
    File saveDir = new File("src/test/resources/crawler4j");
    
    int numCrawlers = 12;
    
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
    
    controller.addSeed("https://www.baeldung.com/");
    
    CrawlController.WebCrawlerFactory<ImageCrawler> factory = () -> new ImageCrawler(saveDir);
    
    controller.start(factory, numCrawlers);
}

Source File: HtmlCrawlerController.java From tutorials with MIT License

6 votes

public static void main(String[] args) throws Exception {
    File crawlStorage = new File("src/test/resources/crawler4j");
    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(crawlStorage.getAbsolutePath());
    config.setMaxDepthOfCrawling(2);

    int numCrawlers = 12;

    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

    controller.addSeed("https://www.baeldung.com/");

    CrawlerStatistics stats = new CrawlerStatistics();
    CrawlController.WebCrawlerFactory<HtmlCrawler> factory = () -> new HtmlCrawler(stats);

    controller.start(factory, numCrawlers);
    System.out.printf("Crawled %d pages %n", stats.getProcessedPageCount());
    System.out.printf("Total Number of outbound links = %d %n", stats.getTotalLinksCount());
}

Source File: CrawlControllerFactory.java From vividus with Apache License 2.0

6 votes

@Override
public CrawlController createCrawlController(URI mainApplicationPage)
{
    CrawlConfig crawlConfig = createCrawlConfig(mainApplicationPage);

    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    robotstxtConfig.setEnabled(false);
    PageFetcher pageFetcher = new PageFetcher(crawlConfig);
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);

    try
    {
        return new CrawlController(crawlConfig, pageFetcher, robotstxtServer);
    }
    catch (Exception e)
    {
        throw new IllegalArgumentException(e);
    }
}

Source File: DownloadService.java From WebVideoBot with MIT License

6 votes

public void download(CrawlConfig config, String url, File file) throws InterruptedException, IOException {
    PageFetcher pageFetcher = new PageFetcher(config);
    WebURL curURL = new WebURL();
    curURL.setURL(url);
    PageFetchResult fetchResult = null;
    try {
        fetchResult = pageFetcher.fetchPage(curURL);
        if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
            logger.info("Start download filePath:[{}]", file);
            FileUtils.copyInputStreamToFile(fetchResult.getEntity().getContent(), file);
            logger.info("Download Finish filePath:[{}].", file);
        } else {
            logger.info("Skip download url:[{}], HttpStatus:[{}]", url, fetchResult.getStatusCode());
        }
    } catch (PageBiggerThanMaxSizeException e) {
        logger.debug("PageBiggerThanMaxSizeException", e);
        logger.info("Skip download url:[{}], Out of  MaxDownloadSize", url);
    } finally {
        if (fetchResult != null) {
            fetchResult.discardContentIfNotConsumed();
        }
    }
}

Source File: Main.java From navex with GNU General Public License v3.0

6 votes

public static void main(String[] args) {
	long startTime = System.currentTimeMillis();
	CrawlConfig config = null;

	NeoGraphDatabase graph = new NeoGraphDatabase();

	try {
		config =BasicCrawlController.crawlerMain(args, graph);
	} catch (Exception e) {
		e.printStackTrace();
	}
	//reinitiat the authentication before analyzing forms.
	PageFetcher pageFetcher = new PageFetcher(config);

	startFormAnalyzer(pageFetcher, graph);

	//shutDown the pagefetcher
	pageFetcher.shutDown();
	long analysisEndTime = System.currentTimeMillis();
	long analysisDiffTime = (analysisEndTime - startTime);

	System.out.println("=====TOTAL Crawling+ processing time is====="+analysisDiffTime);

}

Source File: NavigationDatabaseNode.java From navex with GNU General Public License v3.0

6 votes

@Override
public Map<String, Object> createPropertiesForms() {
	Map<String, Object> properties = new HashMap<String, Object>();
	if (node != null)
	{
		int docid = this.docid;//node.getWebURL().getDocid();
		properties.put(NodeKeys.ID, docid);

		String parentUrl = this.parent;//node.getWebURL().getParentUrl();
		if (parentUrl != null)
			properties.put(NodeKeys.PARENT, parentUrl);

		if (CrawlConfig.getRole() != null)
			properties.put(NodeKeys.ROLE, CrawlConfig.getRole());
	}
	properties.put("method", this.method); 
	properties.put("params", this.params);
	properties.put(NodeKeys.URL, this.url);

	return properties;
}

Source File: CrawlControllerFactory.java From vividus with Apache License 2.0

6 votes

private CrawlConfig createCrawlConfig(URI mainApplicationPage)
{
    CrawlConfig crawlConfig = new CrawlConfig();
    crawlConfig.setCrawlStorageFolder(crawlStorageFolder);
    crawlConfig.setPolitenessDelay(0);
    crawlConfig.setSocketTimeout(SOCKET_TIMEOUT);
    crawlConfig.setRespectNoFollow(false);
    crawlConfig.setRespectNoIndex(false);

    UserInfo userInfo = UriUtils.getUserInfo(mainApplicationPage);
    if (userInfo != null)
    {
        try
        {
            BasicAuthInfo authInfo = new BasicAuthInfo(userInfo.getUser(), userInfo.getPassword(),
                    UriUtils.removeUserInfo(mainApplicationPage).toString());
            crawlConfig.addAuthInfo(authInfo);
        }
        catch (MalformedURLException e)
        {
            throw new IllegalArgumentException(e);
        }
    }
    return crawlConfig;
}

Source File: BasicCrawlController.java From navex with GNU General Public License v3.0

5 votes

public static CrawlConfig crawlerMain(String[] args, NeoGraphDatabase graph) throws Exception {
	if (args.length < 3) {
		logger.info("Needed parameters: ");
		logger.info("\t rootFolder (it will contain intermediate crawl data)");
		logger.info("\t numberOfCralwers (number of concurrent threads)");
		return null;
	}

	//authentication info is stored in a file 
	String authFile = args[2];

	ArrayList<String[]> authList = IO.readAuthFile(authFile);

	String seed = args[3];



	/*
	 * crawlStorageFolder is a folder where intermediate crawl data is
	 * stored.
	 */
	String crawlStorageFolder = args[0];

	/*
	 * numberOfCrawlers shows the number of concurrent threads that should
	 * be initiated for crawling.
	 */
	int numberOfCrawlers = Integer.parseInt(args[1]);

	int i= 0;CrawlConfig config = null;
	while (i < authList.size()){

		config = startCrawlling(crawlStorageFolder, numberOfCrawlers, authList.get(i), seed, graph );
		i++;
	}


	return  config;

}

Source File: PornCrawlControllerFactory.java From WebVideoBot with MIT License

5 votes

public CrawlController getController() throws Exception {
    CrawlConfig config = prepareConfig();
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
    return controller;
}

Source File: PornCrawlControllerFactory.java From WebVideoBot with MIT License

5 votes

private CrawlConfig prepareConfig() {
    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(CRAWL_STORAGE + "/" + RandomStringUtils.random(10));
    config.setCookieStore(new PornCookieStore());
    config.setMaxDownloadSize(Properties.MAX_VIDEO_SIZE);
    config.setUserAgentString(RandomAgentsUtils.nextAgent());
    config.setResumableCrawling(true);
    config.setThreadShutdownDelaySeconds(2);
    config.setThreadMonitoringDelaySeconds(2);
    return config;
}

Source File: VsController.java From visual-spider with MIT License

5 votes

/**
 * 初始化
 *
 * @param numberOfCrawlers 爬虫线程数
 * @param maxDepthOfCrawling 抓取深度
 * @param maxPagesToFetch 最大抓取页数
 * @param politenessDelay 延迟
 * @param links 待爬取链接
 */
public void init(int numberOfCrawlers, int maxDepthOfCrawling, int maxPagesToFetch, int politenessDelay, String[]
        links) {
    this.numberOfCrawlers = numberOfCrawlers;
    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(DefaultConfigValues.CRAWL_STORAGE_FOLDER);
    config.setMaxDepthOfCrawling(maxDepthOfCrawling);
    config.setIncludeHttpsPages(true);
    config.setMaxPagesToFetch(maxPagesToFetch);
    config.setIncludeBinaryContentInCrawling(false);
    config.setPolitenessDelay(politenessDelay);
    config.setUserAgentString(DefaultConfigValues.USER_AGENT);
    config.setResumableCrawling(true);

    if (com.zhazhapan.vspider.models.CrawlConfig.getTurnOnProxy().get()) {
        LOGGER.info("open proxy");
        config.setProxyHost(com.zhazhapan.vspider.models.CrawlConfig.getProxyServer().get());
        config.setProxyPort(Formatter.stringToInt(com.zhazhapan.vspider.models.CrawlConfig.getProxyPort().get()));
        config.setProxyUsername(com.zhazhapan.vspider.models.CrawlConfig.getProxyUser().get());
        config.setProxyPassword(com.zhazhapan.vspider.models.CrawlConfig.getProxyPass().get());
    }

    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    robotstxtConfig.setEnabled(false);
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    try {
        controller = new CrawlController(config, pageFetcher, robotstxtServer);
        for (String link : links) {
            if (Checker.isHyperLink(link)) {
                controller.addSeed(link);
            }
        }
        isInited = true;
    } catch (Exception e) {
        LOGGER.error("start to crawl urls error: " + e.getMessage());
    }
}

Source File: Crawler.java From scava with Eclipse Public License 2.0

5 votes

public Crawler(File storing, List<String> urlSeeds, int maxDept, int maxPages)
{
	logger = (OssmeterLogger) OssmeterLogger.getLogger("nlp.tools.webcrawler");
	CrawlConfig config = new CrawlConfig();
	config.setIncludeHttpsPages(true);
	config.setPolitenessDelay(1000);
	config.setCrawlStorageFolder(storing.toString());
	config.setMaxDepthOfCrawling(maxDept);
       config.setMaxPagesToFetch(maxPages);       
       
       createCrawler(config, storing, urlSeeds);
}

Source File: Crawler.java From scava with Eclipse Public License 2.0

5 votes

public Crawler(File storing, List<String> urlSeeds, int maxDept, int maxPages,String loginURL, String username, String password, String usernameFieldName, String passwordFieldName) throws MalformedURLException
{
	logger = (OssmeterLogger) OssmeterLogger.getLogger("nlp.tools.webcrawler");
	CrawlConfig config = new CrawlConfig();
	config.setIncludeHttpsPages(true);
	config.setPolitenessDelay(1000);
	config.setCrawlStorageFolder(storing.toString());
	config.setMaxDepthOfCrawling(maxDept);
	config.addAuthInfo(createAuthethicator(username, password, loginURL, usernameFieldName, passwordFieldName));
       config.setMaxPagesToFetch(maxPages);       
       
       createCrawler(config, storing, urlSeeds);
}

Source File: Crawler.java From scava with Eclipse Public License 2.0

5 votes

public Crawler(File storing, List<String> urlSeeds)
{
	logger = (OssmeterLogger) OssmeterLogger.getLogger("nlp.tools.webcrawler");
	CrawlConfig config = new CrawlConfig();
	config.setIncludeHttpsPages(true);
	config.setPolitenessDelay(1000);
	config.setCrawlStorageFolder(storing.toString());
	config.setMaxDepthOfCrawling(-1);
       config.setMaxPagesToFetch(-1);       
       
       createCrawler(config, storing, urlSeeds);
}

Source File: Crawler.java From scava with Eclipse Public License 2.0

5 votes

public Crawler(File storing, List<String> urlSeeds, String loginURL, String username, String password, String usernameFieldName, String passwordFieldName) throws MalformedURLException
{
	logger = (OssmeterLogger) OssmeterLogger.getLogger("nlp.tools.webcrawler");
	CrawlConfig config = new CrawlConfig();
	config.setIncludeHttpsPages(true);
	config.setPolitenessDelay(1000);
	config.setCrawlStorageFolder(storing.toString());
	config.setMaxDepthOfCrawling(-1);
       config.setMaxPagesToFetch(-1);       
       config.addAuthInfo(createAuthethicator(username, password, loginURL, usernameFieldName, passwordFieldName));
       createCrawler(config, storing, urlSeeds);
}

Source File: Crawler.java From scava with Eclipse Public License 2.0

5 votes

private void createCrawler(CrawlConfig config, File storing, List<String> urlSeeds)
{
	
	config.setIncludeBinaryContentInCrawling(true);
       config.setResumableCrawling(false);
       config.setMaxDownloadSize(6250000); //50mb
       
       PageFetcher pageFetcher = new PageFetcher(config);
       RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
       RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
       
       try {
		controller = new CrawlController(config, pageFetcher, robotstxtServer);
		
		for(String seed : urlSeeds)
			controller.addSeed(seed);
		
		core = new CrawlerCore(storing, urlSeeds);
		
		factory = () -> core;
		
		core.getMappingPaths();
		
		
	}  
       catch (Exception e) {
		logger.error("Error in the creation of a crawler:"+e);
	}
}

Source File: MultipleCrawlerController.java From tutorials with MIT License

4 votes

public static void main(String[] args) throws Exception {
    File crawlStorageBase = new File("src/test/resources/crawler4j");
    CrawlConfig htmlConfig = new CrawlConfig();
    CrawlConfig imageConfig = new CrawlConfig();
    
    htmlConfig.setCrawlStorageFolder(new File(crawlStorageBase, "html").getAbsolutePath());
    imageConfig.setCrawlStorageFolder(new File(crawlStorageBase, "image").getAbsolutePath());
    imageConfig.setIncludeBinaryContentInCrawling(true);
    
    htmlConfig.setMaxPagesToFetch(500);
    imageConfig.setMaxPagesToFetch(1000);
    
    PageFetcher pageFetcherHtml = new PageFetcher(htmlConfig);
    PageFetcher pageFetcherImage = new PageFetcher(imageConfig);
    
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcherHtml);

    CrawlController htmlController = new CrawlController(htmlConfig, pageFetcherHtml, robotstxtServer);
    CrawlController imageController = new CrawlController(imageConfig, pageFetcherImage, robotstxtServer);
    
    htmlController.addSeed("https://www.baeldung.com/");
    imageController.addSeed("https://www.baeldung.com/");
    
    CrawlerStatistics stats = new CrawlerStatistics();
    CrawlController.WebCrawlerFactory<HtmlCrawler> htmlFactory = () -> new HtmlCrawler(stats);
    
    File saveDir = new File("src/test/resources/crawler4j");
    CrawlController.WebCrawlerFactory<ImageCrawler> imageFactory = () -> new ImageCrawler(saveDir);
    
    imageController.startNonBlocking(imageFactory, 7);
    htmlController.startNonBlocking(htmlFactory, 10);
    

    htmlController.waitUntilFinish();
    System.out.printf("Crawled %d pages %n", stats.getProcessedPageCount());
    System.out.printf("Total Number of outbound links = %d %n", stats.getTotalLinksCount());

    imageController.waitUntilFinish();
    System.out.printf("Image Crawler is finished.");
    
}

Source File: NavigationDatabaseNode.java From navex with GNU General Public License v3.0

4 votes

@Override
public Map<String, Object> createProperties() {
	Map<String, Object> properties = new HashMap<String, Object>();
	if (node != null)
	{
		int docid = this.docid;//node.getWebURL().getDocid();
		properties.put(NodeKeys.ID, docid);

		String url = this.url;//node.getWebURL().getURL();
		if (url != null)
			properties.put(NodeKeys.URL, url);

		String domain = this.domain;//node.get()
		if (domain != null)
			properties.put(NodeKeys.DOMAIN, domain);

		String path = this.path;//node.getWebURL().getPath();
		if (path != null)
			properties.put(NodeKeys.PATH, path);

		String parentUrl = this.parent;//node.getWebURL().getParentUrl();
		if (parentUrl != null)
			properties.put(NodeKeys.PARENT, parentUrl);

		if (data != null){
			int links = data.getOutgoingUrls().size();
			properties.put(NodeKeys.LINKS, links);

			int forms = data.getForms().size();
			properties.put(NodeKeys.FORMS, forms);
		}

		List<NameValuePair> p = this.params;//node.get()
		if (p != null){
			String str="";
			for (NameValuePair pair : p)
			{
				str+=pair.getName()+"="+pair.getValue()+",";
			}

			properties.put(NodeKeys.PARAMS, str);
		}

		if (CrawlConfig.getRole() != null)
			properties.put(NodeKeys.ROLE, CrawlConfig.getRole());


	}

	return properties;
}

edu.uci.ics.crawler4j.crawler.CrawlConfig Java Examples