edu.uci.ics.crawler4j.crawler.CrawlConfig Java Examples
The following examples show how to use
edu.uci.ics.crawler4j.crawler.CrawlConfig.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CrawlerController.java From Java-for-Data-Science with MIT License | 6 votes |
public static void main(String[] args) throws Exception { int numberOfCrawlers = 2; CrawlConfig config = new CrawlConfig(); String crawlStorageFolder = "data"; config.setCrawlStorageFolder(crawlStorageFolder); config.setPolitenessDelay(500); config.setMaxDepthOfCrawling(2); config.setMaxPagesToFetch(20); config.setIncludeBinaryContentInCrawling(false); PageFetcher pageFetcher = new PageFetcher(config); RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); controller.addSeed("https://en.wikipedia.org/wiki/Bishop_Rock,_Isles_of_Scilly"); controller.start(SampleCrawler.class, numberOfCrawlers); }
Example #2
Source File: ImageCrawlerController.java From tutorials with MIT License | 6 votes |
public static void main(String[] args) throws Exception { File crawlStorage = new File("src/test/resources/crawler4j"); CrawlConfig config = new CrawlConfig(); config.setCrawlStorageFolder(crawlStorage.getAbsolutePath()); config.setIncludeBinaryContentInCrawling(true); config.setMaxPagesToFetch(500); File saveDir = new File("src/test/resources/crawler4j"); int numCrawlers = 12; PageFetcher pageFetcher = new PageFetcher(config); RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); controller.addSeed("https://www.baeldung.com/"); CrawlController.WebCrawlerFactory<ImageCrawler> factory = () -> new ImageCrawler(saveDir); controller.start(factory, numCrawlers); }
Example #3
Source File: HtmlCrawlerController.java From tutorials with MIT License | 6 votes |
public static void main(String[] args) throws Exception { File crawlStorage = new File("src/test/resources/crawler4j"); CrawlConfig config = new CrawlConfig(); config.setCrawlStorageFolder(crawlStorage.getAbsolutePath()); config.setMaxDepthOfCrawling(2); int numCrawlers = 12; PageFetcher pageFetcher = new PageFetcher(config); RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); controller.addSeed("https://www.baeldung.com/"); CrawlerStatistics stats = new CrawlerStatistics(); CrawlController.WebCrawlerFactory<HtmlCrawler> factory = () -> new HtmlCrawler(stats); controller.start(factory, numCrawlers); System.out.printf("Crawled %d pages %n", stats.getProcessedPageCount()); System.out.printf("Total Number of outbound links = %d %n", stats.getTotalLinksCount()); }
Example #4
Source File: CrawlControllerFactory.java From vividus with Apache License 2.0 | 6 votes |
@Override public CrawlController createCrawlController(URI mainApplicationPage) { CrawlConfig crawlConfig = createCrawlConfig(mainApplicationPage); RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); robotstxtConfig.setEnabled(false); PageFetcher pageFetcher = new PageFetcher(crawlConfig); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); try { return new CrawlController(crawlConfig, pageFetcher, robotstxtServer); } catch (Exception e) { throw new IllegalArgumentException(e); } }
Example #5
Source File: DownloadService.java From WebVideoBot with MIT License | 6 votes |
public void download(CrawlConfig config, String url, File file) throws InterruptedException, IOException { PageFetcher pageFetcher = new PageFetcher(config); WebURL curURL = new WebURL(); curURL.setURL(url); PageFetchResult fetchResult = null; try { fetchResult = pageFetcher.fetchPage(curURL); if (fetchResult.getStatusCode() == HttpStatus.SC_OK) { logger.info("Start download filePath:[{}]", file); FileUtils.copyInputStreamToFile(fetchResult.getEntity().getContent(), file); logger.info("Download Finish filePath:[{}].", file); } else { logger.info("Skip download url:[{}], HttpStatus:[{}]", url, fetchResult.getStatusCode()); } } catch (PageBiggerThanMaxSizeException e) { logger.debug("PageBiggerThanMaxSizeException", e); logger.info("Skip download url:[{}], Out of MaxDownloadSize", url); } finally { if (fetchResult != null) { fetchResult.discardContentIfNotConsumed(); } } }
Example #6
Source File: Main.java From navex with GNU General Public License v3.0 | 6 votes |
public static void main(String[] args) { long startTime = System.currentTimeMillis(); CrawlConfig config = null; NeoGraphDatabase graph = new NeoGraphDatabase(); try { config =BasicCrawlController.crawlerMain(args, graph); } catch (Exception e) { e.printStackTrace(); } //reinitiat the authentication before analyzing forms. PageFetcher pageFetcher = new PageFetcher(config); startFormAnalyzer(pageFetcher, graph); //shutDown the pagefetcher pageFetcher.shutDown(); long analysisEndTime = System.currentTimeMillis(); long analysisDiffTime = (analysisEndTime - startTime); System.out.println("=====TOTAL Crawling+ processing time is====="+analysisDiffTime); }
Example #7
Source File: NavigationDatabaseNode.java From navex with GNU General Public License v3.0 | 6 votes |
@Override public Map<String, Object> createPropertiesForms() { Map<String, Object> properties = new HashMap<String, Object>(); if (node != null) { int docid = this.docid;//node.getWebURL().getDocid(); properties.put(NodeKeys.ID, docid); String parentUrl = this.parent;//node.getWebURL().getParentUrl(); if (parentUrl != null) properties.put(NodeKeys.PARENT, parentUrl); if (CrawlConfig.getRole() != null) properties.put(NodeKeys.ROLE, CrawlConfig.getRole()); } properties.put("method", this.method); properties.put("params", this.params); properties.put(NodeKeys.URL, this.url); return properties; }
Example #8
Source File: CrawlControllerFactory.java From vividus with Apache License 2.0 | 6 votes |
private CrawlConfig createCrawlConfig(URI mainApplicationPage) { CrawlConfig crawlConfig = new CrawlConfig(); crawlConfig.setCrawlStorageFolder(crawlStorageFolder); crawlConfig.setPolitenessDelay(0); crawlConfig.setSocketTimeout(SOCKET_TIMEOUT); crawlConfig.setRespectNoFollow(false); crawlConfig.setRespectNoIndex(false); UserInfo userInfo = UriUtils.getUserInfo(mainApplicationPage); if (userInfo != null) { try { BasicAuthInfo authInfo = new BasicAuthInfo(userInfo.getUser(), userInfo.getPassword(), UriUtils.removeUserInfo(mainApplicationPage).toString()); crawlConfig.addAuthInfo(authInfo); } catch (MalformedURLException e) { throw new IllegalArgumentException(e); } } return crawlConfig; }
Example #9
Source File: BasicCrawlController.java From navex with GNU General Public License v3.0 | 5 votes |
public static CrawlConfig crawlerMain(String[] args, NeoGraphDatabase graph) throws Exception { if (args.length < 3) { logger.info("Needed parameters: "); logger.info("\t rootFolder (it will contain intermediate crawl data)"); logger.info("\t numberOfCralwers (number of concurrent threads)"); return null; } //authentication info is stored in a file String authFile = args[2]; ArrayList<String[]> authList = IO.readAuthFile(authFile); String seed = args[3]; /* * crawlStorageFolder is a folder where intermediate crawl data is * stored. */ String crawlStorageFolder = args[0]; /* * numberOfCrawlers shows the number of concurrent threads that should * be initiated for crawling. */ int numberOfCrawlers = Integer.parseInt(args[1]); int i= 0;CrawlConfig config = null; while (i < authList.size()){ config = startCrawlling(crawlStorageFolder, numberOfCrawlers, authList.get(i), seed, graph ); i++; } return config; }
Example #10
Source File: PornCrawlControllerFactory.java From WebVideoBot with MIT License | 5 votes |
public CrawlController getController() throws Exception { CrawlConfig config = prepareConfig(); PageFetcher pageFetcher = new PageFetcher(config); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); return controller; }
Example #11
Source File: PornCrawlControllerFactory.java From WebVideoBot with MIT License | 5 votes |
private CrawlConfig prepareConfig() { CrawlConfig config = new CrawlConfig(); config.setCrawlStorageFolder(CRAWL_STORAGE + "/" + RandomStringUtils.random(10)); config.setCookieStore(new PornCookieStore()); config.setMaxDownloadSize(Properties.MAX_VIDEO_SIZE); config.setUserAgentString(RandomAgentsUtils.nextAgent()); config.setResumableCrawling(true); config.setThreadShutdownDelaySeconds(2); config.setThreadMonitoringDelaySeconds(2); return config; }
Example #12
Source File: VsController.java From visual-spider with MIT License | 5 votes |
/** * 初始化 * * @param numberOfCrawlers 爬虫线程数 * @param maxDepthOfCrawling 抓取深度 * @param maxPagesToFetch 最大抓取页数 * @param politenessDelay 延迟 * @param links 待爬取链接 */ public void init(int numberOfCrawlers, int maxDepthOfCrawling, int maxPagesToFetch, int politenessDelay, String[] links) { this.numberOfCrawlers = numberOfCrawlers; CrawlConfig config = new CrawlConfig(); config.setCrawlStorageFolder(DefaultConfigValues.CRAWL_STORAGE_FOLDER); config.setMaxDepthOfCrawling(maxDepthOfCrawling); config.setIncludeHttpsPages(true); config.setMaxPagesToFetch(maxPagesToFetch); config.setIncludeBinaryContentInCrawling(false); config.setPolitenessDelay(politenessDelay); config.setUserAgentString(DefaultConfigValues.USER_AGENT); config.setResumableCrawling(true); if (com.zhazhapan.vspider.models.CrawlConfig.getTurnOnProxy().get()) { LOGGER.info("open proxy"); config.setProxyHost(com.zhazhapan.vspider.models.CrawlConfig.getProxyServer().get()); config.setProxyPort(Formatter.stringToInt(com.zhazhapan.vspider.models.CrawlConfig.getProxyPort().get())); config.setProxyUsername(com.zhazhapan.vspider.models.CrawlConfig.getProxyUser().get()); config.setProxyPassword(com.zhazhapan.vspider.models.CrawlConfig.getProxyPass().get()); } PageFetcher pageFetcher = new PageFetcher(config); RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); robotstxtConfig.setEnabled(false); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); try { controller = new CrawlController(config, pageFetcher, robotstxtServer); for (String link : links) { if (Checker.isHyperLink(link)) { controller.addSeed(link); } } isInited = true; } catch (Exception e) { LOGGER.error("start to crawl urls error: " + e.getMessage()); } }
Example #13
Source File: Crawler.java From scava with Eclipse Public License 2.0 | 5 votes |
public Crawler(File storing, List<String> urlSeeds, int maxDept, int maxPages) { logger = (OssmeterLogger) OssmeterLogger.getLogger("nlp.tools.webcrawler"); CrawlConfig config = new CrawlConfig(); config.setIncludeHttpsPages(true); config.setPolitenessDelay(1000); config.setCrawlStorageFolder(storing.toString()); config.setMaxDepthOfCrawling(maxDept); config.setMaxPagesToFetch(maxPages); createCrawler(config, storing, urlSeeds); }
Example #14
Source File: Crawler.java From scava with Eclipse Public License 2.0 | 5 votes |
public Crawler(File storing, List<String> urlSeeds, int maxDept, int maxPages,String loginURL, String username, String password, String usernameFieldName, String passwordFieldName) throws MalformedURLException { logger = (OssmeterLogger) OssmeterLogger.getLogger("nlp.tools.webcrawler"); CrawlConfig config = new CrawlConfig(); config.setIncludeHttpsPages(true); config.setPolitenessDelay(1000); config.setCrawlStorageFolder(storing.toString()); config.setMaxDepthOfCrawling(maxDept); config.addAuthInfo(createAuthethicator(username, password, loginURL, usernameFieldName, passwordFieldName)); config.setMaxPagesToFetch(maxPages); createCrawler(config, storing, urlSeeds); }
Example #15
Source File: Crawler.java From scava with Eclipse Public License 2.0 | 5 votes |
public Crawler(File storing, List<String> urlSeeds) { logger = (OssmeterLogger) OssmeterLogger.getLogger("nlp.tools.webcrawler"); CrawlConfig config = new CrawlConfig(); config.setIncludeHttpsPages(true); config.setPolitenessDelay(1000); config.setCrawlStorageFolder(storing.toString()); config.setMaxDepthOfCrawling(-1); config.setMaxPagesToFetch(-1); createCrawler(config, storing, urlSeeds); }
Example #16
Source File: Crawler.java From scava with Eclipse Public License 2.0 | 5 votes |
public Crawler(File storing, List<String> urlSeeds, String loginURL, String username, String password, String usernameFieldName, String passwordFieldName) throws MalformedURLException { logger = (OssmeterLogger) OssmeterLogger.getLogger("nlp.tools.webcrawler"); CrawlConfig config = new CrawlConfig(); config.setIncludeHttpsPages(true); config.setPolitenessDelay(1000); config.setCrawlStorageFolder(storing.toString()); config.setMaxDepthOfCrawling(-1); config.setMaxPagesToFetch(-1); config.addAuthInfo(createAuthethicator(username, password, loginURL, usernameFieldName, passwordFieldName)); createCrawler(config, storing, urlSeeds); }
Example #17
Source File: Crawler.java From scava with Eclipse Public License 2.0 | 5 votes |
private void createCrawler(CrawlConfig config, File storing, List<String> urlSeeds) { config.setIncludeBinaryContentInCrawling(true); config.setResumableCrawling(false); config.setMaxDownloadSize(6250000); //50mb PageFetcher pageFetcher = new PageFetcher(config); RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); try { controller = new CrawlController(config, pageFetcher, robotstxtServer); for(String seed : urlSeeds) controller.addSeed(seed); core = new CrawlerCore(storing, urlSeeds); factory = () -> core; core.getMappingPaths(); } catch (Exception e) { logger.error("Error in the creation of a crawler:"+e); } }
Example #18
Source File: MultipleCrawlerController.java From tutorials with MIT License | 4 votes |
public static void main(String[] args) throws Exception { File crawlStorageBase = new File("src/test/resources/crawler4j"); CrawlConfig htmlConfig = new CrawlConfig(); CrawlConfig imageConfig = new CrawlConfig(); htmlConfig.setCrawlStorageFolder(new File(crawlStorageBase, "html").getAbsolutePath()); imageConfig.setCrawlStorageFolder(new File(crawlStorageBase, "image").getAbsolutePath()); imageConfig.setIncludeBinaryContentInCrawling(true); htmlConfig.setMaxPagesToFetch(500); imageConfig.setMaxPagesToFetch(1000); PageFetcher pageFetcherHtml = new PageFetcher(htmlConfig); PageFetcher pageFetcherImage = new PageFetcher(imageConfig); RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcherHtml); CrawlController htmlController = new CrawlController(htmlConfig, pageFetcherHtml, robotstxtServer); CrawlController imageController = new CrawlController(imageConfig, pageFetcherImage, robotstxtServer); htmlController.addSeed("https://www.baeldung.com/"); imageController.addSeed("https://www.baeldung.com/"); CrawlerStatistics stats = new CrawlerStatistics(); CrawlController.WebCrawlerFactory<HtmlCrawler> htmlFactory = () -> new HtmlCrawler(stats); File saveDir = new File("src/test/resources/crawler4j"); CrawlController.WebCrawlerFactory<ImageCrawler> imageFactory = () -> new ImageCrawler(saveDir); imageController.startNonBlocking(imageFactory, 7); htmlController.startNonBlocking(htmlFactory, 10); htmlController.waitUntilFinish(); System.out.printf("Crawled %d pages %n", stats.getProcessedPageCount()); System.out.printf("Total Number of outbound links = %d %n", stats.getTotalLinksCount()); imageController.waitUntilFinish(); System.out.printf("Image Crawler is finished."); }
Example #19
Source File: NavigationDatabaseNode.java From navex with GNU General Public License v3.0 | 4 votes |
@Override public Map<String, Object> createProperties() { Map<String, Object> properties = new HashMap<String, Object>(); if (node != null) { int docid = this.docid;//node.getWebURL().getDocid(); properties.put(NodeKeys.ID, docid); String url = this.url;//node.getWebURL().getURL(); if (url != null) properties.put(NodeKeys.URL, url); String domain = this.domain;//node.get() if (domain != null) properties.put(NodeKeys.DOMAIN, domain); String path = this.path;//node.getWebURL().getPath(); if (path != null) properties.put(NodeKeys.PATH, path); String parentUrl = this.parent;//node.getWebURL().getParentUrl(); if (parentUrl != null) properties.put(NodeKeys.PARENT, parentUrl); if (data != null){ int links = data.getOutgoingUrls().size(); properties.put(NodeKeys.LINKS, links); int forms = data.getForms().size(); properties.put(NodeKeys.FORMS, forms); } List<NameValuePair> p = this.params;//node.get() if (p != null){ String str=""; for (NameValuePair pair : p) { str+=pair.getName()+"="+pair.getValue()+","; } properties.put(NodeKeys.PARAMS, str); } if (CrawlConfig.getRole() != null) properties.put(NodeKeys.ROLE, CrawlConfig.getRole()); } return properties; }