Python cloudscraper.create_scraper() Examples
The following are 14
code examples of cloudscraper.create_scraper().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
cloudscraper
, or try the search function
.
Example #1
Source File: requester_proxy.py From JAVOneStop with MIT License | 9 votes |
def cloudflare_get(url, cookies={}, proxies=None): retry = 6 from JavHelper.core.javlibrary import JavLibraryScraper while retry > 0: try: cookies.update(JavLibraryScraper.load_local_cookies()) # update cloudflare cookies when updating res = cloudscraper.create_scraper().get(url, cookies=cookies, proxies=proxies) #print(res.text) return res #except cloudscraper.exceptions.CloudflareIUAMError: except Exception as e: print(f'cloudflare get failed on {e}, retrying') retry = retry - 1 sleep(5) raise Exception(f'cloudflare get {url} failed')
Example #2
Source File: phi_download.py From ancient-text-restoration with Apache License 2.0 | 6 votes |
def main(): # Create structure os.makedirs(FLAGS.output, exist_ok=True) # Cloudflare scraper scraper = cloudscraper.create_scraper() # Download inscriptions with concurrent.futures.ThreadPoolExecutor(max_workers=FLAGS.connections) as executor: future_to_phi = (executor.submit(load_phi_id, text_i, FLAGS.timeout, FLAGS.output, scraper) for text_i in range(1, FLAGS.max_phi_id)) for future in tqdm(concurrent.futures.as_completed(future_to_phi), total=FLAGS.max_phi_id): try: future.result() except: pass
Example #3
Source File: bufferover.py From OneForAll with GNU General Public License v3.0 | 6 votes |
def query(self): """ 向接口查询子域并做子域匹配 """ # 绕过cloudFlare验证 scraper = cloudscraper.create_scraper() scraper.proxies = self.get_proxy(self.source) url = self.addr + self.domain try: resp = scraper.get(url, timeout=self.timeout) except Exception as e: logger.log('ERROR', e.args) return if resp.status_code != 200: return subdomains = self.match_subdomains(self.domain, str(resp.json())) # 合并搜索子域名搜索结果 self.subdomains = self.subdomains.union(subdomains)
Example #4
Source File: url.py From RSScrawler with MIT License | 6 votes |
def get_urls_async(urls, configfile, dbfile, scraper=False): if not scraper: scraper = cloudscraper.create_scraper(browser={'browser': 'chrome', 'mobile': False}) results = [] def load_url(url): return get_url(url, configfile, dbfile, scraper) with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: future_to_url = {executor.submit(load_url, url): url for url in urls} for future in concurrent.futures.as_completed(future_to_url): future_to_url[future] try: results.append(future.result()) except Exception: pass return [results, scraper]
Example #5
Source File: mangaChapterDownload.py From comic-dl with MIT License | 6 votes |
def json_download(self, page_id): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 'Accept-Encoding': 'gzip, deflate' } sess = requests.session() sess = cloudscraper.create_scraper(sess) search_url = "http://www.mangaeden.com/api/chapter/{0}/".format(page_id) connection = sess.get(search_url, headers=headers) if connection.status_code != 200: print("Whoops! Seems like I can't connect to website.") print("It's showing : %s" % connection) print("Run this script with the --verbose argument and report the issue along with log file on Github.") sys.exit(1) else: json_data = connection.content return json_data
Example #6
Source File: mangaChapters.py From comic-dl with MIT License | 6 votes |
def json_download(self, chapter_id): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 'Accept-Encoding': 'gzip, deflate' } sess = requests.session() sess = cloudscraper.create_scraper(sess) search_url = "http://www.mangaeden.com/api/manga/{0}/".format(chapter_id) connection = sess.get(search_url, headers=headers) if connection.status_code != 200: print("Whoops! Seems like I can't connect to website.") print("It's showing : %s" % connection) print("Run this script with the --verbose argument and report the issue along with log file on Github.") sys.exit(1) else: json_data = connection.content return json_data
Example #7
Source File: japscan.py From comic-dl with MIT License | 6 votes |
def __init__(self, manga_url, download_directory, chapter_range, **kwargs): self.scraper = cloudscraper.create_scraper() conversion = kwargs.get("conversion") keep_files = kwargs.get("keep_files") self.logging = kwargs.get("log_flag") self.sorting = kwargs.get("sorting_order") self.manga_url = manga_url + '/' self.print_index = kwargs.get("print_index") if 'manga' in manga_url: self.comic_id = str(str(manga_url).split("/")[-1]) self.full_series(comic_id=self.comic_id, sorting=self.sorting, download_directory=download_directory, chapter_range=chapter_range, conversion=conversion, keep_files=keep_files) if 'lecture-en-ligne' in manga_url: self.comic_id = str(str(manga_url).split("/")[-2]) chapter_path = re.sub(re.compile(r'.*japscan.to'), '', str(self.manga_url)) self.single_chapter(chapter_path, comic_id=self.comic_id, download_directory=download_directory, scraper=scraper)
Example #8
Source File: threatcrowd.py From OneForAll with GNU General Public License v3.0 | 5 votes |
def query(self): # 绕过cloudFlare验证 scraper = cloudscraper.create_scraper() scraper.proxies = self.get_proxy(self.source) url = self.addr + self.domain try: resp = scraper.get(url, timeout=self.timeout) except Exception as e: logger.log('ERROR', e.args) return if resp.status_code != 200: return subdomains = self.match_subdomains(self.domain, str(resp.json())) # 合并搜索子域名搜索结果 self.subdomains = self.subdomains.union(subdomains)
Example #9
Source File: crawler.py From lightnovel-crawler with Apache License 2.0 | 5 votes |
def __init__(self): self._destroyed = False self.executor = futures.ThreadPoolExecutor(max_workers=2) # Initialize cloudscrapper self.scraper = cloudscraper.create_scraper( browser={ 'browser': 'firefox', 'mobile': False } ) # Must resolve these fields inside `read_novel_info` self.novel_title = 'N/A' self.novel_author = 'N/A' self.novel_cover = None self.is_rtl = False # Each item must contain these keys: # `id` - 1 based index of the volume # `title` - the volume title (can be ignored) self.volumes = [] # Each item must contain these keys: # `id` - 1 based index of the chapter # `title` - the title name # `volume` - the volume id of this chapter # `volume_title` - the volume title (can be ignored) # `url` - the link where to download the chapter self.chapters = [] # Other stuffs - not necessary to resolve from crawler instance. self.home_url = '' self.novel_url = '' self.last_visited_url = None # end def
Example #10
Source File: update_checker.py From lightnovel-crawler with Apache License 2.0 | 5 votes |
def check_updates(): try: logger.info('Checking latest version') pypi_short_url = 'http://bit.ly/2yYyFGd' scraper = cloudscraper.create_scraper() res = scraper.get(pypi_short_url, timeout=5) latest_version = res.json()['info']['version'] if get_value() != latest_version: new_version_news(latest_version) # end if except Exception: logger.warn('Failed to check for update') # end try # end def
Example #11
Source File: flhhkk_spider.py From Spiders with Apache License 2.0 | 5 votes |
def __init__(self): self.scrapper = cloudscraper.create_scraper() super().__init__()
Example #12
Source File: mangaSearch.py From comic-dl with MIT License | 5 votes |
def json_download(self, manga_language): print("Downloading The Latest Data Set...") headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 'Accept-Encoding': 'gzip, deflate' } sess = requests.session() sess = cloudscraper.create_scraper(sess) search_url = "http://www.mangaeden.com/api/list/{0}/".format(manga_language) connection = sess.get(search_url, headers=headers) if connection.status_code != 200: print("Whoops! Seems like I can't connect to website.") print("It's showing : %s" % connection) print("Run this script with the --verbose argument and report the issue along with log file on Github.") sys.exit(1) else: json_data = connection.content # print(json_data) try: # Let's save the JSON data with open("Manga_Eden_Data.json", "wb") as write_file: write_file.write(json_data) except Exception as WriteError: print("Couldn't make Cache : {0}".format(WriteError)) pass return json_data
Example #13
Source File: Core.py From CurseBreaker with GNU General Public License v3.0 | 5 votes |
def __init__(self): self.path = Path('Interface/AddOns') self.configPath = Path('WTF/CurseBreaker.json') self.cachePath = Path('WTF/CurseBreaker.cache') self.clientType = 'wow_retail' self.waCompanionVersion = 110 self.config = None self.cfIDs = None self.cfDirs = None self.cfCache = {} self.wowiCache = {} self.checksumCache = {} self.scraper = cloudscraper.create_scraper()
Example #14
Source File: batoto.py From comic-dl with MIT License | 4 votes |
def user_login(self, username, password, **kwargs): session_cookie = "" headers = kwargs.get("headers") if not headers: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 'Accept-Encoding': 'gzip, deflate', 'referer': 'https://bato.to/' } print("Getting Auth Token...") page_source, update_cookie = globalFunctions.GlobalFunctions().page_downloader( manga_url="https://bato.to/forums/index.php?app=core&module=global§ion=login") soup_parse = page_source.find_all('input', {'type': 'hidden'}) auth_token = str([x['value'] for x in soup_parse][0]).strip() payload = { 'auth_key': auth_token, 'ips_username': username, 'ips_password': password, 'rememberMe': '1' } sess = requests.session() sess = cloudscraper.create_scraper(sess) print('Trying To Log In...') connection = sess.post("https://bato.to/forums/index.php?app=core&module=global§ion=login&do=process", headers=headers, data=payload, cookies=kwargs.get("cookies")) if connection.status_code != 200: print("Whoops! Seems like I can't connect to website.") print("It's showing : %s" % connection) print("Run this script with the --verbose argument and report the issue along with log file on Github.") sys.exit(1) else: page_source = BeautifulSoup(connection.text.encode("utf-8"), "html.parser") if "logout" in str(page_source): print("Successfully Logged In!") else: print("Couldn't Log You In. Please Check Your Credentials Again!") session_cookie = sess.cookies return session_cookie