Python cloudscraper.create_scraper() Examples

The following are 14 code examples of cloudscraper.create_scraper(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module cloudscraper , or try the search function .
Example #1
Source File: requester_proxy.py    From JAVOneStop with MIT License 9 votes vote down vote up
def cloudflare_get(url, cookies={}, proxies=None):
    retry = 6
    from JavHelper.core.javlibrary import JavLibraryScraper
    while retry > 0:
        try:
            cookies.update(JavLibraryScraper.load_local_cookies())  # update cloudflare cookies when updating
            res = cloudscraper.create_scraper().get(url, cookies=cookies, proxies=proxies)
            #print(res.text)
            return res
        #except cloudscraper.exceptions.CloudflareIUAMError:
        except Exception as e:
            print(f'cloudflare get failed on {e}, retrying')
            retry = retry - 1
            sleep(5)
    
    raise Exception(f'cloudflare get {url} failed') 
Example #2
Source File: phi_download.py    From ancient-text-restoration with Apache License 2.0 6 votes vote down vote up
def main():
  # Create structure
  os.makedirs(FLAGS.output, exist_ok=True)

  # Cloudflare scraper
  scraper = cloudscraper.create_scraper()

  # Download inscriptions
  with concurrent.futures.ThreadPoolExecutor(max_workers=FLAGS.connections) as executor:
    future_to_phi = (executor.submit(load_phi_id, text_i, FLAGS.timeout, FLAGS.output, scraper) for text_i in
                     range(1, FLAGS.max_phi_id))
    for future in tqdm(concurrent.futures.as_completed(future_to_phi), total=FLAGS.max_phi_id):
      try:
        future.result()
      except:
        pass 
Example #3
Source File: bufferover.py    From OneForAll with GNU General Public License v3.0 6 votes vote down vote up
def query(self):
        """
        向接口查询子域并做子域匹配
        """
        # 绕过cloudFlare验证
        scraper = cloudscraper.create_scraper()
        scraper.proxies = self.get_proxy(self.source)
        url = self.addr + self.domain
        try:
            resp = scraper.get(url, timeout=self.timeout)
        except Exception as e:
            logger.log('ERROR', e.args)
            return
        if resp.status_code != 200:
            return
        subdomains = self.match_subdomains(self.domain, str(resp.json()))
        # 合并搜索子域名搜索结果
        self.subdomains = self.subdomains.union(subdomains) 
Example #4
Source File: url.py    From RSScrawler with MIT License 6 votes vote down vote up
def get_urls_async(urls, configfile, dbfile, scraper=False):
    if not scraper:
        scraper = cloudscraper.create_scraper(browser={'browser': 'chrome', 'mobile': False})
    results = []

    def load_url(url):
        return get_url(url, configfile, dbfile, scraper)

    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        future_to_url = {executor.submit(load_url, url): url for url in urls}
        for future in concurrent.futures.as_completed(future_to_url):
            future_to_url[future]
            try:
                results.append(future.result())
            except Exception:
                pass
    return [results, scraper] 
Example #5
Source File: mangaChapterDownload.py    From comic-dl with MIT License 6 votes vote down vote up
def json_download(self, page_id):
        headers = {
            'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
            'Accept-Encoding': 'gzip, deflate'
        }

        sess = requests.session()
        sess = cloudscraper.create_scraper(sess)

        search_url = "http://www.mangaeden.com/api/chapter/{0}/".format(page_id)

        connection = sess.get(search_url, headers=headers)
        if connection.status_code != 200:
            print("Whoops! Seems like I can't connect to website.")
            print("It's showing : %s" % connection)
            print("Run this script with the --verbose argument and report the issue along with log file on Github.")
            sys.exit(1)
        else:
            json_data = connection.content

            return json_data 
Example #6
Source File: mangaChapters.py    From comic-dl with MIT License 6 votes vote down vote up
def json_download(self, chapter_id):
        headers = {
            'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
            'Accept-Encoding': 'gzip, deflate'
        }

        sess = requests.session()
        sess = cloudscraper.create_scraper(sess)

        search_url = "http://www.mangaeden.com/api/manga/{0}/".format(chapter_id)

        connection = sess.get(search_url, headers=headers)
        if connection.status_code != 200:
            print("Whoops! Seems like I can't connect to website.")
            print("It's showing : %s" % connection)
            print("Run this script with the --verbose argument and report the issue along with log file on Github.")
            sys.exit(1)
        else:
            json_data = connection.content

            return json_data 
Example #7
Source File: japscan.py    From comic-dl with MIT License 6 votes vote down vote up
def __init__(self, manga_url, download_directory, chapter_range, **kwargs):
        self.scraper = cloudscraper.create_scraper()
        conversion = kwargs.get("conversion")
        keep_files = kwargs.get("keep_files")
        self.logging = kwargs.get("log_flag")
        self.sorting = kwargs.get("sorting_order")
        self.manga_url = manga_url + '/'
        self.print_index = kwargs.get("print_index")

        if 'manga' in manga_url:
            self.comic_id = str(str(manga_url).split("/")[-1])
            self.full_series(comic_id=self.comic_id, sorting=self.sorting, download_directory=download_directory,
                             chapter_range=chapter_range, conversion=conversion, keep_files=keep_files)

        if 'lecture-en-ligne' in manga_url:
            self.comic_id = str(str(manga_url).split("/")[-2])
            chapter_path = re.sub(re.compile(r'.*japscan.to'), '', str(self.manga_url))
            self.single_chapter(chapter_path, comic_id=self.comic_id, download_directory=download_directory,
                                scraper=scraper) 
Example #8
Source File: threatcrowd.py    From OneForAll with GNU General Public License v3.0 5 votes vote down vote up
def query(self):
        # 绕过cloudFlare验证
        scraper = cloudscraper.create_scraper()
        scraper.proxies = self.get_proxy(self.source)
        url = self.addr + self.domain
        try:
            resp = scraper.get(url, timeout=self.timeout)
        except Exception as e:
            logger.log('ERROR', e.args)
            return
        if resp.status_code != 200:
            return
        subdomains = self.match_subdomains(self.domain, str(resp.json()))
        # 合并搜索子域名搜索结果
        self.subdomains = self.subdomains.union(subdomains) 
Example #9
Source File: crawler.py    From lightnovel-crawler with Apache License 2.0 5 votes vote down vote up
def __init__(self):
        self._destroyed = False
        self.executor = futures.ThreadPoolExecutor(max_workers=2)

        # Initialize cloudscrapper
        self.scraper = cloudscraper.create_scraper(
            browser={
                'browser': 'firefox',
                'mobile': False
            }
        )

        # Must resolve these fields inside `read_novel_info`
        self.novel_title = 'N/A'
        self.novel_author = 'N/A'
        self.novel_cover = None
        self.is_rtl = False

        # Each item must contain these keys:
        # `id` - 1 based index of the volume
        # `title` - the volume title (can be ignored)
        self.volumes = []

        # Each item must contain these keys:
        # `id` - 1 based index of the chapter
        # `title` - the title name
        # `volume` - the volume id of this chapter
        # `volume_title` - the volume title (can be ignored)
        # `url` - the link where to download the chapter
        self.chapters = []

        # Other stuffs - not necessary to resolve from crawler instance.
        self.home_url = ''
        self.novel_url = ''
        self.last_visited_url = None
    # end def 
Example #10
Source File: update_checker.py    From lightnovel-crawler with Apache License 2.0 5 votes vote down vote up
def check_updates():
    try:
        logger.info('Checking latest version')
        pypi_short_url = 'http://bit.ly/2yYyFGd'
        scraper = cloudscraper.create_scraper()
        res = scraper.get(pypi_short_url, timeout=5)
        latest_version = res.json()['info']['version']
        if get_value() != latest_version:
            new_version_news(latest_version)
        # end if
    except Exception:
        logger.warn('Failed to check for update')
    # end try
# end def 
Example #11
Source File: flhhkk_spider.py    From Spiders with Apache License 2.0 5 votes vote down vote up
def __init__(self):
        self.scrapper = cloudscraper.create_scraper()
        super().__init__() 
Example #12
Source File: mangaSearch.py    From comic-dl with MIT License 5 votes vote down vote up
def json_download(self, manga_language):
        print("Downloading The Latest Data Set...")
        headers = {
            'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
            'Accept-Encoding': 'gzip, deflate'
        }

        sess = requests.session()
        sess = cloudscraper.create_scraper(sess)

        search_url = "http://www.mangaeden.com/api/list/{0}/".format(manga_language)

        connection = sess.get(search_url, headers=headers)
        if connection.status_code != 200:
            print("Whoops! Seems like I can't connect to website.")
            print("It's showing : %s" % connection)
            print("Run this script with the --verbose argument and report the issue along with log file on Github.")
            sys.exit(1)
        else:
            json_data = connection.content
            # print(json_data)
            try:
                # Let's save the JSON data
                with open("Manga_Eden_Data.json", "wb") as write_file:
                    write_file.write(json_data)
            except Exception as WriteError:
                print("Couldn't make Cache : {0}".format(WriteError))
                pass

            return json_data 
Example #13
Source File: Core.py    From CurseBreaker with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self):
        self.path = Path('Interface/AddOns')
        self.configPath = Path('WTF/CurseBreaker.json')
        self.cachePath = Path('WTF/CurseBreaker.cache')
        self.clientType = 'wow_retail'
        self.waCompanionVersion = 110
        self.config = None
        self.cfIDs = None
        self.cfDirs = None
        self.cfCache = {}
        self.wowiCache = {}
        self.checksumCache = {}
        self.scraper = cloudscraper.create_scraper() 
Example #14
Source File: batoto.py    From comic-dl with MIT License 4 votes vote down vote up
def user_login(self, username, password, **kwargs):
        session_cookie = ""

        headers = kwargs.get("headers")
        if not headers:
            headers = {
                'User-Agent':
                    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
                'Accept-Encoding': 'gzip, deflate',
                'referer': 'https://bato.to/'
            }
        print("Getting Auth Token...")
        page_source, update_cookie = globalFunctions.GlobalFunctions().page_downloader(
            manga_url="https://bato.to/forums/index.php?app=core&module=global&section=login")

        soup_parse = page_source.find_all('input', {'type': 'hidden'})
        auth_token = str([x['value'] for x in soup_parse][0]).strip()

        payload = {
            'auth_key': auth_token,
            'ips_username': username,
            'ips_password': password,
            'rememberMe': '1'
        }

        sess = requests.session()
        sess = cloudscraper.create_scraper(sess)

        print('Trying To Log In...')
        connection = sess.post("https://bato.to/forums/index.php?app=core&module=global&section=login&do=process",
                               headers=headers, data=payload, cookies=kwargs.get("cookies"))
        if connection.status_code != 200:
            print("Whoops! Seems like I can't connect to website.")
            print("It's showing : %s" % connection)
            print("Run this script with the --verbose argument and report the issue along with log file on Github.")
            sys.exit(1)
        else:
            page_source = BeautifulSoup(connection.text.encode("utf-8"), "html.parser")
            if "logout" in str(page_source):
                print("Successfully Logged In!")
            else:
                print("Couldn't Log You In. Please Check Your Credentials Again!")
            session_cookie = sess.cookies

        return session_cookie