Python Examples of cloudscraper.create

Source File: requester_proxy.py From JAVOneStop with MIT License

9 votes

def cloudflare_get(url, cookies={}, proxies=None):
    retry = 6
    from JavHelper.core.javlibrary import JavLibraryScraper
    while retry > 0:
        try:
            cookies.update(JavLibraryScraper.load_local_cookies())  # update cloudflare cookies when updating
            res = cloudscraper.create_scraper().get(url, cookies=cookies, proxies=proxies)
            #print(res.text)
            return res
        #except cloudscraper.exceptions.CloudflareIUAMError:
        except Exception as e:
            print(f'cloudflare get failed on {e}, retrying')
            retry = retry - 1
            sleep(5)
    
    raise Exception(f'cloudflare get {url} failed')

Source File: phi_download.py From ancient-text-restoration with Apache License 2.0

6 votes

def main():
  # Create structure
  os.makedirs(FLAGS.output, exist_ok=True)

  # Cloudflare scraper
  scraper = cloudscraper.create_scraper()

  # Download inscriptions
  with concurrent.futures.ThreadPoolExecutor(max_workers=FLAGS.connections) as executor:
    future_to_phi = (executor.submit(load_phi_id, text_i, FLAGS.timeout, FLAGS.output, scraper) for text_i in
                     range(1, FLAGS.max_phi_id))
    for future in tqdm(concurrent.futures.as_completed(future_to_phi), total=FLAGS.max_phi_id):
      try:
        future.result()
      except:
        pass

Source File: bufferover.py From OneForAll with GNU General Public License v3.0

6 votes

def query(self):
        """
        向接口查询子域并做子域匹配
        """
        # 绕过cloudFlare验证
        scraper = cloudscraper.create_scraper()
        scraper.proxies = self.get_proxy(self.source)
        url = self.addr + self.domain
        try:
            resp = scraper.get(url, timeout=self.timeout)
        except Exception as e:
            logger.log('ERROR', e.args)
            return
        if resp.status_code != 200:
            return
        subdomains = self.match_subdomains(self.domain, str(resp.json()))
        # 合并搜索子域名搜索结果
        self.subdomains = self.subdomains.union(subdomains)

Source File: url.py From RSScrawler with MIT License

6 votes

def get_urls_async(urls, configfile, dbfile, scraper=False):
    if not scraper:
        scraper = cloudscraper.create_scraper(browser={'browser': 'chrome', 'mobile': False})
    results = []

    def load_url(url):
        return get_url(url, configfile, dbfile, scraper)

    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        future_to_url = {executor.submit(load_url, url): url for url in urls}
        for future in concurrent.futures.as_completed(future_to_url):
            future_to_url[future]
            try:
                results.append(future.result())
            except Exception:
                pass
    return [results, scraper]

Source File: mangaChapterDownload.py From comic-dl with MIT License

6 votes

def json_download(self, page_id):
        headers = {
            'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
            'Accept-Encoding': 'gzip, deflate'
        }

        sess = requests.session()
        sess = cloudscraper.create_scraper(sess)

        search_url = "http://www.mangaeden.com/api/chapter/{0}/".format(page_id)

        connection = sess.get(search_url, headers=headers)
        if connection.status_code != 200:
            print("Whoops! Seems like I can't connect to website.")
            print("It's showing : %s" % connection)
            print("Run this script with the --verbose argument and report the issue along with log file on Github.")
            sys.exit(1)
        else:
            json_data = connection.content

            return json_data

Source File: mangaChapters.py From comic-dl with MIT License

6 votes

def json_download(self, chapter_id):
        headers = {
            'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
            'Accept-Encoding': 'gzip, deflate'
        }

        sess = requests.session()
        sess = cloudscraper.create_scraper(sess)

        search_url = "http://www.mangaeden.com/api/manga/{0}/".format(chapter_id)

        connection = sess.get(search_url, headers=headers)
        if connection.status_code != 200:
            print("Whoops! Seems like I can't connect to website.")
            print("It's showing : %s" % connection)
            print("Run this script with the --verbose argument and report the issue along with log file on Github.")
            sys.exit(1)
        else:
            json_data = connection.content

            return json_data

Source File: japscan.py From comic-dl with MIT License

6 votes

def __init__(self, manga_url, download_directory, chapter_range, **kwargs):
        self.scraper = cloudscraper.create_scraper()
        conversion = kwargs.get("conversion")
        keep_files = kwargs.get("keep_files")
        self.logging = kwargs.get("log_flag")
        self.sorting = kwargs.get("sorting_order")
        self.manga_url = manga_url + '/'
        self.print_index = kwargs.get("print_index")

        if 'manga' in manga_url:
            self.comic_id = str(str(manga_url).split("/")[-1])
            self.full_series(comic_id=self.comic_id, sorting=self.sorting, download_directory=download_directory,
                             chapter_range=chapter_range, conversion=conversion, keep_files=keep_files)

        if 'lecture-en-ligne' in manga_url:
            self.comic_id = str(str(manga_url).split("/")[-2])
            chapter_path = re.sub(re.compile(r'.*japscan.to'), '', str(self.manga_url))
            self.single_chapter(chapter_path, comic_id=self.comic_id, download_directory=download_directory,
                                scraper=scraper)

Source File: threatcrowd.py From OneForAll with GNU General Public License v3.0

5 votes

def query(self):
        # 绕过cloudFlare验证
        scraper = cloudscraper.create_scraper()
        scraper.proxies = self.get_proxy(self.source)
        url = self.addr + self.domain
        try:
            resp = scraper.get(url, timeout=self.timeout)
        except Exception as e:
            logger.log('ERROR', e.args)
            return
        if resp.status_code != 200:
            return
        subdomains = self.match_subdomains(self.domain, str(resp.json()))
        # 合并搜索子域名搜索结果
        self.subdomains = self.subdomains.union(subdomains)

Source File: crawler.py From lightnovel-crawler with Apache License 2.0

5 votes

def __init__(self):
        self._destroyed = False
        self.executor = futures.ThreadPoolExecutor(max_workers=2)

        # Initialize cloudscrapper
        self.scraper = cloudscraper.create_scraper(
            browser={
                'browser': 'firefox',
                'mobile': False
            }
        )

        # Must resolve these fields inside `read_novel_info`
        self.novel_title = 'N/A'
        self.novel_author = 'N/A'
        self.novel_cover = None
        self.is_rtl = False

        # Each item must contain these keys:
        # `id` - 1 based index of the volume
        # `title` - the volume title (can be ignored)
        self.volumes = []

        # Each item must contain these keys:
        # `id` - 1 based index of the chapter
        # `title` - the title name
        # `volume` - the volume id of this chapter
        # `volume_title` - the volume title (can be ignored)
        # `url` - the link where to download the chapter
        self.chapters = []

        # Other stuffs - not necessary to resolve from crawler instance.
        self.home_url = ''
        self.novel_url = ''
        self.last_visited_url = None
    # end def

Source File: update_checker.py From lightnovel-crawler with Apache License 2.0

5 votes

def check_updates():
    try:
        logger.info('Checking latest version')
        pypi_short_url = 'http://bit.ly/2yYyFGd'
        scraper = cloudscraper.create_scraper()
        res = scraper.get(pypi_short_url, timeout=5)
        latest_version = res.json()['info']['version']
        if get_value() != latest_version:
            new_version_news(latest_version)
        # end if
    except Exception:
        logger.warn('Failed to check for update')
    # end try
# end def

Source File: flhhkk_spider.py From Spiders with Apache License 2.0

5 votes

def __init__(self):
        self.scrapper = cloudscraper.create_scraper()
        super().__init__()

Source File: mangaSearch.py From comic-dl with MIT License

5 votes

def json_download(self, manga_language):
        print("Downloading The Latest Data Set...")
        headers = {
            'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
            'Accept-Encoding': 'gzip, deflate'
        }

        sess = requests.session()
        sess = cloudscraper.create_scraper(sess)

        search_url = "http://www.mangaeden.com/api/list/{0}/".format(manga_language)

        connection = sess.get(search_url, headers=headers)
        if connection.status_code != 200:
            print("Whoops! Seems like I can't connect to website.")
            print("It's showing : %s" % connection)
            print("Run this script with the --verbose argument and report the issue along with log file on Github.")
            sys.exit(1)
        else:
            json_data = connection.content
            # print(json_data)
            try:
                # Let's save the JSON data
                with open("Manga_Eden_Data.json", "wb") as write_file:
                    write_file.write(json_data)
            except Exception as WriteError:
                print("Couldn't make Cache : {0}".format(WriteError))
                pass

            return json_data

Source File: Core.py From CurseBreaker with GNU General Public License v3.0

5 votes

def __init__(self):
        self.path = Path('Interface/AddOns')
        self.configPath = Path('WTF/CurseBreaker.json')
        self.cachePath = Path('WTF/CurseBreaker.cache')
        self.clientType = 'wow_retail'
        self.waCompanionVersion = 110
        self.config = None
        self.cfIDs = None
        self.cfDirs = None
        self.cfCache = {}
        self.wowiCache = {}
        self.checksumCache = {}
        self.scraper = cloudscraper.create_scraper()

Source File: batoto.py From comic-dl with MIT License

4 votes

def user_login(self, username, password, **kwargs):
        session_cookie = ""

        headers = kwargs.get("headers")
        if not headers:
            headers = {
                'User-Agent':
                    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
                'Accept-Encoding': 'gzip, deflate',
                'referer': 'https://bato.to/'
            }
        print("Getting Auth Token...")
        page_source, update_cookie = globalFunctions.GlobalFunctions().page_downloader(
            manga_url="https://bato.to/forums/index.php?app=core&module=global&section=login")

        soup_parse = page_source.find_all('input', {'type': 'hidden'})
        auth_token = str([x['value'] for x in soup_parse][0]).strip()

        payload = {
            'auth_key': auth_token,
            'ips_username': username,
            'ips_password': password,
            'rememberMe': '1'
        }

        sess = requests.session()
        sess = cloudscraper.create_scraper(sess)

        print('Trying To Log In...')
        connection = sess.post("https://bato.to/forums/index.php?app=core&module=global&section=login&do=process",
                               headers=headers, data=payload, cookies=kwargs.get("cookies"))
        if connection.status_code != 200:
            print("Whoops! Seems like I can't connect to website.")
            print("It's showing : %s" % connection)
            print("Run this script with the --verbose argument and report the issue along with log file on Github.")
            sys.exit(1)
        else:
            page_source = BeautifulSoup(connection.text.encode("utf-8"), "html.parser")
            if "logout" in str(page_source):
                print("Successfully Logged In!")
            else:
                print("Couldn't Log You In. Please Check Your Credentials Again!")
            session_cookie = sess.cookies

        return session_cookie

Python cloudscraper.create_scraper() Examples