Python requests_html.HTMLSession() Examples
The following are 21
code examples of requests_html.HTMLSession().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
requests_html
, or try the search function
.
Example #1
Source File: facebook_scraper.py From facebook-scraper with MIT License | 7 votes |
def __init__(self, session=None, requests_kwargs=None): if session is None: session = HTMLSession() session.headers.update(self.default_headers) if requests_kwargs is None: requests_kwargs = {} self.session = session self.requests_kwargs = requests_kwargs
Example #2
Source File: instagram.py From EagleEye with Do What The F*ck You Want To Public License | 7 votes |
def getLinks(self): session = HTMLSession() r = session.get('https://instagram.com/' + self.username) l = r.html.find('body > script:nth-child(5)')[0].text json_str = l[21:] json_str = json_str[:-1] json_parsed = json.loads(json_str) shortcodes = [] try: images = json_parsed['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges'] for image in images: node = image['node'] shortcode = node['shortcode'] shortcodes.append(shortcode) links = [] for sc in shortcodes: r = session.get('https://instagram.com/p/' + sc + '/?taken-by=' + self.username) img = r.html.find('meta[property="og:image"]') if len(img) > 0: img = img[0] links.append(img.attrs['content']) return links except: return []
Example #3
Source File: auth.py From codechef-cli with GNU General Public License v3.0 | 6 votes |
def make_login_req(username, password, disconnect_sessions): with HTMLSession() as session: set_session_cookies(session) resp = request(session=session) token = get_csrf_token(resp.html, CSRF_TOKEN_INPUT_ID) if not token: return [{'data': CSRF_TOKEN_MISSING, 'code': 500}] data = { 'name': username, 'pass': password, 'form_id': LOGIN_FORM_ID[1:], 'csrfToken': token } resp = request(session=session, method='POST', data=data) resp_html = resp.html if resp.status_code == 200: if resp_html.find(SESSION_LIMIT_FORM_ID): if disconnect_sessions: resps = disconnect_active_sessions(session, resp_html) save_session_cookies(session, username) return resps else: logout(session=session) return [{'data': SESSION_LIMIT_MSG, 'code': 400}] elif resp_html.find(LOGOUT_BUTTON_CLASS): save_session_cookies(session, username) return [{'data': LOGIN_SUCCESS_MSG}] return [{'data': INCORRECT_CREDS_MSG, 'code': 400}] return [{'code': 503}]
Example #4
Source File: crawler.py From administrative-divisions-of-China-on-Python with GNU General Public License v3.0 | 6 votes |
def __init__(self): self._headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36', 'Accept-Encoding': '' } """ 头信息 """ self._session = HTMLSession() """ HTMLSession 对象 """
Example #5
Source File: test_config.py From wikipron with Apache License 2.0 | 6 votes |
def test_american_english_dialect_selection(): # Pick a word for which Wiktionary has dialect-specified pronunciations # for both US and non-US English. word = "mocha" html_session = requests_html.HTMLSession() response = html_session.get(_PAGE_TEMPLATE.format(word=word)) # Construct two configs to demonstrate the US dialect (non-)selection. config_only_us = config_factory(key="en", dialect="US | American English") config_any_dialect = config_factory(key="en") # Apply each config's XPath selector. results_only_us = response.html.xpath(config_only_us.pron_xpath_selector) results_any_dialect = response.html.xpath( config_any_dialect.pron_xpath_selector ) assert ( len(results_any_dialect) # containing both US and non-US results > len(results_only_us) # containing only the US result > 0 )
Example #6
Source File: helpers.py From codechef-cli with GNU General Public License v3.0 | 5 votes |
def get_session(): session = HTMLSession() if os.path.exists(COOKIES_FILE_PATH): set_session_cookies(session) session.cookies.load(ignore_discard=True, ignore_expires=True) return session
Example #7
Source File: imicrobe.py From grabseqs with MIT License | 5 votes |
def get_imicrobe_acc_metadata(pacc): """ Function to get list of iMicrobe sample accession numbers from a particular project. Takes project accession number `pacc` and returns a list of iMicrobe accession numbers. """ # Check accession format pacc = pacc.lower() if pacc.startswith("p"): pacc = pacc[1:] elif pacc.startswith("s"): return [pacc] else: raise(Exception("iMicrobe accession numbers should be prefixed with 'p' (project) or 's' (sample)")) # Grab sample info session = HTMLSession() r = session.get('https://www.imicrobe.us/#/projects/'+pacc) r.html.render(sleep = 1) sample_list = [] for l in r.html.element("a"): i = l.items() try: if i[0][1].startswith("#/samples/"): sample_list.append(i[0][1][10:]) # add sample ID only except IndexError: continue session.close() # Format and return sample accession numbers return ["s"+ sID for sID in sample_list]
Example #8
Source File: channels.py From telegram with MIT License | 5 votes |
def extract_html(url, javascript_enabled=False): session = HTMLSession() response = session.get(url) if javascript_enabled: response.html.render() source_html = response.html.html return source_html else: return response.html.html # method to parse the HTML from the Lyzem page
Example #9
Source File: live_recorder.py From bilibili-live-recorder with MIT License | 5 votes |
def __init__(self, cid, output_name='opt.mp4'): self.cid = cid self.api_url = 'http://api.live.bilibili.com/api/playurl?device=phone&platform=ios&scale=3&build=10000&' \ 'cid={}&otype=json&platform=h5'.format(cid) self.output_dir = os.path.join(os.getcwd(), 'files') self._s = requests_html.HTMLSession()
Example #10
Source File: worker.py From scylla with Apache License 2.0 | 5 votes |
def __init__(self): """Initialize the worker object """ self.session = HTMLSession()
Example #11
Source File: scrape.py From wikipron with Apache License 2.0 | 5 votes |
def _scrape_once(data, config: Config) -> Iterator[WordPronPair]: session = requests_html.HTMLSession() for member in data["query"]["categorymembers"]: word = member["title"] date = member["timestamp"] if _skip_word(word, config.no_skip_spaces_word) or _skip_date( date, config.cut_off_date ): continue request = session.get(_PAGE_TEMPLATE.format(word=word), timeout=10) for word, pron in config.extract_word_pron(word, request, config): yield word, pron
Example #12
Source File: spider.py From rxivist with GNU Affero General Public License v3.0 | 5 votes |
def __init__(self): self.connection = db.Connection(config.db["host"], config.db["db"], config.db["user"], config.db["password"]) self.session = HTMLSession(mock_browser=False) self.session.headers['User-Agent'] = config.user_agent self.log = Logger()
Example #13
Source File: check_music.py From snippet with MIT License | 5 votes |
def __init__(self): self._session = HTMLSession()
Example #14
Source File: conftest.py From kube-web-view with GNU General Public License v3.0 | 5 votes |
def session(populated_cluster): url = populated_cluster["url"].rstrip("/") s = HTMLSession() def new_request(prefix, f, method, url, *args, **kwargs): return f(method, prefix + url, *args, **kwargs) s.request = partial(new_request, url, s.request) return s
Example #15
Source File: search.py From SQL-scanner with MIT License | 5 votes |
def find_links(self): session = HTMLSession() session.headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36' url = self.base_url + self.parameters.format(self.query) while self.is_alive: try: html = session.get(url).html except: break for r in html.find('.b_algo'): a = r.find('h2', first=True).find('a', first=True) try: link = a.attrs['href'] except: continue if self.is_valid(link): self.links.put(link) next_page = self.next_page(html) if not next_page: break url = next_page with self.lock: self.is_searching = False
Example #16
Source File: instagram_scraper.py From instagram-scraper with MIT License | 5 votes |
def scrape_instagram_tag(tag: str, total_count: int=50, existing: set=None): """ Scrape and yield recently tagged instagram photos. """ if existing is None: existing = set() url = f'https://www.instagram.com/explore/tags/{tag}' session = HTMLSession() req = session.get(url) imgs = set(existing) count = 0 page = 0 while count <= total_count: req.html.render(scrolldown=page) images = req.html.xpath('//img[@alt]') page += 1 for image in images: if count > total_count: break try: url, caption = image.attrs['src'], image.attrs['alt'] except: pass else: if url in imgs: continue imgs.add(url) hashtags = set(REGEXES['hashtag'].findall(caption)) mentions = set(REGEXES['username'].findall(caption)) count += 1 yield url, caption, hashtags, mentions
Example #17
Source File: mtc.py From crypto51 with GNU General Public License v3.0 | 5 votes |
def __init__(self): self._session = HTMLSession()
Example #18
Source File: test_helpers.py From codechef-cli with GNU General Public License v3.0 | 5 votes |
def test_get_session_no_cookies(self): """Should return requests_html.HTMLSession instance""" fake_logout() session = get_session() self.assertIsInstance(session, HTMLSession) self.assertEqual(len(session.cookies), 0)
Example #19
Source File: test_helpers.py From codechef-cli with GNU General Public License v3.0 | 5 votes |
def test_get_session_cookies(self): """Should return requests_html.HTMLSession instance preloaded with cookies""" fake_login() session = get_session() self.assertIsInstance(session, HTMLSession) self.assertTrue(len(session.cookies) > 0)
Example #20
Source File: list.py From terraenv with MIT License | 4 votes |
def list_remote(args): program = args.program """ lists terraform/terragrunt versions """ if program == "terraform": session = HTMLSession() terraform_url = session.get( "https://releases.hashicorp.com/terraform/") unstable_releases = '-' data = terraform_url.html.links data = filter(lambda x: program in x, data) data = filter(lambda x: unstable_releases not in x, data) available_versions = [''] for d in data: version = d.split('/')[2] available_versions.append(version) available_versions.remove('') available_versions.sort(key=StrictVersion) if args.commands in validate_versions_commands: return available_versions for version in available_versions: print(version) elif program == "terragrunt": session = HTMLSession() terragrunt_url = session.get( "https://api.github.com/repos/gruntwork-io/terragrunt/tags?per_page=1000") data = terragrunt_url.html.full_text parsed_json = (json.loads(data)) available_versions = [''] for version in parsed_json: available_versions.append(version['name'].lstrip('v')) available_versions.remove('') available_versions.sort(key=StrictVersion) if args.commands in validate_versions_commands: return available_versions for version in available_versions: print(version) else: raise Exception( 'Invalid Arguement !! It should be either terraform / terragrunt')
Example #21
Source File: imicrobe.py From grabseqs with MIT License | 4 votes |
def _parse_imicrobe_readpath_metadata(acc, download_metadata, metadata_agg): """ Helper function to parse sample download paths from a sample page. Takes an `acc` with no prefix. Returns a dictionary with download paths for one or two reads like: {1:"url"} or {1:"url1", 2:"url2"}. Also returns aggregated metadata. """ acc = str(acc) session = HTMLSession() r = session.get('https://www.imicrobe.us/#/samples/'+acc) r.html.render(scrolldown=4, sleep=4) file_links = list(r.html.links) # Find one or two links immediately followed by "Reads column (or equivalent) reads_colnames = ["Reads FASTQ", "Reads", "FASTQ", "upload.fastq"] for c in reads_colnames: hits = [m.start() for m in re.finditer("<td>"+c+"</td>", r.html.html)] if len(hits) > 0: break link_indices = [] working_file_links = [] for l in file_links: try: link_indices.append(r.html.html.index('"'+l+'"')) working_file_links.append(l) except ValueError: # sometimes they are formatted differently (if added by the project owner?) continue read_links = {} for j in range(len(hits)): read_links[j+1] = working_file_links[_closest_below_index(link_indices, hits[j])].replace("http://datacommons.cyverse.org/browse", "https://de.cyverse.org/anon-files") if download_metadata: html_str = str(r.html.html) relevant_section = html_str[html_str.index("<h2>Attributes"):html_str.index("<h2>Files")] table_only = relevant_section[relevant_section.index("<tbody>")+7:relevant_section.index("</tbody>")].replace(',',';') formatted_table = table_only.replace('</tr><tr>', '\n').replace('</td><td>', ',').replace('<tr>','').replace('<td>','').replace('</tr>','').replace('</td>','') listed_table = [z.split(',') for z in formatted_table.split('\n')] transposed_table =[[z[0] for z in listed_table],[z[1] for z in listed_table]] formatted_table = ','.join(transposed_table[0]) + '\n' + ','.join(transposed_table[1]) if type(metadata_agg) == type(None): metadata_agg = pd.read_csv(StringIO(formatted_table)) else: metadata_agg = metadata_agg.append(pd.read_csv(StringIO(formatted_table)),sort=True) return read_links, metadata_agg