Python Examples of requests

Source File: facebook_scraper.py From facebook-scraper with MIT License

7 votes

def __init__(self, session=None, requests_kwargs=None):
        if session is None:
            session = HTMLSession()
            session.headers.update(self.default_headers)

        if requests_kwargs is None:
            requests_kwargs = {}

        self.session = session
        self.requests_kwargs = requests_kwargs

Source File: instagram.py From EagleEye with Do What The F*ck You Want To Public License

7 votes

def getLinks(self):
        session = HTMLSession()
        r = session.get('https://instagram.com/' + self.username)
        l = r.html.find('body > script:nth-child(5)')[0].text
        json_str = l[21:]
        json_str = json_str[:-1]
        json_parsed = json.loads(json_str)
        shortcodes = []
        try:
            images = json_parsed['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges']
            for image in images:
                node = image['node']
                shortcode = node['shortcode']
                shortcodes.append(shortcode)
            links = []
            for sc in shortcodes:
                r = session.get('https://instagram.com/p/' + sc + '/?taken-by=' + self.username)
                img = r.html.find('meta[property="og:image"]')
                if len(img) > 0:
                    img = img[0]
                    links.append(img.attrs['content'])
            return links
        except:
            return []

Source File: auth.py From codechef-cli with GNU General Public License v3.0

6 votes

def make_login_req(username, password, disconnect_sessions):
    with HTMLSession() as session:
        set_session_cookies(session)

        resp = request(session=session)
        token = get_csrf_token(resp.html, CSRF_TOKEN_INPUT_ID)
        if not token:
            return [{'data': CSRF_TOKEN_MISSING, 'code': 500}]

        data = {
            'name': username,
            'pass': password,
            'form_id': LOGIN_FORM_ID[1:],
            'csrfToken': token
        }

        resp = request(session=session, method='POST', data=data)
        resp_html = resp.html

        if resp.status_code == 200:
            if resp_html.find(SESSION_LIMIT_FORM_ID):
                if disconnect_sessions:
                    resps = disconnect_active_sessions(session, resp_html)
                    save_session_cookies(session, username)
                    return resps
                else:
                    logout(session=session)
                    return [{'data': SESSION_LIMIT_MSG, 'code': 400}]
            elif resp_html.find(LOGOUT_BUTTON_CLASS):
                save_session_cookies(session, username)
                return [{'data': LOGIN_SUCCESS_MSG}]
            return [{'data': INCORRECT_CREDS_MSG, 'code': 400}]
        return [{'code': 503}]

Source File: crawler.py From administrative-divisions-of-China-on-Python with GNU General Public License v3.0

6 votes

def __init__(self):
        self._headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
            'Accept-Encoding': ''
        }
        """
        头信息
        """

        self._session = HTMLSession()
        """
        HTMLSession 对象
        """

Source File: test_config.py From wikipron with Apache License 2.0

6 votes

def test_american_english_dialect_selection():
    # Pick a word for which Wiktionary has dialect-specified pronunciations
    # for both US and non-US English.
    word = "mocha"
    html_session = requests_html.HTMLSession()
    response = html_session.get(_PAGE_TEMPLATE.format(word=word))
    # Construct two configs to demonstrate the US dialect (non-)selection.
    config_only_us = config_factory(key="en", dialect="US | American English")
    config_any_dialect = config_factory(key="en")
    # Apply each config's XPath selector.
    results_only_us = response.html.xpath(config_only_us.pron_xpath_selector)
    results_any_dialect = response.html.xpath(
        config_any_dialect.pron_xpath_selector
    )
    assert (
        len(results_any_dialect)  # containing both US and non-US results
        > len(results_only_us)  # containing only the US result
        > 0
    )

Source File: helpers.py From codechef-cli with GNU General Public License v3.0

5 votes

def get_session():
    session = HTMLSession()

    if os.path.exists(COOKIES_FILE_PATH):
        set_session_cookies(session)
        session.cookies.load(ignore_discard=True, ignore_expires=True)
    return session

Source File: imicrobe.py From grabseqs with MIT License

5 votes

def get_imicrobe_acc_metadata(pacc):
    """
    Function to get list of iMicrobe sample accession numbers from a particular
    project. Takes project accession number `pacc` and returns a list of iMicrobe
    accession numbers.
    """
    # Check accession format
    pacc = pacc.lower()
    if pacc.startswith("p"):
        pacc = pacc[1:]
    elif pacc.startswith("s"):
        return [pacc]
    else:
        raise(Exception("iMicrobe accession numbers should be prefixed with 'p' (project) or 's' (sample)"))

    # Grab sample info
    session = HTMLSession()
    r = session.get('https://www.imicrobe.us/#/projects/'+pacc)
    r.html.render(sleep = 1)

    sample_list = []
    for l in r.html.element("a"):
        i = l.items()
        try:
            if i[0][1].startswith("#/samples/"):
                sample_list.append(i[0][1][10:]) # add sample ID only
        except IndexError:
            continue
    session.close()

    # Format and return sample accession numbers
    return ["s"+ sID for sID in sample_list]

Source File: channels.py From telegram with MIT License

5 votes

def extract_html(url, javascript_enabled=False):
    session = HTMLSession()
    response = session.get(url)
    if javascript_enabled:
        response.html.render()
        source_html = response.html.html
        return source_html
    else:
        return response.html.html


# method to parse the HTML from the Lyzem page

Source File: live_recorder.py From bilibili-live-recorder with MIT License

5 votes

def __init__(self, cid, output_name='opt.mp4'):
        self.cid = cid
        self.api_url = 'http://api.live.bilibili.com/api/playurl?device=phone&platform=ios&scale=3&build=10000&' \
                       'cid={}&otype=json&platform=h5'.format(cid)
        self.output_dir = os.path.join(os.getcwd(), 'files')
        self._s = requests_html.HTMLSession()

Source File: worker.py From scylla with Apache License 2.0

5 votes

def __init__(self):
        """Initialize the worker object

        """

        self.session = HTMLSession()

Source File: scrape.py From wikipron with Apache License 2.0

5 votes

def _scrape_once(data, config: Config) -> Iterator[WordPronPair]:
    session = requests_html.HTMLSession()
    for member in data["query"]["categorymembers"]:
        word = member["title"]
        date = member["timestamp"]
        if _skip_word(word, config.no_skip_spaces_word) or _skip_date(
            date, config.cut_off_date
        ):
            continue
        request = session.get(_PAGE_TEMPLATE.format(word=word), timeout=10)
        for word, pron in config.extract_word_pron(word, request, config):
            yield word, pron

Source File: spider.py From rxivist with GNU Affero General Public License v3.0

5 votes

def __init__(self):
    self.connection = db.Connection(config.db["host"], config.db["db"], config.db["user"], config.db["password"])
    self.session = HTMLSession(mock_browser=False)
    self.session.headers['User-Agent'] = config.user_agent
    self.log = Logger()

Source File: check_music.py From snippet with MIT License

5 votes

def __init__(self):
        self._session = HTMLSession()

Source File: conftest.py From kube-web-view with GNU General Public License v3.0

5 votes

def session(populated_cluster):

    url = populated_cluster["url"].rstrip("/")

    s = HTMLSession()

    def new_request(prefix, f, method, url, *args, **kwargs):
        return f(method, prefix + url, *args, **kwargs)

    s.request = partial(new_request, url, s.request)
    return s

Source File: search.py From SQL-scanner with MIT License

5 votes

def find_links(self):

        session = HTMLSession()
        session.headers['user-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'

        url = self.base_url + self.parameters.format(self.query)

        while self.is_alive:
            try:
                html = session.get(url).html
            except:
                break

            for r in html.find('.b_algo'):
                a = r.find('h2', first=True).find('a', first=True)

                try:
                    link = a.attrs['href']
                except:
                    continue

                if self.is_valid(link):
                    self.links.put(link)

            next_page = self.next_page(html)

            if not next_page:
                break

            url = next_page

        with self.lock:
            self.is_searching = False

Source File: instagram_scraper.py From instagram-scraper with MIT License

5 votes

def scrape_instagram_tag(tag: str, total_count: int=50, existing: set=None):
    """
    Scrape and yield recently tagged instagram photos.
    """
    if existing is None:
        existing = set()

    url = f'https://www.instagram.com/explore/tags/{tag}'
    session = HTMLSession()
    req = session.get(url)

    imgs = set(existing)
    count = 0
    page = 0

    while count <= total_count:
        req.html.render(scrolldown=page)
        images = req.html.xpath('//img[@alt]')
        page += 1
        for image in images:
            if count > total_count:
                break
            try:
                url, caption = image.attrs['src'], image.attrs['alt']
            except:
                pass
            else:
                if url in imgs:
                    continue
                imgs.add(url)
                hashtags = set(REGEXES['hashtag'].findall(caption))
                mentions = set(REGEXES['username'].findall(caption))
                count += 1
                yield url, caption, hashtags, mentions

Source File: mtc.py From crypto51 with GNU General Public License v3.0

5 votes

def __init__(self):
        self._session = HTMLSession()

Source File: test_helpers.py From codechef-cli with GNU General Public License v3.0

5 votes

def test_get_session_no_cookies(self):
        """Should return requests_html.HTMLSession instance"""
        fake_logout()

        session = get_session()
        self.assertIsInstance(session, HTMLSession)
        self.assertEqual(len(session.cookies), 0)

Source File: test_helpers.py From codechef-cli with GNU General Public License v3.0

5 votes

def test_get_session_cookies(self):
        """Should return requests_html.HTMLSession instance preloaded with cookies"""
        fake_login()

        session = get_session()
        self.assertIsInstance(session, HTMLSession)
        self.assertTrue(len(session.cookies) > 0)

Source File: list.py From terraenv with MIT License

4 votes

def list_remote(args):
    program = args.program

    """ lists terraform/terragrunt versions """

    if program == "terraform":
        session = HTMLSession()
        terraform_url = session.get(
            "https://releases.hashicorp.com/terraform/")
        unstable_releases = '-'
        data = terraform_url.html.links
        data = filter(lambda x: program in x, data)
        data = filter(lambda x: unstable_releases not in x, data)
        available_versions = ['']

        for d in data:
            version = d.split('/')[2]
            available_versions.append(version)
        available_versions.remove('')
        available_versions.sort(key=StrictVersion)

        if args.commands in validate_versions_commands:
            return available_versions

        for version in available_versions:
            print(version)

    elif program == "terragrunt":
        session = HTMLSession()
        terragrunt_url = session.get(
            "https://api.github.com/repos/gruntwork-io/terragrunt/tags?per_page=1000")
        data = terragrunt_url.html.full_text
        parsed_json = (json.loads(data))
        available_versions = ['']

        for version in parsed_json:
            available_versions.append(version['name'].lstrip('v'))
        available_versions.remove('')
        available_versions.sort(key=StrictVersion)

        if args.commands in validate_versions_commands:
            return available_versions

        for version in available_versions:
            print(version)

    else:
        raise Exception(
            'Invalid Arguement !! It should be either terraform / terragrunt')

Source File: imicrobe.py From grabseqs with MIT License

4 votes

def _parse_imicrobe_readpath_metadata(acc, download_metadata, metadata_agg):
    """
    Helper function to parse sample download paths from a sample page.
    Takes an `acc` with no prefix. Returns a dictionary with download paths
    for one or two reads like: {1:"url"} or {1:"url1", 2:"url2"}. Also returns
    aggregated metadata.
    """
    acc = str(acc)
    session = HTMLSession()
    r = session.get('https://www.imicrobe.us/#/samples/'+acc)
    r.html.render(scrolldown=4, sleep=4)
    file_links = list(r.html.links)
    # Find one or two links immediately followed by "Reads column (or equivalent)
    reads_colnames = ["Reads FASTQ", "Reads", "FASTQ", "upload.fastq"]

    for c in reads_colnames:
        hits = [m.start() for m in re.finditer("<td>"+c+"</td>", r.html.html)]
        if len(hits) > 0:
            break
    link_indices = []
    working_file_links = []
    for l in file_links:
        try:
            link_indices.append(r.html.html.index('"'+l+'"'))
            working_file_links.append(l)
        except ValueError: # sometimes they are formatted differently (if added by the project owner?)
            continue
    read_links = {}
    for j in range(len(hits)):
        read_links[j+1] = working_file_links[_closest_below_index(link_indices, hits[j])].replace("http://datacommons.cyverse.org/browse", "https://de.cyverse.org/anon-files")

    if download_metadata:
        html_str = str(r.html.html)
        relevant_section = html_str[html_str.index("<h2>Attributes"):html_str.index("<h2>Files")]
        table_only = relevant_section[relevant_section.index("<tbody>")+7:relevant_section.index("</tbody>")].replace(',',';')
        formatted_table = table_only.replace('</tr><tr>', '\n').replace('</td><td>', ',').replace('<tr>','').replace('<td>','').replace('</tr>','').replace('</td>','')
        listed_table = [z.split(',') for z in formatted_table.split('\n')]
        transposed_table =[[z[0] for z in listed_table],[z[1] for z in listed_table]]
        formatted_table = ','.join(transposed_table[0]) + '\n' + ','.join(transposed_table[1])
        if type(metadata_agg) == type(None):
            metadata_agg = pd.read_csv(StringIO(formatted_table))
        else:
            metadata_agg = metadata_agg.append(pd.read_csv(StringIO(formatted_table)),sort=True)
    return read_links, metadata_agg

Python requests_html.HTMLSession() Examples