Python lxml.html.fromstring() Examples

The following are 30 code examples of lxml.html.fromstring(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module lxml.html , or try the search function .
Example #1
Source File: pyhoroscope.py    From Horoscope-API with MIT License 6 votes vote down vote up
def get_yearly_horoscope(sunsign):
        url = "http://www.ganeshaspeaks.com/horoscopes/yearly-horoscope/" + sunsign
        response = requests.get(url)
        tree = html.fromstring(response.content)
        year = str(tree.xpath(
            "//*[@id=\"daily\"]/div/div[1]/div[1]/div[2]/div/p/text()"))
        year = year.replace("']", "").replace("['", "")
        horoscope = str(tree.xpath(
            "//*[@id=\"daily\"]/div/div[1]/div[2]/p[1]/text()"))
        horoscope = horoscope.replace("\\n", "").replace("  ", "").replace("']", "").replace("['", "")
        dict = {
            'year': year,
            'horoscope': horoscope,
            'sunsign': sunsign
        }

        return dict 
Example #2
Source File: dic.py    From yui with GNU Affero General Public License v3.0 6 votes vote down vote up
def parse(html: str) -> Tuple[Optional[str], List[Attachment]]:
    h = fromstring(html)
    meta = h.cssselect('meta[http-equiv=Refresh]')
    if meta:
        return fix_url(meta[0].get('content')[7:]), []
    else:
        words = h.cssselect('div.search_type')

        attachments: List[Attachment] = []

        for word in words:
            w = word.cssselect('.txt_searchword')[0]
            attachments.append(
                Attachment(
                    title=strip_tags(w.text_content()),
                    title_link=fix_url(w.get('href')),
                    text=fix_blank(
                        word.cssselect('.list_search')[0].text_content()
                    ),
                )
            )

        return None, attachments 
Example #3
Source File: animal.py    From yui with GNU Affero General Public License v3.0 6 votes vote down vote up
def get_cat_image_url(timeout: float) -> str:
    api_url = 'http://thecatapi.com/api/images/get'
    async with aiohttp.ClientSession() as session:
        while True:
            try:
                async with session.get(
                    api_url, params={'format': 'xml', 'type': 'jpg,png'}
                ) as res:
                    if res.status != 200:
                        raise APIServerError
                    xml_result = await res.read()
                    tree = etree.fromstring(xml_result)
                    url = tree.find('data/images/image/url').text
            except aiohttp.client_exceptions.ServerDisconnectedError:
                await asyncio.sleep(0.1)
                continue
            try:
                async with async_timeout.timeout(timeout=timeout):
                    async with session.get(url) as res:
                        async with res:
                            if res.status == 200:
                                return url
            except (aiohttp.ClientConnectorError, asyncio.TimeoutError):
                continue 
Example #4
Source File: get_data.py    From X-ray-classification with MIT License 6 votes vote down vote up
def main():
    for url in url_list :
        try:
            r = requests.get(url)
        except : continue
        tree = html.fromstring(r.text)

        script = tree.xpath('//script[@language="javascript"]/text()')[0]

        json_string = regex.findall(script)[0]
        json_data = json.loads(json_string)

        next_page_url = tree.xpath('//footer/a/@href')

        links = [domain + x['nodeRef'] for x in json_data]
        for link in links:
            extract(link) 
Example #5
Source File: receiving_mail.py    From django-email-gateway with GNU General Public License v3.0 6 votes vote down vote up
def sns_notification(body):
    json_body = body.decode('utf8')
    js = json.loads(json_body.replace('\n', ''))
    if js["Type"] == "Notification":
        arg_info = js["Message"]
        arg_info = json.loads(arg_info)
        content = arg_info['content']
        subject = arg_info['mail']['commonHeaders']['subject']
        html_content = content.partition('Content-Type: text/html; charset=UTF-8')[2]
        if 'Content-Transfer-Encoding' in html_content:
            html_content = html_content.partition('Content-Transfer-Encoding: quoted-printable')[2]
        text = html_content.replace('\r\n', '')
        table = html.fromstring(text)
        content = ''
        for item in table:
            if item.text:
                content += item.text.strip()
        mail_content = str(content)
        from_mail = arg_info['mail']['source']
        to_mail = arg_info['mail']['destination'][0]
        hash_code = arg_info['mail']['destination'][0].split('@')[0]
        return subject, from_mail, to_mail, hash_code, mail_content 
Example #6
Source File: reportloader.py    From nhlscrapi with Apache License 2.0 6 votes vote down vote up
def html_doc(self):
        """
        :returns: the lxml processed html document
        :rtype: ``lxml.html.document_fromstring`` output
        """
        
        if self.__lx_doc is None:
            cn = NHLCn()
          
            if hasattr(cn, self.report_type):
                html = getattr(cn, self.report_type)(self.game_key)
            else:
                raise ValueError('Invalid report type: %s' % self.report_type)
          
            if cn.req_err is None:
                self.__lx_doc = fromstring(html)
            else:
                self.req_err = cn.req_err
            
        return self.__lx_doc 
Example #7
Source File: get_image_gevent.py    From girl-atlas-crawler with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def get_page_urls():

    start_url = 'http://girl-atlas.com/'
    response = get_response(start_url)
    page_urls = []

    page_urls.append(start_url)
    while True:
        parsed_body = html.fromstring(response.text)
        next_url = parsed_body.xpath('//a[@class="btn-form next"]/@href')

        if not next_url:
            break

        next_url = start_url + next_url[0]
        page_urls.append(next_url)
        response = get_response(next_url)

    print "get_page_urls done!!!"

    return page_urls

# 获取每个girl专辑的url 
Example #8
Source File: get_image.py    From girl-atlas-crawler with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def get_image_urls(girl_urls):

    girl_list = []
    
    for url in girl_urls:
        # print "in get_image_urls" + url[0]
        response = get_response(url)
        parsed_body = html.fromstring(response.text)

        # 专辑名
        girl_title  = parsed_body.xpath('//title/text()')
        image_urls = parsed_body.xpath('//li[@class="slide "]/img/@src | //li[@class="slide "]/img/@delay')

        girl_dict = {girl_title[0] : image_urls}
        girl_list.append(girl_dict)
        
    print "get_girl_urls done!!!"
    return girl_list

# 开始下载图片 
Example #9
Source File: populate.py    From phageParser with MIT License 6 votes vote down vote up
def addpositionstodict(gendict):
    print("Downloading position information from web...")
    for accidwithloc in tqdm(gendict):
        if 'Start' in gendict[accidwithloc]:
            continue
        accid = '_'.join(accidwithloc.split('_')[:-1])
        url = ('http://crispr.i2bc.paris-saclay.fr/crispr/crispr_db.php?'
               'checked%5B%5D={}'.format(accid))
        page = requests.get(url)
        htmltable = html.fromstring(page.content).xpath(
            "//table[normalize-space(@class)='primary_table']")[1]
        strtable = etree.tostring(htmltable)
        # converts to pandas df and then to numpy array then drop titles
        arrtable = pandas.read_html(strtable)[0].as_matrix()[2:]
        for row in arrtable:
            if row[0] in gendict:
                gendict[row[0]]['Start'] = row[2]
                gendict[row[0]]['Stop'] = row[3]
            else:
                if row[1] != 'questionable':
                    print("Can't find %s in local files" % row[0])
    return gendict 
Example #10
Source File: get_image.py    From girl-atlas-crawler with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def get_page_urls():

    start_url = 'http://girl-atlas.com/'
    response = get_response(start_url)
    page_urls = []

    page_urls.append(start_url)
    while True:
        parsed_body = html.fromstring(response.text)
        # Xpath 提取访问下个页面的url
        next_url = parsed_body.xpath('//a[@class="btn-form next"]/@href')

        if not next_url:
            break

        next_url = start_url + next_url[0]
        page_urls.append(next_url)
        response = get_response(next_url)

    print "get_page_urls done!!!"

    return page_urls

# 获取每个girl专辑的Url 
Example #11
Source File: pyhoroscope.py    From Horoscope-API with MIT License 6 votes vote down vote up
def get_monthly_horoscope(sunsign):
        url = "http://www.ganeshaspeaks.com/horoscopes/monthly-horoscope/" + sunsign
        response = requests.get(url)
        tree = html.fromstring(response.content)
        month = str(tree.xpath(
            "//*[@id=\"daily\"]/div/div[1]/div[1]/div[2]/div/p/text()"))
        month = month.replace("']", "").replace("['", "")
        horoscope = str(tree.xpath(
            "//*[@id=\"daily\"]/div/div[1]/div[2]/p[1]/text()[1]"))
        horoscope = horoscope.replace("\\n", "").replace("  ", "").replace("']", "").replace("['", "")
        dict = {
            'month': month,
            'horoscope': horoscope,
            'sunsign': sunsign
        }

        return dict 
Example #12
Source File: pyhoroscope.py    From Horoscope-API with MIT License 6 votes vote down vote up
def get_weekly_horoscope(sunsign):
        url = "http://www.ganeshaspeaks.com/horoscopes/weekly-horoscope/" + sunsign
        response = requests.get(url)
        tree = html.fromstring(response.content)
        week = str(tree.xpath(
            "//*[@id=\"daily\"]/div/div[1]/div[1]/div[2]/div/p/text()"))
        week = week.replace("']", "").replace("['", "")
        horoscope = str(tree.xpath(
            "//*[@id=\"daily\"]/div/div[1]/div[2]/p[1]/text()"))
        horoscope = horoscope.replace("\\n", "").replace("  ", "").replace("']", "").replace("['", "")
        dict = {
            'week': week,
            'horoscope': horoscope,
            'sunsign': sunsign
        }

        return dict 
Example #13
Source File: imdbutils.py    From python-plexlibrary with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _handle_request(self, url):
        """Stolen from Automated IMDB Top 250 Plex library script
           by /u/SwiftPanda16
        """
        r = requests.get(url)
        tree = html.fromstring(r.content)

        # Dict of the IMDB top 250 ids in order
        titles = tree.xpath("//table[contains(@class, 'chart')]"
                            "//td[@class='titleColumn']/a/text()")
        years = tree.xpath("//table[contains(@class, 'chart')]"
                           "//td[@class='titleColumn']/span/text()")
        ids = tree.xpath("//table[contains(@class, 'chart')]"
                         "//td[@class='ratingColumn']/div//@data-titleid")

        return ids, titles, years 
Example #14
Source File: utils.py    From PlayStoreLinks_Bot with MIT License 6 votes vote down vote up
def get_text_from_markdown(markdown_text):
	renderer = HtmlRenderer()
	markdown = Markdown(renderer, extensions=('tables', 'autolink', 'strikethrough', 'quote', 'superscript', 'fenced-code'))
	html = markdown(markdown_text)
	parsed_html = fromstring(html)
	
	# remove quoted text
	[x.getparent().remove(x) for x in parsed_html.xpath('//blockquote')]
	
	# remove automatically added links 
	for link in parsed_html.xpath('//a'):
		if link.text_content() == link.get('href'):			 
			link.getparent().remove(link)
	
	text = ''.join(parsed_html.text_content()).strip()
	return text

# https://stackoverflow.com/a/3155023 
Example #15
Source File: token_repository.py    From safe-relay-service with MIT License 6 votes vote down vote up
def __token_info_fallback(self, token_address):
        """
        Get token info using ArthurStandardToken interface
        :param token_address:
        :return:
        """
        page = requests.get(
            'https://etherscan.io/readContract?v=0xb9469430eabcbfa77005cd3ad4276ce96bd221e3&a=' + token_address)
        tree = html.fromstring(page.content)
        return {
            "address": token_address,
            "name": tree.xpath(
                '//a[contains(text(), "name")]/../../following-sibling::div//div[@class="form-group"]/text()')[
                0].strip(),
            "symbol": tree.xpath(
                '//a[contains(text(), "symbol")]/../../following-sibling::div//div[@class="form-group"]/text()')[
                0].strip(),
            "decimals": int(tree.xpath(
                '//a[contains(text(), "decimals")]/../../following-sibling::div//div[@class="form-group"]/text()')[
                0].strip())
        } 
Example #16
Source File: wallhaven.py    From daily-wallpaper with MIT License 6 votes vote down vote up
def resolve_url(self):
        try:
            r = requests.get(URL)
            if r.status_code == 200:
                doc = fromstring(r.text)
                results = doc.cssselect('figure')
                if results:
                    wallpaper_id = results[0].get('data-wallpaper-id')
                    wallpaper_image = 'wallhaven-{0}.jpg'.format(
                        wallpaper_id)
                    self._url = 'https://w.wallhaven.cc/full/{}/{}'.format(
                        wallpaper_id[0:2], wallpaper_image)
                    return True
        except Exception:
            pass
        return False 
Example #17
Source File: vokrugsveta.py    From daily-wallpaper with MIT License 6 votes vote down vote up
def resolve_url(self):
        url = URL.format('photo_of_the_day/')
        try:
            r = requests.get(url)
            if r.status_code == 200:
                doc = fromstring(r.text)
                results = doc.cssselect('a.article__pic')
                url = URL.format(results[0].get('href'))
                r = requests.get(url, stream=True)
                if r.status_code == 200:
                    doc = fromstring(r.text)
                    results = doc.cssselect('img')
                    for index, result in enumerate(results):
                        posible = result.get('src')
                        if re.match(r'/img/bx/iblock/.*\.jpg$',
                                    posible.lower()):
                            self._url = URL.format(posible[1:])
                            return True
        except Exception as e:
            print(e)
            pass
        return False 
Example #18
Source File: instance_type_scraper.py    From cloudformation-environmentbase with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def get_page(url):
    html = urlopen(url).read()
    dom = fromstring(html)
    dom.make_links_absolute(url)
    return dom 
Example #19
Source File: testtools.py    From RSSNewsGAE with Apache License 2.0 5 votes vote down vote up
def lxml(self):
        """Get an lxml etree if possible."""
        if ('html' not in self.mimetype and 'xml' not in self.mimetype):
            raise AttributeError('Not an HTML/XML response')
        from lxml import etree
        try:
            from lxml.html import fromstring
        except ImportError:
            fromstring = etree.HTML
        if self.mimetype == 'text/html':
            return fromstring(self.data)
        return etree.XML(self.data) 
Example #20
Source File: Facebook.py    From ModLogin with MIT License 5 votes vote down vote up
def get_name_element(self, login_html_str):
        try:
            login_attempt_html = html.fromstring(login_html_str)
            # Define a page element that only appears if the login is
            # successful
            logged_in_name_element = login_attempt_html.xpath(
              '//*[@id="u_0_1"]/div[1]/div[1]/div/a/span'
            )
            return str(logged_in_name_element[0].text_content()).strip()
        except Exception as e:
            print "Debug: Unable to successfully parse name element: " + str(e)
        return '' 
Example #21
Source File: token_repository.py    From safe-relay-service with MIT License 5 votes vote down vote up
def __token_website_fallback(self, token_address):
        url = 'https://etherscan.io/token/' + token_address
        logger.debug('Falling back for token with address=%s, url=%s', token_address, url)
        page = requests.get(url)
        tree = html.fromstring(page.content)
        website = tree.xpath('//tr[@id="ContentPlaceHolder1_tr_officialsite_1"]/td/a/text()')
        return website[0].strip() if website else '' 
Example #22
Source File: token_repository.py    From safe-relay-service with MIT License 5 votes vote down vote up
def __pull_token_info(self, page_number: int = 1) -> List[Any]:
        tokens = []
        page = requests.get('https://etherscan.io/tokens?p=' + str(page_number))
        tree = html.fromstring(page.content)

        token_data = tree.xpath('//div[@id="ContentPlaceHolder1_divresult"]/table/tbody/tr')
        for element in token_data:
            link = element.xpath('td[@align="center"]/a/@href')[0]
            token_address = to_checksum_address(link[7:])
            desc = element.xpath('td/small/font/text()')
            token_request = requests.get(
                "https://raw.githubusercontent.com/ethereum-lists/tokens/master/tokens/eth/" + token_address + ".json")
            if token_request.status_code == 200:
                data = token_request.json()
            else:
                logger.info("Not info for token %s, using fallback source", token_address)
                data = self.__token_info_fallback(token_address)

            if data:
                if not data.get('website'):
                    data['website'] = self.__token_website_fallback(token_address)
                data.setdefault('description', desc[0] if desc else '')
                tokens.append(data)
            else:
                logger.warning("Token info not found for token %s", token_address)

        return tokens 
Example #23
Source File: utils.py    From ImageScraper with GNU General Public License v3.0 5 votes vote down vote up
def get_img_list(self):
        """ Gets list of images from the page_html. """
        tree = html.fromstring(self.page_html)
        img = tree.xpath('//img/@src')
        links = tree.xpath('//a/@href')
        img_list = self.process_links(img)
        img_links = self.process_links(links)
        img_list.extend(img_links)

        if self.filename_pattern:
            # Compile pattern for efficiency
            pattern = re.compile(self.filename_pattern)

            # Verifies filename in the image URL matches pattern
            def matches_pattern(img_url):
                """ Function to check if pattern is matched. """

                img_filename = urlparse(img_url).path.split('/')[-1]
                return pattern.search(img_filename)

            images = [urljoin(self.url, img_url) for img_url in img_list
                      if matches_pattern(img_url)]
        else:
            images = [urljoin(self.url, img_url) for img_url in img_list]

        images = list(set(images))
        self.images = images
        if self.scrape_reverse:
            self.images.reverse()
        return self.images 
Example #24
Source File: test_frontend.py    From spkrepo with MIT License 5 votes vote down vote up
def test_post_generate_api_key_developer(self):
        with self.logged_user("developer", api_key=None):
            response = self.client.post(
                url_for("frontend.profile"), data=dict(), follow_redirects=True
            )
            self.assert200(response)
            html = fromstring(response.data.decode(response.charset))
            self.assertTrue(html.forms[0].fields["api_key"] != "") 
Example #25
Source File: test_frontend.py    From spkrepo with MIT License 5 votes vote down vote up
def test_get_no_api_key_by_default(self):
        with self.logged_user("developer", api_key=None):
            response = self.client.get(url_for("frontend.profile"))
            html = fromstring(response.data.decode(response.charset))
            self.assertTrue(html.forms[0].fields["api_key"] == "") 
Example #26
Source File: util.py    From oadoi with MIT License 5 votes vote down vote up
def get_tree(page):
    page = page.replace(" ", " ")  # otherwise starts-with for lxml doesn't work
    try:
        tree = html.fromstring(page)
    except (etree.XMLSyntaxError, etree.ParserError) as e:
        print u"not parsing, beause etree error in get_tree: {}".format(e)
        tree = None
    return tree 
Example #27
Source File: testtools.py    From lambda-packs with MIT License 5 votes vote down vote up
def lxml(self):
        """Get an lxml etree if possible."""
        if ('html' not in self.mimetype and 'xml' not in self.mimetype):
            raise AttributeError('Not an HTML/XML response')
        from lxml import etree
        try:
            from lxml.html import fromstring
        except ImportError:
            fromstring = etree.HTML
        if self.mimetype == 'text/html':
            return fromstring(self.data)
        return etree.XML(self.data) 
Example #28
Source File: response.py    From pledgeservice with Apache License 2.0 5 votes vote down vote up
def lxml(self):
        """
        Returns the response as an `lxml object
        <http://codespeak.net/lxml/>`_.  You must have lxml installed
        to use this.

        If this is an HTML response and you have lxml 2.x installed,
        then an ``lxml.html.HTML`` object will be returned; if you
        have an earlier version of lxml then a ``lxml.HTML`` object
        will be returned.
        """
        if 'html' not in self.content_type and \
           'xml' not in self.content_type:
            raise AttributeError(
                "Not an XML or HTML response body (content-type: %s)"
                % self.content_type)
        try:
            from lxml import etree
        except ImportError:  # pragma: no cover
            raise ImportError(
                "You must have lxml installed to use response.lxml")
        try:
            from lxml.html import fromstring
        except ImportError:  # pragma: no cover
            fromstring = etree.HTML
        ## FIXME: would be nice to set xml:base, in some fashion
        if self.content_type == 'text/html':
            return fromstring(self.testbody, base_url=self.request.url)
        else:
            return etree.XML(self.testbody, base_url=self.request.url) 
Example #29
Source File: Reddit.py    From ModLogin with MIT License 5 votes vote down vote up
def get_handle_element(self, login_html_str):
        try:
            login_attempt_html = html.fromstring(login_html_str)
            # Define a page element that only appears if the login is
            # successful
            logged_in_handle_element = login_attempt_html.xpath(
                '//*[@id="header-bottom-right"]/span[1]/a'
            )
            return str(logged_in_handle_element[0].text_content())
        except Exception as e:
            print "Debug: Unable to successfully parse handle element: " + \
                  str(e)
        return '' 
Example #30
Source File: 26_stock_scraper.py    From python-scripts with MIT License 5 votes vote down vote up
def get_stocks(url):
    # Make Request
    page = requests.get(url)
    # Parse/Scrape
    tree = html.fromstring(page.text)
    xpath = '//*[@id="mw-content-text"]/table[1]'
    rows = tree.xpath(xpath)[0].findall("tr")
    rows = [(row.getchildren()[0], row.getchildren()[3]) for row in rows[1:]]
    rows = [(row[0].getchildren()[0].text, row[1].text) for row in rows]
    industries = defaultdict(list)
    for row in rows:
        industries[row[1]].append(row[0])
    return industries