Python Examples of lxml.html.fromstring

Source File: pyhoroscope.py From Horoscope-API with MIT License

6 votes

def get_yearly_horoscope(sunsign):
        url = "http://www.ganeshaspeaks.com/horoscopes/yearly-horoscope/" + sunsign
        response = requests.get(url)
        tree = html.fromstring(response.content)
        year = str(tree.xpath(
            "//*[@id=\"daily\"]/div/div[1]/div[1]/div[2]/div/p/text()"))
        year = year.replace("']", "").replace("['", "")
        horoscope = str(tree.xpath(
            "//*[@id=\"daily\"]/div/div[1]/div[2]/p[1]/text()"))
        horoscope = horoscope.replace("\\n", "").replace("  ", "").replace("']", "").replace("['", "")
        dict = {
            'year': year,
            'horoscope': horoscope,
            'sunsign': sunsign
        }

        return dict

Source File: dic.py From yui with GNU Affero General Public License v3.0

6 votes

def parse(html: str) -> Tuple[Optional[str], List[Attachment]]:
    h = fromstring(html)
    meta = h.cssselect('meta[http-equiv=Refresh]')
    if meta:
        return fix_url(meta[0].get('content')[7:]), []
    else:
        words = h.cssselect('div.search_type')

        attachments: List[Attachment] = []

        for word in words:
            w = word.cssselect('.txt_searchword')[0]
            attachments.append(
                Attachment(
                    title=strip_tags(w.text_content()),
                    title_link=fix_url(w.get('href')),
                    text=fix_blank(
                        word.cssselect('.list_search')[0].text_content()
                    ),
                )
            )

        return None, attachments

Source File: animal.py From yui with GNU Affero General Public License v3.0

6 votes

def get_cat_image_url(timeout: float) -> str:
    api_url = 'http://thecatapi.com/api/images/get'
    async with aiohttp.ClientSession() as session:
        while True:
            try:
                async with session.get(
                    api_url, params={'format': 'xml', 'type': 'jpg,png'}
                ) as res:
                    if res.status != 200:
                        raise APIServerError
                    xml_result = await res.read()
                    tree = etree.fromstring(xml_result)
                    url = tree.find('data/images/image/url').text
            except aiohttp.client_exceptions.ServerDisconnectedError:
                await asyncio.sleep(0.1)
                continue
            try:
                async with async_timeout.timeout(timeout=timeout):
                    async with session.get(url) as res:
                        async with res:
                            if res.status == 200:
                                return url
            except (aiohttp.ClientConnectorError, asyncio.TimeoutError):
                continue

Source File: get_data.py From X-ray-classification with MIT License

6 votes

def main():
    for url in url_list :
        try:
            r = requests.get(url)
        except : continue
        tree = html.fromstring(r.text)

        script = tree.xpath('//script[@language="javascript"]/text()')[0]

        json_string = regex.findall(script)[0]
        json_data = json.loads(json_string)

        next_page_url = tree.xpath('//footer/a/@href')

        links = [domain + x['nodeRef'] for x in json_data]
        for link in links:
            extract(link)

Source File: receiving_mail.py From django-email-gateway with GNU General Public License v3.0

6 votes

def sns_notification(body):
    json_body = body.decode('utf8')
    js = json.loads(json_body.replace('\n', ''))
    if js["Type"] == "Notification":
        arg_info = js["Message"]
        arg_info = json.loads(arg_info)
        content = arg_info['content']
        subject = arg_info['mail']['commonHeaders']['subject']
        html_content = content.partition('Content-Type: text/html; charset=UTF-8')[2]
        if 'Content-Transfer-Encoding' in html_content:
            html_content = html_content.partition('Content-Transfer-Encoding: quoted-printable')[2]
        text = html_content.replace('\r\n', '')
        table = html.fromstring(text)
        content = ''
        for item in table:
            if item.text:
                content += item.text.strip()
        mail_content = str(content)
        from_mail = arg_info['mail']['source']
        to_mail = arg_info['mail']['destination'][0]
        hash_code = arg_info['mail']['destination'][0].split('@')[0]
        return subject, from_mail, to_mail, hash_code, mail_content

Source File: reportloader.py From nhlscrapi with Apache License 2.0

6 votes

def html_doc(self):
        """
        :returns: the lxml processed html document
        :rtype: ``lxml.html.document_fromstring`` output
        """
        
        if self.__lx_doc is None:
            cn = NHLCn()
          
            if hasattr(cn, self.report_type):
                html = getattr(cn, self.report_type)(self.game_key)
            else:
                raise ValueError('Invalid report type: %s' % self.report_type)
          
            if cn.req_err is None:
                self.__lx_doc = fromstring(html)
            else:
                self.req_err = cn.req_err
            
        return self.__lx_doc

Source File: get_image_gevent.py From girl-atlas-crawler with BSD 2-Clause "Simplified" License

6 votes

def get_page_urls():

    start_url = 'http://girl-atlas.com/'
    response = get_response(start_url)
    page_urls = []

    page_urls.append(start_url)
    while True:
        parsed_body = html.fromstring(response.text)
        next_url = parsed_body.xpath('//a[@class="btn-form next"]/@href')

        if not next_url:
            break

        next_url = start_url + next_url[0]
        page_urls.append(next_url)
        response = get_response(next_url)

    print "get_page_urls done!!!"

    return page_urls

# 获取每个girl专辑的url

Source File: get_image.py From girl-atlas-crawler with BSD 2-Clause "Simplified" License

6 votes

def get_image_urls(girl_urls):

    girl_list = []
    
    for url in girl_urls:
        # print "in get_image_urls" + url[0]
        response = get_response(url)
        parsed_body = html.fromstring(response.text)

        # 专辑名
        girl_title  = parsed_body.xpath('//title/text()')
        image_urls = parsed_body.xpath('//li[@class="slide "]/img/@src | //li[@class="slide "]/img/@delay')

        girl_dict = {girl_title[0] : image_urls}
        girl_list.append(girl_dict)
        
    print "get_girl_urls done!!!"
    return girl_list

# 开始下载图片

Source File: populate.py From phageParser with MIT License

6 votes

def addpositionstodict(gendict):
    print("Downloading position information from web...")
    for accidwithloc in tqdm(gendict):
        if 'Start' in gendict[accidwithloc]:
            continue
        accid = '_'.join(accidwithloc.split('_')[:-1])
        url = ('http://crispr.i2bc.paris-saclay.fr/crispr/crispr_db.php?'
               'checked%5B%5D={}'.format(accid))
        page = requests.get(url)
        htmltable = html.fromstring(page.content).xpath(
            "//table[normalize-space(@class)='primary_table']")[1]
        strtable = etree.tostring(htmltable)
        # converts to pandas df and then to numpy array then drop titles
        arrtable = pandas.read_html(strtable)[0].as_matrix()[2:]
        for row in arrtable:
            if row[0] in gendict:
                gendict[row[0]]['Start'] = row[2]
                gendict[row[0]]['Stop'] = row[3]
            else:
                if row[1] != 'questionable':
                    print("Can't find %s in local files" % row[0])
    return gendict

Source File: get_image.py From girl-atlas-crawler with BSD 2-Clause "Simplified" License

6 votes

def get_page_urls():

    start_url = 'http://girl-atlas.com/'
    response = get_response(start_url)
    page_urls = []

    page_urls.append(start_url)
    while True:
        parsed_body = html.fromstring(response.text)
        # Xpath 提取访问下个页面的url
        next_url = parsed_body.xpath('//a[@class="btn-form next"]/@href')

        if not next_url:
            break

        next_url = start_url + next_url[0]
        page_urls.append(next_url)
        response = get_response(next_url)

    print "get_page_urls done!!!"

    return page_urls

# 获取每个girl专辑的Url

Source File: pyhoroscope.py From Horoscope-API with MIT License

6 votes

def get_monthly_horoscope(sunsign):
        url = "http://www.ganeshaspeaks.com/horoscopes/monthly-horoscope/" + sunsign
        response = requests.get(url)
        tree = html.fromstring(response.content)
        month = str(tree.xpath(
            "//*[@id=\"daily\"]/div/div[1]/div[1]/div[2]/div/p/text()"))
        month = month.replace("']", "").replace("['", "")
        horoscope = str(tree.xpath(
            "//*[@id=\"daily\"]/div/div[1]/div[2]/p[1]/text()[1]"))
        horoscope = horoscope.replace("\\n", "").replace("  ", "").replace("']", "").replace("['", "")
        dict = {
            'month': month,
            'horoscope': horoscope,
            'sunsign': sunsign
        }

        return dict

Source File: pyhoroscope.py From Horoscope-API with MIT License

6 votes

def get_weekly_horoscope(sunsign):
        url = "http://www.ganeshaspeaks.com/horoscopes/weekly-horoscope/" + sunsign
        response = requests.get(url)
        tree = html.fromstring(response.content)
        week = str(tree.xpath(
            "//*[@id=\"daily\"]/div/div[1]/div[1]/div[2]/div/p/text()"))
        week = week.replace("']", "").replace("['", "")
        horoscope = str(tree.xpath(
            "//*[@id=\"daily\"]/div/div[1]/div[2]/p[1]/text()"))
        horoscope = horoscope.replace("\\n", "").replace("  ", "").replace("']", "").replace("['", "")
        dict = {
            'week': week,
            'horoscope': horoscope,
            'sunsign': sunsign
        }

        return dict

Source File: imdbutils.py From python-plexlibrary with BSD 3-Clause "New" or "Revised" License

6 votes

def _handle_request(self, url):
        """Stolen from Automated IMDB Top 250 Plex library script
           by /u/SwiftPanda16
        """
        r = requests.get(url)
        tree = html.fromstring(r.content)

        # Dict of the IMDB top 250 ids in order
        titles = tree.xpath("//table[contains(@class, 'chart')]"
                            "//td[@class='titleColumn']/a/text()")
        years = tree.xpath("//table[contains(@class, 'chart')]"
                           "//td[@class='titleColumn']/span/text()")
        ids = tree.xpath("//table[contains(@class, 'chart')]"
                         "//td[@class='ratingColumn']/div//@data-titleid")

        return ids, titles, years

Source File: utils.py From PlayStoreLinks_Bot with MIT License

6 votes

def get_text_from_markdown(markdown_text):
	renderer = HtmlRenderer()
	markdown = Markdown(renderer, extensions=('tables', 'autolink', 'strikethrough', 'quote', 'superscript', 'fenced-code'))
	html = markdown(markdown_text)
	parsed_html = fromstring(html)
	
	# remove quoted text
	[x.getparent().remove(x) for x in parsed_html.xpath('//blockquote')]
	
	# remove automatically added links 
	for link in parsed_html.xpath('//a'):
		if link.text_content() == link.get('href'):			 
			link.getparent().remove(link)
	
	text = ''.join(parsed_html.text_content()).strip()
	return text

# https://stackoverflow.com/a/3155023

Source File: token_repository.py From safe-relay-service with MIT License

6 votes

def __token_info_fallback(self, token_address):
        """
        Get token info using ArthurStandardToken interface
        :param token_address:
        :return:
        """
        page = requests.get(
            'https://etherscan.io/readContract?v=0xb9469430eabcbfa77005cd3ad4276ce96bd221e3&a=' + token_address)
        tree = html.fromstring(page.content)
        return {
            "address": token_address,
            "name": tree.xpath(
                '//a[contains(text(), "name")]/../../following-sibling::div//div[@class="form-group"]/text()')[
                0].strip(),
            "symbol": tree.xpath(
                '//a[contains(text(), "symbol")]/../../following-sibling::div//div[@class="form-group"]/text()')[
                0].strip(),
            "decimals": int(tree.xpath(
                '//a[contains(text(), "decimals")]/../../following-sibling::div//div[@class="form-group"]/text()')[
                0].strip())
        }

Source File: wallhaven.py From daily-wallpaper with MIT License

6 votes

def resolve_url(self):
        try:
            r = requests.get(URL)
            if r.status_code == 200:
                doc = fromstring(r.text)
                results = doc.cssselect('figure')
                if results:
                    wallpaper_id = results[0].get('data-wallpaper-id')
                    wallpaper_image = 'wallhaven-{0}.jpg'.format(
                        wallpaper_id)
                    self._url = 'https://w.wallhaven.cc/full/{}/{}'.format(
                        wallpaper_id[0:2], wallpaper_image)
                    return True
        except Exception:
            pass
        return False

Source File: vokrugsveta.py From daily-wallpaper with MIT License

6 votes

def resolve_url(self):
        url = URL.format('photo_of_the_day/')
        try:
            r = requests.get(url)
            if r.status_code == 200:
                doc = fromstring(r.text)
                results = doc.cssselect('a.article__pic')
                url = URL.format(results[0].get('href'))
                r = requests.get(url, stream=True)
                if r.status_code == 200:
                    doc = fromstring(r.text)
                    results = doc.cssselect('img')
                    for index, result in enumerate(results):
                        posible = result.get('src')
                        if re.match(r'/img/bx/iblock/.*\.jpg$',
                                    posible.lower()):
                            self._url = URL.format(posible[1:])
                            return True
        except Exception as e:
            print(e)
            pass
        return False

Source File: instance_type_scraper.py From cloudformation-environmentbase with BSD 2-Clause "Simplified" License

5 votes

def get_page(url):
    html = urlopen(url).read()
    dom = fromstring(html)
    dom.make_links_absolute(url)
    return dom

Source File: testtools.py From RSSNewsGAE with Apache License 2.0

5 votes

def lxml(self):
        """Get an lxml etree if possible."""
        if ('html' not in self.mimetype and 'xml' not in self.mimetype):
            raise AttributeError('Not an HTML/XML response')
        from lxml import etree
        try:
            from lxml.html import fromstring
        except ImportError:
            fromstring = etree.HTML
        if self.mimetype == 'text/html':
            return fromstring(self.data)
        return etree.XML(self.data)

Source File: Facebook.py From ModLogin with MIT License

5 votes

def get_name_element(self, login_html_str):
        try:
            login_attempt_html = html.fromstring(login_html_str)
            # Define a page element that only appears if the login is
            # successful
            logged_in_name_element = login_attempt_html.xpath(
              '//*[@id="u_0_1"]/div[1]/div[1]/div/a/span'
            )
            return str(logged_in_name_element[0].text_content()).strip()
        except Exception as e:
            print "Debug: Unable to successfully parse name element: " + str(e)
        return ''

Source File: token_repository.py From safe-relay-service with MIT License

5 votes

def __token_website_fallback(self, token_address):
        url = 'https://etherscan.io/token/' + token_address
        logger.debug('Falling back for token with address=%s, url=%s', token_address, url)
        page = requests.get(url)
        tree = html.fromstring(page.content)
        website = tree.xpath('//tr[@id="ContentPlaceHolder1_tr_officialsite_1"]/td/a/text()')
        return website[0].strip() if website else ''

Source File: token_repository.py From safe-relay-service with MIT License

5 votes

def __pull_token_info(self, page_number: int = 1) -> List[Any]:
        tokens = []
        page = requests.get('https://etherscan.io/tokens?p=' + str(page_number))
        tree = html.fromstring(page.content)

        token_data = tree.xpath('//div[@id="ContentPlaceHolder1_divresult"]/table/tbody/tr')
        for element in token_data:
            link = element.xpath('td[@align="center"]/a/@href')[0]
            token_address = to_checksum_address(link[7:])
            desc = element.xpath('td/small/font/text()')
            token_request = requests.get(
                "https://raw.githubusercontent.com/ethereum-lists/tokens/master/tokens/eth/" + token_address + ".json")
            if token_request.status_code == 200:
                data = token_request.json()
            else:
                logger.info("Not info for token %s, using fallback source", token_address)
                data = self.__token_info_fallback(token_address)

            if data:
                if not data.get('website'):
                    data['website'] = self.__token_website_fallback(token_address)
                data.setdefault('description', desc[0] if desc else '')
                tokens.append(data)
            else:
                logger.warning("Token info not found for token %s", token_address)

        return tokens

Source File: utils.py From ImageScraper with GNU General Public License v3.0

5 votes

def get_img_list(self):
        """ Gets list of images from the page_html. """
        tree = html.fromstring(self.page_html)
        img = tree.xpath('//img/@src')
        links = tree.xpath('//a/@href')
        img_list = self.process_links(img)
        img_links = self.process_links(links)
        img_list.extend(img_links)

        if self.filename_pattern:
            # Compile pattern for efficiency
            pattern = re.compile(self.filename_pattern)

            # Verifies filename in the image URL matches pattern
            def matches_pattern(img_url):
                """ Function to check if pattern is matched. """

                img_filename = urlparse(img_url).path.split('/')[-1]
                return pattern.search(img_filename)

            images = [urljoin(self.url, img_url) for img_url in img_list
                      if matches_pattern(img_url)]
        else:
            images = [urljoin(self.url, img_url) for img_url in img_list]

        images = list(set(images))
        self.images = images
        if self.scrape_reverse:
            self.images.reverse()
        return self.images

Source File: test_frontend.py From spkrepo with MIT License

5 votes

def test_post_generate_api_key_developer(self):
        with self.logged_user("developer", api_key=None):
            response = self.client.post(
                url_for("frontend.profile"), data=dict(), follow_redirects=True
            )
            self.assert200(response)
            html = fromstring(response.data.decode(response.charset))
            self.assertTrue(html.forms[0].fields["api_key"] != "")

Source File: test_frontend.py From spkrepo with MIT License

5 votes

def test_get_no_api_key_by_default(self):
        with self.logged_user("developer", api_key=None):
            response = self.client.get(url_for("frontend.profile"))
            html = fromstring(response.data.decode(response.charset))
            self.assertTrue(html.forms[0].fields["api_key"] == "")

Source File: util.py From oadoi with MIT License

5 votes

def get_tree(page):
    page = page.replace("&nbsp;", " ")  # otherwise starts-with for lxml doesn't work
    try:
        tree = html.fromstring(page)
    except (etree.XMLSyntaxError, etree.ParserError) as e:
        print u"not parsing, beause etree error in get_tree: {}".format(e)
        tree = None
    return tree

Source File: testtools.py From lambda-packs with MIT License

5 votes

def lxml(self):
        """Get an lxml etree if possible."""
        if ('html' not in self.mimetype and 'xml' not in self.mimetype):
            raise AttributeError('Not an HTML/XML response')
        from lxml import etree
        try:
            from lxml.html import fromstring
        except ImportError:
            fromstring = etree.HTML
        if self.mimetype == 'text/html':
            return fromstring(self.data)
        return etree.XML(self.data)

Source File: response.py From pledgeservice with Apache License 2.0

5 votes

def lxml(self):
        """
        Returns the response as an `lxml object
        <http://codespeak.net/lxml/>`_.  You must have lxml installed
        to use this.

        If this is an HTML response and you have lxml 2.x installed,
        then an ``lxml.html.HTML`` object will be returned; if you
        have an earlier version of lxml then a ``lxml.HTML`` object
        will be returned.
        """
        if 'html' not in self.content_type and \
           'xml' not in self.content_type:
            raise AttributeError(
                "Not an XML or HTML response body (content-type: %s)"
                % self.content_type)
        try:
            from lxml import etree
        except ImportError:  # pragma: no cover
            raise ImportError(
                "You must have lxml installed to use response.lxml")
        try:
            from lxml.html import fromstring
        except ImportError:  # pragma: no cover
            fromstring = etree.HTML
        ## FIXME: would be nice to set xml:base, in some fashion
        if self.content_type == 'text/html':
            return fromstring(self.testbody, base_url=self.request.url)
        else:
            return etree.XML(self.testbody, base_url=self.request.url)

Source File: Reddit.py From ModLogin with MIT License

5 votes

def get_handle_element(self, login_html_str):
        try:
            login_attempt_html = html.fromstring(login_html_str)
            # Define a page element that only appears if the login is
            # successful
            logged_in_handle_element = login_attempt_html.xpath(
                '//*[@id="header-bottom-right"]/span[1]/a'
            )
            return str(logged_in_handle_element[0].text_content())
        except Exception as e:
            print "Debug: Unable to successfully parse handle element: " + \
                  str(e)
        return ''

Source File: 26_stock_scraper.py From python-scripts with MIT License

5 votes

def get_stocks(url):
    # Make Request
    page = requests.get(url)
    # Parse/Scrape
    tree = html.fromstring(page.text)
    xpath = '//*[@id="mw-content-text"]/table[1]'
    rows = tree.xpath(xpath)[0].findall("tr")
    rows = [(row.getchildren()[0], row.getchildren()[3]) for row in rows[1:]]
    rows = [(row[0].getchildren()[0].text, row[1].text) for row in rows]
    industries = defaultdict(list)
    for row in rows:
        industries[row[1]].append(row[0])
    return industries

Python lxml.html.fromstring() Examples