Python lxml.html.fromstring() Examples
The following are 30
code examples of lxml.html.fromstring().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
lxml.html
, or try the search function
.
Example #1
Source File: pyhoroscope.py From Horoscope-API with MIT License | 6 votes |
def get_yearly_horoscope(sunsign): url = "http://www.ganeshaspeaks.com/horoscopes/yearly-horoscope/" + sunsign response = requests.get(url) tree = html.fromstring(response.content) year = str(tree.xpath( "//*[@id=\"daily\"]/div/div[1]/div[1]/div[2]/div/p/text()")) year = year.replace("']", "").replace("['", "") horoscope = str(tree.xpath( "//*[@id=\"daily\"]/div/div[1]/div[2]/p[1]/text()")) horoscope = horoscope.replace("\\n", "").replace(" ", "").replace("']", "").replace("['", "") dict = { 'year': year, 'horoscope': horoscope, 'sunsign': sunsign } return dict
Example #2
Source File: dic.py From yui with GNU Affero General Public License v3.0 | 6 votes |
def parse(html: str) -> Tuple[Optional[str], List[Attachment]]: h = fromstring(html) meta = h.cssselect('meta[http-equiv=Refresh]') if meta: return fix_url(meta[0].get('content')[7:]), [] else: words = h.cssselect('div.search_type') attachments: List[Attachment] = [] for word in words: w = word.cssselect('.txt_searchword')[0] attachments.append( Attachment( title=strip_tags(w.text_content()), title_link=fix_url(w.get('href')), text=fix_blank( word.cssselect('.list_search')[0].text_content() ), ) ) return None, attachments
Example #3
Source File: animal.py From yui with GNU Affero General Public License v3.0 | 6 votes |
def get_cat_image_url(timeout: float) -> str: api_url = 'http://thecatapi.com/api/images/get' async with aiohttp.ClientSession() as session: while True: try: async with session.get( api_url, params={'format': 'xml', 'type': 'jpg,png'} ) as res: if res.status != 200: raise APIServerError xml_result = await res.read() tree = etree.fromstring(xml_result) url = tree.find('data/images/image/url').text except aiohttp.client_exceptions.ServerDisconnectedError: await asyncio.sleep(0.1) continue try: async with async_timeout.timeout(timeout=timeout): async with session.get(url) as res: async with res: if res.status == 200: return url except (aiohttp.ClientConnectorError, asyncio.TimeoutError): continue
Example #4
Source File: get_data.py From X-ray-classification with MIT License | 6 votes |
def main(): for url in url_list : try: r = requests.get(url) except : continue tree = html.fromstring(r.text) script = tree.xpath('//script[@language="javascript"]/text()')[0] json_string = regex.findall(script)[0] json_data = json.loads(json_string) next_page_url = tree.xpath('//footer/a/@href') links = [domain + x['nodeRef'] for x in json_data] for link in links: extract(link)
Example #5
Source File: receiving_mail.py From django-email-gateway with GNU General Public License v3.0 | 6 votes |
def sns_notification(body): json_body = body.decode('utf8') js = json.loads(json_body.replace('\n', '')) if js["Type"] == "Notification": arg_info = js["Message"] arg_info = json.loads(arg_info) content = arg_info['content'] subject = arg_info['mail']['commonHeaders']['subject'] html_content = content.partition('Content-Type: text/html; charset=UTF-8')[2] if 'Content-Transfer-Encoding' in html_content: html_content = html_content.partition('Content-Transfer-Encoding: quoted-printable')[2] text = html_content.replace('\r\n', '') table = html.fromstring(text) content = '' for item in table: if item.text: content += item.text.strip() mail_content = str(content) from_mail = arg_info['mail']['source'] to_mail = arg_info['mail']['destination'][0] hash_code = arg_info['mail']['destination'][0].split('@')[0] return subject, from_mail, to_mail, hash_code, mail_content
Example #6
Source File: reportloader.py From nhlscrapi with Apache License 2.0 | 6 votes |
def html_doc(self): """ :returns: the lxml processed html document :rtype: ``lxml.html.document_fromstring`` output """ if self.__lx_doc is None: cn = NHLCn() if hasattr(cn, self.report_type): html = getattr(cn, self.report_type)(self.game_key) else: raise ValueError('Invalid report type: %s' % self.report_type) if cn.req_err is None: self.__lx_doc = fromstring(html) else: self.req_err = cn.req_err return self.__lx_doc
Example #7
Source File: get_image_gevent.py From girl-atlas-crawler with BSD 2-Clause "Simplified" License | 6 votes |
def get_page_urls(): start_url = 'http://girl-atlas.com/' response = get_response(start_url) page_urls = [] page_urls.append(start_url) while True: parsed_body = html.fromstring(response.text) next_url = parsed_body.xpath('//a[@class="btn-form next"]/@href') if not next_url: break next_url = start_url + next_url[0] page_urls.append(next_url) response = get_response(next_url) print "get_page_urls done!!!" return page_urls # 获取每个girl专辑的url
Example #8
Source File: get_image.py From girl-atlas-crawler with BSD 2-Clause "Simplified" License | 6 votes |
def get_image_urls(girl_urls): girl_list = [] for url in girl_urls: # print "in get_image_urls" + url[0] response = get_response(url) parsed_body = html.fromstring(response.text) # 专辑名 girl_title = parsed_body.xpath('//title/text()') image_urls = parsed_body.xpath('//li[@class="slide "]/img/@src | //li[@class="slide "]/img/@delay') girl_dict = {girl_title[0] : image_urls} girl_list.append(girl_dict) print "get_girl_urls done!!!" return girl_list # 开始下载图片
Example #9
Source File: populate.py From phageParser with MIT License | 6 votes |
def addpositionstodict(gendict): print("Downloading position information from web...") for accidwithloc in tqdm(gendict): if 'Start' in gendict[accidwithloc]: continue accid = '_'.join(accidwithloc.split('_')[:-1]) url = ('http://crispr.i2bc.paris-saclay.fr/crispr/crispr_db.php?' 'checked%5B%5D={}'.format(accid)) page = requests.get(url) htmltable = html.fromstring(page.content).xpath( "//table[normalize-space(@class)='primary_table']")[1] strtable = etree.tostring(htmltable) # converts to pandas df and then to numpy array then drop titles arrtable = pandas.read_html(strtable)[0].as_matrix()[2:] for row in arrtable: if row[0] in gendict: gendict[row[0]]['Start'] = row[2] gendict[row[0]]['Stop'] = row[3] else: if row[1] != 'questionable': print("Can't find %s in local files" % row[0]) return gendict
Example #10
Source File: get_image.py From girl-atlas-crawler with BSD 2-Clause "Simplified" License | 6 votes |
def get_page_urls(): start_url = 'http://girl-atlas.com/' response = get_response(start_url) page_urls = [] page_urls.append(start_url) while True: parsed_body = html.fromstring(response.text) # Xpath 提取访问下个页面的url next_url = parsed_body.xpath('//a[@class="btn-form next"]/@href') if not next_url: break next_url = start_url + next_url[0] page_urls.append(next_url) response = get_response(next_url) print "get_page_urls done!!!" return page_urls # 获取每个girl专辑的Url
Example #11
Source File: pyhoroscope.py From Horoscope-API with MIT License | 6 votes |
def get_monthly_horoscope(sunsign): url = "http://www.ganeshaspeaks.com/horoscopes/monthly-horoscope/" + sunsign response = requests.get(url) tree = html.fromstring(response.content) month = str(tree.xpath( "//*[@id=\"daily\"]/div/div[1]/div[1]/div[2]/div/p/text()")) month = month.replace("']", "").replace("['", "") horoscope = str(tree.xpath( "//*[@id=\"daily\"]/div/div[1]/div[2]/p[1]/text()[1]")) horoscope = horoscope.replace("\\n", "").replace(" ", "").replace("']", "").replace("['", "") dict = { 'month': month, 'horoscope': horoscope, 'sunsign': sunsign } return dict
Example #12
Source File: pyhoroscope.py From Horoscope-API with MIT License | 6 votes |
def get_weekly_horoscope(sunsign): url = "http://www.ganeshaspeaks.com/horoscopes/weekly-horoscope/" + sunsign response = requests.get(url) tree = html.fromstring(response.content) week = str(tree.xpath( "//*[@id=\"daily\"]/div/div[1]/div[1]/div[2]/div/p/text()")) week = week.replace("']", "").replace("['", "") horoscope = str(tree.xpath( "//*[@id=\"daily\"]/div/div[1]/div[2]/p[1]/text()")) horoscope = horoscope.replace("\\n", "").replace(" ", "").replace("']", "").replace("['", "") dict = { 'week': week, 'horoscope': horoscope, 'sunsign': sunsign } return dict
Example #13
Source File: imdbutils.py From python-plexlibrary with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _handle_request(self, url): """Stolen from Automated IMDB Top 250 Plex library script by /u/SwiftPanda16 """ r = requests.get(url) tree = html.fromstring(r.content) # Dict of the IMDB top 250 ids in order titles = tree.xpath("//table[contains(@class, 'chart')]" "//td[@class='titleColumn']/a/text()") years = tree.xpath("//table[contains(@class, 'chart')]" "//td[@class='titleColumn']/span/text()") ids = tree.xpath("//table[contains(@class, 'chart')]" "//td[@class='ratingColumn']/div//@data-titleid") return ids, titles, years
Example #14
Source File: utils.py From PlayStoreLinks_Bot with MIT License | 6 votes |
def get_text_from_markdown(markdown_text): renderer = HtmlRenderer() markdown = Markdown(renderer, extensions=('tables', 'autolink', 'strikethrough', 'quote', 'superscript', 'fenced-code')) html = markdown(markdown_text) parsed_html = fromstring(html) # remove quoted text [x.getparent().remove(x) for x in parsed_html.xpath('//blockquote')] # remove automatically added links for link in parsed_html.xpath('//a'): if link.text_content() == link.get('href'): link.getparent().remove(link) text = ''.join(parsed_html.text_content()).strip() return text # https://stackoverflow.com/a/3155023
Example #15
Source File: token_repository.py From safe-relay-service with MIT License | 6 votes |
def __token_info_fallback(self, token_address): """ Get token info using ArthurStandardToken interface :param token_address: :return: """ page = requests.get( 'https://etherscan.io/readContract?v=0xb9469430eabcbfa77005cd3ad4276ce96bd221e3&a=' + token_address) tree = html.fromstring(page.content) return { "address": token_address, "name": tree.xpath( '//a[contains(text(), "name")]/../../following-sibling::div//div[@class="form-group"]/text()')[ 0].strip(), "symbol": tree.xpath( '//a[contains(text(), "symbol")]/../../following-sibling::div//div[@class="form-group"]/text()')[ 0].strip(), "decimals": int(tree.xpath( '//a[contains(text(), "decimals")]/../../following-sibling::div//div[@class="form-group"]/text()')[ 0].strip()) }
Example #16
Source File: wallhaven.py From daily-wallpaper with MIT License | 6 votes |
def resolve_url(self): try: r = requests.get(URL) if r.status_code == 200: doc = fromstring(r.text) results = doc.cssselect('figure') if results: wallpaper_id = results[0].get('data-wallpaper-id') wallpaper_image = 'wallhaven-{0}.jpg'.format( wallpaper_id) self._url = 'https://w.wallhaven.cc/full/{}/{}'.format( wallpaper_id[0:2], wallpaper_image) return True except Exception: pass return False
Example #17
Source File: vokrugsveta.py From daily-wallpaper with MIT License | 6 votes |
def resolve_url(self): url = URL.format('photo_of_the_day/') try: r = requests.get(url) if r.status_code == 200: doc = fromstring(r.text) results = doc.cssselect('a.article__pic') url = URL.format(results[0].get('href')) r = requests.get(url, stream=True) if r.status_code == 200: doc = fromstring(r.text) results = doc.cssselect('img') for index, result in enumerate(results): posible = result.get('src') if re.match(r'/img/bx/iblock/.*\.jpg$', posible.lower()): self._url = URL.format(posible[1:]) return True except Exception as e: print(e) pass return False
Example #18
Source File: instance_type_scraper.py From cloudformation-environmentbase with BSD 2-Clause "Simplified" License | 5 votes |
def get_page(url): html = urlopen(url).read() dom = fromstring(html) dom.make_links_absolute(url) return dom
Example #19
Source File: testtools.py From RSSNewsGAE with Apache License 2.0 | 5 votes |
def lxml(self): """Get an lxml etree if possible.""" if ('html' not in self.mimetype and 'xml' not in self.mimetype): raise AttributeError('Not an HTML/XML response') from lxml import etree try: from lxml.html import fromstring except ImportError: fromstring = etree.HTML if self.mimetype == 'text/html': return fromstring(self.data) return etree.XML(self.data)
Example #20
Source File: Facebook.py From ModLogin with MIT License | 5 votes |
def get_name_element(self, login_html_str): try: login_attempt_html = html.fromstring(login_html_str) # Define a page element that only appears if the login is # successful logged_in_name_element = login_attempt_html.xpath( '//*[@id="u_0_1"]/div[1]/div[1]/div/a/span' ) return str(logged_in_name_element[0].text_content()).strip() except Exception as e: print "Debug: Unable to successfully parse name element: " + str(e) return ''
Example #21
Source File: token_repository.py From safe-relay-service with MIT License | 5 votes |
def __token_website_fallback(self, token_address): url = 'https://etherscan.io/token/' + token_address logger.debug('Falling back for token with address=%s, url=%s', token_address, url) page = requests.get(url) tree = html.fromstring(page.content) website = tree.xpath('//tr[@id="ContentPlaceHolder1_tr_officialsite_1"]/td/a/text()') return website[0].strip() if website else ''
Example #22
Source File: token_repository.py From safe-relay-service with MIT License | 5 votes |
def __pull_token_info(self, page_number: int = 1) -> List[Any]: tokens = [] page = requests.get('https://etherscan.io/tokens?p=' + str(page_number)) tree = html.fromstring(page.content) token_data = tree.xpath('//div[@id="ContentPlaceHolder1_divresult"]/table/tbody/tr') for element in token_data: link = element.xpath('td[@align="center"]/a/@href')[0] token_address = to_checksum_address(link[7:]) desc = element.xpath('td/small/font/text()') token_request = requests.get( "https://raw.githubusercontent.com/ethereum-lists/tokens/master/tokens/eth/" + token_address + ".json") if token_request.status_code == 200: data = token_request.json() else: logger.info("Not info for token %s, using fallback source", token_address) data = self.__token_info_fallback(token_address) if data: if not data.get('website'): data['website'] = self.__token_website_fallback(token_address) data.setdefault('description', desc[0] if desc else '') tokens.append(data) else: logger.warning("Token info not found for token %s", token_address) return tokens
Example #23
Source File: utils.py From ImageScraper with GNU General Public License v3.0 | 5 votes |
def get_img_list(self): """ Gets list of images from the page_html. """ tree = html.fromstring(self.page_html) img = tree.xpath('//img/@src') links = tree.xpath('//a/@href') img_list = self.process_links(img) img_links = self.process_links(links) img_list.extend(img_links) if self.filename_pattern: # Compile pattern for efficiency pattern = re.compile(self.filename_pattern) # Verifies filename in the image URL matches pattern def matches_pattern(img_url): """ Function to check if pattern is matched. """ img_filename = urlparse(img_url).path.split('/')[-1] return pattern.search(img_filename) images = [urljoin(self.url, img_url) for img_url in img_list if matches_pattern(img_url)] else: images = [urljoin(self.url, img_url) for img_url in img_list] images = list(set(images)) self.images = images if self.scrape_reverse: self.images.reverse() return self.images
Example #24
Source File: test_frontend.py From spkrepo with MIT License | 5 votes |
def test_post_generate_api_key_developer(self): with self.logged_user("developer", api_key=None): response = self.client.post( url_for("frontend.profile"), data=dict(), follow_redirects=True ) self.assert200(response) html = fromstring(response.data.decode(response.charset)) self.assertTrue(html.forms[0].fields["api_key"] != "")
Example #25
Source File: test_frontend.py From spkrepo with MIT License | 5 votes |
def test_get_no_api_key_by_default(self): with self.logged_user("developer", api_key=None): response = self.client.get(url_for("frontend.profile")) html = fromstring(response.data.decode(response.charset)) self.assertTrue(html.forms[0].fields["api_key"] == "")
Example #26
Source File: util.py From oadoi with MIT License | 5 votes |
def get_tree(page): page = page.replace(" ", " ") # otherwise starts-with for lxml doesn't work try: tree = html.fromstring(page) except (etree.XMLSyntaxError, etree.ParserError) as e: print u"not parsing, beause etree error in get_tree: {}".format(e) tree = None return tree
Example #27
Source File: testtools.py From lambda-packs with MIT License | 5 votes |
def lxml(self): """Get an lxml etree if possible.""" if ('html' not in self.mimetype and 'xml' not in self.mimetype): raise AttributeError('Not an HTML/XML response') from lxml import etree try: from lxml.html import fromstring except ImportError: fromstring = etree.HTML if self.mimetype == 'text/html': return fromstring(self.data) return etree.XML(self.data)
Example #28
Source File: response.py From pledgeservice with Apache License 2.0 | 5 votes |
def lxml(self): """ Returns the response as an `lxml object <http://codespeak.net/lxml/>`_. You must have lxml installed to use this. If this is an HTML response and you have lxml 2.x installed, then an ``lxml.html.HTML`` object will be returned; if you have an earlier version of lxml then a ``lxml.HTML`` object will be returned. """ if 'html' not in self.content_type and \ 'xml' not in self.content_type: raise AttributeError( "Not an XML or HTML response body (content-type: %s)" % self.content_type) try: from lxml import etree except ImportError: # pragma: no cover raise ImportError( "You must have lxml installed to use response.lxml") try: from lxml.html import fromstring except ImportError: # pragma: no cover fromstring = etree.HTML ## FIXME: would be nice to set xml:base, in some fashion if self.content_type == 'text/html': return fromstring(self.testbody, base_url=self.request.url) else: return etree.XML(self.testbody, base_url=self.request.url)
Example #29
Source File: Reddit.py From ModLogin with MIT License | 5 votes |
def get_handle_element(self, login_html_str): try: login_attempt_html = html.fromstring(login_html_str) # Define a page element that only appears if the login is # successful logged_in_handle_element = login_attempt_html.xpath( '//*[@id="header-bottom-right"]/span[1]/a' ) return str(logged_in_handle_element[0].text_content()) except Exception as e: print "Debug: Unable to successfully parse handle element: " + \ str(e) return ''
Example #30
Source File: 26_stock_scraper.py From python-scripts with MIT License | 5 votes |
def get_stocks(url): # Make Request page = requests.get(url) # Parse/Scrape tree = html.fromstring(page.text) xpath = '//*[@id="mw-content-text"]/table[1]' rows = tree.xpath(xpath)[0].findall("tr") rows = [(row.getchildren()[0], row.getchildren()[3]) for row in rows[1:]] rows = [(row[0].getchildren()[0].text, row[1].text) for row in rows] industries = defaultdict(list) for row in rows: industries[row[1]].append(row[0]) return industries