Python lxml.etree.HTMLParser() Examples
The following are 30
code examples of lxml.etree.HTMLParser().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
lxml.etree
, or try the search function
.
Example #1
Source File: nasa.py From daily-wallpaper with MIT License | 6 votes |
def resolve_url(self): url = URL try: r = requests.get(url) if r.status_code == 200: parser = etree.HTMLParser(recover=True) html = etree.HTML(r.content, parser) images = html.iter('img') if images is not None: images = list(images) if len(images) > 0: image_url = images[0].getparent().attrib['href'] self._url = 'https://apod.nasa.gov/' + image_url return True except Exception: pass return False
Example #2
Source File: fstoppers.py From daily-wallpaper with MIT License | 6 votes |
def resolve_url(self): url = URL.format('/potd') try: r = requests.get(url) if r.status_code == 200: parser = etree.HTMLParser(recover=True) html = etree.HTML(r.content, parser) for element in html.iter('img'): if 'href' in element.getparent().attrib: url = URL.format(element.getparent().attrib['href']) break if url is not None: r = requests.get(url) if r.status_code == 200: html = etree.HTML(r.content, parser) for element in html.iter('div'): if 'class' in element.attrib and \ element.attrib['class'] == 'photo': if 'data-xlarge' in element.attrib: self._url = element.attrib['data-xlarge'] return True except Exception: pass return False
Example #3
Source File: spider.py From You-are-Pythonista with GNU General Public License v3.0 | 6 votes |
def parse_page(url): # headers = build_headers() # # result = requests.get(url,headers=headers).text # # parse = etree.HTMLParser(encoding='utf-8') # html = etree.HTML(result,parser=parse) # # hrefs = html.xpath(r'//div[@id="shop-all-list"]//div[@class="tit"]/a/@href') hrefs = ['http://www.dianping.com/shop/23093707', 'http://www.dianping.com/brands/b23093707', 'http://www.dianping.com/shop/2461336', 'http://www.dianping.com/shop/90085699', 'http://www.dianping.com/shop/13810171', 'http://www.dianping.com/brands/b13810171', 'http://www.dianping.com/shop/58322041', 'http://www.dianping.com/shop/80620237', 'http://www.dianping.com/shop/130946881', 'http://www.dianping.com/brands/b130946881', 'http://www.dianping.com/shop/32704021', 'http://www.dianping.com/brands/b18005322', 'http://www.dianping.com/shop/75141698', 'http://www.dianping.com/brands/b10008473', 'http://www.dianping.com/shop/92384680', 'http://www.dianping.com/shop/47008792', 'http://www.dianping.com/brands/b47008792', 'http://www.dianping.com/shop/67997136', 'http://www.dianping.com/brands/b4087801', 'http://www.dianping.com/shop/111533101', 'http://www.dianping.com/shop/98779037', 'http://www.dianping.com/shop/102025765', 'http://www.dianping.com/brands/b23093707'] every_page_headers = build_headers(url) print(every_page_headers) for href in hrefs: result = requests.get(href,headers=every_page_headers).text with open('test.html','w',encoding='utf-8') as fp: fp.write(result) break
Example #4
Source File: Fun.py From NotSoBot with MIT License | 6 votes |
def se(self, ctx, em:str): """Returns a steam emoji image""" em = em.lower() desc = None if em == ':b1:' or em == 'b1': b = self.files_path('b1.png') else: url = "https://steamcommunity-a.akamaihd.net/economy/emoticonhover/{0}".format(em) txt = await self.get_text(url) if not txt: await self.bot.say(":warning: `Emoticon Not Found/Invalid`\nRemember to do :steam_emoticon: (optional ':').") return root = etree.fromstring(txt, etree.HTMLParser()) base = root.find('.//img[@class="emoticon_large"]') b = BytesIO(base64.b64decode(base.attrib['src'][22:])) desc = '**{0}**'.format(root.find('.//div[@class="emoticon_hover_desc"]').text) await self.bot.upload(b, filename='steam.png', content=desc)
Example #5
Source File: test_examples.py From dataflows with MIT License | 6 votes |
def country_population(): from lxml import etree from urllib.request import urlopen page = urlopen('https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population').read() parser = etree.HTMLParser() tree = etree.fromstring(page, parser=parser) tables = tree.findall('.//table') for table in tables: if 'wikitable' in table.attrib.get('class', ''): rows = table.findall('.//tr') for row in rows: cells = row.findall('td') if len(cells) > 3: name = cells[1].find('.//a').attrib.get('title') population = cells[2].text yield(dict( name=name, population=population ))
Example #6
Source File: tweets_scrape.py From tweet_scrapper with GNU General Public License v3.0 | 6 votes |
def __init__(self, twitter_request_url, twitter_request_header, twitter_request_params=None, twitter_request_proxies=None, scrape_pages=2, twitter_file_path=None, twitter_file_format='csv'): self.__twitter_request_url__ = twitter_request_url if twitter_request_header is not None: self.__twitter_request_header__ = twitter_request_header self.__twitter_request_params__ = twitter_request_params self.__twitter_request_proxies__ = twitter_request_proxies self.scrape_pages = scrape_pages self.__twitter_tweet_persist_file_path__ = twitter_file_path self.__twitter_tweet_persist_file_format__ = twitter_file_format self.hashtag_capture = re.compile(self._tweet_hastag_pattern_) self.html_parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True) self.proxy_json = None
Example #7
Source File: avsox.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getNum(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']") return result1
Example #8
Source File: xcity.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getRelease(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() try: result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[4]/text()')[0] except: return '' try: return re.findall('\d{4}/\d{2}/\d{2}', result)[0].replace('/','-') except: return ''
Example #9
Source File: avsox.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getRelease(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']") return result1
Example #10
Source File: avsox.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getCover(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']") return result
Example #11
Source File: avsox.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getCover_small(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']") return result
Example #12
Source File: xcity.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getRuntime(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() try: result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[3]/text()')[0] except: return '' try: return re.findall('\d+',result1)[0] except: return ''
Example #13
Source File: mgstage.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getSeries(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n ').strip( '\\n') result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n ').strip( '\\n') return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
Example #14
Source File: xcity.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getTitle(a): html = etree.fromstring(a, etree.HTMLParser()) result = html.xpath('//*[@id="program_detail_title"]/text()')[0] return result
Example #15
Source File: xcity.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[3]/a/text()')[0] return result1
Example #16
Source File: xcity.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getStudio(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() try: result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()')).strip(" ['']") except: result = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']") return result.strip('+').replace("', '", '').replace('"', '')
Example #17
Source File: javdb.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getStudio(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
Example #18
Source File: avsox.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getLabel(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']") return result1
Example #19
Source File: avsox.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getStudio(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ') return result1
Example #20
Source File: avsox.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getTitle(a): try: html = etree.fromstring(a, etree.HTMLParser()) result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0] return result.replace('/', '') except: return ''
Example #21
Source File: ADC_function.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getXpathSingle(htmlcode,xpath): html = etree.fromstring(htmlcode, etree.HTMLParser()) result1 = str(html.xpath(xpath)).strip(" ['']") return result1
Example #22
Source File: javdb.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getSeries(a): #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
Example #23
Source File: javdb.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getDirector(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
Example #24
Source File: javdb.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getCover(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")).strip(" ['']") return result
Example #25
Source File: javdb.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getCover_small(a, index=0): # same issue mentioned below, # javdb sometime returns multiple results # DO NOT just get the firt one, get the one with correct index number html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] if not 'https' in result: result = 'https:' + result return result
Example #26
Source File: javdb.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getTag(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() try: result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()') return result except: result = html.xpath('//strong[contains(text(),"類別")]/../span/text()') return result
Example #27
Source File: javdb.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getRelease(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//strong[contains(text(),"時間")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"時間")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+')
Example #28
Source File: javdb.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getLabel(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
Example #29
Source File: javdb.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getRuntime(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').rstrip('mi')
Example #30
Source File: xcity.py From AV_Data_Capture with GNU General Public License v3.0 | 5 votes |
def getCover_small(a, index=0): # same issue mentioned below, # javdb sometime returns multiple results # DO NOT just get the firt one, get the one with correct index number html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] if not 'https' in result: result = 'https:' + result return result