Python Examples of lxml.etree.HTMLParser

Source File: nasa.py From daily-wallpaper with MIT License

6 votes

def resolve_url(self):
        url = URL
        try:
            r = requests.get(url)
            if r.status_code == 200:
                parser = etree.HTMLParser(recover=True)
                html = etree.HTML(r.content, parser)
                images = html.iter('img')
                if images is not None:
                    images = list(images)
                    if len(images) > 0:
                        image_url = images[0].getparent().attrib['href']
                        self._url = 'https://apod.nasa.gov/' + image_url
                        return True
        except Exception:
            pass
        return False

Source File: fstoppers.py From daily-wallpaper with MIT License

6 votes

def resolve_url(self):
        url = URL.format('/potd')
        try:
            r = requests.get(url)
            if r.status_code == 200:
                parser = etree.HTMLParser(recover=True)
                html = etree.HTML(r.content, parser)
                for element in html.iter('img'):
                    if 'href' in element.getparent().attrib:
                        url = URL.format(element.getparent().attrib['href'])
                        break
                if url is not None:
                    r = requests.get(url)
                    if r.status_code == 200:
                        html = etree.HTML(r.content, parser)
                        for element in html.iter('div'):
                            if 'class' in element.attrib and \
                                    element.attrib['class'] == 'photo':
                                if 'data-xlarge' in element.attrib:
                                    self._url = element.attrib['data-xlarge']
                                return True
        except Exception:
            pass
        return False

Source File: spider.py From You-are-Pythonista with GNU General Public License v3.0

6 votes

def parse_page(url):
    # headers = build_headers()
    #
    # result = requests.get(url,headers=headers).text
    #
    # parse = etree.HTMLParser(encoding='utf-8')
    # html = etree.HTML(result,parser=parse)
    #
    # hrefs = html.xpath(r'//div[@id="shop-all-list"]//div[@class="tit"]/a/@href')

    hrefs = ['http://www.dianping.com/shop/23093707', 'http://www.dianping.com/brands/b23093707', 'http://www.dianping.com/shop/2461336', 'http://www.dianping.com/shop/90085699', 'http://www.dianping.com/shop/13810171', 'http://www.dianping.com/brands/b13810171', 'http://www.dianping.com/shop/58322041', 'http://www.dianping.com/shop/80620237', 'http://www.dianping.com/shop/130946881', 'http://www.dianping.com/brands/b130946881', 'http://www.dianping.com/shop/32704021', 'http://www.dianping.com/brands/b18005322', 'http://www.dianping.com/shop/75141698', 'http://www.dianping.com/brands/b10008473', 'http://www.dianping.com/shop/92384680', 'http://www.dianping.com/shop/47008792', 'http://www.dianping.com/brands/b47008792', 'http://www.dianping.com/shop/67997136', 'http://www.dianping.com/brands/b4087801', 'http://www.dianping.com/shop/111533101', 'http://www.dianping.com/shop/98779037', 'http://www.dianping.com/shop/102025765', 'http://www.dianping.com/brands/b23093707']


    every_page_headers = build_headers(url)
    print(every_page_headers)
    for href in hrefs:
        result = requests.get(href,headers=every_page_headers).text
        with open('test.html','w',encoding='utf-8') as fp:
            fp.write(result)
        break

Source File: Fun.py From NotSoBot with MIT License

6 votes

def se(self, ctx, em:str):
		"""Returns a steam emoji image"""
		em = em.lower()
		desc = None
		if em == ':b1:' or em == 'b1':
			b = self.files_path('b1.png')
		else:
			url = "https://steamcommunity-a.akamaihd.net/economy/emoticonhover/{0}".format(em)
			txt = await self.get_text(url)
			if not txt:
				await self.bot.say(":warning: `Emoticon Not Found/Invalid`\nRemember to do :steam_emoticon: (optional ':').")
				return
			root = etree.fromstring(txt, etree.HTMLParser())
			base = root.find('.//img[@class="emoticon_large"]')
			b = BytesIO(base64.b64decode(base.attrib['src'][22:]))
			desc = '**{0}**'.format(root.find('.//div[@class="emoticon_hover_desc"]').text)
		await self.bot.upload(b, filename='steam.png', content=desc)

Source File: test_examples.py From dataflows with MIT License

6 votes

def country_population():
    from lxml import etree
    from urllib.request import urlopen
    page = urlopen('https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population').read()
    parser = etree.HTMLParser()
    tree = etree.fromstring(page, parser=parser)
    tables = tree.findall('.//table')
    for table in tables:
        if 'wikitable' in table.attrib.get('class', ''):
            rows = table.findall('.//tr')
            for row in rows:
                cells = row.findall('td')
                if len(cells) > 3:
                    name = cells[1].find('.//a').attrib.get('title')
                    population = cells[2].text
                    yield(dict(
                        name=name,
                        population=population
                    ))

Source File: tweets_scrape.py From tweet_scrapper with GNU General Public License v3.0

6 votes

def __init__(self, twitter_request_url, twitter_request_header,
                 twitter_request_params=None, twitter_request_proxies=None, scrape_pages=2,
                 twitter_file_path=None, twitter_file_format='csv'):

        self.__twitter_request_url__ = twitter_request_url
        if twitter_request_header is not None:
            self.__twitter_request_header__ = twitter_request_header
        self.__twitter_request_params__ = twitter_request_params
        self.__twitter_request_proxies__ = twitter_request_proxies
        self.scrape_pages = scrape_pages
        self.__twitter_tweet_persist_file_path__ = twitter_file_path
        self.__twitter_tweet_persist_file_format__ = twitter_file_format

        self.hashtag_capture = re.compile(self._tweet_hastag_pattern_)

        self.html_parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
        self.proxy_json = None

Source File: avsox.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getNum(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']")
    return result1

Source File: xcity.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getRelease(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    try:
        result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[4]/text()')[0]
    except:
        return ''
    try:
        return re.findall('\d{4}/\d{2}/\d{2}', result)[0].replace('/','-')
    except:
        return ''

Source File: avsox.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getRelease(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']")
    return result1

Source File: avsox.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getCover(htmlcode):
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']")
    return result

Source File: avsox.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getCover_small(htmlcode):
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
    return result

Source File: xcity.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getRuntime(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    try:
        result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[3]/text()')[0]
    except:
        return ''
    try:
        return re.findall('\d+',result1)[0]
    except:
        return ''

Source File: mgstage.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getSeries(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/a/text()')).strip(" ['']").strip('\\n    ').strip(
        '\\n')
    result2 = str(html.xpath('//th[contains(text(),"シリーズ")]/../td/text()')).strip(" ['']").strip('\\n    ').strip(
        '\\n')
    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')

Source File: xcity.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getTitle(a):
    html = etree.fromstring(a, etree.HTMLParser())
    result = html.xpath('//*[@id="program_detail_title"]/text()')[0]
    return result

Source File: xcity.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getActor(a):  # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[3]/a/text()')[0]
    return result1

Source File: xcity.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getStudio(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    try:
        result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()')).strip(" ['']")
    except:
        result = str(html.xpath('//strong[contains(text(),"片商")]/../following-sibling::span/a/text()')).strip(" ['']")
    return result.strip('+').replace("', '", '').replace('"', '')

Source File: javdb.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getStudio(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']")
    result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')

Source File: avsox.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getLabel(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']")
    return result1

Source File: avsox.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getStudio(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ')
    return result1

Source File: avsox.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getTitle(a):
    try:
        html = etree.fromstring(a, etree.HTMLParser())
        result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0]
        return result.replace('/', '')
    except:
        return ''

Source File: ADC_function.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getXpathSingle(htmlcode,xpath):
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    result1 = str(html.xpath(xpath)).strip(" ['']")
    return result1

Source File: javdb.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getSeries(a):
    #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
    result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')

Source File: javdb.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getDirector(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']")
    result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')

Source File: javdb.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getCover(htmlcode):
    html = etree.fromstring(htmlcode, etree.HTMLParser())
    result = str(html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")).strip(" ['']")
    return result

Source File: javdb.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getCover_small(a, index=0):
    # same issue mentioned below,
    # javdb sometime returns multiple results
    # DO NOT just get the firt one, get the one with correct index number
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
    if not 'https' in result:
        result = 'https:' + result
    return result

Source File: javdb.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getTag(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    try:
        result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()')
        return result
    except:
        result = html.xpath('//strong[contains(text(),"類別")]/../span/text()')
        return result

Source File: javdb.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getRelease(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//strong[contains(text(),"時間")]/../span/text()')).strip(" ['']")
    result2 = str(html.xpath('//strong[contains(text(),"時間")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+')

Source File: javdb.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getLabel(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
    result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')

Source File: javdb.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getRuntime(a):
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
    result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
    return str(result1 + result2).strip('+').rstrip('mi')

Source File: xcity.py From AV_Data_Capture with GNU General Public License v3.0

5 votes

def getCover_small(a, index=0):
    # same issue mentioned below,
    # javdb sometime returns multiple results
    # DO NOT just get the firt one, get the one with correct index number
    html = etree.fromstring(a, etree.HTMLParser())  # //table/tr[1]/td[1]/text()
    result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
    if not 'https' in result:
        result = 'https:' + result
    return result

Python lxml.etree.HTMLParser() Examples