Python Examples of lxml.etree.HTML

Source File: diagnose.py From pledgeservice with Apache License 2.0

6 votes

def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    data = rdoc(num_elements)
    print "Generated a large invalid HTML document (%d bytes)." % len(data)
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)

Source File: html_parser.py From amazon-review-spider with MIT License

6 votes

def get_reviews_info(self, content):
        content = str(content)
        content = content.replace("<br>", "")
        content = content.replace("<br />", "")
        html = etree.HTML(content)

        star_list = html.xpath('//a/i[@data-hook="review-star-rating"]/span[@class="a-icon-alt"]/text()')
        title_list = html.xpath('//div[@class="a-row"]/a[@data-hook="review-title"]/text()')

        review_body_list = html.xpath('//div[@class="a-row review-data"]/span['
                                      '@data-hook="review-body"]/text()')

        all_review_list = []
        for index in range(len(star_list)):
            star_num = star_list[index][:1]
            if int(star_num) < 4:
                continue
            all_review_list.append(
                {"star": star_num, "title": title_list[index], "body": review_body_list[index],
                 'trans': self.trans.transEn2Zh(review_body_list[index])})

        return all_review_list

Source File: parser.py From scraper-fourone-jobs with GNU General Public License v2.0

6 votes

def parse(self, html: bytes) -> ApplyContactPerson:
        # 透過 XPATH 取得資訊
        tree = etree.HTML(html)
        contact_name = str(tree.xpath(Config.CONTACT_PERSON_XPATH)[0])

        # 以下為帶有 txticon 樣式，加密過的內容
        raw_email = str(tree.xpath(Config.EMAIL_XPATH)[0])
        raw_telphone = str(tree.xpath(Config.TELPHONE_XPATH)[0])
        raw_mobile = str(tree.xpath(Config.MOBILE_PHONE_XPATH)[0])

        # 取得字型的 CSS 定義檔所在網址路徑
        custom_font_path = self._find_custom_font(html)
        woff: WOFFContent = WebOpenFontReader.read(custom_font_path)

        # 解碼內容
        decoder = FontTextDecoder(woff, Config.FONT_GLYPHID_TRANSLATOR)
        email = decoder.decode(raw_email)
        telphone = decoder.decode(raw_telphone)
        mobile = decoder.decode(raw_mobile)
        return ApplyContactPerson(contact_name, email, telphone, mobile)

Source File: diagnose.py From ServerlessCrawler-VancouverRealState with MIT License

6 votes

def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    data = rdoc(num_elements)
    print "Generated a large invalid HTML document (%d bytes)." % len(data)
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)

Source File: diagnose.py From ServerlessCrawler-VancouverRealState with MIT License

6 votes

def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"

Source File: diagnose.py From ServerlessCrawler-VancouverRealState with MIT License

6 votes

def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    data = rdoc(num_elements)
    print "Generated a large invalid HTML document (%d bytes)." % len(data)
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)

Source File: crawl.py From Vxscan with Apache License 2.0

6 votes

def jsparse(self, r):
        try:
            html = etree.HTML(r.text)
            result = html.xpath('//script/@src')
            for i in result:
                if not re.search(
                    r'jquery|bootstrap|adsbygoogle|angular|javascript|#|vue|react|51.la/=|map\.baidu\.com|canvas|cnzz\.com|slick\.js|autofill-event\.js|tld\.js|clipboard|Chart\.js',
                    i):
                    if '://' not in i:
                        i = re.sub(r'^/|^\.\./', '', i)
                        i = self.host + '/' + i
                    self.js.append(i)
        except (AttributeError, AttributeError, ValueError):
            pass
        except Exception as e:
            logging.exception(e)

Source File: diagnose.py From ServerlessCrawler-VancouverRealState with MIT License

6 votes

def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"

Source File: district.py From BeikeSpider with Apache License 2.0

6 votes

def get_districts(city):
    """
    获取各城市的区县中英文对照信息
    :param city: 城市
    :return: 英文区县名列表
    """
    url = 'https://{0}.ke.com/xiaoqu/'.format(city)
    headers = create_headers()
    response = requests.get(url, timeout=10, headers=headers)
    html = response.content
    root = etree.HTML(html)
    elements = root.xpath("/html/body/div[3]/div[1]/dl[2]/dd/div/div/a")
    en_names = list()
    ch_names = list()
    for element in elements:
        link = element.attrib['href']
        en_names.append(link.split('/')[-2])
        ch_names.append(element.text)

        # 打印区县英文和中文名列表
    for index, name in enumerate(en_names):
        chinese_city_district_dict[name] = ch_names[index]
        print(name + ' -> ' + ch_names[index])
    return en_names

Source File: fetch.py From twstock with MIT License

6 votes

def fetch_data(url):
    r = requests.get(url, proxies=get_proxies())
    root = etree.HTML(r.text)
    trs = root.xpath('//tr')[1:]

    result = []
    typ = ''
    for tr in trs:
        tr = list(map(lambda x: x.text, tr.iter()))
        if len(tr) == 4:
            # This is type
            typ = tr[2].strip(' ')
        else:
            # This is the row data
            result.append(make_row_tuple(typ, tr))
    return result

Source File: fstoppers.py From daily-wallpaper with MIT License

6 votes

def resolve_url(self):
        url = URL.format('/potd')
        try:
            r = requests.get(url)
            if r.status_code == 200:
                parser = etree.HTMLParser(recover=True)
                html = etree.HTML(r.content, parser)
                for element in html.iter('img'):
                    if 'href' in element.getparent().attrib:
                        url = URL.format(element.getparent().attrib['href'])
                        break
                if url is not None:
                    r = requests.get(url)
                    if r.status_code == 200:
                        html = etree.HTML(r.content, parser)
                        for element in html.iter('div'):
                            if 'class' in element.attrib and \
                                    element.attrib['class'] == 'photo':
                                if 'data-xlarge' in element.attrib:
                                    self._url = element.attrib['data-xlarge']
                                return True
        except Exception:
            pass
        return False

Source File: nasa.py From daily-wallpaper with MIT License

6 votes

def resolve_url(self):
        url = URL
        try:
            r = requests.get(url)
            if r.status_code == 200:
                parser = etree.HTMLParser(recover=True)
                html = etree.HTML(r.content, parser)
                images = html.iter('img')
                if images is not None:
                    images = list(images)
                    if len(images) > 0:
                        image_url = images[0].getparent().attrib['href']
                        self._url = 'https://apod.nasa.gov/' + image_url
                        return True
        except Exception:
            pass
        return False

Source File: data_spider.py From dialogbot with Apache License 2.0

6 votes

def symptom_spider(self, url):
        """症状信息解析"""
        html = self.get_html(url)
        selector = etree.HTML(html)
        symptoms = selector.xpath('//a[@class="gre" ]/text()')
        ps = selector.xpath('//p')
        detail = []
        for p in ps:
            info = p.xpath('string(.)') \
                .replace('\r', '') \
                .replace('\n', '') \
                .replace('\xa0', '') \
                .replace('   ', '') \
                .replace('\t', '')
            detail.append(info)
        symptoms_data = {}
        symptoms_data['symptoms'] = symptoms
        symptoms_data['symptoms_detail'] = detail
        return symptoms, detail

Source File: data_spider.py From dialogbot with Apache License 2.0

6 votes

def common_spider(self, url):
        """通用解析模块"""
        html = self.get_html(url)
        selector = etree.HTML(html)
        ps = selector.xpath('//p')
        infobox = []
        for p in ps:
            info = p.xpath('string(.)') \
                .replace('\r', '') \
                .replace('\n', '') \
                .replace('\xa0', '') \
                .replace('   ', '') \
                .replace('\t', '')
            if info:
                infobox.append(info)
        return '\n'.join(infobox)

Source File: diagnose.py From pledgeservice with Apache License 2.0

6 votes

def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"

Source File: get_title.py From Vxscan with Apache License 2.0

6 votes

def get_title(url):
    code = 0

    try:
        r = req.get(url)
        code = r.status_code
        coding = chardet.detect(r.content).get('encoding')
        text = r.content[:10000].decode(coding)
        html = etree.HTML(text)
        title = html.xpath('//title/text()')
        if title:
            return url + ' | ' + title[0]
        else:
            return url + ' | Status_code: ' + str(code)
    except:
        pass

    return url + ' | Status_code: ' + str(code)

Source File: diagnose.py From svg-animation-tools with MIT License

6 votes

def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    data = rdoc(num_elements)
    print "Generated a large invalid HTML document (%d bytes)." % len(data)
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)

Source File: diagnose.py From svg-animation-tools with MIT License

6 votes

def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"

Source File: diagnose.py From svg-animation-tools with MIT License

6 votes

def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    data = rdoc(num_elements)
    print "Generated a large invalid HTML document (%d bytes)." % len(data)
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)

Source File: response.py From pledgeservice with Apache License 2.0

6 votes

def showbrowser(self):
        """
        Show this response in a browser window (for debugging purposes,
        when it's hard to read the HTML).
        """
        import webbrowser
        import tempfile
        f = tempfile.NamedTemporaryFile(prefix='webtest-page',
                                        suffix='.html')
        name = f.name
        f.close()
        f = open(name, 'w')
        if PY3:
            f.write(self.body.decode(self.charset or 'ascii', 'replace'))
        else:
            f.write(self.body)
        f.close()
        if name[0] != '/':  # pragma: no cover
            # windows ...
            url = 'file:///' + name
        else:
            url = 'file://' + name
        webbrowser.open_new(url)

Source File: diagnose.py From svg-animation-tools with MIT License

6 votes

def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"

Source File: diagnose.py From locality-sensitive-hashing with MIT License

6 votes

def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"

Source File: diagnose.py From locality-sensitive-hashing with MIT License

6 votes

def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    data = rdoc(num_elements)
    print "Generated a large invalid HTML document (%d bytes)." % len(data)
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)

Source File: diagnose.py From fuzzdb-collect with GNU General Public License v3.0

6 votes

def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"

Source File: diagnose.py From fuzzdb-collect with GNU General Public License v3.0

6 votes

def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    data = rdoc(num_elements)
    print "Generated a large invalid HTML document (%d bytes)." % len(data)
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)

Source File: doub.spider.py From ParrotSecCN_Community_QQbot with GNU General Public License v2.0

6 votes

def getNeedInfo(sourceHtml):
    """
        获取SS_SSR的请求地址
    """
    selector = etree.HTML(sourceHtml)

    lists = []
    for i in range(5, 9):
        ca_1 = selector.xpath(
            '/html/body/section/div[3]/div/div[1]/table/tbody/tr[' +
            str(i) +
            ']/td/a/@href')
        for j in ca_1:
            print(j)
            lists.append(j)

    return lists
# lists = [j for j in selector.xpath('/html/body/section/div[3]/div/div[1]/table/tbody/tr['+str(i)+']/td/a/@href')]

Source File: diagnose.py From weeman with GNU General Public License v3.0

6 votes

def rdoc(num_elements=1000):
    """Randomly generate an invalid HTML document."""
    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
    elements = []
    for i in range(num_elements):
        choice = random.randint(0,3)
        if choice == 0:
            # New tag.
            tag_name = random.choice(tag_names)
            elements.append("<%s>" % tag_name)
        elif choice == 1:
            elements.append(rsentence(random.randint(1,4)))
        elif choice == 2:
            # Close a tag.
            tag_name = random.choice(tag_names)
            elements.append("</%s>" % tag_name)
    return "<html>" + "\n".join(elements) + "</html>"

Source File: diagnose.py From weeman with GNU General Public License v3.0

6 votes

def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    data = rdoc(num_elements)
    print "Generated a large invalid HTML document (%d bytes)." % len(data)
    
    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = BeautifulSoup(data, parser)
            b = time.time()
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)

Source File: newrelic.py From changelogs with MIT License

6 votes

def get_content(session, urls):
    log = ""
    for url in urls:
        r = session.get(url)
        if r.status_code == 200:
            root = etree.HTML(r.content)
            try:
                article = root.xpath("//article/div[@class='content']")[0]
                content = etree.tostring(article, method="text", encoding='utf-8')
                if sys.version_info > (3, 0):
                    content = content.decode("utf-8")
                # remove first two lines
                content = '\n'.join(content.split('\n')[2:-1])
                log += "# {version}\n{content}\n\n".format(
                    version=url.split("-")[-1],
                    content=content,
                )
            except IndexError:
                pass
    return log

Source File: spider.py From job-web-demo with MIT License

6 votes

def _parse_company_detail(self, detail_url):
        resp = self._request('get', detail_url)
        resp.encoding = resp.apparent_encoding
        html = etree.HTML(resp.text)
        name = html.xpath('//div[@class="company_main"]/h1/a/text()')
        # 这里最好先判断一下，以免没提取到出现异常
        if not name:
            self.logger.debug('请求到错误页面')
            time.sleep(30)
            return self._parse_company_detail(detail_url)
        # 返回的键必须包含这些，否则写入会报错
        supply = {
            'details': unescape(str(etree.tostring(html.xpath(
                '//span[@class="company_content"]')[0]), encoding='utf8')).replace(
                '<span class="company_content">', '').replace('\n', '').replace('\xa0', ''),
            'website': html.xpath('//div[@class="company_main"]/a[1]/@href')[0].split('?')[0],
        }
        return supply

Python lxml.etree.HTML Examples