Python lxml.etree.HTML Examples
The following are 30
code examples of lxml.etree.HTML().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
lxml.etree
, or try the search function
.
Example #1
Source File: diagnose.py From pledgeservice with Apache License 2.0 | 6 votes |
def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print "Comparative parser benchmark on Beautiful Soup %s" % __version__ data = rdoc(num_elements) print "Generated a large invalid HTML document (%d bytes)." % len(data) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = BeautifulSoup(data, parser) b = time.time() success = True except Exception, e: print "%s could not parse the markup." % parser traceback.print_exc() if success: print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
Example #2
Source File: html_parser.py From amazon-review-spider with MIT License | 6 votes |
def get_reviews_info(self, content): content = str(content) content = content.replace("<br>", "") content = content.replace("<br />", "") html = etree.HTML(content) star_list = html.xpath('//a/i[@data-hook="review-star-rating"]/span[@class="a-icon-alt"]/text()') title_list = html.xpath('//div[@class="a-row"]/a[@data-hook="review-title"]/text()') review_body_list = html.xpath('//div[@class="a-row review-data"]/span[' '@data-hook="review-body"]/text()') all_review_list = [] for index in range(len(star_list)): star_num = star_list[index][:1] if int(star_num) < 4: continue all_review_list.append( {"star": star_num, "title": title_list[index], "body": review_body_list[index], 'trans': self.trans.transEn2Zh(review_body_list[index])}) return all_review_list
Example #3
Source File: parser.py From scraper-fourone-jobs with GNU General Public License v2.0 | 6 votes |
def parse(self, html: bytes) -> ApplyContactPerson: # 透過 XPATH 取得資訊 tree = etree.HTML(html) contact_name = str(tree.xpath(Config.CONTACT_PERSON_XPATH)[0]) # 以下為帶有 txticon 樣式,加密過的內容 raw_email = str(tree.xpath(Config.EMAIL_XPATH)[0]) raw_telphone = str(tree.xpath(Config.TELPHONE_XPATH)[0]) raw_mobile = str(tree.xpath(Config.MOBILE_PHONE_XPATH)[0]) # 取得字型的 CSS 定義檔所在網址路徑 custom_font_path = self._find_custom_font(html) woff: WOFFContent = WebOpenFontReader.read(custom_font_path) # 解碼內容 decoder = FontTextDecoder(woff, Config.FONT_GLYPHID_TRANSLATOR) email = decoder.decode(raw_email) telphone = decoder.decode(raw_telphone) mobile = decoder.decode(raw_mobile) return ApplyContactPerson(contact_name, email, telphone, mobile)
Example #4
Source File: diagnose.py From ServerlessCrawler-VancouverRealState with MIT License | 6 votes |
def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print "Comparative parser benchmark on Beautiful Soup %s" % __version__ data = rdoc(num_elements) print "Generated a large invalid HTML document (%d bytes)." % len(data) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = BeautifulSoup(data, parser) b = time.time() success = True except Exception, e: print "%s could not parse the markup." % parser traceback.print_exc() if success: print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
Example #5
Source File: diagnose.py From ServerlessCrawler-VancouverRealState with MIT License | 6 votes |
def rdoc(num_elements=1000): """Randomly generate an invalid HTML document.""" tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] elements = [] for i in range(num_elements): choice = random.randint(0,3) if choice == 0: # New tag. tag_name = random.choice(tag_names) elements.append("<%s>" % tag_name) elif choice == 1: elements.append(rsentence(random.randint(1,4))) elif choice == 2: # Close a tag. tag_name = random.choice(tag_names) elements.append("</%s>" % tag_name) return "<html>" + "\n".join(elements) + "</html>"
Example #6
Source File: diagnose.py From ServerlessCrawler-VancouverRealState with MIT License | 6 votes |
def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print "Comparative parser benchmark on Beautiful Soup %s" % __version__ data = rdoc(num_elements) print "Generated a large invalid HTML document (%d bytes)." % len(data) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = BeautifulSoup(data, parser) b = time.time() success = True except Exception, e: print "%s could not parse the markup." % parser traceback.print_exc() if success: print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
Example #7
Source File: crawl.py From Vxscan with Apache License 2.0 | 6 votes |
def jsparse(self, r): try: html = etree.HTML(r.text) result = html.xpath('//script/@src') for i in result: if not re.search( r'jquery|bootstrap|adsbygoogle|angular|javascript|#|vue|react|51.la/=|map\.baidu\.com|canvas|cnzz\.com|slick\.js|autofill-event\.js|tld\.js|clipboard|Chart\.js', i): if '://' not in i: i = re.sub(r'^/|^\.\./', '', i) i = self.host + '/' + i self.js.append(i) except (AttributeError, AttributeError, ValueError): pass except Exception as e: logging.exception(e)
Example #8
Source File: diagnose.py From ServerlessCrawler-VancouverRealState with MIT License | 6 votes |
def rdoc(num_elements=1000): """Randomly generate an invalid HTML document.""" tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] elements = [] for i in range(num_elements): choice = random.randint(0,3) if choice == 0: # New tag. tag_name = random.choice(tag_names) elements.append("<%s>" % tag_name) elif choice == 1: elements.append(rsentence(random.randint(1,4))) elif choice == 2: # Close a tag. tag_name = random.choice(tag_names) elements.append("</%s>" % tag_name) return "<html>" + "\n".join(elements) + "</html>"
Example #9
Source File: district.py From BeikeSpider with Apache License 2.0 | 6 votes |
def get_districts(city): """ 获取各城市的区县中英文对照信息 :param city: 城市 :return: 英文区县名列表 """ url = 'https://{0}.ke.com/xiaoqu/'.format(city) headers = create_headers() response = requests.get(url, timeout=10, headers=headers) html = response.content root = etree.HTML(html) elements = root.xpath("/html/body/div[3]/div[1]/dl[2]/dd/div/div/a") en_names = list() ch_names = list() for element in elements: link = element.attrib['href'] en_names.append(link.split('/')[-2]) ch_names.append(element.text) # 打印区县英文和中文名列表 for index, name in enumerate(en_names): chinese_city_district_dict[name] = ch_names[index] print(name + ' -> ' + ch_names[index]) return en_names
Example #10
Source File: fetch.py From twstock with MIT License | 6 votes |
def fetch_data(url): r = requests.get(url, proxies=get_proxies()) root = etree.HTML(r.text) trs = root.xpath('//tr')[1:] result = [] typ = '' for tr in trs: tr = list(map(lambda x: x.text, tr.iter())) if len(tr) == 4: # This is type typ = tr[2].strip(' ') else: # This is the row data result.append(make_row_tuple(typ, tr)) return result
Example #11
Source File: fstoppers.py From daily-wallpaper with MIT License | 6 votes |
def resolve_url(self): url = URL.format('/potd') try: r = requests.get(url) if r.status_code == 200: parser = etree.HTMLParser(recover=True) html = etree.HTML(r.content, parser) for element in html.iter('img'): if 'href' in element.getparent().attrib: url = URL.format(element.getparent().attrib['href']) break if url is not None: r = requests.get(url) if r.status_code == 200: html = etree.HTML(r.content, parser) for element in html.iter('div'): if 'class' in element.attrib and \ element.attrib['class'] == 'photo': if 'data-xlarge' in element.attrib: self._url = element.attrib['data-xlarge'] return True except Exception: pass return False
Example #12
Source File: nasa.py From daily-wallpaper with MIT License | 6 votes |
def resolve_url(self): url = URL try: r = requests.get(url) if r.status_code == 200: parser = etree.HTMLParser(recover=True) html = etree.HTML(r.content, parser) images = html.iter('img') if images is not None: images = list(images) if len(images) > 0: image_url = images[0].getparent().attrib['href'] self._url = 'https://apod.nasa.gov/' + image_url return True except Exception: pass return False
Example #13
Source File: data_spider.py From dialogbot with Apache License 2.0 | 6 votes |
def symptom_spider(self, url): """症状信息解析""" html = self.get_html(url) selector = etree.HTML(html) symptoms = selector.xpath('//a[@class="gre" ]/text()') ps = selector.xpath('//p') detail = [] for p in ps: info = p.xpath('string(.)') \ .replace('\r', '') \ .replace('\n', '') \ .replace('\xa0', '') \ .replace(' ', '') \ .replace('\t', '') detail.append(info) symptoms_data = {} symptoms_data['symptoms'] = symptoms symptoms_data['symptoms_detail'] = detail return symptoms, detail
Example #14
Source File: data_spider.py From dialogbot with Apache License 2.0 | 6 votes |
def common_spider(self, url): """通用解析模块""" html = self.get_html(url) selector = etree.HTML(html) ps = selector.xpath('//p') infobox = [] for p in ps: info = p.xpath('string(.)') \ .replace('\r', '') \ .replace('\n', '') \ .replace('\xa0', '') \ .replace(' ', '') \ .replace('\t', '') if info: infobox.append(info) return '\n'.join(infobox)
Example #15
Source File: diagnose.py From pledgeservice with Apache License 2.0 | 6 votes |
def rdoc(num_elements=1000): """Randomly generate an invalid HTML document.""" tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] elements = [] for i in range(num_elements): choice = random.randint(0,3) if choice == 0: # New tag. tag_name = random.choice(tag_names) elements.append("<%s>" % tag_name) elif choice == 1: elements.append(rsentence(random.randint(1,4))) elif choice == 2: # Close a tag. tag_name = random.choice(tag_names) elements.append("</%s>" % tag_name) return "<html>" + "\n".join(elements) + "</html>"
Example #16
Source File: get_title.py From Vxscan with Apache License 2.0 | 6 votes |
def get_title(url): code = 0 try: r = req.get(url) code = r.status_code coding = chardet.detect(r.content).get('encoding') text = r.content[:10000].decode(coding) html = etree.HTML(text) title = html.xpath('//title/text()') if title: return url + ' | ' + title[0] else: return url + ' | Status_code: ' + str(code) except: pass return url + ' | Status_code: ' + str(code)
Example #17
Source File: diagnose.py From svg-animation-tools with MIT License | 6 votes |
def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print "Comparative parser benchmark on Beautiful Soup %s" % __version__ data = rdoc(num_elements) print "Generated a large invalid HTML document (%d bytes)." % len(data) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = BeautifulSoup(data, parser) b = time.time() success = True except Exception, e: print "%s could not parse the markup." % parser traceback.print_exc() if success: print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
Example #18
Source File: diagnose.py From svg-animation-tools with MIT License | 6 votes |
def rdoc(num_elements=1000): """Randomly generate an invalid HTML document.""" tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] elements = [] for i in range(num_elements): choice = random.randint(0,3) if choice == 0: # New tag. tag_name = random.choice(tag_names) elements.append("<%s>" % tag_name) elif choice == 1: elements.append(rsentence(random.randint(1,4))) elif choice == 2: # Close a tag. tag_name = random.choice(tag_names) elements.append("</%s>" % tag_name) return "<html>" + "\n".join(elements) + "</html>"
Example #19
Source File: diagnose.py From svg-animation-tools with MIT License | 6 votes |
def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print "Comparative parser benchmark on Beautiful Soup %s" % __version__ data = rdoc(num_elements) print "Generated a large invalid HTML document (%d bytes)." % len(data) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = BeautifulSoup(data, parser) b = time.time() success = True except Exception, e: print "%s could not parse the markup." % parser traceback.print_exc() if success: print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
Example #20
Source File: response.py From pledgeservice with Apache License 2.0 | 6 votes |
def showbrowser(self): """ Show this response in a browser window (for debugging purposes, when it's hard to read the HTML). """ import webbrowser import tempfile f = tempfile.NamedTemporaryFile(prefix='webtest-page', suffix='.html') name = f.name f.close() f = open(name, 'w') if PY3: f.write(self.body.decode(self.charset or 'ascii', 'replace')) else: f.write(self.body) f.close() if name[0] != '/': # pragma: no cover # windows ... url = 'file:///' + name else: url = 'file://' + name webbrowser.open_new(url)
Example #21
Source File: diagnose.py From svg-animation-tools with MIT License | 6 votes |
def rdoc(num_elements=1000): """Randomly generate an invalid HTML document.""" tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] elements = [] for i in range(num_elements): choice = random.randint(0,3) if choice == 0: # New tag. tag_name = random.choice(tag_names) elements.append("<%s>" % tag_name) elif choice == 1: elements.append(rsentence(random.randint(1,4))) elif choice == 2: # Close a tag. tag_name = random.choice(tag_names) elements.append("</%s>" % tag_name) return "<html>" + "\n".join(elements) + "</html>"
Example #22
Source File: diagnose.py From locality-sensitive-hashing with MIT License | 6 votes |
def rdoc(num_elements=1000): """Randomly generate an invalid HTML document.""" tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] elements = [] for i in range(num_elements): choice = random.randint(0,3) if choice == 0: # New tag. tag_name = random.choice(tag_names) elements.append("<%s>" % tag_name) elif choice == 1: elements.append(rsentence(random.randint(1,4))) elif choice == 2: # Close a tag. tag_name = random.choice(tag_names) elements.append("</%s>" % tag_name) return "<html>" + "\n".join(elements) + "</html>"
Example #23
Source File: diagnose.py From locality-sensitive-hashing with MIT License | 6 votes |
def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print "Comparative parser benchmark on Beautiful Soup %s" % __version__ data = rdoc(num_elements) print "Generated a large invalid HTML document (%d bytes)." % len(data) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = BeautifulSoup(data, parser) b = time.time() success = True except Exception, e: print "%s could not parse the markup." % parser traceback.print_exc() if success: print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
Example #24
Source File: diagnose.py From fuzzdb-collect with GNU General Public License v3.0 | 6 votes |
def rdoc(num_elements=1000): """Randomly generate an invalid HTML document.""" tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] elements = [] for i in range(num_elements): choice = random.randint(0,3) if choice == 0: # New tag. tag_name = random.choice(tag_names) elements.append("<%s>" % tag_name) elif choice == 1: elements.append(rsentence(random.randint(1,4))) elif choice == 2: # Close a tag. tag_name = random.choice(tag_names) elements.append("</%s>" % tag_name) return "<html>" + "\n".join(elements) + "</html>"
Example #25
Source File: diagnose.py From fuzzdb-collect with GNU General Public License v3.0 | 6 votes |
def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print "Comparative parser benchmark on Beautiful Soup %s" % __version__ data = rdoc(num_elements) print "Generated a large invalid HTML document (%d bytes)." % len(data) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = BeautifulSoup(data, parser) b = time.time() success = True except Exception, e: print "%s could not parse the markup." % parser traceback.print_exc() if success: print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
Example #26
Source File: doub.spider.py From ParrotSecCN_Community_QQbot with GNU General Public License v2.0 | 6 votes |
def getNeedInfo(sourceHtml): """ 获取SS_SSR的请求地址 """ selector = etree.HTML(sourceHtml) lists = [] for i in range(5, 9): ca_1 = selector.xpath( '/html/body/section/div[3]/div/div[1]/table/tbody/tr[' + str(i) + ']/td/a/@href') for j in ca_1: print(j) lists.append(j) return lists # lists = [j for j in selector.xpath('/html/body/section/div[3]/div/div[1]/table/tbody/tr['+str(i)+']/td/a/@href')]
Example #27
Source File: diagnose.py From weeman with GNU General Public License v3.0 | 6 votes |
def rdoc(num_elements=1000): """Randomly generate an invalid HTML document.""" tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] elements = [] for i in range(num_elements): choice = random.randint(0,3) if choice == 0: # New tag. tag_name = random.choice(tag_names) elements.append("<%s>" % tag_name) elif choice == 1: elements.append(rsentence(random.randint(1,4))) elif choice == 2: # Close a tag. tag_name = random.choice(tag_names) elements.append("</%s>" % tag_name) return "<html>" + "\n".join(elements) + "</html>"
Example #28
Source File: diagnose.py From weeman with GNU General Public License v3.0 | 6 votes |
def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print "Comparative parser benchmark on Beautiful Soup %s" % __version__ data = rdoc(num_elements) print "Generated a large invalid HTML document (%d bytes)." % len(data) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = BeautifulSoup(data, parser) b = time.time() success = True except Exception, e: print "%s could not parse the markup." % parser traceback.print_exc() if success: print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
Example #29
Source File: newrelic.py From changelogs with MIT License | 6 votes |
def get_content(session, urls): log = "" for url in urls: r = session.get(url) if r.status_code == 200: root = etree.HTML(r.content) try: article = root.xpath("//article/div[@class='content']")[0] content = etree.tostring(article, method="text", encoding='utf-8') if sys.version_info > (3, 0): content = content.decode("utf-8") # remove first two lines content = '\n'.join(content.split('\n')[2:-1]) log += "# {version}\n{content}\n\n".format( version=url.split("-")[-1], content=content, ) except IndexError: pass return log
Example #30
Source File: spider.py From job-web-demo with MIT License | 6 votes |
def _parse_company_detail(self, detail_url): resp = self._request('get', detail_url) resp.encoding = resp.apparent_encoding html = etree.HTML(resp.text) name = html.xpath('//div[@class="company_main"]/h1/a/text()') # 这里最好先判断一下,以免没提取到出现异常 if not name: self.logger.debug('请求到错误页面') time.sleep(30) return self._parse_company_detail(detail_url) # 返回的键必须包含这些,否则写入会报错 supply = { 'details': unescape(str(etree.tostring(html.xpath( '//span[@class="company_content"]')[0]), encoding='utf8')).replace( '<span class="company_content">', '').replace('\n', '').replace('\xa0', ''), 'website': html.xpath('//div[@class="company_main"]/a[1]/@href')[0].split('?')[0], } return supply