Python lxml.html.document_fromstring() Examples
The following are 16
code examples of lxml.html.document_fromstring().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
lxml.html
, or try the search function
.
Example #1
Source File: parsers.py From crestify with BSD 3-Clause "New" or "Revised" License | 6 votes |
def __init__(self, file_name, user_id): with open(file_name, 'r') as self.opened_file: # So Instapaper doesn't close <li> tags # This was causing infinite recursion when using BS directly # Hence why the stuff below is being done, so that the <li> tags get closed self.html = html.document_fromstring(self.opened_file.read()) self.html = html.tostring(self.html) self.soup = BeautifulSoup4(self.html) self.user = user_id self.urls = dict() self.check_duplicates = dict() self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user, Bookmark.deleted == False).all() for bmark in self.check_duplicates_query: self.check_duplicates[bmark.main_url] = bmark self.tags_dict = dict() self.tags_set = set() self.valid_url = re.compile( r'^(?:[a-z0-9\.\-]*)://' r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|' r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' r'(?::\d+)?' r'(?:/?|[/?]\S+)$', re.IGNORECASE)
Example #2
Source File: importer.py From python-ooxml with GNU Affero General Public License v3.0 | 6 votes |
def parse_html_string(s): from lxml import html utf8_parser = html.HTMLParser(encoding='utf-8') html_tree = html.document_fromstring(s , parser=utf8_parser) return html_tree
Example #3
Source File: server.py From autologin with Apache License 2.0 | 6 votes |
def download_page(url, cookie_jar): """ Request page using authenticated cookies (cookiejar). Download html source and save in browser directory, to be used by in show_in_browser(). """ browser_dir = os.path.join(server_path, 'static/browser') delete_directory_files(browser_dir) filename = '{}.html'.format(uuid.uuid4()) filepath = os.path.join(browser_dir, filename) try: response = cookie_request(url, cookie_jar) except requests.RequestException as e: return e, None doc = html.document_fromstring(response.text) with open(filepath, 'wb') as f: f.write(html.tostring(doc)) return None, filename
Example #4
Source File: scraping.py From text-mining-class with MIT License | 5 votes |
def __init__(self, html_content, encoding="utf-8"): if isinstance(html_content, bytes): html_content = html_content.decode(encoding) self.document = document_fromstring(html_content) strip_elements(self.document, "style")
Example #5
Source File: scraping.py From text-mining-class with MIT License | 5 votes |
def __init__(self, html_content, encoding="utf-8"): if isinstance(html_content, bytes): html_content = html_content.decode(encoding) self.document = document_fromstring(html_content) strip_elements(self.document, "style")
Example #6
Source File: googlerbot.py From SnapchatBot with MIT License | 5 votes |
def reverse_image_search(url): value = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17" headers = {'User-Bot': value} search_url = 'https://www.google.com/searchbyimage?image_url=%s' % url resp = requests.get(search_url, headers=headers) root = document_fromstring(resp.content) href = root.cssselect(".bia")[0].attrib['href'] print(search_url) new_url = "https://www.google.com" + href resp = requests.get(new_url, headers=headers) return re.search("imgurl=([^&]*)", resp.content).group(1)
Example #7
Source File: cnae.py From socios-brasil with GNU Lesser General Public License v3.0 | 5 votes |
def parse_items(self, response, root_name=None): """Recursively get data/make requests for all parser hierarchical levels""" data = response.request.meta.get("data", {}) root_name = root_name or response.request.meta["root_name"] metadata = self.parsers[root_name] xpath_id = metadata["xpath_id"] xpath_description = metadata["xpath_description"] xpath_url = metadata["xpath_url"] item_name = metadata["next"] for item in response.xpath(metadata["xpath_items"]): tree = document_fromstring(item.extract()) url = urljoin("https://cnae.ibge.gov.br/", tree.xpath(xpath_url)[0]) item_id = get_text(tree.xpath(xpath_id)) item_description = get_text(tree.xpath(xpath_description)) item_data = {} if item_name == "subclasse" or len(item_id) == self.parsers[item_name]["id_length"]: next_root_name = item_name else: descricao = response.xpath("//span[@class = 'destaque']//text()").extract()[0] item_data[f"id_{item_name}"] = descricao.split()[0] item_data[f"descricao_{item_name}"] = descricao next_root_name = self.parsers[item_name]["next"] item_data.update({ f"id_{next_root_name}": item_id.strip(), f"descricao_{next_root_name}": item_description.strip(), }) item_data.update(data) callback = self.parse_items if next_root_name != "subclasse" else self.parse_subclasse yield scrapy.Request( url=url, meta={"data": item_data, "root_name": next_root_name}, callback=callback, )
Example #8
Source File: cnae.py From socios-brasil with GNU Lesser General Public License v3.0 | 5 votes |
def parse_subclasse(self, response): """Yield the subclass item (last mile of the recursive strategy)""" data = response.request.meta["data"] tree = document_fromstring(response.body) data["notas_explicativas"] = "\n".join( [ line.strip() for line in tree.xpath('//div[@id = "notas-explicativas"]//text()') if line.strip() ] ) data["url"] = response.request.url data["id"] = int(data["id_subclasse"].replace("/", "").replace("-", "")) data["versao"] = self.versao yield data
Example #9
Source File: cleaner.py From wanish with MIT License | 5 votes |
def initial_output(html_partial=False): """ Creates initial HTML document according to the given flag :param html_partial: determines if there should be the html page or only a fragment :return: html output element """ return fragment_fromstring('<div/>') if html_partial else document_fromstring('<div/>')
Example #10
Source File: column.py From zhihu2ebook with MIT License | 5 votes |
def replace_img_url(self, content): utf8_parser = html.HTMLParser(encoding='utf-8') tree = html.document_fromstring(str(content), parser=utf8_parser) for _pic_link in tree.xpath("//img"): href = str(_pic_link.get('src')) pic_id, pic_type = href.split('.') _pic_link.set('src', "https://pic4.zhimg.com/" + pic_id + "_b." + pic_type) replaced_content = etree.tostring(tree, encoding=str) return replaced_content
Example #11
Source File: utils.py From jorvik with GNU General Public License v3.0 | 5 votes |
def get_drive_file(file): req = urllib.request.Request("https://docs.google.com/feeds/download/documents/export/Export?id=%s&exportFormat=html" %(file,)) str = urllib.request.urlopen(req).read().decode('UTF-8') doc = html.document_fromstring(str) head = doc.xpath('//head')[0] head.tag = 'div' body = doc.xpath('//body')[0] body.tag = 'div' str = html.tostring(head)+html.tostring(body) return str
Example #12
Source File: models.py From jorvik with GNU General Public License v3.0 | 5 votes |
def corpo_body(self): """ Prova ad estrarre il corpo della pagina (body). :return: """ if not self.corpo: return "" doc = html.document_fromstring(self.corpo) body = doc.xpath('//body')[0] body.tag = 'div' #try: return html.tostring(body) #except: # return self.corpo #print html.parse('http://someurl.at.domain').xpath('//body')[0].text_content()
Example #13
Source File: models.py From jorvik with GNU General Public License v3.0 | 5 votes |
def processa_link(self): """ Controlla i link nella e-mail relativi e li rende assoluti. """ doc = html.document_fromstring(self.corpo) links = doc.xpath('//a') for el in links: try: url = el.attrib['href'] if '://' not in url: el.attrib['href'] = "https://gaia.cri.it%s" % (url,) except KeyError: continue self.corpo = html.tostring(doc, pretty_print=True).decode('UTF-8')
Example #14
Source File: test_parsers.py From crestify with BSD 3-Clause "New" or "Revised" License | 4 votes |
def setUp(self): query_user = User.query.filter_by(email='instapaper@example.com').first() if query_user: query_bookmarks = Bookmark.query.filter_by(user=query_user.id) for bmark in query_bookmarks: db.session.delete(bmark) db.session.commit() db.session.delete(query_user) db.session.commit() create_user = User() create_user.first_name = 'Instapaper' create_user.last_name = 'Test' create_user.email = 'instapaper@example.com' create_user.password = 'instapaper_pass' create_user.active = True create_user.confirmed_at = datetime.datetime.utcnow() db.session.add(create_user) db.session.commit() self.user = create_user with open('Instapaper.html') as json_file: create_file = open(os.path.join(app.config['CRESTIFY_UPLOAD_DIRECTORY'], 'test_instapaper.html'), 'w+') self.data = html.document_fromstring(json_file.read()) self.data = html.tostring(self.data) self.html_data = BeautifulSoup4(self.data) self.bookmarks = {} for tag in self.html_data.find_all('h1'): parent_elem = tag.find_next_sibling('ol') links = parent_elem.find_all('a') for link in links: title = link.text url = link['href'] tags = [tag.text] tags.append('Imported') # Thanks Instapaper for not adding timestamps self.bookmarks[url] = { 'href': url, 'title': title, 'tags': tags } create_file.write(self.data) self.file_path = create_file.name create_file.close() init_parser = InstapaperParser(self.file_path, self.user.id) init_parser.process() init_parser.add_to_database() self.query = Bookmark.query.filter_by(user=self.user.id).all() self.html_parser = HTMLParser()
Example #15
Source File: parse-bbc-html-data.py From XSum with MIT License | 4 votes |
def __init__(self, story, corpus): self.story = story self.corpus = corpus self.parser = html.HTMLParser(encoding=chardet.detect(self.story.html)['encoding']) self.tree = html.document_fromstring(self.story.html, parser=self.parser) # Elements to delete. self.delete_selectors = { 'bbc': [ '//blockquote[contains(@class, "twitter-tweet")]', '//blockquote[contains(@class, "instagram-media")]' ] } # Title Selector self.title_selectors = { 'bbc': [ '//h1[contains(@class, "story-headline")]', '//h1[contains(@class, "story-body__h1")]' ] } # Introduction Selector self.introduction_selectors = { 'bbc': [ '//p[contains(@class, "story-body__introduction")]' ] } # Rest Content exclusions: ads, links, bylines, comments, headline and story introduction self.bbc_exclude = ( 'not(contains(@class, "story-headline"))' ' and not(contains(@class, "story-body__h1"))' ' and not(contains(@class, "story-body__introduction"))' ' and not(contains(@class, "with-extracted-share-icons"))' ) # Rest Content Selector self.restcontent_selectors = { 'bbc': [ '//div[contains(@class, "story-body")]//p[%s]' % self.bbc_exclude # story-body__inner ] }
Example #16
Source File: tests.py From jorvik with GNU General Public License v3.0 | 4 votes |
def test_us_attivazione_credenziali(self): EMAIL_UTENZA = email_fittizzia() presidente = crea_persona() persona, sede, appartenenza = crea_persona_sede_appartenenza(presidente=presidente) sessione_presidente = self.sessione_utente(persona=presidente) sessione_presidente.visit("%s%s" % (self.live_server_url, persona.url_profilo_credenziali)) sessione_presidente.fill('email', EMAIL_UTENZA) sessione_presidente.find_by_xpath("//button[@type='submit']").first.click() self.assertTrue( Utenza.objects.filter(persona=persona).exists(), msg="L'utenza e' stata creata correttamente" ) self.assertTrue( Utenza.objects.get(persona=persona).email == EMAIL_UTENZA, msg="L'email e' stata correttamente creata" ) # Ottieni e-mail inviata msg = Messaggio.objects.filter(oggetto__icontains="credenziali", oggetti_destinatario__persona=persona) self.assertTrue( msg.exists(), msg="Email delle credenziali spedita" ) corpo_msg = msg.first().corpo self.assertTrue( EMAIL_UTENZA in corpo_msg, msg="L'email contiene il nuovo indirizzo e-mail" ) doc = html.document_fromstring(corpo_msg) nuova_pwd = doc.xpath("//*[@id='nuova-password']")[0].text.strip() utenza = persona.utenza utenza.password_testing = nuova_pwd # Password per accesso # Prova accesso con nuova utenza. sessione_persona = self.sessione_utente(utente=utenza)