Python lxml.html() Examples
The following are 30
code examples of lxml.html().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
lxml
, or try the search function
.
Example #1
Source File: structural_similarity.py From html-similarity with BSD 3-Clause "New" or "Revised" License | 6 votes |
def structural_similarity(document_1, document_2): """ Computes the structural similarity between two DOM Trees :param document_1: html string :param document_2: html string :return: int """ try: document_1 = lxml.html.parse(StringIO(document_1)) document_2 = lxml.html.parse(StringIO(document_2)) except Exception as e: print(e) return 0 tags1 = get_tags(document_1) tags2 = get_tags(document_2) diff = difflib.SequenceMatcher() diff.set_seq1(tags1) diff.set_seq2(tags2) return diff.ratio()
Example #2
Source File: html_quotations_test.py From talon with Apache License 2.0 | 6 votes |
def test_remove_namespaces(): msg_body = """ <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns="http://www.w3.org/TR/REC-html40"> <body> <o:p>Dear Sir,</o:p> <o:p>Thank you for the email.</o:p> <blockquote>thing</blockquote> </body> </html> """ rendered = quotations.extract_from_html(msg_body) assert_true("<p>" in rendered) assert_true("xmlns" in rendered) assert_true("<o:p>" not in rendered) assert_true("<xmlns:o>" not in rendered)
Example #3
Source File: scrape-cdc-state-case-counts.py From zika-data with MIT License | 6 votes |
def scrape(): html = requests.get(URL, params={ "_": random.random() }).content dom = lxml.html.fromstring(html) table = dom.cssselect("table")[0] trs = table.cssselect("tr") rows = [ [ parse_cell(td.text_content()) for td in tr.cssselect("td:nth-child(1), td:nth-child(2), td:nth-child(4)") ] for tr in trs ] data = [ row for row in rows[1:] if len(row) and not row[0] in [ "", "States", "Territories" ] ] df = pd.DataFrame(data, columns=COLS) for c in INT_COLS: df[c] = df[c].str.replace(",", "").str.strip("*").astype(int) return df
Example #4
Source File: test_home.py From zulip with Apache License 2.0 | 6 votes |
def test_terms_of_service(self) -> None: user = self.example_user('hamlet') self.login_user(user) for user_tos_version in [None, '1.1', '2.0.3.4']: user.tos_version = user_tos_version user.save() with \ self.settings(TERMS_OF_SERVICE='whatever'), \ self.settings(TOS_VERSION='99.99'): result = self.client_get('/', dict(stream='Denmark')) html = result.content.decode('utf-8') self.assertIn('Accept the new Terms of Service', html)
Example #5
Source File: test_home.py From zulip with Apache License 2.0 | 6 votes |
def test_invites_by_admins_only(self) -> None: user_profile = self.example_user('hamlet') realm = user_profile.realm realm.invite_by_admins_only = True realm.save() self.login_user(user_profile) self.assertFalse(user_profile.is_realm_admin) result = self._get_home_page() html = result.content.decode('utf-8') self.assertNotIn('Invite more users', html) user_profile.role = UserProfile.ROLE_REALM_ADMINISTRATOR user_profile.save() result = self._get_home_page() html = result.content.decode('utf-8') self.assertIn('Invite more users', html)
Example #6
Source File: generic.py From n6 with GNU Affero General Public License v3.0 | 6 votes |
def rss_item_to_relevant_data(self, item): """ Extract the relevant data from the given RSS item. Args: `item`: A single item from the RSS feed. Such an item is an element of a list obtained with a `<lxml etree/html document>.xpath(...)` call (see the source code of the _process_rss() method). Returns: Some hashable object. It may be, for example, a tuple or a string -- the exact type depends on the implementation provided by a particular subclass of BaseRSSCollector. """ raise NotImplementedError
Example #7
Source File: jurisdiction.py From clarify with MIT License | 6 votes |
def _get_subjurisdictions_url(self): """ Returns a URL for the county detail page, which lists URLs for each of the counties in a state. If original jurisdiction is not a state, returns None. """ if self.level != 'state': return None elif 'Web01/' in self.url: return None else: newpath = '/'.join(self.parsed_url.path.split('/')[:-1]) + '/select-county.html' parts = ( self.parsed_url.scheme, self.parsed_url.netloc, newpath, self.parsed_url.query, self.parsed_url.fragment, ) return parse.urlunsplit(parts)
Example #8
Source File: html_quotations_test.py From talon with Apache License 2.0 | 6 votes |
def test_quotation_splitter_inside_blockquote(): msg_body = """Reply <blockquote> <div> On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: </div> <div> Test </div> </blockquote>""" eq_("<html><head></head><body>Reply</body></html>", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
Example #9
Source File: Utilities.py From table-extractor with GNU General Public License v3.0 | 6 votes |
def url_composer(self, query, service): """ This function is used to compose a url to call some web services, such as sparql endpoints. :param query: is the string used in some rest calls. :param service: type of service you request (dbpedia sparql endpoint) :return url: the url composed """ # use quote_plus method from urllib to encode special character (must to do with web service) query = urllib.quote_plus(query) """ The following if clause are differentiated by service requested Eg. 'dbpedia',.. but in all the cases url is composed using pre formatted string along with the query """ if service == 'dbpedia': url = self.dbpedia_sparql_url + query + self.call_format_sparql elif service == 'html': url = self.html_format + query else: url = "ERROR" return url
Example #10
Source File: HtmlTableParser.py From table-extractor with GNU General Public License v3.0 | 6 votes |
def remove_html_encode_errors(self, headers, error): """ Use this method to remove html special characters (Eg. &nbps), encoding errors or other unicode text. Simply pass headers rows to the method and the error, as a unicode string, you want to correct :param headers: rows list of headers :param error: unicode string you want to delete from header cells :return: nothing """ # Iterates over headers for row in headers: # Iterate over header cells for header in row: # Replace 'error' with u'' in the text of this header cell header['th'] = header['th'].replace(error, u'')
Example #11
Source File: html_quotations_test.py From talon with Apache License 2.0 | 6 votes |
def test_regular_blockquote(): msg_body = """Reply <blockquote>Regular</blockquote> <div> On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: </div> <blockquote> <div> <blockquote>Nested</blockquote> </div> </blockquote> """ eq_("<html><head></head><body>Reply<blockquote>Regular</blockquote></body></html>", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
Example #12
Source File: processor.py From rssant with BSD 3-Clause "New" or "Revised" License | 6 votes |
def get_html_redirect_url(html: str, base_url: str = None) -> str: """ Resolve HTML meta refresh client-side redirect https://www.w3.org/TR/WCAG20-TECHS/H76.html Example: <meta http-equiv="refresh" content="0;URL='http://example.com/'"/> """ if not html or len(html) > 2048: return None match = RE_HTML_REDIRECT.search(html) if not match: return None match = RE_HTML_REDIRECT_URL.search(match.group(1)) if not match: return None url = normalize_url(match.group(1).strip(), base_url=base_url) try: url = validate_url(url) except Invalid: url = None return url
Example #13
Source File: coinmarketcap.py From coinmarketcap-scraper with MIT License | 6 votes |
def testParseCurrencyListAll(self): """Test parseCurrencyListAll.""" f = codecs.open("{0}/example/currencylist.html".format( os.path.dirname(os.path.abspath(__file__))), 'r', 'utf-8') html = f.read() f.close() data = parseCurrencyListAll(html) self.assertEqual(len(data), 452) expectedFirst = { 'name': 'Bitcoin', 'slug': 'bitcoin', 'symbol': 'BTC', 'explorer_link': 'http://blockchain.info' } self.assertEqual(data[0], expectedFirst) expectedLast = { 'name': 'Marscoin', 'slug': 'marscoin', 'symbol': 'MRS', 'explorer_link': 'http://explore.marscoin.org/chain/Marscoin/' } self.assertEqual(data[-1], expectedLast)
Example #14
Source File: html_quotations_test.py From talon with Apache License 2.0 | 6 votes |
def test_validate_output_html(): msg_body = """Reply <div> On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: <blockquote> <div> Test </div> </blockquote> </div> <div/> """ out = quotations.extract_from_html(msg_body) ok_('<html>' in out and '</html>' in out, 'Invalid HTML - <html>/</html> tag not present') ok_('<div/>' not in out, 'Invalid HTML output - <div/> element is not valid')
Example #15
Source File: structural_similarity.py From html-similarity with BSD 3-Clause "New" or "Revised" License | 6 votes |
def get_tags(doc): ''' Get tags from a DOM tree :param doc: lxml parsed object :return: ''' tags = list() for el in doc.getroot().iter(): if isinstance(el, lxml.html.HtmlElement): tags.append(el.tag) elif isinstance(el, lxml.html.HtmlComment): tags.append('comment') else: raise ValueError('Don\'t know what to do with element: {}'.format(el)) return tags
Example #16
Source File: html_quotations_test.py From talon with Apache License 2.0 | 6 votes |
def test_date_block(): msg_body = """ <div> message<br> <div> <hr> Date: Fri, 23 Mar 2012 12:35:31 -0600<br> To: <a href="mailto:bob@example.com">bob@example.com</a><br> From: <a href="mailto:rob@example.com">rob@example.com</a><br> Subject: You Have New Mail From Mary!<br><br> text </div> </div> """ eq_('<html><head></head><body><div>message<br></div></body></html>', RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
Example #17
Source File: html_quotations_test.py From talon with Apache License 2.0 | 6 votes |
def test_from_block_and_quotations_in_separate_divs(): msg_body = ''' Reply <div> <hr/> <div> <font> <b>From: bob@example.com</b> <b>Date: Thu, 24 Mar 2016 08:07:12 -0700</b> </font> </div> <div> Quoted message </div> </div> ''' eq_('<html><head></head><body>Reply<div><hr></div></body></html>', RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
Example #18
Source File: state_bill_extractors.py From policy_diffusion with MIT License | 5 votes |
def ct_text_extractor(doc_source): doc = lxml.html.fromstring(doc_source) text = ' '.join(p.text_content() for p in doc.xpath('//body/p')) return text
Example #19
Source File: html_quotations_test.py From talon with Apache License 2.0 | 5 votes |
def test_ms_outlook_2007_reply(): extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2007.html")
Example #20
Source File: state_bill_extractors.py From policy_diffusion with MIT License | 5 votes |
def ca_text_extractor(doc_source): doc = lxml.html.fromstring(doc_source) divs_to_try = ['//div[@id="bill"]', '//div[@id="bill_all"]'] for xpath in divs_to_try: div = doc.xpath(xpath) if div: return div[0].text_content()
Example #21
Source File: html_quotations_test.py From talon with Apache License 2.0 | 5 votes |
def test_reply_separated_by_hr(): eq_('<html><head></head><body><div>Hi<div>there</div></div></body></html>', RE_WHITESPACE.sub( '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR)))
Example #22
Source File: state_bill_extractors.py From policy_diffusion with MIT License | 5 votes |
def mi_text_extractor(doc_source): doc = lxml.html.fromstring(doc_source) text = doc.xpath('//body')[0].text_content() return text
Example #23
Source File: html_quotations_test.py From talon with Apache License 2.0 | 5 votes |
def test_bad_html(): bad_html = "<html></html>" eq_(bad_html, quotations.extract_from_html(bad_html))
Example #24
Source File: html_quotations_test.py From talon with Apache License 2.0 | 5 votes |
def test_CRLF(): """CR is not converted to ' ' """ symbol = ' ' extracted = quotations.extract_from_html('<html>\r\n</html>') assert_false(symbol in extracted) eq_('<html></html>', RE_WHITESPACE.sub('', extracted)) msg_body = """My reply <blockquote> <div> On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: </div> <div> Test </div> </blockquote>""" msg_body = msg_body.replace('\n', '\r\n') extracted = quotations.extract_from_html(msg_body) assert_false(symbol in extracted) # Keep new lines otherwise "My reply" becomes one word - "Myreply" eq_("<html><head></head><body>My\nreply\n</body></html>", extracted)
Example #25
Source File: html_quotations_test.py From talon with Apache License 2.0 | 5 votes |
def test_windows_mail_reply(): extract_reply_and_check("tests/fixtures/html_replies/windows_mail.html")
Example #26
Source File: html_quotations_test.py From talon with Apache License 2.0 | 5 votes |
def test_thunderbird_reply(): extract_reply_and_check("tests/fixtures/html_replies/thunderbird.html")
Example #27
Source File: html_quotations_test.py From talon with Apache License 2.0 | 5 votes |
def test_gmail_reply(): extract_reply_and_check("tests/fixtures/html_replies/gmail.html")
Example #28
Source File: html_quotations_test.py From talon with Apache License 2.0 | 5 votes |
def test_mail_ru_reply(): extract_reply_and_check("tests/fixtures/html_replies/mail_ru.html")
Example #29
Source File: html_quotations_test.py From talon with Apache License 2.0 | 5 votes |
def test_ms_outlook_2010_reply(): extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2010.html")
Example #30
Source File: html_quotations_test.py From talon with Apache License 2.0 | 5 votes |
def test_ms_outlook_2003_reply(): extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2003.html")