Python lxml.html() Examples

The following are 30 code examples of lxml.html(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module lxml , or try the search function .
Example #1
Source File: structural_similarity.py    From html-similarity with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def structural_similarity(document_1, document_2):
    """
    Computes the structural similarity between two DOM Trees
    :param document_1: html string
    :param document_2: html string
    :return: int
    """
    try:
        document_1 = lxml.html.parse(StringIO(document_1))
        document_2 = lxml.html.parse(StringIO(document_2))
    except Exception as e:
        print(e)
        return 0

    tags1 = get_tags(document_1)
    tags2 = get_tags(document_2)
    diff = difflib.SequenceMatcher()
    diff.set_seq1(tags1)
    diff.set_seq2(tags2)

    return diff.ratio() 
Example #2
Source File: html_quotations_test.py    From talon with Apache License 2.0 6 votes vote down vote up
def test_remove_namespaces():
    msg_body = """
    <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns="http://www.w3.org/TR/REC-html40">
        <body>
            <o:p>Dear Sir,</o:p>
            <o:p>Thank you for the email.</o:p>
            <blockquote>thing</blockquote>
        </body>
    </html>
    """

    rendered = quotations.extract_from_html(msg_body)

    assert_true("<p>" in rendered)
    assert_true("xmlns" in rendered)

    assert_true("<o:p>" not in rendered)
    assert_true("<xmlns:o>" not in rendered) 
Example #3
Source File: scrape-cdc-state-case-counts.py    From zika-data with MIT License 6 votes vote down vote up
def scrape():
    html = requests.get(URL, params={
        "_": random.random()
    }).content
    dom = lxml.html.fromstring(html)

    table = dom.cssselect("table")[0]
    trs = table.cssselect("tr")

    rows = [ [ parse_cell(td.text_content())
        for td in tr.cssselect("td:nth-child(1), td:nth-child(2), td:nth-child(4)") ] 
             for tr in trs ]

    data = [ row for row in rows[1:]
        if len(row) and not row[0] in [ "", "States", "Territories" ] ]

    df = pd.DataFrame(data, columns=COLS)
    for c in INT_COLS:
        df[c] = df[c].str.replace(",", "").str.strip("*").astype(int)

    return df 
Example #4
Source File: test_home.py    From zulip with Apache License 2.0 6 votes vote down vote up
def test_terms_of_service(self) -> None:
        user = self.example_user('hamlet')
        self.login_user(user)

        for user_tos_version in [None, '1.1', '2.0.3.4']:
            user.tos_version = user_tos_version
            user.save()

            with \
                    self.settings(TERMS_OF_SERVICE='whatever'), \
                    self.settings(TOS_VERSION='99.99'):

                result = self.client_get('/', dict(stream='Denmark'))

            html = result.content.decode('utf-8')
            self.assertIn('Accept the new Terms of Service', html) 
Example #5
Source File: test_home.py    From zulip with Apache License 2.0 6 votes vote down vote up
def test_invites_by_admins_only(self) -> None:
        user_profile = self.example_user('hamlet')

        realm = user_profile.realm
        realm.invite_by_admins_only = True
        realm.save()

        self.login_user(user_profile)
        self.assertFalse(user_profile.is_realm_admin)
        result = self._get_home_page()
        html = result.content.decode('utf-8')
        self.assertNotIn('Invite more users', html)

        user_profile.role = UserProfile.ROLE_REALM_ADMINISTRATOR
        user_profile.save()
        result = self._get_home_page()
        html = result.content.decode('utf-8')
        self.assertIn('Invite more users', html) 
Example #6
Source File: generic.py    From n6 with GNU Affero General Public License v3.0 6 votes vote down vote up
def rss_item_to_relevant_data(self, item):
        """
        Extract the relevant data from the given RSS item.

        Args:
            `item`:
                A single item from the RSS feed.  Such an
                item is an element of a list obtained with a
                `<lxml etree/html document>.xpath(...)` call
                (see the source code of the _process_rss()
                method).

        Returns:
            Some hashable object.  It may be, for example, a
            tuple or a string -- the exact type depends on the
            implementation provided by a particular subclass
            of BaseRSSCollector.
        """
        raise NotImplementedError 
Example #7
Source File: jurisdiction.py    From clarify with MIT License 6 votes vote down vote up
def _get_subjurisdictions_url(self):
        """
        Returns a URL for the county detail page, which lists URLs for
        each of the counties in a state. If original jurisdiction is
        not a state, returns None.
        """
        if self.level != 'state':
            return None
        elif 'Web01/' in self.url:
            return None
        else:
            newpath = '/'.join(self.parsed_url.path.split('/')[:-1]) + '/select-county.html'
            parts = (
                self.parsed_url.scheme,
                self.parsed_url.netloc,
                newpath,
                self.parsed_url.query,
                self.parsed_url.fragment,
            )
            return parse.urlunsplit(parts) 
Example #8
Source File: html_quotations_test.py    From talon with Apache License 2.0 6 votes vote down vote up
def test_quotation_splitter_inside_blockquote():
    msg_body = """Reply
<blockquote>

  <div>
    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
  </div>

  <div>
    Test
  </div>

</blockquote>"""

    eq_("<html><head></head><body>Reply</body></html>",
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) 
Example #9
Source File: Utilities.py    From table-extractor with GNU General Public License v3.0 6 votes vote down vote up
def url_composer(self, query, service):
        """
        This function is used to compose a url to call some web services, such as sparql endpoints.

        :param query: is the string used in some rest calls.
        :param service: type of service you request (dbpedia sparql endpoint)
        :return url: the url composed
        """
        # use quote_plus method from urllib to encode special character (must to do with web service)
        query = urllib.quote_plus(query)

        """
        The following if clause are differentiated by service requested Eg. 'dbpedia',..
            but in all the cases url is composed using pre formatted string along with the query
        """
        if service == 'dbpedia':
            url = self.dbpedia_sparql_url + query + self.call_format_sparql

        elif service == 'html':
            url = self.html_format + query

        else:
            url = "ERROR"
        return url 
Example #10
Source File: HtmlTableParser.py    From table-extractor with GNU General Public License v3.0 6 votes vote down vote up
def remove_html_encode_errors(self, headers, error):
        """
        Use this method to remove html special characters (Eg. &nbps), encoding errors or other unicode text.

        Simply pass headers rows to the method and the error, as a unicode string, you want to correct

        :param headers: rows list of headers
        :param error: unicode string you want to delete from header cells
        :return: nothing
        """
        # Iterates over headers
        for row in headers:
            # Iterate over header cells
            for header in row:
                # Replace 'error' with u'' in the text of this header cell
                header['th'] = header['th'].replace(error, u'') 
Example #11
Source File: html_quotations_test.py    From talon with Apache License 2.0 6 votes vote down vote up
def test_regular_blockquote():
    msg_body = """Reply
<blockquote>Regular</blockquote>

<div>
  On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
</div>

<blockquote>
  <div>
    <blockquote>Nested</blockquote>
  </div>
</blockquote>
"""
    eq_("<html><head></head><body>Reply<blockquote>Regular</blockquote></body></html>",
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) 
Example #12
Source File: processor.py    From rssant with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_html_redirect_url(html: str, base_url: str = None) -> str:
    """
    Resolve HTML meta refresh client-side redirect

    https://www.w3.org/TR/WCAG20-TECHS/H76.html
    Example:
        <meta http-equiv="refresh" content="0;URL='http://example.com/'"/>
    """
    if not html or len(html) > 2048:
        return None
    match = RE_HTML_REDIRECT.search(html)
    if not match:
        return None
    match = RE_HTML_REDIRECT_URL.search(match.group(1))
    if not match:
        return None
    url = normalize_url(match.group(1).strip(), base_url=base_url)
    try:
        url = validate_url(url)
    except Invalid:
        url = None
    return url 
Example #13
Source File: coinmarketcap.py    From coinmarketcap-scraper with MIT License 6 votes vote down vote up
def testParseCurrencyListAll(self):
        """Test parseCurrencyListAll."""
        f = codecs.open("{0}/example/currencylist.html".format(
            os.path.dirname(os.path.abspath(__file__))), 'r', 'utf-8')
        html = f.read()
        f.close()
        data = parseCurrencyListAll(html)
        self.assertEqual(len(data), 452)
        expectedFirst = {
            'name': 'Bitcoin',
            'slug': 'bitcoin',
            'symbol': 'BTC',
            'explorer_link': 'http://blockchain.info'
        }
        self.assertEqual(data[0], expectedFirst)
        expectedLast = {
            'name': 'Marscoin',
            'slug': 'marscoin',
            'symbol': 'MRS',
            'explorer_link': 'http://explore.marscoin.org/chain/Marscoin/'
        }
        self.assertEqual(data[-1], expectedLast) 
Example #14
Source File: html_quotations_test.py    From talon with Apache License 2.0 6 votes vote down vote up
def test_validate_output_html():
    msg_body = """Reply
<div>
  On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:

    <blockquote>
      <div>
        Test
      </div>
    </blockquote>
</div>

<div/>
"""
    out = quotations.extract_from_html(msg_body)
    ok_('<html>' in out and '</html>' in out,
        'Invalid HTML - <html>/</html> tag not present')
    ok_('<div/>' not in out,
        'Invalid HTML output - <div/> element is not valid') 
Example #15
Source File: structural_similarity.py    From html-similarity with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_tags(doc):
    '''
    Get tags from a DOM tree

    :param doc: lxml parsed object
    :return:
    '''
    tags = list()

    for el in doc.getroot().iter():
        if isinstance(el, lxml.html.HtmlElement):
            tags.append(el.tag)
        elif isinstance(el, lxml.html.HtmlComment):
            tags.append('comment')
        else:
            raise ValueError('Don\'t know what to do with element: {}'.format(el))

    return tags 
Example #16
Source File: html_quotations_test.py    From talon with Apache License 2.0 6 votes vote down vote up
def test_date_block():
    msg_body = """
<div>
  message<br>
  <div>
    <hr>
    Date: Fri, 23 Mar 2012 12:35:31 -0600<br>
    To: <a href="mailto:bob@example.com">bob@example.com</a><br>
    From: <a href="mailto:rob@example.com">rob@example.com</a><br>
    Subject: You Have New Mail From Mary!<br><br>

    text
  </div>
</div>
"""
    eq_('<html><head></head><body><div>message<br></div></body></html>',
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) 
Example #17
Source File: html_quotations_test.py    From talon with Apache License 2.0 6 votes vote down vote up
def test_from_block_and_quotations_in_separate_divs():
    msg_body = '''
Reply
<div>
  <hr/>
  <div>
    <font>
      <b>From: bob@example.com</b>
      <b>Date: Thu, 24 Mar 2016 08:07:12 -0700</b>
    </font>
  </div>
  <div>
    Quoted message
  </div>
</div>
'''
    eq_('<html><head></head><body>Reply<div><hr></div></body></html>',
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) 
Example #18
Source File: state_bill_extractors.py    From policy_diffusion with MIT License 5 votes vote down vote up
def ct_text_extractor(doc_source):
    doc = lxml.html.fromstring(doc_source)
    text = ' '.join(p.text_content() for p in doc.xpath('//body/p'))
    return text 
Example #19
Source File: html_quotations_test.py    From talon with Apache License 2.0 5 votes vote down vote up
def test_ms_outlook_2007_reply():
    extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2007.html") 
Example #20
Source File: state_bill_extractors.py    From policy_diffusion with MIT License 5 votes vote down vote up
def ca_text_extractor(doc_source):
    doc = lxml.html.fromstring(doc_source)
    divs_to_try = ['//div[@id="bill"]', '//div[@id="bill_all"]']
    for xpath in divs_to_try:
        div = doc.xpath(xpath)
        if div:
            return div[0].text_content() 
Example #21
Source File: html_quotations_test.py    From talon with Apache License 2.0 5 votes vote down vote up
def test_reply_separated_by_hr():
    eq_('<html><head></head><body><div>Hi<div>there</div></div></body></html>',
        RE_WHITESPACE.sub(
            '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) 
Example #22
Source File: state_bill_extractors.py    From policy_diffusion with MIT License 5 votes vote down vote up
def mi_text_extractor(doc_source):
    doc = lxml.html.fromstring(doc_source)
    text = doc.xpath('//body')[0].text_content()
    return text 
Example #23
Source File: html_quotations_test.py    From talon with Apache License 2.0 5 votes vote down vote up
def test_bad_html():
    bad_html = "<html></html>"
    eq_(bad_html, quotations.extract_from_html(bad_html)) 
Example #24
Source File: html_quotations_test.py    From talon with Apache License 2.0 5 votes vote down vote up
def test_CRLF():
    """CR is not converted to '&#13;'
    """
    symbol = '&#13;'
    extracted = quotations.extract_from_html('<html>\r\n</html>')
    assert_false(symbol in extracted)
    eq_('<html></html>', RE_WHITESPACE.sub('', extracted))

    msg_body = """My
reply
<blockquote>

  <div>
    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
  </div>

  <div>
    Test
  </div>

</blockquote>"""
    msg_body = msg_body.replace('\n', '\r\n')
    extracted = quotations.extract_from_html(msg_body)
    assert_false(symbol in extracted)
    # Keep new lines otherwise "My reply" becomes one word - "Myreply" 
    eq_("<html><head></head><body>My\nreply\n</body></html>", extracted) 
Example #25
Source File: html_quotations_test.py    From talon with Apache License 2.0 5 votes vote down vote up
def test_windows_mail_reply():
    extract_reply_and_check("tests/fixtures/html_replies/windows_mail.html") 
Example #26
Source File: html_quotations_test.py    From talon with Apache License 2.0 5 votes vote down vote up
def test_thunderbird_reply():
    extract_reply_and_check("tests/fixtures/html_replies/thunderbird.html") 
Example #27
Source File: html_quotations_test.py    From talon with Apache License 2.0 5 votes vote down vote up
def test_gmail_reply():
    extract_reply_and_check("tests/fixtures/html_replies/gmail.html") 
Example #28
Source File: html_quotations_test.py    From talon with Apache License 2.0 5 votes vote down vote up
def test_mail_ru_reply():
    extract_reply_and_check("tests/fixtures/html_replies/mail_ru.html") 
Example #29
Source File: html_quotations_test.py    From talon with Apache License 2.0 5 votes vote down vote up
def test_ms_outlook_2010_reply():
    extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2010.html") 
Example #30
Source File: html_quotations_test.py    From talon with Apache License 2.0 5 votes vote down vote up
def test_ms_outlook_2003_reply():
    extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2003.html")