Python Examples of lxml.html

Source File: structural_similarity.py From html-similarity with BSD 3-Clause "New" or "Revised" License

6 votes

def structural_similarity(document_1, document_2):
    """
    Computes the structural similarity between two DOM Trees
    :param document_1: html string
    :param document_2: html string
    :return: int
    """
    try:
        document_1 = lxml.html.parse(StringIO(document_1))
        document_2 = lxml.html.parse(StringIO(document_2))
    except Exception as e:
        print(e)
        return 0

    tags1 = get_tags(document_1)
    tags2 = get_tags(document_2)
    diff = difflib.SequenceMatcher()
    diff.set_seq1(tags1)
    diff.set_seq2(tags2)

    return diff.ratio()

Source File: html_quotations_test.py From talon with Apache License 2.0

6 votes

def test_remove_namespaces():
    msg_body = """
    <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns="http://www.w3.org/TR/REC-html40">
        <body>
            <o:p>Dear Sir,</o:p>
            <o:p>Thank you for the email.</o:p>
            <blockquote>thing</blockquote>
        </body>
    </html>
    """

    rendered = quotations.extract_from_html(msg_body)

    assert_true("<p>" in rendered)
    assert_true("xmlns" in rendered)

    assert_true("<o:p>" not in rendered)
    assert_true("<xmlns:o>" not in rendered)

Source File: scrape-cdc-state-case-counts.py From zika-data with MIT License

6 votes

def scrape():
    html = requests.get(URL, params={
        "_": random.random()
    }).content
    dom = lxml.html.fromstring(html)

    table = dom.cssselect("table")[0]
    trs = table.cssselect("tr")

    rows = [ [ parse_cell(td.text_content())
        for td in tr.cssselect("td:nth-child(1), td:nth-child(2), td:nth-child(4)") ] 
             for tr in trs ]

    data = [ row for row in rows[1:]
        if len(row) and not row[0] in [ "", "States", "Territories" ] ]

    df = pd.DataFrame(data, columns=COLS)
    for c in INT_COLS:
        df[c] = df[c].str.replace(",", "").str.strip("*").astype(int)

    return df

Source File: test_home.py From zulip with Apache License 2.0

6 votes

def test_terms_of_service(self) -> None:
        user = self.example_user('hamlet')
        self.login_user(user)

        for user_tos_version in [None, '1.1', '2.0.3.4']:
            user.tos_version = user_tos_version
            user.save()

            with \
                    self.settings(TERMS_OF_SERVICE='whatever'), \
                    self.settings(TOS_VERSION='99.99'):

                result = self.client_get('/', dict(stream='Denmark'))

            html = result.content.decode('utf-8')
            self.assertIn('Accept the new Terms of Service', html)

Source File: test_home.py From zulip with Apache License 2.0

6 votes

def test_invites_by_admins_only(self) -> None:
        user_profile = self.example_user('hamlet')

        realm = user_profile.realm
        realm.invite_by_admins_only = True
        realm.save()

        self.login_user(user_profile)
        self.assertFalse(user_profile.is_realm_admin)
        result = self._get_home_page()
        html = result.content.decode('utf-8')
        self.assertNotIn('Invite more users', html)

        user_profile.role = UserProfile.ROLE_REALM_ADMINISTRATOR
        user_profile.save()
        result = self._get_home_page()
        html = result.content.decode('utf-8')
        self.assertIn('Invite more users', html)

Source File: generic.py From n6 with GNU Affero General Public License v3.0

6 votes

def rss_item_to_relevant_data(self, item):
        """
        Extract the relevant data from the given RSS item.

        Args:
            `item`:
                A single item from the RSS feed.  Such an
                item is an element of a list obtained with a
                `<lxml etree/html document>.xpath(...)` call
                (see the source code of the _process_rss()
                method).

        Returns:
            Some hashable object.  It may be, for example, a
            tuple or a string -- the exact type depends on the
            implementation provided by a particular subclass
            of BaseRSSCollector.
        """
        raise NotImplementedError

Source File: jurisdiction.py From clarify with MIT License

6 votes

def _get_subjurisdictions_url(self):
        """
        Returns a URL for the county detail page, which lists URLs for
        each of the counties in a state. If original jurisdiction is
        not a state, returns None.
        """
        if self.level != 'state':
            return None
        elif 'Web01/' in self.url:
            return None
        else:
            newpath = '/'.join(self.parsed_url.path.split('/')[:-1]) + '/select-county.html'
            parts = (
                self.parsed_url.scheme,
                self.parsed_url.netloc,
                newpath,
                self.parsed_url.query,
                self.parsed_url.fragment,
            )
            return parse.urlunsplit(parts)

Source File: html_quotations_test.py From talon with Apache License 2.0

6 votes

def test_quotation_splitter_inside_blockquote():
    msg_body = """Reply
<blockquote>

  <div>
    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
  </div>

  <div>
    Test
  </div>

</blockquote>"""

    eq_("<html><head></head><body>Reply</body></html>",
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))

Source File: Utilities.py From table-extractor with GNU General Public License v3.0

6 votes

def url_composer(self, query, service):
        """
        This function is used to compose a url to call some web services, such as sparql endpoints.

        :param query: is the string used in some rest calls.
        :param service: type of service you request (dbpedia sparql endpoint)
        :return url: the url composed
        """
        # use quote_plus method from urllib to encode special character (must to do with web service)
        query = urllib.quote_plus(query)

        """
        The following if clause are differentiated by service requested Eg. 'dbpedia',..
            but in all the cases url is composed using pre formatted string along with the query
        """
        if service == 'dbpedia':
            url = self.dbpedia_sparql_url + query + self.call_format_sparql

        elif service == 'html':
            url = self.html_format + query

        else:
            url = "ERROR"
        return url

Source File: HtmlTableParser.py From table-extractor with GNU General Public License v3.0

6 votes

def remove_html_encode_errors(self, headers, error):
        """
        Use this method to remove html special characters (Eg. &nbps), encoding errors or other unicode text.

        Simply pass headers rows to the method and the error, as a unicode string, you want to correct

        :param headers: rows list of headers
        :param error: unicode string you want to delete from header cells
        :return: nothing
        """
        # Iterates over headers
        for row in headers:
            # Iterate over header cells
            for header in row:
                # Replace 'error' with u'' in the text of this header cell
                header['th'] = header['th'].replace(error, u'')

Source File: html_quotations_test.py From talon with Apache License 2.0

6 votes

def test_regular_blockquote():
    msg_body = """Reply
<blockquote>Regular</blockquote>

<div>
  On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
</div>

<blockquote>
  <div>
    <blockquote>Nested</blockquote>
  </div>
</blockquote>
"""
    eq_("<html><head></head><body>Reply<blockquote>Regular</blockquote></body></html>",
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))

Source File: processor.py From rssant with BSD 3-Clause "New" or "Revised" License

6 votes

def get_html_redirect_url(html: str, base_url: str = None) -> str:
    """
    Resolve HTML meta refresh client-side redirect

    https://www.w3.org/TR/WCAG20-TECHS/H76.html
    Example:
        <meta http-equiv="refresh" content="0;URL='http://example.com/'"/>
    """
    if not html or len(html) > 2048:
        return None
    match = RE_HTML_REDIRECT.search(html)
    if not match:
        return None
    match = RE_HTML_REDIRECT_URL.search(match.group(1))
    if not match:
        return None
    url = normalize_url(match.group(1).strip(), base_url=base_url)
    try:
        url = validate_url(url)
    except Invalid:
        url = None
    return url

Source File: coinmarketcap.py From coinmarketcap-scraper with MIT License

6 votes

def testParseCurrencyListAll(self):
        """Test parseCurrencyListAll."""
        f = codecs.open("{0}/example/currencylist.html".format(
            os.path.dirname(os.path.abspath(__file__))), 'r', 'utf-8')
        html = f.read()
        f.close()
        data = parseCurrencyListAll(html)
        self.assertEqual(len(data), 452)
        expectedFirst = {
            'name': 'Bitcoin',
            'slug': 'bitcoin',
            'symbol': 'BTC',
            'explorer_link': 'http://blockchain.info'
        }
        self.assertEqual(data[0], expectedFirst)
        expectedLast = {
            'name': 'Marscoin',
            'slug': 'marscoin',
            'symbol': 'MRS',
            'explorer_link': 'http://explore.marscoin.org/chain/Marscoin/'
        }
        self.assertEqual(data[-1], expectedLast)

Source File: html_quotations_test.py From talon with Apache License 2.0

6 votes

def test_validate_output_html():
    msg_body = """Reply
<div>
  On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:

    <blockquote>
      <div>
        Test
      </div>
    </blockquote>
</div>

<div/>
"""
    out = quotations.extract_from_html(msg_body)
    ok_('<html>' in out and '</html>' in out,
        'Invalid HTML - <html>/</html> tag not present')
    ok_('<div/>' not in out,
        'Invalid HTML output - <div/> element is not valid')

Source File: structural_similarity.py From html-similarity with BSD 3-Clause "New" or "Revised" License

6 votes

def get_tags(doc):
    '''
    Get tags from a DOM tree

    :param doc: lxml parsed object
    :return:
    '''
    tags = list()

    for el in doc.getroot().iter():
        if isinstance(el, lxml.html.HtmlElement):
            tags.append(el.tag)
        elif isinstance(el, lxml.html.HtmlComment):
            tags.append('comment')
        else:
            raise ValueError('Don\'t know what to do with element: {}'.format(el))

    return tags

Source File: html_quotations_test.py From talon with Apache License 2.0

6 votes

def test_date_block():
    msg_body = """
<div>
  message<br>
  <div>
    <hr>
    Date: Fri, 23 Mar 2012 12:35:31 -0600<br>
    To: <a href="mailto:bob@example.com">bob@example.com</a><br>
    From: <a href="mailto:rob@example.com">rob@example.com</a><br>
    Subject: You Have New Mail From Mary!<br><br>

    text
  </div>
</div>
"""
    eq_('<html><head></head><body><div>message<br></div></body></html>',
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))

Source File: html_quotations_test.py From talon with Apache License 2.0

6 votes

def test_from_block_and_quotations_in_separate_divs():
    msg_body = '''
Reply
<div>
  <hr/>
  <div>
    <font>
      <b>From: bob@example.com</b>
      <b>Date: Thu, 24 Mar 2016 08:07:12 -0700</b>
    </font>
  </div>
  <div>
    Quoted message
  </div>
</div>
'''
    eq_('<html><head></head><body>Reply<div><hr></div></body></html>',
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))

Source File: state_bill_extractors.py From policy_diffusion with MIT License

5 votes

def ct_text_extractor(doc_source):
    doc = lxml.html.fromstring(doc_source)
    text = ' '.join(p.text_content() for p in doc.xpath('//body/p'))
    return text

Source File: html_quotations_test.py From talon with Apache License 2.0

5 votes

def test_ms_outlook_2007_reply():
    extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2007.html")

Source File: state_bill_extractors.py From policy_diffusion with MIT License

5 votes

def ca_text_extractor(doc_source):
    doc = lxml.html.fromstring(doc_source)
    divs_to_try = ['//div[@id="bill"]', '//div[@id="bill_all"]']
    for xpath in divs_to_try:
        div = doc.xpath(xpath)
        if div:
            return div[0].text_content()

Source File: html_quotations_test.py From talon with Apache License 2.0

5 votes

def test_reply_separated_by_hr():
    eq_('<html><head></head><body><div>Hi<div>there</div></div></body></html>',
        RE_WHITESPACE.sub(
            '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR)))

Source File: state_bill_extractors.py From policy_diffusion with MIT License

5 votes

def mi_text_extractor(doc_source):
    doc = lxml.html.fromstring(doc_source)
    text = doc.xpath('//body')[0].text_content()
    return text

Source File: html_quotations_test.py From talon with Apache License 2.0

5 votes

def test_bad_html():
    bad_html = "<html></html>"
    eq_(bad_html, quotations.extract_from_html(bad_html))

Source File: html_quotations_test.py From talon with Apache License 2.0

5 votes

def test_CRLF():
    """CR is not converted to '&#13;'
    """
    symbol = '&#13;'
    extracted = quotations.extract_from_html('<html>\r\n</html>')
    assert_false(symbol in extracted)
    eq_('<html></html>', RE_WHITESPACE.sub('', extracted))

    msg_body = """My
reply
<blockquote>

  <div>
    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
  </div>

  <div>
    Test
  </div>

</blockquote>"""
    msg_body = msg_body.replace('\n', '\r\n')
    extracted = quotations.extract_from_html(msg_body)
    assert_false(symbol in extracted)
    # Keep new lines otherwise "My reply" becomes one word - "Myreply" 
    eq_("<html><head></head><body>My\nreply\n</body></html>", extracted)

Source File: html_quotations_test.py From talon with Apache License 2.0

5 votes

def test_windows_mail_reply():
    extract_reply_and_check("tests/fixtures/html_replies/windows_mail.html")

Source File: html_quotations_test.py From talon with Apache License 2.0

5 votes

def test_thunderbird_reply():
    extract_reply_and_check("tests/fixtures/html_replies/thunderbird.html")

Source File: html_quotations_test.py From talon with Apache License 2.0

5 votes

def test_gmail_reply():
    extract_reply_and_check("tests/fixtures/html_replies/gmail.html")

Source File: html_quotations_test.py From talon with Apache License 2.0

5 votes

def test_mail_ru_reply():
    extract_reply_and_check("tests/fixtures/html_replies/mail_ru.html")

Source File: html_quotations_test.py From talon with Apache License 2.0

5 votes

def test_ms_outlook_2010_reply():
    extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2010.html")

Source File: html_quotations_test.py From talon with Apache License 2.0

5 votes

def test_ms_outlook_2003_reply():
    extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2003.html")

Python lxml.html() Examples