Python Examples of lxml.html.HTMLParser

Source File: importer.py From python-ooxml with GNU Affero General Public License v3.0

6 votes

def parse_html_string(s):
    from lxml import html

    utf8_parser = html.HTMLParser(encoding='utf-8')
    html_tree = html.document_fromstring(s , parser=utf8_parser)

    return html_tree

Source File: form.py From wextracto with BSD 3-Clause "New" or "Revised" License

5 votes

def create_html_parser(headers):

    charset = headers.get_content_charset()
    try:
        if charset and codecs.lookup(charset).name == 'iso8859-1':
            charset = 'windows-1252'
    except LookupError:
        pass

    # if charset is not specified in the Content-Type, this will be
    # None ; encoding=None produces default (ISO 8859-1) behavior.
    return HTMLParser(encoding=charset)

Source File: questions.py From legco-watch with MIT License

5 votes

def parse(self, response):
        sel = Selector(response)
        body = sel.xpath('//div[@id="_content_"]')
        if len(body) != 1:
            self.log(u'Expected single body element, but found {} on {}'.format(len(body), response.url), level=log.WARNING)
            return
        body = body[0]
        if u'chinese' in response.url:
            language = 'C'
            matcher = self.HEADER_RE_C
        else:
            language = 'E'
            matcher = self.HEADER_RE_E
        # We'll need lxml to parse this
        parser = HTMLParser(encoding='utf-8')
        body_extract = body.extract().encode('utf-8')
        body_elements = lxml.html.fromstring(body_extract, parser=parser)
        # Iterate over the body elements, processing each h2-table pair for each meeting
        count_sessions = 0
        count_questions = 0
        for elem in body_elements:
            # Skip comments
            if elem.tag == lxml.etree.Comment:
                continue
            # Take the first 50 characters, so RE doesn't scan the whole body of text for large elements
            match = re.search(matcher, elem.text_content()[:50])
            if match is not None:
                this_date = match.groupdict()['date']
                self.log(u'Found table for date {}'.format(this_date))
                count_sessions += 1
                questions_table = elem.getnext()
                for row in questions_table.xpath('./tr'):
                    # We ignore the header row, which is indicated by ths
                    if row[0].tag == 'th':
                        continue
                    this_question = self.make_question(language, response, row, this_date)
                    count_questions += 1
                    yield Question(**this_question)

        self.log(u'Processed {} questions in {} sessions'.format(count_questions, count_sessions), level=log.INFO)

Source File: agenda.py From legco-watch with MIT License

5 votes

def _load(self):
        """
        Load the ElementTree from the source
        """
        # Convert directional quotation marks to regular quotes
        double_quotes = ur'[\u201c\u201d]'
        self.source = re.sub(double_quotes, u'"', self.source)
        single_quotes = ur'[\u2019\u2018]'
        self.source = re.sub(single_quotes, u"'", self.source)
        # Convert colons
        self.source = self.source.replace(u'\uff1a', u':')
        # Remove line breaks and tabs
        self.source = self.source.replace(u'\n', u'')
        self.source = self.source.replace(u'\t', u'')
        # There are also some "zero width joiners" in random places in the text
        # Should remove them here, since they make string search unreliable
        # these are the codes: &#8205, &#160 (nbsp), \xa0 (nbsp), \u200d
        zero_width_joiners = u'\u200d'
        self.source = self.source.replace(zero_width_joiners, u'')
        # Also previously had some non breaking spaces in unicode \u00a0, but this
        # may have been fixed by changing the parser below

        # Use the lxml cleaner
        cleaner = Cleaner()
        parser = HTMLParser(encoding='utf-8')
        # Finally, load the cleaned string to an ElementTree
        self.tree = cleaner.clean_html(lxml.html.fromstring(to_string(self.source), parser=parser))
        # self.tree = lxml.html.fromstring(to_string(self.source))

Source File: column.py From zhihu2ebook with MIT License

5 votes

def replace_img_url(self, content):
        utf8_parser = html.HTMLParser(encoding='utf-8')
        tree = html.document_fromstring(str(content), parser=utf8_parser)

        for _pic_link in tree.xpath("//img"):
            href = str(_pic_link.get('src'))
            pic_id, pic_type = href.split('.')
            _pic_link.set('src', "https://pic4.zhimg.com/" + pic_id + "_b." + pic_type)
        replaced_content = etree.tostring(tree, encoding=str)
        return replaced_content

Source File: selector.py From ChemDataExtractor with MIT License

5 votes

def from_html(cls, response, namespaces=None):
        return cls.from_response(response, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=namespaces)

Source File: selector.py From ChemDataExtractor with MIT License

5 votes

def from_response(cls, response, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=None):
        return cls.from_text(response.content, response.url, parser, translator, fmt, namespaces=namespaces, encoding=response.encoding)

Source File: selector.py From ChemDataExtractor with MIT License

5 votes

def from_html_text(cls, text, base_url=None, namespaces=None, encoding=None):
        return cls.from_text(text, base_url=base_url, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=namespaces, encoding=encoding)

Source File: selector.py From ChemDataExtractor with MIT License

5 votes

def from_text(cls, text, base_url=None, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=None, encoding=None):
        log.debug('Parsing {} with {}'.format(fmt, parser))
        root = fromstring(text, parser=parser(recover=True, encoding=cls._get_encoding(text, encoding)), base_url=base_url)
        if base_url and hasattr(root, 'make_links_absolute'):
            root.make_links_absolute()
        return cls(root, translator=translator, fmt=fmt, namespaces=namespaces)

Source File: htmlstream.py From wextracto with BSD 3-Clause "New" or "Revised" License

5 votes

def pre_parse(self):

        http_content_type = self.response.headers.get('content-type', '')
        target = HTMLEncodings(http_content_type)
        # parser will fail on non-ascii unless we set it explicitly
        parser = HTMLParser(target=target, encoding='ISO-8859-1')
        total_bytes = 0

        self.response.seek(0)
        while target:
            chunk = self.response.read(PRE_PARSE_CHUNK_SIZE)
            if not chunk:
                try:
                    parser.close()
                except XMLSyntaxError:
                    pass
                break

            if self.bom is None:
                assert PRE_PARSE_CHUNK_SIZE >= 4
                self.bom = b''
                for i in range(4, 1, -1):
                    if chunk[:i] in BOM_ENC:
                        self.bom = chunk[:i]
                        target.encodings.append(('bom', BOM_ENC[self.bom]))
                        # the can only be one BOM - stop here
                        break

            parser.feed(chunk)
            total_bytes += len(chunk)
            if total_bytes >= MAX_PRE_PARSE_BYTES:
                break

        return target.encodings

Source File: etree.py From wextracto with BSD 3-Clause "New" or "Revised" License

5 votes

def parse(src):
    """ Returns an element tree create by `LXML <http://lxml.de/>`_.
       :param src: A readable object such as a :class:`wex.response.Response`.
    """

    if not hasattr(src, 'read'):
        return src

    etree = _ElementTree()
    try:
        stream = HTMLStream(src)
        # Sometimes we get URLs containing characters that aren't
        # acceptable to lxml (e.g. "http:/foo.com/bar?this=array[]").
        # When this happens lxml will quote the whole URL.
        # We don't want to have to check for this so we just always
        # quote it here and then unquote it in the `base_url` function.
        quoted_base_url = quote_base_url(src.url) if src.url else src.url
        while True:
            try:
                fp = replace_invalid_ncr(stream)
                # fp is a Unicode stream
                # The lxml FAQ tells us that it is inefficient to do this
                # http://lxml.de/FAQ.html#can-lxml-parse-from-file-objects-opened-in-unicode-text-mode
                # but actually it seems just fine as long as you tell the parser to use 'utf-8'!?
                parser = HTMLParser(encoding='utf-8')
                etree.parse(fp, parser=parser, base_url=quoted_base_url)
                break
            except UnicodeDecodeError as exc:
                stream.next_encoding()
    except IOError as exc:
        logger = logging.getLogger(__name__)
        logger.warning("IOError parsing %s (%s)", src.url, exc)

    root = etree.getroot()
    if root is None:
        etree._setroot(UNPARSEABLE)

    return etree

Source File: core.py From libextract with MIT License

5 votes

def parse_html(fileobj, encoding):
    """
    Given a file object *fileobj*, get an ElementTree instance.
    The *encoding* is assumed to be utf8.
    """
    parser = HTMLParser(encoding=encoding, remove_blank_text=True)
    return parse(fileobj, parser)

Source File: __init__.py From online-judge with GNU Affero General Public License v3.0

5 votes

def fragments_to_tree(fragment):
    tree = html.Element('div')
    try:
        parsed = html.fragments_fromstring(fragment, parser=html.HTMLParser(recover=True))
    except (XMLSyntaxError, ParserError) as e:
        if fragment and (not isinstance(e, ParserError) or e.args[0] != 'Document is empty'):
            logger.exception('Failed to parse HTML string')
        return tree

    if parsed and isinstance(parsed[0], str):
        tree.text = parsed[0]
        parsed = parsed[1:]
    tree.extend(parsed)
    return tree

Source File: __init__.py From online-judge with GNU Affero General Public License v3.0

5 votes

def __init__(self, *args, **kwargs):
        self.nofollow = kwargs.pop('nofollow', True)
        self.texoid = TexoidRenderer() if kwargs.pop('texoid', False) else None
        self.parser = HTMLParser()
        super(AwesomeRenderer, self).__init__(*args, **kwargs)

Source File: lxml_tree.py From online-judge with GNU Affero General Public License v3.0

5 votes

def __init__(self, str):
        try:
            self._tree = html.fromstring(str, parser=html.HTMLParser(recover=True))
        except (XMLSyntaxError, ParserError) as e:
            if str and (not isinstance(e, ParserError) or e.args[0] != 'Document is empty'):
                logger.exception('Failed to parse HTML string')
            self._tree = html.Element('div')

Source File: jinja_migration.py From INGInious with GNU Affero General Public License v3.0

5 votes

def check_same_tpl(html_a, html_b):
    """ Given html_a and html_b, two HTML pages, check that they contain the same structure.
        Raises an exception if it's not the case. Otherwise, returns html_a.
    """
    structa = fromstring(str(html_a), parser=HTMLParser(remove_blank_text=True))
    structb = fromstring(str(html_b), parser=HTMLParser(remove_blank_text=True))
    if not elements_equal(structa, structb):
        raise Exception("The two templates do not contain the same thing!")
    return html_a

Source File: html.py From recruit with Apache License 2.0

4 votes

def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring, HTMLParser
        from lxml.etree import XMLSyntaxError
        parser = HTMLParser(recover=True, encoding=self.encoding)

        try:
            if _is_url(self.io):
                with urlopen(self.io) as f:
                    r = parse(f, parser=parser)
            else:
                # try to parse the input in the simplest way
                r = parse(self.io, parser=parser)
            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, IOError) as e:
            # if the input is a blob of html goop
            if not _is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                raise e
        else:
            if not hasattr(r, 'text_content'):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
        return r

Source File: rsc.py From ChemDataExtractor with MIT License

4 votes

def parse_rsc_html(htmlstring):
    """Messy RSC HTML needs this special parser to fix problems before creating selector."""
    converted = UnicodeDammit(htmlstring)
    if not converted.unicode_markup:
        raise UnicodeDecodeError('Failed to detect encoding, tried [%s]')
    root = fromstring(htmlstring, parser=HTMLParser(recover=True, encoding=converted.original_encoding))
    # Add p.otherpara tags around orphan text
    newp = None
    for child in root.get_element_by_id('wrapper'):
        if newp is not None:
            if child.tag in BLOCK_ELEMENTS or child.get('id', '').startswith('sect') or child.getnext() is None:
                child.addprevious(newp)
                newp = None
            else:
                newp.append(child)
        if newp is None and child.tag in BLOCK_ELEMENTS and child.tail and child.tail.strip():
            newp = Element('p', **{'class': 'otherpara'})
            newp.text = child.tail
            child.tail = ''
    return root

Source File: html.py From predictive-maintenance-using-machine-learning with Apache License 2.0

4 votes

def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring, HTMLParser
        from lxml.etree import XMLSyntaxError
        parser = HTMLParser(recover=True, encoding=self.encoding)

        try:
            if _is_url(self.io):
                with urlopen(self.io) as f:
                    r = parse(f, parser=parser)
            else:
                # try to parse the input in the simplest way
                r = parse(self.io, parser=parser)
            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, IOError) as e:
            # if the input is a blob of html goop
            if not _is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                raise e
        else:
            if not hasattr(r, 'text_content'):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
        return r

Source File: parse-bbc-html-data.py From XSum with MIT License

4 votes

def __init__(self, story, corpus):
    self.story = story
    self.corpus = corpus
    self.parser = html.HTMLParser(encoding=chardet.detect(self.story.html)['encoding'])
    self.tree = html.document_fromstring(self.story.html, parser=self.parser)
    
    # Elements to delete.
    self.delete_selectors = {
      'bbc': [
        '//blockquote[contains(@class, "twitter-tweet")]',
        '//blockquote[contains(@class, "instagram-media")]'
      ]
    }
    
    # Title Selector
    self.title_selectors = {
      'bbc': [
        '//h1[contains(@class, "story-headline")]',
        '//h1[contains(@class, "story-body__h1")]'
      ]
    }
    
    # Introduction Selector
    self.introduction_selectors = {
      'bbc': [
        '//p[contains(@class, "story-body__introduction")]'
      ]
    }
    
    # Rest Content exclusions: ads, links, bylines, comments, headline and story introduction
    self.bbc_exclude = (
      'not(contains(@class, "story-headline"))'
      ' and not(contains(@class, "story-body__h1"))'
      ' and not(contains(@class, "story-body__introduction"))'
      ' and not(contains(@class, "with-extracted-share-icons"))'
    )

    # Rest Content Selector
    self.restcontent_selectors = {
      'bbc': [
        '//div[contains(@class, "story-body")]//p[%s]' % self.bbc_exclude    # story-body__inner
      ]
    }

Source File: html.py From Splunking-Crime with GNU Affero General Public License v3.0

4 votes

def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring, HTMLParser
        from lxml.etree import XMLSyntaxError

        parser = HTMLParser(recover=False, encoding=self.encoding)

        try:
            # try to parse the input in the simplest way
            r = parse(self.io, parser=parser)

            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, IOError):
            # if the input is a blob of html goop
            if not _is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                # not a url
                scheme = parse_url(self.io).scheme
                if scheme not in _valid_schemes:
                    # lxml can't parse it
                    msg = (('{invalid!r} is not a valid url scheme, valid '
                            'schemes are {valid}')
                           .format(invalid=scheme, valid=_valid_schemes))
                    raise ValueError(msg)
                else:
                    # something else happened: maybe a faulty connection
                    raise
        else:
            if not hasattr(r, 'text_content'):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
        return r

Source File: html.py From elasticintel with GNU General Public License v3.0

4 votes

def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring, HTMLParser
        from lxml.etree import XMLSyntaxError

        parser = HTMLParser(recover=False, encoding=self.encoding)

        try:
            # try to parse the input in the simplest way
            r = parse(self.io, parser=parser)

            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, IOError):
            # if the input is a blob of html goop
            if not _is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                # not a url
                scheme = parse_url(self.io).scheme
                if scheme not in _valid_schemes:
                    # lxml can't parse it
                    msg = (('{invalid!r} is not a valid url scheme, valid '
                            'schemes are {valid}')
                           .format(invalid=scheme, valid=_valid_schemes))
                    raise ValueError(msg)
                else:
                    # something else happened: maybe a faulty connection
                    raise
        else:
            if not hasattr(r, 'text_content'):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
        return r

Source File: html.py From Computable with MIT License

4 votes

def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring, HTMLParser
        from lxml.etree import XMLSyntaxError

        parser = HTMLParser(recover=False)

        try:
            # try to parse the input in the simplest way
            r = parse(self.io, parser=parser)

            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, IOError):
            # if the input is a blob of html goop
            if not _is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                # not a url
                scheme = parse_url(self.io).scheme
                if scheme not in _valid_schemes:
                    # lxml can't parse it
                    msg = ('%r is not a valid url scheme, valid schemes are '
                           '%s') % (scheme, _valid_schemes)
                    raise ValueError(msg)
                else:
                    # something else happened: maybe a faulty connection
                    raise
        else:
            if not hasattr(r, 'text_content'):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
        return r

Source File: html.py From vnpy_crypto with MIT License

4 votes

def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring, HTMLParser
        from lxml.etree import XMLSyntaxError
        parser = HTMLParser(recover=True, encoding=self.encoding)

        try:
            if _is_url(self.io):
                with urlopen(self.io) as f:
                    r = parse(f, parser=parser)
            else:
                # try to parse the input in the simplest way
                r = parse(self.io, parser=parser)
            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, IOError) as e:
            # if the input is a blob of html goop
            if not _is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                raise e
        else:
            if not hasattr(r, 'text_content'):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
        return r

Python lxml.html.HTMLParser() Examples