Python lxml.html.HTMLParser() Examples
The following are 24
code examples of lxml.html.HTMLParser().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
lxml.html
, or try the search function
.
Example #1
Source File: importer.py From python-ooxml with GNU Affero General Public License v3.0 | 6 votes |
def parse_html_string(s): from lxml import html utf8_parser = html.HTMLParser(encoding='utf-8') html_tree = html.document_fromstring(s , parser=utf8_parser) return html_tree
Example #2
Source File: form.py From wextracto with BSD 3-Clause "New" or "Revised" License | 5 votes |
def create_html_parser(headers): charset = headers.get_content_charset() try: if charset and codecs.lookup(charset).name == 'iso8859-1': charset = 'windows-1252' except LookupError: pass # if charset is not specified in the Content-Type, this will be # None ; encoding=None produces default (ISO 8859-1) behavior. return HTMLParser(encoding=charset)
Example #3
Source File: questions.py From legco-watch with MIT License | 5 votes |
def parse(self, response): sel = Selector(response) body = sel.xpath('//div[@id="_content_"]') if len(body) != 1: self.log(u'Expected single body element, but found {} on {}'.format(len(body), response.url), level=log.WARNING) return body = body[0] if u'chinese' in response.url: language = 'C' matcher = self.HEADER_RE_C else: language = 'E' matcher = self.HEADER_RE_E # We'll need lxml to parse this parser = HTMLParser(encoding='utf-8') body_extract = body.extract().encode('utf-8') body_elements = lxml.html.fromstring(body_extract, parser=parser) # Iterate over the body elements, processing each h2-table pair for each meeting count_sessions = 0 count_questions = 0 for elem in body_elements: # Skip comments if elem.tag == lxml.etree.Comment: continue # Take the first 50 characters, so RE doesn't scan the whole body of text for large elements match = re.search(matcher, elem.text_content()[:50]) if match is not None: this_date = match.groupdict()['date'] self.log(u'Found table for date {}'.format(this_date)) count_sessions += 1 questions_table = elem.getnext() for row in questions_table.xpath('./tr'): # We ignore the header row, which is indicated by ths if row[0].tag == 'th': continue this_question = self.make_question(language, response, row, this_date) count_questions += 1 yield Question(**this_question) self.log(u'Processed {} questions in {} sessions'.format(count_questions, count_sessions), level=log.INFO)
Example #4
Source File: agenda.py From legco-watch with MIT License | 5 votes |
def _load(self): """ Load the ElementTree from the source """ # Convert directional quotation marks to regular quotes double_quotes = ur'[\u201c\u201d]' self.source = re.sub(double_quotes, u'"', self.source) single_quotes = ur'[\u2019\u2018]' self.source = re.sub(single_quotes, u"'", self.source) # Convert colons self.source = self.source.replace(u'\uff1a', u':') # Remove line breaks and tabs self.source = self.source.replace(u'\n', u'') self.source = self.source.replace(u'\t', u'') # There are also some "zero width joiners" in random places in the text # Should remove them here, since they make string search unreliable # these are the codes: ‍,   (nbsp), \xa0 (nbsp), \u200d zero_width_joiners = u'\u200d' self.source = self.source.replace(zero_width_joiners, u'') # Also previously had some non breaking spaces in unicode \u00a0, but this # may have been fixed by changing the parser below # Use the lxml cleaner cleaner = Cleaner() parser = HTMLParser(encoding='utf-8') # Finally, load the cleaned string to an ElementTree self.tree = cleaner.clean_html(lxml.html.fromstring(to_string(self.source), parser=parser)) # self.tree = lxml.html.fromstring(to_string(self.source))
Example #5
Source File: column.py From zhihu2ebook with MIT License | 5 votes |
def replace_img_url(self, content): utf8_parser = html.HTMLParser(encoding='utf-8') tree = html.document_fromstring(str(content), parser=utf8_parser) for _pic_link in tree.xpath("//img"): href = str(_pic_link.get('src')) pic_id, pic_type = href.split('.') _pic_link.set('src', "https://pic4.zhimg.com/" + pic_id + "_b." + pic_type) replaced_content = etree.tostring(tree, encoding=str) return replaced_content
Example #6
Source File: selector.py From ChemDataExtractor with MIT License | 5 votes |
def from_html(cls, response, namespaces=None): return cls.from_response(response, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=namespaces)
Example #7
Source File: selector.py From ChemDataExtractor with MIT License | 5 votes |
def from_response(cls, response, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=None): return cls.from_text(response.content, response.url, parser, translator, fmt, namespaces=namespaces, encoding=response.encoding)
Example #8
Source File: selector.py From ChemDataExtractor with MIT License | 5 votes |
def from_html_text(cls, text, base_url=None, namespaces=None, encoding=None): return cls.from_text(text, base_url=base_url, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=namespaces, encoding=encoding)
Example #9
Source File: selector.py From ChemDataExtractor with MIT License | 5 votes |
def from_text(cls, text, base_url=None, parser=HTMLParser, translator=CssHTMLTranslator, fmt='html', namespaces=None, encoding=None): log.debug('Parsing {} with {}'.format(fmt, parser)) root = fromstring(text, parser=parser(recover=True, encoding=cls._get_encoding(text, encoding)), base_url=base_url) if base_url and hasattr(root, 'make_links_absolute'): root.make_links_absolute() return cls(root, translator=translator, fmt=fmt, namespaces=namespaces)
Example #10
Source File: htmlstream.py From wextracto with BSD 3-Clause "New" or "Revised" License | 5 votes |
def pre_parse(self): http_content_type = self.response.headers.get('content-type', '') target = HTMLEncodings(http_content_type) # parser will fail on non-ascii unless we set it explicitly parser = HTMLParser(target=target, encoding='ISO-8859-1') total_bytes = 0 self.response.seek(0) while target: chunk = self.response.read(PRE_PARSE_CHUNK_SIZE) if not chunk: try: parser.close() except XMLSyntaxError: pass break if self.bom is None: assert PRE_PARSE_CHUNK_SIZE >= 4 self.bom = b'' for i in range(4, 1, -1): if chunk[:i] in BOM_ENC: self.bom = chunk[:i] target.encodings.append(('bom', BOM_ENC[self.bom])) # the can only be one BOM - stop here break parser.feed(chunk) total_bytes += len(chunk) if total_bytes >= MAX_PRE_PARSE_BYTES: break return target.encodings
Example #11
Source File: etree.py From wextracto with BSD 3-Clause "New" or "Revised" License | 5 votes |
def parse(src): """ Returns an element tree create by `LXML <http://lxml.de/>`_. :param src: A readable object such as a :class:`wex.response.Response`. """ if not hasattr(src, 'read'): return src etree = _ElementTree() try: stream = HTMLStream(src) # Sometimes we get URLs containing characters that aren't # acceptable to lxml (e.g. "http:/foo.com/bar?this=array[]"). # When this happens lxml will quote the whole URL. # We don't want to have to check for this so we just always # quote it here and then unquote it in the `base_url` function. quoted_base_url = quote_base_url(src.url) if src.url else src.url while True: try: fp = replace_invalid_ncr(stream) # fp is a Unicode stream # The lxml FAQ tells us that it is inefficient to do this # http://lxml.de/FAQ.html#can-lxml-parse-from-file-objects-opened-in-unicode-text-mode # but actually it seems just fine as long as you tell the parser to use 'utf-8'!? parser = HTMLParser(encoding='utf-8') etree.parse(fp, parser=parser, base_url=quoted_base_url) break except UnicodeDecodeError as exc: stream.next_encoding() except IOError as exc: logger = logging.getLogger(__name__) logger.warning("IOError parsing %s (%s)", src.url, exc) root = etree.getroot() if root is None: etree._setroot(UNPARSEABLE) return etree
Example #12
Source File: core.py From libextract with MIT License | 5 votes |
def parse_html(fileobj, encoding): """ Given a file object *fileobj*, get an ElementTree instance. The *encoding* is assumed to be utf8. """ parser = HTMLParser(encoding=encoding, remove_blank_text=True) return parse(fileobj, parser)
Example #13
Source File: __init__.py From online-judge with GNU Affero General Public License v3.0 | 5 votes |
def fragments_to_tree(fragment): tree = html.Element('div') try: parsed = html.fragments_fromstring(fragment, parser=html.HTMLParser(recover=True)) except (XMLSyntaxError, ParserError) as e: if fragment and (not isinstance(e, ParserError) or e.args[0] != 'Document is empty'): logger.exception('Failed to parse HTML string') return tree if parsed and isinstance(parsed[0], str): tree.text = parsed[0] parsed = parsed[1:] tree.extend(parsed) return tree
Example #14
Source File: __init__.py From online-judge with GNU Affero General Public License v3.0 | 5 votes |
def __init__(self, *args, **kwargs): self.nofollow = kwargs.pop('nofollow', True) self.texoid = TexoidRenderer() if kwargs.pop('texoid', False) else None self.parser = HTMLParser() super(AwesomeRenderer, self).__init__(*args, **kwargs)
Example #15
Source File: lxml_tree.py From online-judge with GNU Affero General Public License v3.0 | 5 votes |
def __init__(self, str): try: self._tree = html.fromstring(str, parser=html.HTMLParser(recover=True)) except (XMLSyntaxError, ParserError) as e: if str and (not isinstance(e, ParserError) or e.args[0] != 'Document is empty'): logger.exception('Failed to parse HTML string') self._tree = html.Element('div')
Example #16
Source File: jinja_migration.py From INGInious with GNU Affero General Public License v3.0 | 5 votes |
def check_same_tpl(html_a, html_b): """ Given html_a and html_b, two HTML pages, check that they contain the same structure. Raises an exception if it's not the case. Otherwise, returns html_a. """ structa = fromstring(str(html_a), parser=HTMLParser(remove_blank_text=True)) structb = fromstring(str(html_b), parser=HTMLParser(remove_blank_text=True)) if not elements_equal(structa, structb): raise Exception("The two templates do not contain the same thing!") return html_a
Example #17
Source File: html.py From recruit with Apache License 2.0 | 4 votes |
def _build_doc(self): """ Raises ------ ValueError * If a URL that lxml cannot parse is passed. Exception * Any other ``Exception`` thrown. For example, trying to parse a URL that is syntactically correct on a machine with no internet connection will fail. See Also -------- pandas.io.html._HtmlFrameParser._build_doc """ from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError parser = HTMLParser(recover=True, encoding=self.encoding) try: if _is_url(self.io): with urlopen(self.io) as f: r = parse(f, parser=parser) else: # try to parse the input in the simplest way r = parse(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass except (UnicodeDecodeError, IOError) as e: # if the input is a blob of html goop if not _is_url(self.io): r = fromstring(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass else: raise e else: if not hasattr(r, 'text_content'): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) return r
Example #18
Source File: rsc.py From ChemDataExtractor with MIT License | 4 votes |
def parse_rsc_html(htmlstring): """Messy RSC HTML needs this special parser to fix problems before creating selector.""" converted = UnicodeDammit(htmlstring) if not converted.unicode_markup: raise UnicodeDecodeError('Failed to detect encoding, tried [%s]') root = fromstring(htmlstring, parser=HTMLParser(recover=True, encoding=converted.original_encoding)) # Add p.otherpara tags around orphan text newp = None for child in root.get_element_by_id('wrapper'): if newp is not None: if child.tag in BLOCK_ELEMENTS or child.get('id', '').startswith('sect') or child.getnext() is None: child.addprevious(newp) newp = None else: newp.append(child) if newp is None and child.tag in BLOCK_ELEMENTS and child.tail and child.tail.strip(): newp = Element('p', **{'class': 'otherpara'}) newp.text = child.tail child.tail = '' return root
Example #19
Source File: html.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 4 votes |
def _build_doc(self): """ Raises ------ ValueError * If a URL that lxml cannot parse is passed. Exception * Any other ``Exception`` thrown. For example, trying to parse a URL that is syntactically correct on a machine with no internet connection will fail. See Also -------- pandas.io.html._HtmlFrameParser._build_doc """ from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError parser = HTMLParser(recover=True, encoding=self.encoding) try: if _is_url(self.io): with urlopen(self.io) as f: r = parse(f, parser=parser) else: # try to parse the input in the simplest way r = parse(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass except (UnicodeDecodeError, IOError) as e: # if the input is a blob of html goop if not _is_url(self.io): r = fromstring(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass else: raise e else: if not hasattr(r, 'text_content'): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) return r
Example #20
Source File: parse-bbc-html-data.py From XSum with MIT License | 4 votes |
def __init__(self, story, corpus): self.story = story self.corpus = corpus self.parser = html.HTMLParser(encoding=chardet.detect(self.story.html)['encoding']) self.tree = html.document_fromstring(self.story.html, parser=self.parser) # Elements to delete. self.delete_selectors = { 'bbc': [ '//blockquote[contains(@class, "twitter-tweet")]', '//blockquote[contains(@class, "instagram-media")]' ] } # Title Selector self.title_selectors = { 'bbc': [ '//h1[contains(@class, "story-headline")]', '//h1[contains(@class, "story-body__h1")]' ] } # Introduction Selector self.introduction_selectors = { 'bbc': [ '//p[contains(@class, "story-body__introduction")]' ] } # Rest Content exclusions: ads, links, bylines, comments, headline and story introduction self.bbc_exclude = ( 'not(contains(@class, "story-headline"))' ' and not(contains(@class, "story-body__h1"))' ' and not(contains(@class, "story-body__introduction"))' ' and not(contains(@class, "with-extracted-share-icons"))' ) # Rest Content Selector self.restcontent_selectors = { 'bbc': [ '//div[contains(@class, "story-body")]//p[%s]' % self.bbc_exclude # story-body__inner ] }
Example #21
Source File: html.py From Splunking-Crime with GNU Affero General Public License v3.0 | 4 votes |
def _build_doc(self): """ Raises ------ ValueError * If a URL that lxml cannot parse is passed. Exception * Any other ``Exception`` thrown. For example, trying to parse a URL that is syntactically correct on a machine with no internet connection will fail. See Also -------- pandas.io.html._HtmlFrameParser._build_doc """ from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError parser = HTMLParser(recover=False, encoding=self.encoding) try: # try to parse the input in the simplest way r = parse(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass except (UnicodeDecodeError, IOError): # if the input is a blob of html goop if not _is_url(self.io): r = fromstring(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass else: # not a url scheme = parse_url(self.io).scheme if scheme not in _valid_schemes: # lxml can't parse it msg = (('{invalid!r} is not a valid url scheme, valid ' 'schemes are {valid}') .format(invalid=scheme, valid=_valid_schemes)) raise ValueError(msg) else: # something else happened: maybe a faulty connection raise else: if not hasattr(r, 'text_content'): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) return r
Example #22
Source File: html.py From elasticintel with GNU General Public License v3.0 | 4 votes |
def _build_doc(self): """ Raises ------ ValueError * If a URL that lxml cannot parse is passed. Exception * Any other ``Exception`` thrown. For example, trying to parse a URL that is syntactically correct on a machine with no internet connection will fail. See Also -------- pandas.io.html._HtmlFrameParser._build_doc """ from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError parser = HTMLParser(recover=False, encoding=self.encoding) try: # try to parse the input in the simplest way r = parse(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass except (UnicodeDecodeError, IOError): # if the input is a blob of html goop if not _is_url(self.io): r = fromstring(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass else: # not a url scheme = parse_url(self.io).scheme if scheme not in _valid_schemes: # lxml can't parse it msg = (('{invalid!r} is not a valid url scheme, valid ' 'schemes are {valid}') .format(invalid=scheme, valid=_valid_schemes)) raise ValueError(msg) else: # something else happened: maybe a faulty connection raise else: if not hasattr(r, 'text_content'): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) return r
Example #23
Source File: html.py From Computable with MIT License | 4 votes |
def _build_doc(self): """ Raises ------ ValueError * If a URL that lxml cannot parse is passed. Exception * Any other ``Exception`` thrown. For example, trying to parse a URL that is syntactically correct on a machine with no internet connection will fail. See Also -------- pandas.io.html._HtmlFrameParser._build_doc """ from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError parser = HTMLParser(recover=False) try: # try to parse the input in the simplest way r = parse(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass except (UnicodeDecodeError, IOError): # if the input is a blob of html goop if not _is_url(self.io): r = fromstring(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass else: # not a url scheme = parse_url(self.io).scheme if scheme not in _valid_schemes: # lxml can't parse it msg = ('%r is not a valid url scheme, valid schemes are ' '%s') % (scheme, _valid_schemes) raise ValueError(msg) else: # something else happened: maybe a faulty connection raise else: if not hasattr(r, 'text_content'): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) return r
Example #24
Source File: html.py From vnpy_crypto with MIT License | 4 votes |
def _build_doc(self): """ Raises ------ ValueError * If a URL that lxml cannot parse is passed. Exception * Any other ``Exception`` thrown. For example, trying to parse a URL that is syntactically correct on a machine with no internet connection will fail. See Also -------- pandas.io.html._HtmlFrameParser._build_doc """ from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError parser = HTMLParser(recover=True, encoding=self.encoding) try: if _is_url(self.io): with urlopen(self.io) as f: r = parse(f, parser=parser) else: # try to parse the input in the simplest way r = parse(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass except (UnicodeDecodeError, IOError) as e: # if the input is a blob of html goop if not _is_url(self.io): r = fromstring(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass else: raise e else: if not hasattr(r, 'text_content'): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) return r