Python html5lib.HTMLParser() Examples
The following are 30
code examples of html5lib.HTMLParser().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
html5lib
, or try the search function
.
Example #1
Source File: test_alphabeticalattributes.py From bazarr with GNU General Public License v3.0 | 6 votes |
def test_with_serializer(): """Verify filter works in the context of everything else""" parser = html5lib.HTMLParser() dom = parser.parseFragment('<svg><pattern xlink:href="#patt2" id="patt1"></svg>') walker = html5lib.getTreeWalker('etree') ser = HTMLSerializer( alphabetical_attributes=True, quote_attr_values='always' ) # FIXME(willkg): The "xlink" namespace gets dropped by the serializer. When # that gets fixed, we can fix this expected result. assert ( ser.render(walker(dom)) == '<svg><pattern id="patt1" href="#patt2"></pattern></svg>' )
Example #2
Source File: _html5lib.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def __init__(self, namespaceHTMLElements, soup=None, store_line_numbers=True, **kwargs): if soup: self.soup = soup else: from bs4 import BeautifulSoup # TODO: Why is the parser 'html.parser' here? To avoid an # infinite loop? self.soup = BeautifulSoup( "", "html.parser", store_line_numbers=store_line_numbers, **kwargs ) super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) # This will be set later to an html5lib.html5parser.HTMLParser # object, which we can use to track the current line number. self.parser = None self.store_line_numbers = store_line_numbers
Example #3
Source File: _html5lib.py From Tautulli with GNU General Public License v3.0 | 6 votes |
def __init__(self, namespaceHTMLElements, soup=None, store_line_numbers=True, **kwargs): if soup: self.soup = soup else: from bs4 import BeautifulSoup # TODO: Why is the parser 'html.parser' here? To avoid an # infinite loop? self.soup = BeautifulSoup( "", "html.parser", store_line_numbers=store_line_numbers, **kwargs ) super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) # This will be set later to an html5lib.html5parser.HTMLParser # object, which we can use to track the current line number. self.parser = None self.store_line_numbers = store_line_numbers
Example #4
Source File: __init__.py From Tautulli with GNU General Public License v3.0 | 5 votes |
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, strip_comments=True): """Clean an HTML fragment and return it :arg text: the text to clean :arg tags: whitelist of allowed tags; defaults to ``bleach.ALLOWED_TAGS`` :arg attributes: whitelist of allowed attributes; defaults to ``bleach.ALLOWED_ATTRIBUTES`` :arg styles: whitelist of allowed css; defaults to ``bleach.ALLOWED_STYLES`` :arg protocols: whitelist of allowed protocols for links; defaults to ``bleach.ALLOWED_PROTOCOLS`` :arg strip: whether or not to strip disallowed elements :arg strip_comments: whether or not to strip HTML comments """ if not text: return '' text = force_unicode(text) class s(BleachSanitizer): allowed_elements = tags allowed_attributes = attributes allowed_css_properties = styles allowed_protocols = protocols strip_disallowed_elements = strip strip_html_comments = strip_comments parser = html5lib.HTMLParser(tokenizer=s) return _render(parser.parseFragment(text))
Example #5
Source File: _html5lib.py From bazarr with GNU General Public License v3.0 | 5 votes |
def feed(self, markup): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) extra_kwargs = dict() if not isinstance(markup, unicode): if new_html5lib: extra_kwargs['override_encoding'] = self.user_specified_encoding else: extra_kwargs['encoding'] = self.user_specified_encoding doc = parser.parse(markup, **extra_kwargs) # Set the character encoding detected by the tokenizer. if isinstance(markup, unicode): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: original_encoding = parser.tokenizer.stream.charEncoding[0] if not isinstance(original_encoding, basestring): # In 0.99999999 and up, the encoding is an html5lib # Encoding object. We want to use a string for compatibility # with other tree builders. original_encoding = original_encoding.name doc.original_encoding = original_encoding
Example #6
Source File: diagnose.py From bazarr with GNU General Public License v3.0 | 5 votes |
def htmlparser_trace(data): """Print out the HTMLParser events that occur during parsing. This lets you see how HTMLParser parses a document when no Beautiful Soup code is running. """ parser = AnnouncingParser() parser.feed(data)
Example #7
Source File: _html5lib.py From bazarr with GNU General Public License v3.0 | 5 votes |
def feed(self, markup): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) extra_kwargs = dict() if not isinstance(markup, str): if new_html5lib: extra_kwargs['override_encoding'] = self.user_specified_encoding else: extra_kwargs['encoding'] = self.user_specified_encoding doc = parser.parse(markup, **extra_kwargs) # Set the character encoding detected by the tokenizer. if isinstance(markup, str): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: original_encoding = parser.tokenizer.stream.charEncoding[0] if not isinstance(original_encoding, str): # In 0.99999999 and up, the encoding is an html5lib # Encoding object. We want to use a string for compatibility # with other tree builders. original_encoding = original_encoding.name doc.original_encoding = original_encoding
Example #8
Source File: diagnose.py From bazarr with GNU General Public License v3.0 | 5 votes |
def htmlparser_trace(data): """Print out the HTMLParser events that occur during parsing. This lets you see how HTMLParser parses a document when no Beautiful Soup code is running. """ parser = AnnouncingParser() parser.feed(data)
Example #9
Source File: _html5lib.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def feed(self, markup): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) self.underlying_builder.parser = parser extra_kwargs = dict() if not isinstance(markup, str): if new_html5lib: extra_kwargs['override_encoding'] = self.user_specified_encoding else: extra_kwargs['encoding'] = self.user_specified_encoding doc = parser.parse(markup, **extra_kwargs) # Set the character encoding detected by the tokenizer. if isinstance(markup, str): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: original_encoding = parser.tokenizer.stream.charEncoding[0] if not isinstance(original_encoding, str): # In 0.99999999 and up, the encoding is an html5lib # Encoding object. We want to use a string for compatibility # with other tree builders. original_encoding = original_encoding.name doc.original_encoding = original_encoding self.underlying_builder.parser = None
Example #10
Source File: diagnose.py From Tautulli with GNU General Public License v3.0 | 5 votes |
def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print("Comparative parser benchmark on Beautiful Soup %s" % __version__) data = rdoc(num_elements) print("Generated a large invalid HTML document (%d bytes)." % len(data)) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = BeautifulSoup(data, parser) b = time.time() success = True except Exception as e: print("%s could not parse the markup." % parser) traceback.print_exc() if success: print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) from lxml import etree a = time.time() etree.HTML(data) b = time.time() print("Raw lxml parsed the markup in %.2fs." % (b-a)) import html5lib parser = html5lib.HTMLParser() a = time.time() parser.parse(data) b = time.time() print("Raw html5lib parsed the markup in %.2fs." % (b-a))
Example #11
Source File: diagnose.py From MIA-Dictionary-Addon with GNU General Public License v3.0 | 5 votes |
def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print("Comparative parser benchmark on Beautiful Soup %s" % __version__) data = rdoc(num_elements) print("Generated a large invalid HTML document (%d bytes)." % len(data)) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = BeautifulSoup(data, parser) b = time.time() success = True except Exception as e: print("%s could not parse the markup." % parser) traceback.print_exc() if success: print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) from lxml import etree a = time.time() etree.HTML(data) b = time.time() print("Raw lxml parsed the markup in %.2fs." % (b-a)) import html5lib parser = html5lib.HTMLParser() a = time.time() parser.parse(data) b = time.time() print("Raw html5lib parsed the markup in %.2fs." % (b-a))
Example #12
Source File: _html5lib.py From moviegrabber with GNU General Public License v3.0 | 5 votes |
def feed(self, markup): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) doc = parser.parse(markup, encoding=self.user_specified_encoding) # Set the character encoding detected by the tokenizer. if isinstance(markup, unicode): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
Example #13
Source File: diagnose.py From moviegrabber with GNU General Public License v3.0 | 5 votes |
def htmlparser_trace(data): """Print out the HTMLParser events that occur during parsing. This lets you see how HTMLParser parses a document when no Beautiful Soup code is running. """ parser = AnnouncingParser() parser.feed(data)
Example #14
Source File: diagnose.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print("Comparative parser benchmark on Beautiful Soup %s" % __version__) data = rdoc(num_elements) print("Generated a large invalid HTML document (%d bytes)." % len(data)) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = BeautifulSoup(data, parser) b = time.time() success = True except Exception as e: print("%s could not parse the markup." % parser) traceback.print_exc() if success: print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) from lxml import etree a = time.time() etree.HTML(data) b = time.time() print("Raw lxml parsed the markup in %.2fs." % (b-a)) import html5lib parser = html5lib.HTMLParser() a = time.time() parser.parse(data) b = time.time() print("Raw html5lib parsed the markup in %.2fs." % (b-a))
Example #15
Source File: __init__.py From Serverless-Deep-Learning-with-TensorFlow-and-AWS-Lambda with MIT License | 5 votes |
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, strip_comments=True): """Clean an HTML fragment of malicious content and return it This function is a security-focused function whose sole purpose is to remove malicious content from a string such that it can be displayed as content in a web page. This function is not designed to use to transform content to be used in non-web-page contexts. :arg text: the text to clean :arg tags: whitelist of allowed tags; defaults to ``bleach.ALLOWED_TAGS`` :arg attributes: whitelist of allowed attributes; defaults to ``bleach.ALLOWED_ATTRIBUTES`` :arg styles: whitelist of allowed css; defaults to ``bleach.ALLOWED_STYLES`` :arg protocols: whitelist of allowed protocols for links; defaults to ``bleach.ALLOWED_PROTOCOLS`` :arg strip: whether or not to strip disallowed elements :arg strip_comments: whether or not to strip HTML comments """ if not text: return '' text = force_unicode(text) class s(BleachSanitizer): allowed_elements = tags allowed_attributes = attributes allowed_css_properties = styles allowed_protocols = protocols strip_disallowed_elements = strip strip_html_comments = strip_comments parser = html5lib.HTMLParser(tokenizer=s) return _render(parser.parseFragment(text))
Example #16
Source File: diagnose.py From Tautulli with GNU General Public License v3.0 | 5 votes |
def htmlparser_trace(data): """Print out the HTMLParser events that occur during parsing. This lets you see how HTMLParser parses a document when no Beautiful Soup code is running. """ parser = AnnouncingParser() parser.feed(data)
Example #17
Source File: diagnose.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def htmlparser_trace(data): """Print out the HTMLParser events that occur during parsing. This lets you see how HTMLParser parses a document when no Beautiful Soup code is running. :param data: Some markup. """ parser = AnnouncingParser() parser.feed(data)
Example #18
Source File: test_parser2.py From bazarr with GNU General Public License v3.0 | 5 votes |
def test_maintain_duplicate_attribute_order(): # This is here because we impl it in parser and not tokenizer p = HTMLParser() attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))] token = {'name': 'html', 'selfClosing': False, 'selfClosingAcknowledged': False, 'type': tokenTypes["StartTag"], 'data': attrs + [('a', len(attrs))]} out = p.normalizeToken(token) attr_order = list(out["data"].keys()) assert attr_order == [x for x, i in attrs]
Example #19
Source File: diagnose.py From MIA-Dictionary-Addon with GNU General Public License v3.0 | 5 votes |
def htmlparser_trace(data): """Print out the HTMLParser events that occur during parsing. This lets you see how HTMLParser parses a document when no Beautiful Soup code is running. """ parser = AnnouncingParser() parser.feed(data)
Example #20
Source File: _html5lib.py From MIA-Dictionary-Addon with GNU General Public License v3.0 | 5 votes |
def feed(self, markup): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) extra_kwargs = dict() if not isinstance(markup, str): if new_html5lib: extra_kwargs['override_encoding'] = self.user_specified_encoding else: extra_kwargs['encoding'] = self.user_specified_encoding doc = parser.parse(markup, **extra_kwargs) # Set the character encoding detected by the tokenizer. if isinstance(markup, str): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: original_encoding = parser.tokenizer.stream.charEncoding[0] if not isinstance(original_encoding, str): # In 0.99999999 and up, the encoding is an html5lib # Encoding object. We want to use a string for compatibility # with other tree builders. original_encoding = original_encoding.name doc.original_encoding = original_encoding
Example #21
Source File: diagnose.py From FastWordQuery with GNU General Public License v3.0 | 5 votes |
def htmlparser_trace(data): """Print out the HTMLParser events that occur during parsing. This lets you see how HTMLParser parses a document when no Beautiful Soup code is running. """ parser = AnnouncingParser() parser.feed(data)
Example #22
Source File: _html5lib.py From Crunchyroll-XML-Decoder with GNU General Public License v2.0 | 5 votes |
def feed(self, markup): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) doc = parser.parse(markup, encoding=self.user_specified_encoding) # Set the character encoding detected by the tokenizer. if isinstance(markup, unicode): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
Example #23
Source File: diagnose.py From POC-EXP with GNU General Public License v3.0 | 5 votes |
def htmlparser_trace(data): """Print out the HTMLParser events that occur during parsing. This lets you see how HTMLParser parses a document when no Beautiful Soup code is running. """ parser = AnnouncingParser() parser.feed(data)
Example #24
Source File: _html5lib.py From POC-EXP with GNU General Public License v3.0 | 5 votes |
def feed(self, markup): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) doc = parser.parse(markup, encoding=self.user_specified_encoding) # Set the character encoding detected by the tokenizer. if isinstance(markup, unicode): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
Example #25
Source File: _html5lib.py From python-for-android with Apache License 2.0 | 5 votes |
def feed(self, markup): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) doc = parser.parse(markup, encoding=self.user_specified_encoding) # Set the character encoding detected by the tokenizer. if isinstance(markup, str): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
Example #26
Source File: diagnose.py From plugin.git.browser with GNU General Public License v3.0 | 5 votes |
def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print("Comparative parser benchmark on Beautiful Soup %s" % __version__) data = rdoc(num_elements) print("Generated a large invalid HTML document (%d bytes)." % len(data)) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = BeautifulSoup(data, parser) b = time.time() success = True except Exception as e: print("%s could not parse the markup." % parser) traceback.print_exc() if success: print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) from lxml import etree a = time.time() etree.HTML(data) b = time.time() print("Raw lxml parsed the markup in %.2fs." % (b-a)) import html5lib parser = html5lib.HTMLParser() a = time.time() parser.parse(data) b = time.time() print("Raw html5lib parsed the markup in %.2fs." % (b-a))
Example #27
Source File: diagnose.py From plugin.git.browser with GNU General Public License v3.0 | 5 votes |
def htmlparser_trace(data): """Print out the HTMLParser events that occur during parsing. This lets you see how HTMLParser parses a document when no Beautiful Soup code is running. """ parser = AnnouncingParser() parser.feed(data)
Example #28
Source File: _html5lib.py From plugin.git.browser with GNU General Public License v3.0 | 5 votes |
def feed(self, markup): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) extra_kwargs = dict() if not isinstance(markup, str): if new_html5lib: extra_kwargs['override_encoding'] = self.user_specified_encoding else: extra_kwargs['encoding'] = self.user_specified_encoding doc = parser.parse(markup, **extra_kwargs) # Set the character encoding detected by the tokenizer. if isinstance(markup, str): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: original_encoding = parser.tokenizer.stream.charEncoding[0] if not isinstance(original_encoding, str): # In 0.99999999 and up, the encoding is an html5lib # Encoding object. We want to use a string for compatibility # with other tree builders. original_encoding = original_encoding.name doc.original_encoding = original_encoding
Example #29
Source File: diagnose.py From ru with GNU General Public License v2.0 | 5 votes |
def htmlparser_trace(data): """Print out the HTMLParser events that occur during parsing. This lets you see how HTMLParser parses a document when no Beautiful Soup code is running. """ parser = AnnouncingParser() parser.feed(data)
Example #30
Source File: _html5lib.py From ServerlessCrawler-VancouverRealState with MIT License | 5 votes |
def feed(self, markup): if self.soup.parse_only is not None: warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") parser = html5lib.HTMLParser(tree=self.create_treebuilder) extra_kwargs = dict() if not isinstance(markup, unicode): if new_html5lib: extra_kwargs['override_encoding'] = self.user_specified_encoding else: extra_kwargs['encoding'] = self.user_specified_encoding doc = parser.parse(markup, **extra_kwargs) # Set the character encoding detected by the tokenizer. if isinstance(markup, unicode): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: original_encoding = parser.tokenizer.stream.charEncoding[0] if not isinstance(original_encoding, basestring): # In 0.99999999 and up, the encoding is an html5lib # Encoding object. We want to use a string for compatibility # with other tree builders. original_encoding = original_encoding.name doc.original_encoding = original_encoding