Python HTMLParser.HTMLParser() Examples
The following are 30
code examples of HTMLParser.HTMLParser().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
HTMLParser
, or try the search function
.
Example #1
Source File: nginx_CVE_2017_7529.py From xunfeng with GNU General Public License v3.0 | 6 votes |
def get_url(domain,port,timeout): url_list = [] if port ==443: surl = 'https://' + domain else: surl = 'http://' + domain res = urllib2.urlopen(surl, timeout=timeout) html = res.read() root_url = res.geturl() m = re.findall("<(?:img|link|script)[^>]*?(?:src|href)=('|\")(.*?)\\1", html, re.I) if m: for url in m: ParseResult = urlparse.urlparse(url[1]) if ParseResult.netloc and ParseResult.scheme: if domain == ParseResult.hostname: url_list.append(HTMLParser.HTMLParser().unescape(url[1])) elif not ParseResult.netloc and not ParseResult.scheme: url_list.append(HTMLParser.HTMLParser().unescape(urlparse.urljoin(root_url, url[1]))) return list(set(url_list))
Example #2
Source File: html_linter.py From html-linter with Apache License 2.0 | 6 votes |
def __init__(self, html): self._messages = [] # Variables used to get the indentation self._last_data = '' self._last_data_position = (0, 1) self._last_indent = 0 # Variables used to check if a charset tag should be required. self._first_meta_line_col = None self._after_head_line_col = None self._has_charset = False # Variables to extend the feature set of HTMLParser. self._endtag_text = None HTMLParser.HTMLParser.__init__(self) # In case we are dealing with Python 3, set it to non-strict mode. if hasattr(self, 'strict'): self.strict = False self.feed(html) self.close()
Example #3
Source File: parsers.py From crestify with BSD 3-Clause "New" or "Revised" License | 6 votes |
def __init__(self, file_name, user_id): """ Reads data from file, loads it as JSON """ with open(file_name, 'r') as self.opened_file: self.data = self.opened_file.read() self.user = user_id self.data = ujson.loads(self.data) self.urls = dict() # Keeps track of all the urls in the import file, used when adding to db self.tags_dict = dict() # Store tag objects for imported bookmarks self.tags_set = set() # Keeps track of all the tags in the import file self.check_duplicates = dict() # Store all current bookmarks for the user self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user, Bookmark.deleted == False).all() for x in self.check_duplicates_query: self.check_duplicates[x.main_url] = x # Add bookmark object to dict self.html_parser = HTMLParser.HTMLParser() self.valid_url = re.compile( r'^(?:[a-z0-9\.\-]*)://' r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|' r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' r'(?::\d+)?' r'(?:/?|[/?]\S+)$', re.IGNORECASE) # We only want valid URLs in the database
Example #4
Source File: html_linter.py From html-linter with Apache License 2.0 | 6 votes |
def get_attribute_line_column(tag_definition, line, column, attribute): """Returns the line and column of the provided attribute. Args: tag_definition: str with the definition of the tag. line: line where the tag starts. column: column where the tag starts (1-based). attribute: str representing the attribute to find. Return: A (line, column) tuple representing the position of the attribute. """ for match in HTMLParser.attrfind.finditer(tag_definition): if match.group(1).lower() == attribute: return get_line_column(tag_definition, line, column, match.start(1)) assert False, 'Could not find the requested attribute %s' % attribute
Example #5
Source File: parsers.py From crestify with BSD 3-Clause "New" or "Revised" License | 6 votes |
def __init__(self, file_name, user_id): with open(file_name, 'r') as self.opened_file: self.html = self.opened_file.read() self.soup = BeautifulSoup4(self.html) self.user = user_id self.urls = dict() self.check_duplicates = dict() self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user, Bookmark.deleted == False).all() for bmark in self.check_duplicates_query: self.check_duplicates[bmark.main_url] = bmark self.tags_dict = dict() self.tags_set = set() self.html_parser = HTMLParser.HTMLParser() self.valid_url = re.compile( r'^(?:[a-z0-9\.\-]*)://' r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|' r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' r'(?::\d+)?' r'(?:/?|[/?]\S+)$', re.IGNORECASE)
Example #6
Source File: _htmlparser.py From svg-animation-tools with MIT License | 6 votes |
def feed(self, markup): args, kwargs = self.parser_args parser = BeautifulSoupHTMLParser(*args, **kwargs) parser.soup = self.soup try: parser.feed(markup) except HTMLParseError, e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some # 3.2.3 code. This ensures they don't treat markup like <p></p> as a # string. # # XXX This code can be removed once most Python 3 users are on 3.2.3.
Example #7
Source File: test_htmlparser.py From BinderFilter with MIT License | 6 votes |
def test_cdata_with_closing_tags(self): # see issue #13358 # make sure that HTMLParser calls handle_data only once for each CDATA. # The normal event collector normalizes the events in get_events, # so we override it to return the original list of events. class Collector(EventCollector): def get_events(self): return self.events content = """<!-- not a comment --> ¬-an-entity-ref; <a href="" /> </p><p> & <span></span></style> '</script' + '>' </html> </head> </scripter>!""" for element in [' script', 'script ', ' script ', '\nscript', 'script\n', '\nscript\n']: s = u'<script>{content}</{element}>'.format(element=element, content=content) self._run_check(s, [("starttag", "script", []), ("data", content), ("endtag", "script")], collector=Collector)
Example #8
Source File: matcher.py From romcollectionbrowser with GNU General Public License v2.0 | 6 votes |
def resolveParseResult(self, result, itemName): """ This method is due to the fact that our result set is a list of dicts """ resultValue = "" try: resultValue = result[itemName][0] resultValue = util.html_unescape(resultValue) resultValue = resultValue.strip() # unescape ugly html encoding from websites resultValue = HTMLParser().unescape(resultValue) except Exception as e: # log.warn("Error while resolving item: " + itemName + " : " + str(exc)) log.warn("Error while resolving item: {0} : {1} {2}".format(itemName, type(e), str(e))) try: log.debug("Result " + itemName + " = " + resultValue) except: pass return resultValue
Example #9
Source File: parsers.py From crestify with BSD 3-Clause "New" or "Revised" License | 6 votes |
def __init__(self, file_name, user_id): with open(file_name, 'r') as self.opened_file: self.data = self.opened_file.read() self.user = user_id self.data = ujson.loads(self.data) self.urls = dict() self.tags_dict = dict() self.tags_set = set() self.check_duplicates = dict() self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user, Bookmark.deleted == False).all() for x in self.check_duplicates_query: self.check_duplicates[x.main_url] = x self.html_parser = HTMLParser.HTMLParser() self.valid_url = re.compile( r'^(?:[a-z0-9\.\-]*)://' r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|' r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' r'(?::\d+)?' r'(?:/?|[/?]\S+)$', re.IGNORECASE)
Example #10
Source File: test_htmlparser.py From ironpython2 with Apache License 2.0 | 6 votes |
def test_cdata_with_closing_tags(self): # see issue #13358 # make sure that HTMLParser calls handle_data only once for each CDATA. # The normal event collector normalizes the events in get_events, # so we override it to return the original list of events. class Collector(EventCollector): def get_events(self): return self.events content = """<!-- not a comment --> ¬-an-entity-ref; <a href="" /> </p><p> & <span></span></style> '</script' + '>' </html> </head> </scripter>!""" for element in [' script', 'script ', ' script ', '\nscript', 'script\n', '\nscript\n']: s = u'<script>{content}</{element}>'.format(element=element, content=content) self._run_check(s, [("starttag", "script", []), ("data", content), ("endtag", "script")], collector=Collector)
Example #11
Source File: test_htmlparser.py From oss-ftp with MIT License | 6 votes |
def test_cdata_with_closing_tags(self): # see issue #13358 # make sure that HTMLParser calls handle_data only once for each CDATA. # The normal event collector normalizes the events in get_events, # so we override it to return the original list of events. class Collector(EventCollector): def get_events(self): return self.events content = """<!-- not a comment --> ¬-an-entity-ref; <a href="" /> </p><p> & <span></span></style> '</script' + '>' </html> </head> </scripter>!""" for element in [' script', 'script ', ' script ', '\nscript', 'script\n', '\nscript\n']: s = u'<script>{content}</{element}>'.format(element=element, content=content) self._run_check(s, [("starttag", "script", []), ("data", content), ("endtag", "script")], collector=Collector)
Example #12
Source File: _htmlparser.py From weeman with GNU General Public License v3.0 | 6 votes |
def feed(self, markup): args, kwargs = self.parser_args parser = BeautifulSoupHTMLParser(*args, **kwargs) parser.soup = self.soup try: parser.feed(markup) except HTMLParseError, e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some # 3.2.3 code. This ensures they don't treat markup like <p></p> as a # string. # # XXX This code can be removed once most Python 3 users are on 3.2.3.
Example #13
Source File: _htmlparser.py From svg-animation-tools with MIT License | 6 votes |
def feed(self, markup): args, kwargs = self.parser_args parser = BeautifulSoupHTMLParser(*args, **kwargs) parser.soup = self.soup try: parser.feed(markup) except HTMLParseError, e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some # 3.2.3 code. This ensures they don't treat markup like <p></p> as a # string. # # XXX This code can be removed once most Python 3 users are on 3.2.3.
Example #14
Source File: twokenize.py From geoinference with BSD 3-Clause "New" or "Revised" License | 5 votes |
def normalizeTextForTagger(text): text = text.replace("&", "&") text = HTMLParser.HTMLParser().unescape(text) return text # This is intended for raw tweet text -- we do some HTML entity unescaping before running the tagger. # # This function normalizes the input text BEFORE calling the tokenizer. # So the tokens you get back may not exactly correspond to # substrings of the original text.
Example #15
Source File: test_htmlparser.py From oss-ftp with MIT License | 5 votes |
def _parse_error(self, source): def parse(source=source): parser = HTMLParser.HTMLParser() parser.feed(source) parser.close() self.assertRaises(HTMLParser.HTMLParseError, parse)
Example #16
Source File: extract_recipe.py From extract_recipe with Apache License 2.0 | 5 votes |
def close(self): HTMLParser.HTMLParser.close(self) self.pbr() self.o('', 0, 'end') self.outtext = self.outtext.join(self.outtextlist) if self.unicode_snob: nbsp = unichr(name2cp('nbsp')) else: nbsp = u' ' self.outtext = self.outtext.replace(u' _place_holder;', nbsp) return self.outtext
Example #17
Source File: diagnose.py From weeman with GNU General Public License v3.0 | 5 votes |
def htmlparser_trace(data): """Print out the HTMLParser events that occur during parsing. This lets you see how HTMLParser parses a document when no Beautiful Soup code is running. """ parser = AnnouncingParser() parser.feed(data)
Example #18
Source File: XHTMLparse.py From pycopia with Apache License 2.0 | 5 votes |
def close(self): if self.stack: raise ValidationError("XHTML document has unmatched tags") HTMLParser.HTMLParser.close(self) self.doc.set_root(self.topelement) self.doc.comments = self.comments
Example #19
Source File: hrefgetter.py From pycopia with Apache License 2.0 | 5 votes |
def __init__(self, writer): HTMLParser.HTMLParser.__init__(self) self.current_href = None self.adata = "" self.state = "" if callable(writer): self.writer = writer else: raise ValueError, "HrefGetter: writer must be callable."
Example #20
Source File: extract_recipe.py From extract_recipe with Apache License 2.0 | 5 votes |
def feed(self, data): data = data.replace("</' + 'script>", "</ignore>") HTMLParser.HTMLParser.feed(self, data)
Example #21
Source File: html_linter.py From html-linter with Apache License 2.0 | 5 votes |
def get_value_line_column(tag_definition, line, column, attribute): """Returns the line and column of the value of the provided attribute. Args: tag_definition: str with the definition of the tag. line: line where the tag starts. column: column where the tag starts (1-based). attribute: str representing the attribute for which we want its value. Return: A (line, column) tuple representing the position of the value. """ for match in HTMLParser.attrfind.finditer(tag_definition): if match.group(1).lower() == attribute: if not match.group(3): pos = match.end(1) elif match.group(3)[0] in '"\'': pos = match.start(3) + 1 else: pos = match.start(3) return get_line_column(tag_definition, line, column, pos) assert False, 'Could not find the requested attribute %s' % attribute # pylint: disable=too-many-public-methods
Example #22
Source File: test_htmlparser.py From oss-ftp with MIT License | 5 votes |
def __init__(self): self.events = [] self.append = self.events.append HTMLParser.HTMLParser.__init__(self)
Example #23
Source File: client.py From plugin.video.sparkle with GNU General Public License v3.0 | 5 votes |
def replaceHTMLCodes(txt): txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt) txt = HTMLParser.HTMLParser().unescape(txt) txt = txt.replace(""", "\"") txt = txt.replace("&", "&") return txt
Example #24
Source File: _htmlparser.py From weeman with GNU General Public License v3.0 | 5 votes |
def handle_charref(self, name): # XXX workaround for a bug in HTMLParser. Remove this once # it's fixed in all supported versions. # http://bugs.python.org/issue13633 if name.startswith('x'): real_name = int(name.lstrip('x'), 16) elif name.startswith('X'): real_name = int(name.lstrip('X'), 16) else: real_name = int(name) try: data = unichr(real_name) except (ValueError, OverflowError), e: data = u"\N{REPLACEMENT CHARACTER}"
Example #25
Source File: website.py From plugin.video.netflix with MIT License | 5 votes |
def parse_html(html_value): """Parse HTML entities""" try: # Python 2 from HTMLParser import HTMLParser except ImportError: # Python 3 from html.parser import HTMLParser return HTMLParser().unescape(html_value)
Example #26
Source File: test_htmlparser.py From BinderFilter with MIT License | 5 votes |
def test_unescape_function(self): parser = HTMLParser.HTMLParser() self.assertEqual(parser.unescape('&#bad;'),'&#bad;') self.assertEqual(parser.unescape('&'),'&')
Example #27
Source File: test_htmlparser.py From BinderFilter with MIT License | 5 votes |
def test_bad_nesting(self): # Strangely, this *is* supposed to test that overlapping # elements are allowed. HTMLParser is more geared toward # lexing the input that parsing the structure. self._run_check("<a><b></a></b>", [ ("starttag", "a", []), ("starttag", "b", []), ("endtag", "a"), ("endtag", "b"), ])
Example #28
Source File: test_htmlparser.py From BinderFilter with MIT License | 5 votes |
def _parse_error(self, source): def parse(source=source): parser = HTMLParser.HTMLParser() parser.feed(source) parser.close() self.assertRaises(HTMLParser.HTMLParseError, parse)
Example #29
Source File: test_htmlparser.py From BinderFilter with MIT License | 5 votes |
def __init__(self): self.events = [] self.append = self.events.append HTMLParser.HTMLParser.__init__(self)
Example #30
Source File: html.py From machinae with MIT License | 5 votes |
def html_unescape(content): try: import html return html.unescape(content) except ImportError: import HTMLParser html_parser = HTMLParser.HTMLParser() return html_parser.unescape(content)