Python HTMLParser.HTMLParser() Examples

The following are 30 code examples of HTMLParser.HTMLParser(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module HTMLParser , or try the search function .
Example #1
Source File: nginx_CVE_2017_7529.py    From xunfeng with GNU General Public License v3.0 6 votes vote down vote up
def get_url(domain,port,timeout):
    url_list = []
    if port ==443:
        surl = 'https://' + domain
    else:
        surl = 'http://' + domain
    res = urllib2.urlopen(surl, timeout=timeout)
    html = res.read()
    root_url = res.geturl()
    m = re.findall("<(?:img|link|script)[^>]*?(?:src|href)=('|\")(.*?)\\1", html, re.I)
    if m:
        for url in m:
            ParseResult = urlparse.urlparse(url[1])
            if ParseResult.netloc and ParseResult.scheme:
                if domain == ParseResult.hostname:
                    url_list.append(HTMLParser.HTMLParser().unescape(url[1]))
            elif not ParseResult.netloc and not ParseResult.scheme:
                url_list.append(HTMLParser.HTMLParser().unescape(urlparse.urljoin(root_url, url[1])))
    return list(set(url_list)) 
Example #2
Source File: html_linter.py    From html-linter with Apache License 2.0 6 votes vote down vote up
def __init__(self, html):
        self._messages = []

        # Variables used to get the indentation
        self._last_data = ''
        self._last_data_position = (0, 1)
        self._last_indent = 0

        # Variables used to check if a charset tag should be required.
        self._first_meta_line_col = None
        self._after_head_line_col = None
        self._has_charset = False

        # Variables to extend the feature set of HTMLParser.
        self._endtag_text = None

        HTMLParser.HTMLParser.__init__(self)

        # In case we are dealing with Python 3, set it to non-strict mode.
        if hasattr(self, 'strict'):
            self.strict = False

        self.feed(html)
        self.close() 
Example #3
Source File: parsers.py    From crestify with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __init__(self, file_name, user_id):
        """
        Reads data from file, loads it as JSON
        """
        with open(file_name, 'r') as self.opened_file:
            self.data = self.opened_file.read()
        self.user = user_id
        self.data = ujson.loads(self.data)
        self.urls = dict()  # Keeps track of all the urls in the import file, used when adding to db
        self.tags_dict = dict()  # Store tag objects for imported bookmarks
        self.tags_set = set()  # Keeps track of all the tags in the import file
        self.check_duplicates = dict()  # Store all current bookmarks for the user
        self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
                                                            Bookmark.deleted == False).all()
        for x in self.check_duplicates_query:
            self.check_duplicates[x.main_url] = x  # Add bookmark object to dict
        self.html_parser = HTMLParser.HTMLParser()
        self.valid_url = re.compile(
            r'^(?:[a-z0-9\.\-]*)://'
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
            r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
            r'(?::\d+)?'
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)  # We only want valid URLs in the database 
Example #4
Source File: html_linter.py    From html-linter with Apache License 2.0 6 votes vote down vote up
def get_attribute_line_column(tag_definition, line, column, attribute):
    """Returns the line and column of the provided attribute.

    Args:
        tag_definition: str with the definition of the tag.
        line: line where the tag starts.
        column: column where the tag starts (1-based).
        attribute: str representing the attribute to find.

    Return:
       A (line, column) tuple representing the position of the attribute.
    """
    for match in HTMLParser.attrfind.finditer(tag_definition):
        if match.group(1).lower() == attribute:
            return get_line_column(tag_definition, line, column, match.start(1))

    assert False, 'Could not find the requested attribute %s' % attribute 
Example #5
Source File: parsers.py    From crestify with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __init__(self, file_name, user_id):
        with open(file_name, 'r') as self.opened_file:
            self.html = self.opened_file.read()
        self.soup = BeautifulSoup4(self.html)
        self.user = user_id
        self.urls = dict()
        self.check_duplicates = dict()
        self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
                                                            Bookmark.deleted == False).all()
        for bmark in self.check_duplicates_query:
            self.check_duplicates[bmark.main_url] = bmark
        self.tags_dict = dict()
        self.tags_set = set()
        self.html_parser = HTMLParser.HTMLParser()
        self.valid_url = re.compile(
            r'^(?:[a-z0-9\.\-]*)://'
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
            r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
            r'(?::\d+)?'
            r'(?:/?|[/?]\S+)$', re.IGNORECASE) 
Example #6
Source File: _htmlparser.py    From svg-animation-tools with MIT License 6 votes vote down vote up
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3. 
Example #7
Source File: test_htmlparser.py    From BinderFilter with MIT License 6 votes vote down vote up
def test_cdata_with_closing_tags(self):
        # see issue #13358
        # make sure that HTMLParser calls handle_data only once for each CDATA.
        # The normal event collector normalizes the events in get_events,
        # so we override it to return the original list of events.
        class Collector(EventCollector):
            def get_events(self):
                return self.events

        content = """<!-- not a comment --> &not-an-entity-ref;
                  <a href="" /> </p><p> &amp; <span></span></style>
                  '</script' + '>' </html> </head> </scripter>!"""
        for element in [' script', 'script ', ' script ',
                        '\nscript', 'script\n', '\nscript\n']:
            s = u'<script>{content}</{element}>'.format(element=element,
                                                        content=content)
            self._run_check(s, [("starttag", "script", []),
                                ("data", content),
                                ("endtag", "script")],
                            collector=Collector) 
Example #8
Source File: matcher.py    From romcollectionbrowser with GNU General Public License v2.0 6 votes vote down vote up
def resolveParseResult(self, result, itemName):
        """ This method is due to the fact that our result set is a list of dicts """

        resultValue = ""

        try:
            resultValue = result[itemName][0]
            resultValue = util.html_unescape(resultValue)
            resultValue = resultValue.strip()
            # unescape ugly html encoding from websites
            resultValue = HTMLParser().unescape(resultValue)

        except Exception as e:
            # log.warn("Error while resolving item: " + itemName + " : " + str(exc))
            log.warn("Error while resolving item: {0} : {1} {2}".format(itemName, type(e), str(e)))

        try:
            log.debug("Result " + itemName + " = " + resultValue)
        except:
            pass

        return resultValue 
Example #9
Source File: parsers.py    From crestify with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __init__(self, file_name, user_id):
        with open(file_name, 'r') as self.opened_file:
            self.data = self.opened_file.read()
        self.user = user_id
        self.data = ujson.loads(self.data)
        self.urls = dict()
        self.tags_dict = dict()
        self.tags_set = set()
        self.check_duplicates = dict()
        self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
                                                            Bookmark.deleted == False).all()
        for x in self.check_duplicates_query:
            self.check_duplicates[x.main_url] = x
        self.html_parser = HTMLParser.HTMLParser()
        self.valid_url = re.compile(
            r'^(?:[a-z0-9\.\-]*)://'
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
            r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
            r'(?::\d+)?'
            r'(?:/?|[/?]\S+)$', re.IGNORECASE) 
Example #10
Source File: test_htmlparser.py    From ironpython2 with Apache License 2.0 6 votes vote down vote up
def test_cdata_with_closing_tags(self):
        # see issue #13358
        # make sure that HTMLParser calls handle_data only once for each CDATA.
        # The normal event collector normalizes the events in get_events,
        # so we override it to return the original list of events.
        class Collector(EventCollector):
            def get_events(self):
                return self.events

        content = """<!-- not a comment --> &not-an-entity-ref;
                  <a href="" /> </p><p> &amp; <span></span></style>
                  '</script' + '>' </html> </head> </scripter>!"""
        for element in [' script', 'script ', ' script ',
                        '\nscript', 'script\n', '\nscript\n']:
            s = u'<script>{content}</{element}>'.format(element=element,
                                                        content=content)
            self._run_check(s, [("starttag", "script", []),
                                ("data", content),
                                ("endtag", "script")],
                            collector=Collector) 
Example #11
Source File: test_htmlparser.py    From oss-ftp with MIT License 6 votes vote down vote up
def test_cdata_with_closing_tags(self):
        # see issue #13358
        # make sure that HTMLParser calls handle_data only once for each CDATA.
        # The normal event collector normalizes the events in get_events,
        # so we override it to return the original list of events.
        class Collector(EventCollector):
            def get_events(self):
                return self.events

        content = """<!-- not a comment --> &not-an-entity-ref;
                  <a href="" /> </p><p> &amp; <span></span></style>
                  '</script' + '>' </html> </head> </scripter>!"""
        for element in [' script', 'script ', ' script ',
                        '\nscript', 'script\n', '\nscript\n']:
            s = u'<script>{content}</{element}>'.format(element=element,
                                                        content=content)
            self._run_check(s, [("starttag", "script", []),
                                ("data", content),
                                ("endtag", "script")],
                            collector=Collector) 
Example #12
Source File: _htmlparser.py    From weeman with GNU General Public License v3.0 6 votes vote down vote up
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3. 
Example #13
Source File: _htmlparser.py    From svg-animation-tools with MIT License 6 votes vote down vote up
def feed(self, markup):
        args, kwargs = self.parser_args
        parser = BeautifulSoupHTMLParser(*args, **kwargs)
        parser.soup = self.soup
        try:
            parser.feed(markup)
        except HTMLParseError, e:
            warnings.warn(RuntimeWarning(
                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
            raise e

# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3. 
Example #14
Source File: twokenize.py    From geoinference with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def normalizeTextForTagger(text):
    text = text.replace("&amp;", "&")
    text = HTMLParser.HTMLParser().unescape(text)
    return text

# This is intended for raw tweet text -- we do some HTML entity unescaping before running the tagger.
# 
# This function normalizes the input text BEFORE calling the tokenizer.
# So the tokens you get back may not exactly correspond to
# substrings of the original text. 
Example #15
Source File: test_htmlparser.py    From oss-ftp with MIT License 5 votes vote down vote up
def _parse_error(self, source):
        def parse(source=source):
            parser = HTMLParser.HTMLParser()
            parser.feed(source)
            parser.close()
        self.assertRaises(HTMLParser.HTMLParseError, parse) 
Example #16
Source File: extract_recipe.py    From extract_recipe with Apache License 2.0 5 votes vote down vote up
def close(self):
        HTMLParser.HTMLParser.close(self)

        self.pbr()
        self.o('', 0, 'end')

        self.outtext = self.outtext.join(self.outtextlist)
        if self.unicode_snob:
            nbsp = unichr(name2cp('nbsp'))
        else:
            nbsp = u' '
        self.outtext = self.outtext.replace(u'&nbsp_place_holder;', nbsp)

        return self.outtext 
Example #17
Source File: diagnose.py    From weeman with GNU General Public License v3.0 5 votes vote down vote up
def htmlparser_trace(data):
    """Print out the HTMLParser events that occur during parsing.

    This lets you see how HTMLParser parses a document when no
    Beautiful Soup code is running.
    """
    parser = AnnouncingParser()
    parser.feed(data) 
Example #18
Source File: XHTMLparse.py    From pycopia with Apache License 2.0 5 votes vote down vote up
def close(self):
        if self.stack:
            raise ValidationError("XHTML document has unmatched tags")
        HTMLParser.HTMLParser.close(self)
        self.doc.set_root(self.topelement)
        self.doc.comments = self.comments 
Example #19
Source File: hrefgetter.py    From pycopia with Apache License 2.0 5 votes vote down vote up
def __init__(self, writer):
        HTMLParser.HTMLParser.__init__(self)
        self.current_href = None
        self.adata = ""
        self.state = ""
        if callable(writer):
            self.writer = writer
        else:
            raise ValueError, "HrefGetter: writer must be callable." 
Example #20
Source File: extract_recipe.py    From extract_recipe with Apache License 2.0 5 votes vote down vote up
def feed(self, data):
        data = data.replace("</' + 'script>", "</ignore>")
        HTMLParser.HTMLParser.feed(self, data) 
Example #21
Source File: html_linter.py    From html-linter with Apache License 2.0 5 votes vote down vote up
def get_value_line_column(tag_definition, line, column, attribute):
    """Returns the line and column of the value of the provided attribute.

    Args:
        tag_definition: str with the definition of the tag.
        line: line where the tag starts.
        column: column where the tag starts (1-based).
        attribute: str representing the attribute for which we want its value.

    Return:
       A (line, column) tuple representing the position of the value.
    """
    for match in HTMLParser.attrfind.finditer(tag_definition):
        if match.group(1).lower() == attribute:
            if not match.group(3):
                pos = match.end(1)
            elif match.group(3)[0] in '"\'':
                pos = match.start(3) + 1
            else:
                pos = match.start(3)
            return get_line_column(tag_definition, line, column, pos)

    assert False, 'Could not find the requested attribute %s' % attribute


# pylint: disable=too-many-public-methods 
Example #22
Source File: test_htmlparser.py    From oss-ftp with MIT License 5 votes vote down vote up
def __init__(self):
        self.events = []
        self.append = self.events.append
        HTMLParser.HTMLParser.__init__(self) 
Example #23
Source File: client.py    From plugin.video.sparkle with GNU General Public License v3.0 5 votes vote down vote up
def replaceHTMLCodes(txt):
    txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt)
    txt = HTMLParser.HTMLParser().unescape(txt)
    txt = txt.replace("&quot;", "\"")
    txt = txt.replace("&amp;", "&")
    return txt 
Example #24
Source File: _htmlparser.py    From weeman with GNU General Public License v3.0 5 votes vote down vote up
def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
        # it's fixed in all supported versions.
        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
        elif name.startswith('X'):
            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

        try:
            data = unichr(real_name)
        except (ValueError, OverflowError), e:
            data = u"\N{REPLACEMENT CHARACTER}" 
Example #25
Source File: website.py    From plugin.video.netflix with MIT License 5 votes vote down vote up
def parse_html(html_value):
    """Parse HTML entities"""
    try:  # Python 2
        from HTMLParser import HTMLParser
    except ImportError:  # Python 3
        from html.parser import HTMLParser
    return HTMLParser().unescape(html_value) 
Example #26
Source File: test_htmlparser.py    From BinderFilter with MIT License 5 votes vote down vote up
def test_unescape_function(self):
        parser = HTMLParser.HTMLParser()
        self.assertEqual(parser.unescape('&#bad;'),'&#bad;')
        self.assertEqual(parser.unescape('&#0038;'),'&') 
Example #27
Source File: test_htmlparser.py    From BinderFilter with MIT License 5 votes vote down vote up
def test_bad_nesting(self):
        # Strangely, this *is* supposed to test that overlapping
        # elements are allowed.  HTMLParser is more geared toward
        # lexing the input that parsing the structure.
        self._run_check("<a><b></a></b>", [
            ("starttag", "a", []),
            ("starttag", "b", []),
            ("endtag", "a"),
            ("endtag", "b"),
            ]) 
Example #28
Source File: test_htmlparser.py    From BinderFilter with MIT License 5 votes vote down vote up
def _parse_error(self, source):
        def parse(source=source):
            parser = HTMLParser.HTMLParser()
            parser.feed(source)
            parser.close()
        self.assertRaises(HTMLParser.HTMLParseError, parse) 
Example #29
Source File: test_htmlparser.py    From BinderFilter with MIT License 5 votes vote down vote up
def __init__(self):
        self.events = []
        self.append = self.events.append
        HTMLParser.HTMLParser.__init__(self) 
Example #30
Source File: html.py    From machinae with MIT License 5 votes vote down vote up
def html_unescape(content):
    try:
        import html
        return html.unescape(content)
    except ImportError:
        import HTMLParser
        html_parser = HTMLParser.HTMLParser()
        return html_parser.unescape(content)