Python sgmllib.SGMLParser() Examples

The following are 30 code examples of sgmllib.SGMLParser(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sgmllib , or try the search function .
Example #1
Source File: beautifulsoup-bk.py    From Cloudmare with GNU General Public License v3.0 6 votes vote down vote up
def parse_declaration(self, i):
        """Treat a bogus SGML declaration as raw data. Treat a CDATA
        declaration as a CData object."""
        j = None
        if self.rawdata[i:i+9] == '<![CDATA[':
             k = self.rawdata.find(']]>', i)
             if k == -1:
                 k = len(self.rawdata)
             data = self.rawdata[i+9:k]
             j = k+3
             self._toStringSubclass(data, CData)
        else:
            try:
                j = sgmllib.SGMLParser.parse_declaration(self, i)
            except sgmllib.SGMLParseError:
                toHandle = self.rawdata[i:]
                self.handle_data(toHandle)
                j = i + len(toHandle)
        return j 
Example #2
Source File: beautifulsoup-bk.py    From Cloudmare with GNU General Public License v3.0 5 votes vote down vote up
def convert_charref(self, name):
        """This method fixes a bug in Python's SGMLParser."""
        try:
            n = int(name)
        except ValueError:
            return
        if not 0 <= n <= 127 : # ASCII ends at 127, not 255
            return
        return self.convert_codepoint(n) 
Example #3
Source File: tags.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self):
		sgmllib.SGMLParser.__init__(self) 
Example #4
Source File: feedparser.py    From incremental-reading with ISC License 5 votes vote down vote up
def feed(self, data):
        data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
        data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
        data = data.replace('&#39;', "'")
        data = data.replace('&#34;', '"')
        try:
            bytes
            if bytes is str:
                raise NameError
            self.encoding = self.encoding + '_INVALID_PYTHON_3'
        except NameError:
            if self.encoding and isinstance(data, str):
                data = data.encode(self.encoding)
        sgmllib.SGMLParser.feed(self, data)
        sgmllib.SGMLParser.close(self) 
Example #5
Source File: beautifulsoup-bk.py    From Cloudmare with GNU General Public License v3.0 5 votes vote down vote up
def __getattr__(self, methodName):
        """This method routes method call requests to either the SGMLParser
        superclass or the Tag superclass, depending on the method name."""
        #print "__getattr__ called on %s.%s" % (self.__class__, methodName)

        if methodName.startswith('start_') or methodName.startswith('end_') \
               or methodName.startswith('do_'):
            return sgmllib.SGMLParser.__getattr__(self, methodName)
        elif not methodName.startswith('__'):
            return Tag.__getattr__(self, methodName)
        else:
            raise AttributeError 
Example #6
Source File: beautifulsoup-bk.py    From Cloudmare with GNU General Public License v3.0 5 votes vote down vote up
def reset(self):
        Tag.__init__(self, self, self.ROOT_TAG_NAME)
        self.hidden = 1
        sgmllib.SGMLParser.reset(self)
        self.currentData = []
        self.currentTag = None
        self.tagStack = []
        self.quoteStack = []
        self.pushTag(self) 
Example #7
Source File: beautifulsoup-bk.py    From Cloudmare with GNU General Public License v3.0 5 votes vote down vote up
def _feed(self, inDocumentEncoding=None, isHTML=False):
        # Convert the document to Unicode.
        markup = self.markup
        if isinstance(markup, text_type):
            if not hasattr(self, 'originalEncoding'):
                self.originalEncoding = None
        else:
            dammit = UnicodeDammit\
                     (markup, [self.fromEncoding, inDocumentEncoding],
                      smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
            markup = dammit.unicode
            self.originalEncoding = dammit.originalEncoding
            self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
        if markup:
            if self.markupMassage:
                if not hasattr(self.markupMassage, "__iter__"):
                    self.markupMassage = self.MARKUP_MASSAGE
                for fix, m in self.markupMassage:
                    markup = fix.sub(m, markup)
                # TODO: We get rid of markupMassage so that the
                # soup object can be deepcopied later on. Some
                # Python installations can't copy regexes. If anyone
                # was relying on the existence of markupMassage, this
                # might cause problems.
                del(self.markupMassage)
        self.reset()

        sgmllib.SGMLParser.feed(self, markup)
        # Close out any unfinished strings and close all the open tags.
        self.endData()
        while self.currentTag.name != self.ROOT_TAG_NAME:
            self.popTag() 
Example #8
Source File: clientform.py    From POC-EXP with GNU General Public License v3.0 5 votes vote down vote up
def feed(self, data):
        try:
            sgmllib.SGMLParser.feed(self, data)
        except SGMLLIB_PARSEERROR, exc:
            raise ParseError(exc) 
Example #9
Source File: clientform.py    From POC-EXP with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
        sgmllib.SGMLParser.__init__(self)
        _AbstractFormParser.__init__(self, entitydefs, encoding) 
Example #10
Source File: feedparser.py    From incremental-reading with ISC License 5 votes vote down vote up
def __init__(self, baseuri, baselang, encoding, entities):
        sgmllib.SGMLParser.__init__(self)
        _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
        _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
        self.entities=entities 
Example #11
Source File: feedparser.py    From incremental-reading with ISC License 5 votes vote down vote up
def parse_declaration(self, i):
        try:
            return sgmllib.SGMLParser.parse_declaration(self, i)
        except sgmllib.SGMLParseError:
            # escape the doctype declaration and continue parsing
            self.handle_data('&lt;')
            return i+1 
Example #12
Source File: clientform.py    From POC-EXP with GNU General Public License v3.0 5 votes vote down vote up
def close(self):
        sgmllib.SGMLParser.close(self)
        self.end_body()


# sigh, must support mechanize by allowing dynamic creation of classes based on
# its bundled copy of BeautifulSoup (which was necessary because of dependency
# problems) 
Example #13
Source File: htmllib.py    From medicare-demo with Apache License 2.0 5 votes vote down vote up
def reset(self):
        sgmllib.SGMLParser.reset(self)
        self.savedata = None
        self.isindex = 0
        self.title = None
        self.base = None
        self.anchor = None
        self.anchorlist = []
        self.nofill = 0
        self.list_stack = []

    # ------ Methods used internally; some may be overridden

    # --- Formatter interface, taking care of 'savedata' mode;
    # shouldn't need to be overridden 
Example #14
Source File: htmllib.py    From medicare-demo with Apache License 2.0 5 votes vote down vote up
def __init__(self, formatter, verbose=0):
        """Creates an instance of the HTMLParser class.

        The formatter parameter is the formatter instance associated with
        the parser.

        """
        sgmllib.SGMLParser.__init__(self, verbose)
        self.formatter = formatter 
Example #15
Source File: htmllib.py    From Splunking-Crime with GNU Affero General Public License v3.0 5 votes vote down vote up
def reset(self):
        sgmllib.SGMLParser.reset(self)
        self.savedata = None
        self.isindex = 0
        self.title = None
        self.base = None
        self.anchor = None
        self.anchorlist = []
        self.nofill = 0
        self.list_stack = []

    # ------ Methods used internally; some may be overridden

    # --- Formatter interface, taking care of 'savedata' mode;
    # shouldn't need to be overridden 
Example #16
Source File: htmllib.py    From Splunking-Crime with GNU Affero General Public License v3.0 5 votes vote down vote up
def __init__(self, formatter, verbose=0):
        """Creates an instance of the HTMLParser class.

        The formatter parameter is the formatter instance associated with
        the parser.

        """
        sgmllib.SGMLParser.__init__(self, verbose)
        self.formatter = formatter 
Example #17
Source File: parse.py    From flo with MIT License 5 votes vote down vote up
def __init__(self, writer):
        sgmllib.SGMLParser.__init__(self)
        self.writer = writer
        self.document_id = None
        self.in_body = False
        self.text = '' 
Example #18
Source File: htmllib.py    From datafari with Apache License 2.0 5 votes vote down vote up
def reset(self):
        sgmllib.SGMLParser.reset(self)
        self.savedata = None
        self.isindex = 0
        self.title = None
        self.base = None
        self.anchor = None
        self.anchorlist = []
        self.nofill = 0
        self.list_stack = []

    # ------ Methods used internally; some may be overridden

    # --- Formatter interface, taking care of 'savedata' mode;
    # shouldn't need to be overridden 
Example #19
Source File: htmllib.py    From datafari with Apache License 2.0 5 votes vote down vote up
def __init__(self, formatter, verbose=0):
        """Creates an instance of the HTMLParser class.

        The formatter parameter is the formatter instance associated with
        the parser.

        """
        sgmllib.SGMLParser.__init__(self, verbose)
        self.formatter = formatter 
Example #20
Source File: webchecker.py    From datafari with Apache License 2.0 5 votes vote down vote up
def __init__(self, url, verbose=VERBOSE, checker=None):
        self.myverbose = verbose # now unused
        self.checker = checker
        self.base = None
        self.links = {}
        self.names = []
        self.url = url
        sgmllib.SGMLParser.__init__(self) 
Example #21
Source File: devilzc0de.py    From d4rkc0de with GNU General Public License v2.0 5 votes vote down vote up
def __init__(self, verbose=0): 
        "Initialise an object, passing 'verbose' to the superclass." 
 
        sgmllib.SGMLParser.__init__(self, verbose) 
        self.hyperlinks = [] 
Example #22
Source File: reuters.py    From SCDV with MIT License 5 votes vote down vote up
def __init__(self, verbose=0):
        sgmllib.SGMLParser.__init__(self, verbose)
        self._reset() 
Example #23
Source File: feedparser.py    From xbmc-addons-chinese with GNU General Public License v2.0 5 votes vote down vote up
def __init__(self, baseuri, baselang, encoding, entities):
        sgmllib.SGMLParser.__init__(self)
        _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
        _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
        self.entities=entities 
Example #24
Source File: feedparser.py    From xbmc-addons-chinese with GNU General Public License v2.0 5 votes vote down vote up
def parse_declaration(self, i):
        try:
            return sgmllib.SGMLParser.parse_declaration(self, i)
        except sgmllib.SGMLParseError:
            # escape the doctype declaration and continue parsing
            self.handle_data('&lt;')
            return i+1 
Example #25
Source File: feedparser.py    From xbmc-addons-chinese with GNU General Public License v2.0 5 votes vote down vote up
def feed(self, data):
        data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
        data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
        data = data.replace('&#39;', "'")
        data = data.replace('&#34;', '"')
        try:
            bytes
            if bytes is str:
                raise NameError
            self.encoding = self.encoding + u'_INVALID_PYTHON_3'
        except NameError:
            if self.encoding and isinstance(data, unicode):
                data = data.encode(self.encoding)
        sgmllib.SGMLParser.feed(self, data)
        sgmllib.SGMLParser.close(self) 
Example #26
Source File: feedparser.py    From xbmc-addons-chinese with GNU General Public License v2.0 5 votes vote down vote up
def reset(self):
        self.pieces = []
        sgmllib.SGMLParser.reset(self) 
Example #27
Source File: feedparser.py    From xbmc-addons-chinese with GNU General Public License v2.0 5 votes vote down vote up
def __init__(self, encoding, _type):
        self.encoding = encoding
        self._type = _type
        sgmllib.SGMLParser.__init__(self) 
Example #28
Source File: feedparser.py    From pyrobotlab with Apache License 2.0 5 votes vote down vote up
def __init__(self, baseuri, baselang, encoding):
        sgmllib.SGMLParser.__init__(self)
        _FeedParserMixin.__init__(self, baseuri, baselang, encoding) 
Example #29
Source File: feedparser.py    From pyrobotlab with Apache License 2.0 5 votes vote down vote up
def feed(self, data):
        data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
        #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
        data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data) 
        data = data.replace('&#39;', "'")
        data = data.replace('&#34;', '"')
        if self.encoding and type(data) == type(u''):
            data = data.encode(self.encoding)
        sgmllib.SGMLParser.feed(self, data) 
Example #30
Source File: feedparser.py    From pyrobotlab with Apache License 2.0 5 votes vote down vote up
def reset(self):
        self.pieces = []
        sgmllib.SGMLParser.reset(self)