Python sgmllib.SGMLParser.__init__() Examples
The following are 30
code examples of sgmllib.SGMLParser.__init__().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sgmllib.SGMLParser
, or try the search function
.
Example #1
Source File: uspto-sgml-parser.py From uspto-patents-parsing-tools with MIT License | 6 votes |
def __init__(self): SGMLParser.__init__(self, False) # gather data self.information = [] # states self.document_id = 0 self.date = 0 self.ipc_data = 0 self.usc_data = 0 self.title = 0 self.inventor = 0 self.assignee = 0 self.abstract = 0 self.readable = 0 # buffers self.docid_buffer = '' self.date_buffer = '' self.ipc_buffer = '' self.usc_buffer = '' self.title_buffer = '' self.inventor_buffer = '' self.assignee_buffer = '' self.abstract_buffer = ''
Example #2
Source File: sgml.py From learn_python3_spider with MIT License | 6 votes |
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=('a', 'area'), attrs=('href',), canonicalize=False, unique=True, process_value=None, deny_extensions=None, restrict_css=(), strip=True, restrict_text=()): warnings.warn( "SgmlLinkExtractor is deprecated and will be removed in future releases. " "Please use scrapy.linkextractors.LinkExtractor", ScrapyDeprecationWarning, stacklevel=2, ) tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs)) tag_func = lambda x: x in tags attr_func = lambda x: x in attrs with warnings.catch_warnings(): warnings.simplefilter('ignore', ScrapyDeprecationWarning) lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func, unique=unique, process_value=process_value, strip=strip, canonicalized=canonicalize) super(SgmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny, allow_domains=allow_domains, deny_domains=deny_domains, restrict_xpaths=restrict_xpaths, restrict_css=restrict_css, canonicalize=canonicalize, deny_extensions=deny_extensions, restrict_text=restrict_text)
Example #3
Source File: sgml.py From learn_python3_spider with MIT License | 6 votes |
def __init__(self, tag="a", attr="href", unique=False, process_value=None, strip=True, canonicalized=False): warnings.warn( "BaseSgmlLinkExtractor is deprecated and will be removed in future releases. " "Please use scrapy.linkextractors.LinkExtractor", ScrapyDeprecationWarning, stacklevel=2, ) SGMLParser.__init__(self) self.scan_tag = tag if callable(tag) else lambda t: t == tag self.scan_attr = attr if callable(attr) else lambda a: a == attr self.process_value = (lambda v: v) if process_value is None else process_value self.current_link = None self.unique = unique self.strip = strip if canonicalized: self.link_key = lambda link: link.url else: self.link_key = lambda link: canonicalize_url(link.url, keep_fragments=True)
Example #4
Source File: sgml.py From learn_python3_spider with MIT License | 6 votes |
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=('a', 'area'), attrs=('href',), canonicalize=False, unique=True, process_value=None, deny_extensions=None, restrict_css=(), strip=True, restrict_text=()): warnings.warn( "SgmlLinkExtractor is deprecated and will be removed in future releases. " "Please use scrapy.linkextractors.LinkExtractor", ScrapyDeprecationWarning, stacklevel=2, ) tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs)) tag_func = lambda x: x in tags attr_func = lambda x: x in attrs with warnings.catch_warnings(): warnings.simplefilter('ignore', ScrapyDeprecationWarning) lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func, unique=unique, process_value=process_value, strip=strip, canonicalized=canonicalize) super(SgmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny, allow_domains=allow_domains, deny_domains=deny_domains, restrict_xpaths=restrict_xpaths, restrict_css=restrict_css, canonicalize=canonicalize, deny_extensions=deny_extensions, restrict_text=restrict_text)
Example #5
Source File: sgml.py From learn_python3_spider with MIT License | 6 votes |
def __init__(self, tag="a", attr="href", unique=False, process_value=None, strip=True, canonicalized=False): warnings.warn( "BaseSgmlLinkExtractor is deprecated and will be removed in future releases. " "Please use scrapy.linkextractors.LinkExtractor", ScrapyDeprecationWarning, stacklevel=2, ) SGMLParser.__init__(self) self.scan_tag = tag if callable(tag) else lambda t: t == tag self.scan_attr = attr if callable(attr) else lambda a: a == attr self.process_value = (lambda v: v) if process_value is None else process_value self.current_link = None self.unique = unique self.strip = strip if canonicalized: self.link_key = lambda link: link.url else: self.link_key = lambda link: canonicalize_url(link.url, keep_fragments=True)
Example #6
Source File: BeautifulSoup.py From MARA_Framework with GNU Lesser General Public License v3.0 | 5 votes |
def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] self.quoteStack = [] self.pushTag(self)
Example #7
Source File: BeautifulSoup.py From yalih with Apache License 2.0 | 5 votes |
def __init__(self, name=None, attrs={}, text=None, **kwargs): self.name = name if isinstance(attrs, basestring): kwargs['class'] = _match_css_class(attrs) attrs = None if kwargs: if attrs: attrs = attrs.copy() attrs.update(kwargs) else: attrs = kwargs self.attrs = attrs self.text = text
Example #8
Source File: BeautifulSoup.py From MARA_Framework with GNU Lesser General Public License v3.0 | 5 votes |
def __init__(self, *args, **kwargs): if not kwargs.has_key('smartQuotesTo'): kwargs['smartQuotesTo'] = self.HTML_ENTITIES kwargs['isHTML'] = True BeautifulStoneSoup.__init__(self, *args, **kwargs)
Example #9
Source File: BeautifulSoup.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def __init__(self, parser, name, attrs=None, parent=None, previous=None): "Basic constructor." # We don't actually store the parser object: that lets extracted # chunks be garbage-collected self.parserClass = parser.__class__ self.isSelfClosing = parser.isSelfClosingTag(name) self.name = name if attrs is None: attrs = [] elif isinstance(attrs, dict): attrs = attrs.items() self.attrs = attrs self.contents = [] self.setup(parent, previous) self.hidden = False self.containsSubstitutions = False self.convertHTMLEntities = parser.convertHTMLEntities self.convertXMLEntities = parser.convertXMLEntities self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities # Convert any HTML, XML, or numeric entities in the attribute values. convert = lambda(k, val): (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", self._convertEntities, val)) self.attrs = map(convert, self.attrs)
Example #10
Source File: moduleBS.py From D-Tech with GNU General Public License v3.0 | 5 votes |
def __init__(self, name=None, attrs={}, text=None, **kwargs): self.name = name if isinstance(attrs, basestring): kwargs['class'] = _match_css_class(attrs) attrs = None if kwargs: if attrs: attrs = attrs.copy() attrs.update(kwargs) else: attrs = kwargs self.attrs = attrs self.text = text
Example #11
Source File: BeautifulSoup.py From yalih with Apache License 2.0 | 5 votes |
def __init__(self, source): list.__init__([]) self.source = source # Now, some helper functions.
Example #12
Source File: BeautifulSoup.py From yalih with Apache License 2.0 | 5 votes |
def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] self.quoteStack = [] self.pushTag(self)
Example #13
Source File: BeautifulSoup.py From yalih with Apache License 2.0 | 5 votes |
def __init__(self, *args, **kwargs): if not kwargs.has_key('smartQuotesTo'): kwargs['smartQuotesTo'] = self.HTML_ENTITIES kwargs['isHTML'] = True BeautifulStoneSoup.__init__(self, *args, **kwargs)
Example #14
Source File: moduleBS.py From D-Tech with GNU General Public License v3.0 | 5 votes |
def __init__(self, parser, name, attrs=None, parent=None, previous=None): "Basic constructor." # We don't actually store the parser object: that lets extracted # chunks be garbage-collected self.parserClass = parser.__class__ self.isSelfClosing = parser.isSelfClosingTag(name) self.name = name if attrs is None: attrs = [] elif isinstance(attrs, dict): attrs = attrs.items() self.attrs = attrs self.contents = [] self.setup(parent, previous) self.hidden = False self.containsSubstitutions = False self.convertHTMLEntities = parser.convertHTMLEntities self.convertXMLEntities = parser.convertXMLEntities self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities # Convert any HTML, XML, or numeric entities in the attribute values. convert = lambda(k, val): (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", self._convertEntities, val)) self.attrs = map(convert, self.attrs)
Example #15
Source File: moduleBS.py From D-Tech with GNU General Public License v3.0 | 5 votes |
def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] self.quoteStack = [] self.pushTag(self)
Example #16
Source File: BeautifulSoup.py From ru with GNU General Public License v2.0 | 5 votes |
def __init__(self, parser, name, attrs=None, parent=None, previous=None): "Basic constructor." # We don't actually store the parser object: that lets extracted # chunks be garbage-collected self.parserClass = parser.__class__ self.isSelfClosing = parser.isSelfClosingTag(name) self.name = name if attrs is None: attrs = [] elif isinstance(attrs, dict): attrs = attrs.items() self.attrs = attrs self.contents = [] self.setup(parent, previous) self.hidden = False self.containsSubstitutions = False self.convertHTMLEntities = parser.convertHTMLEntities self.convertXMLEntities = parser.convertXMLEntities self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities # Convert any HTML, XML, or numeric entities in the attribute values. convert = lambda(k, val): (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", self._convertEntities, val)) self.attrs = map(convert, self.attrs)
Example #17
Source File: moduleBS.py From D-Tech with GNU General Public License v3.0 | 5 votes |
def __init__(self, *args, **kwargs): if not kwargs.has_key('smartQuotesTo'): kwargs['smartQuotesTo'] = self.HTML_ENTITIES kwargs['isHTML'] = True BeautifulStoneSoup.__init__(self, *args, **kwargs)
Example #18
Source File: BeautifulSoup.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def __init__(self, name=None, attrs={}, text=None, **kwargs): self.name = name if isinstance(attrs, basestring): kwargs['class'] = _match_css_class(attrs) attrs = None if kwargs: if attrs: attrs = attrs.copy() attrs.update(kwargs) else: attrs = kwargs self.attrs = attrs self.text = text
Example #19
Source File: BeautifulSoup.py From MARA_Framework with GNU Lesser General Public License v3.0 | 5 votes |
def __init__(self, source): list.__init__([]) self.source = source # Now, some helper functions.
Example #20
Source File: BeautifulSoup.py From MARA_Framework with GNU Lesser General Public License v3.0 | 5 votes |
def __init__(self, name=None, attrs={}, text=None, **kwargs): self.name = name if isinstance(attrs, basestring): kwargs['class'] = _match_css_class(attrs) attrs = None if kwargs: if attrs: attrs = attrs.copy() attrs.update(kwargs) else: attrs = kwargs self.attrs = attrs self.text = text
Example #21
Source File: BeautifulSoup.py From MARA_Framework with GNU Lesser General Public License v3.0 | 5 votes |
def __init__(self, parser, name, attrs=None, parent=None, previous=None): "Basic constructor." # We don't actually store the parser object: that lets extracted # chunks be garbage-collected self.parserClass = parser.__class__ self.isSelfClosing = parser.isSelfClosingTag(name) self.name = name if attrs is None: attrs = [] elif isinstance(attrs, dict): attrs = attrs.items() self.attrs = attrs self.contents = [] self.setup(parent, previous) self.hidden = False self.containsSubstitutions = False self.convertHTMLEntities = parser.convertHTMLEntities self.convertXMLEntities = parser.convertXMLEntities self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities # Convert any HTML, XML, or numeric entities in the attribute values. convert = lambda(k, val): (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", self._convertEntities, val)) self.attrs = map(convert, self.attrs)
Example #22
Source File: nuclesgmlparser.py From m2scorer with GNU General Public License v2.0 | 5 votes |
def __init__(self): SGMLParser.__init__(self) self.docs = []
Example #23
Source File: beautifulsoup.py From NoobSec-Toolkit with GNU General Public License v2.0 | 5 votes |
def __init__(self, *args, **kwargs): if not kwargs.has_key('smartQuotesTo'): kwargs['smartQuotesTo'] = self.HTML_ENTITIES kwargs['isHTML'] = True BeautifulStoneSoup.__init__(self, *args, **kwargs)
Example #24
Source File: beautifulsoup.py From NoobSec-Toolkit with GNU General Public License v2.0 | 5 votes |
def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] self.quoteStack = [] self.pushTag(self)
Example #25
Source File: beautifulsoup.py From NoobSec-Toolkit with GNU General Public License v2.0 | 5 votes |
def __init__(self, source): list.__init__([]) self.source = source # Now, some helper functions.
Example #26
Source File: beautifulsoup.py From NoobSec-Toolkit with GNU General Public License v2.0 | 5 votes |
def __init__(self, name=None, attrs={}, text=None, **kwargs): self.name = name if isinstance(attrs, basestring): kwargs['class'] = _match_css_class(attrs) attrs = None if kwargs: if attrs: attrs = attrs.copy() attrs.update(kwargs) else: attrs = kwargs self.attrs = attrs self.text = text
Example #27
Source File: beautifulsoup.py From NoobSec-Toolkit with GNU General Public License v2.0 | 5 votes |
def __init__(self, parser, name, attrs=None, parent=None, previous=None): "Basic constructor." # We don't actually store the parser object: that lets extracted # chunks be garbage-collected self.parserClass = parser.__class__ self.isSelfClosing = parser.isSelfClosingTag(name) self.name = name if attrs is None: attrs = [] elif isinstance(attrs, dict): attrs = attrs.items() self.attrs = attrs self.contents = [] self.setup(parent, previous) self.hidden = False self.containsSubstitutions = False self.convertHTMLEntities = parser.convertHTMLEntities self.convertXMLEntities = parser.convertXMLEntities self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities # Convert any HTML, XML, or numeric entities in the attribute values. convert = lambda(k, val): (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", self._convertEntities, val)) self.attrs = map(convert, self.attrs)
Example #28
Source File: beautifulsoup.py From NoobSec-Toolkit with GNU General Public License v2.0 | 5 votes |
def __init__(self, *args, **kwargs): if not kwargs.has_key('smartQuotesTo'): kwargs['smartQuotesTo'] = self.HTML_ENTITIES kwargs['isHTML'] = True BeautifulStoneSoup.__init__(self, *args, **kwargs)
Example #29
Source File: beautifulsoup.py From NoobSec-Toolkit with GNU General Public License v2.0 | 5 votes |
def reset(self): Tag.__init__(self, self, self.ROOT_TAG_NAME) self.hidden = 1 SGMLParser.reset(self) self.currentData = [] self.currentTag = None self.tagStack = [] self.quoteStack = [] self.pushTag(self)
Example #30
Source File: beautifulsoup.py From NoobSec-Toolkit with GNU General Public License v2.0 | 5 votes |
def __init__(self, source): list.__init__([]) self.source = source # Now, some helper functions.