Python six.moves.html_parser.HTMLParser() Examples
The following are 10
code examples of six.moves.html_parser.HTMLParser().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
six.moves.html_parser
, or try the search function
.
Example #1
Source File: bing.py From icrawler with MIT License | 5 votes |
def parse(self, response): soup = BeautifulSoup( response.content.decode('utf-8', 'ignore'), 'lxml') image_divs = soup.find_all('div', class_='imgpt') pattern = re.compile(r'murl\":\"(.*?)\.jpg') for div in image_divs: href_str = html_parser.HTMLParser().unescape(div.a['m']) match = pattern.search(href_str) if match: name = (match.group(1) if six.PY3 else match.group(1).encode('utf-8')) img_url = '{}.jpg'.format(name) yield dict(file_url=img_url)
Example #2
Source File: helper.py From -Odoo--- with GNU General Public License v3.0 | 5 votes |
def __init__(self): html_parser.HTMLParser.__init__(self) self.recording = 0 self.data = []
Example #3
Source File: wikibot.py From python-docs-samples with Apache License 2.0 | 5 votes |
def message(self, msg): """Process incoming message stanzas. Be aware that this also includes MUC messages and error messages. It is usually a good idea to check the messages's type before processing or sending replies. If the message is the appropriate type, then the bot checks wikipedia to see if the message string exists as a page on the site. If so, it sends this link back to the sender in the reply. Arguments: msg -- The received message stanza. See the SleekXMPP documentation for stanza objects and the Message stanza to see how it may be used. """ if msg['type'] in ('chat', 'normal'): msg_body = msg['body'] encoded_body = urllib.quote_plus(msg_body) response = requests.get( 'https://en.wikipedia.org/w/api.php?' 'action=query&list=search&format=json&srprop=snippet&' 'srsearch={}'.format(encoded_body)) doc = json.loads(response.content) results = doc.get('query', {}).get('search') if not results: msg.reply('I wasn\'t able to locate info on "{}" Sorry'.format( msg_body)).send() return snippet = results[0]['snippet'] title = urllib.quote_plus(results[0]['title']) # Strip out html snippet = html_parser.HTMLParser().unescape( re.sub(r'<[^>]*>', '', snippet)) msg.reply(u'{}...\n(http://en.wikipedia.org/w/?title={})'.format( snippet, title)).send()
Example #4
Source File: parser.py From syntribos with Apache License 2.0 | 5 votes |
def _string_data(data, data_type): """Replace various objects types with string representations.""" if data_type == 'json': return json.dumps(data) elif data_type == 'xml': if isinstance(data, str): return data str_data = ElementTree.tostring(data) # No way to stop tostring from HTML escaping even if we wanted h = html_parser.HTMLParser() return h.unescape(str_data.decode()) elif data_type == 'yaml': return yaml.dump(data) else: return data
Example #5
Source File: info.py From yagocd with ISC License | 5 votes |
def __init__(self): html_parser.HTMLParser.__init__(self) self._in_td = False self.data = list()
Example #6
Source File: lp-aws-saml.py From lp-aws-saml with GNU General Public License v2.0 | 5 votes |
def get_saml_token(session, username, password, saml_cfg_id): """ Log into LastPass and retrieve a SAML token for a given SAML configuration. """ logger.debug("Getting SAML token") # now logged in, grab the SAML token from the IdP-initiated login idp_login = '%s/saml/launch/cfg/%d' % (LASTPASS_SERVER, saml_cfg_id) r = session.get(idp_login, verify=should_verify()) form = extract_form(r.text) if not form['action']: # try to scrape the error message just to make it more user friendly error = "" for l in r.text.splitlines(): match = re.search(r'<h2>(.*)</h2>', l) if match: msg = html_parser.HTMLParser().unescape(match.group(1)) msg = msg.replace("<br/>", "\n") msg = msg.replace("<b>", "") msg = msg.replace("</b>", "") error = "\n" + msg raise ValueError("Unable to find SAML ACS" + error) return b64decode(form['fields']['SAMLResponse'])
Example #7
Source File: utils.py From figshare with MIT License | 5 votes |
def strip_html(html): class MLStripper(HTMLParser): def __init__(self): self.reset() self.strict = False self.fed = [] def handle_data(self, d): self.fed.append(d) def get_data(self): return ''.join(self.fed) p = MLStripper() p.feed(html) return p.get_data()
Example #8
Source File: helper.py From wechat_mall with MIT License | 5 votes |
def __init__(self): html_parser.HTMLParser.__init__(self) self.recording = 0 self.data = []
Example #9
Source File: server.py From git-stacktrace with Apache License 2.0 | 5 votes |
def _get_field(self, field, default=''): val = self.params.get(field, [default]) val = val[0] if isinstance(val, list) else val return HTMLParser().unescape(val)
Example #10
Source File: markdown.py From readme_renderer with Apache License 2.0 | 4 votes |
def _highlight(html): """Syntax-highlights HTML-rendered Markdown. Plucks sections to highlight that conform the the GitHub fenced code info string as defined at https://github.github.com/gfm/#info-string. Args: html (str): The rendered HTML. Returns: str: The HTML with Pygments syntax highlighting applied to all code blocks. """ formatter = pygments.formatters.HtmlFormatter(nowrap=True) code_expr = re.compile( r'<pre><code class="language-(?P<lang>.+?)">(?P<code>.+?)' r'</code></pre>', re.DOTALL) def replacer(match): try: lang = match.group('lang') lang = _LANG_ALIASES.get(lang, lang) lexer = pygments.lexers.get_lexer_by_name(lang) except ValueError: lexer = pygments.lexers.TextLexer() code = match.group('code') # Decode html entities in the code. cmark tries to be helpful and # translate '"' to '"', but it confuses pygments. Pygments will # escape any html entities when re-writing the code, and we run # everything through bleach after. code = html_parser.HTMLParser().unescape(code) highlighted = pygments.highlight(code, lexer, formatter) return '<pre>{}</pre>'.format(highlighted) result = code_expr.sub(replacer, html) return result