Python htmlentitydefs.name2codepoint() Examples
The following are 30
code examples of htmlentitydefs.name2codepoint().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
htmlentitydefs
, or try the search function
.
Example #1
Source File: chanutils.py From blissflixx with GNU General Public License v2.0 | 6 votes |
def replace_entity(text): def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return re.sub("&#?\w+;", fixup, text)
Example #2
Source File: feedparser.py From xbmc-addons-chinese with GNU General Public License v2.0 | 6 votes |
def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' if not self.elementstack: return if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): text = '&%s;' % ref elif ref in self.entities: text = self.entities[ref] if text.startswith('&#') and text.endswith(';'): return self.handle_entityref(text) else: try: name2codepoint[ref] except KeyError: text = '&%s;' % ref else: text = unichr(name2codepoint[ref]).encode('utf-8') self.elementstack[-1][2].append(text)
Example #3
Source File: feedparser.py From RSSNewsGAE with Apache License 2.0 | 6 votes |
def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' if not self.elementstack: return if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): text = '&%s;' % ref elif ref in self.entities: text = self.entities[ref] if text.startswith('&#') and text.endswith(';'): return self.handle_entityref(text) else: try: name2codepoint[ref] except KeyError: text = '&%s;' % ref else: text = unichr(name2codepoint[ref]).encode('utf-8') self.elementstack[-1][2].append(text)
Example #4
Source File: moduleBS.py From D-Tech with GNU General Public License v3.0 | 6 votes |
def _convertEntities(self, match): """Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML entities are being converted, any unrecognized entities are escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: return unichr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: return u'&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': return unichr(int(x[2:], 16)) else: return unichr(int(x[1:])) elif self.escapeUnrecognizedEntities: return u'&%s;' % x else: return u'&%s;' % x
Example #5
Source File: BeautifulSoup.py From xbmc-betaseries with GNU General Public License v2.0 | 6 votes |
def _convertEntities(self, match): """Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML entities are being converted, any unrecognized entities are escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: return unichr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: return u'&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': return unichr(int(x[2:], 16)) else: return unichr(int(x[1:])) elif self.escapeUnrecognizedEntities: return u'&%s;' % x else: return u'&%s;' % x
Example #6
Source File: BeautifulSoup.py From xbmc-betaseries with GNU General Public License v2.0 | 6 votes |
def _convertEntities(self, match): """Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML entities are being converted, any unrecognized entities are escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: return unichr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: return u'&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': return unichr(int(x[2:], 16)) else: return unichr(int(x[1:])) elif self.escapeUnrecognizedEntities: return u'&%s;' % x else: return u'&%s;' % x
Example #7
Source File: BeautifulSoup.py From nightmare with GNU General Public License v2.0 | 6 votes |
def _convertEntities(self, match): """Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML entities are being converted, any unrecognized entities are escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: return unichr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: return u'&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': return unichr(int(x[2:], 16)) else: return unichr(int(x[1:])) elif self.escapeUnrecognizedEntities: return u'&%s;' % x else: return u'&%s;' % x
Example #8
Source File: BeautifulSoup.py From ru with GNU General Public License v2.0 | 6 votes |
def _convertEntities(self, match): """Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML entities are being converted, any unrecognized entities are escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: return unichr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: return u'&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': return unichr(int(x[2:], 16)) else: return unichr(int(x[1:])) elif self.escapeUnrecognizedEntities: return u'&%s;' % x else: return u'&%s;' % x
Example #9
Source File: _bsoup.py From faces with GNU General Public License v2.0 | 6 votes |
def _convertEntities(self, match): """Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML entities are being converted, any unrecognized entities are escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: return unichr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: return u'&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': return unichr(int(x[2:], 16)) else: return unichr(int(x[1:])) elif self.escapeUnrecognizedEntities: return u'&%s;' % x else: return u'&%s;' % x
Example #10
Source File: WikiExtractor.py From comparable-text-miner with Apache License 2.0 | 6 votes |
def unescape(text): def fixup(m): text = m.group(0) code = m.group(1) try: if text[1] == "#": # character reference if text[2] == "x": return unichr(int(code[1:], 16)) else: return unichr(int(code)) else: # named entity return unichr(name2codepoint[code]) except: return text # leave as is return re.sub("&#?(\w+);", fixup, text) # Match HTML comments # The buggy template {{Template:T}} has a comment terminating with just "->"
Example #11
Source File: parole.py From zamia-speech with GNU Lesser General Public License v3.0 | 6 votes |
def handle_entityref(self, name): if self.in_par: c = '' if name == 'star': c = u'*' elif name == 'bquot': c = u'"' elif name == 'equot': c = u'"' elif name == 'lowbar': c = u'_' elif name == 'parole.tax': c = u'' else: if name in name2codepoint: c = unichr(name2codepoint[name]) else: logging.warning("unknown entityref: %s" % name) c = '' # print "Named ent:", c self.buf += c
Example #12
Source File: __init__.py From arnold-usd with Apache License 2.0 | 6 votes |
def _convertEntities(self, match): """Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML entities are being converted, any unrecognized entities are escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: return unichr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: return u'&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': return unichr(int(x[2:], 16)) else: return unichr(int(x[1:])) elif self.escapeUnrecognizedEntities: return u'&%s;' % x else: return u'&%s;' % x
Example #13
Source File: util.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def unescape(text): """ Removes HTML or XML character references and entities from a text string. """ def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return re.sub("&#?\w+;", fixup, text)
Example #14
Source File: BeautifulSoup.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def _convertEntities(self, match): """Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML entities are being converted, any unrecognized entities are escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: return unichr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: return u'&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': return unichr(int(x[2:], 16)) else: return unichr(int(x[1:])) elif self.escapeUnrecognizedEntities: return u'&%s;' % x else: return u'&%s;' % x
Example #15
Source File: BeautifulSoup.py From MARA_Framework with GNU Lesser General Public License v3.0 | 6 votes |
def _convertEntities(self, match): """Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML entities are being converted, any unrecognized entities are escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: return unichr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: return u'&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': return unichr(int(x[2:], 16)) else: return unichr(int(x[1:])) elif self.escapeUnrecognizedEntities: return u'&%s;' % x else: return u'&%s;' % x
Example #16
Source File: feedparser.py From pyrobotlab with Apache License 2.0 | 6 votes |
def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' if not self.elementstack: return if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref) if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): text = '&%s;' % ref else: # entity resolution graciously donated by Aaron Swartz def name2cp(k): import htmlentitydefs if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3 return htmlentitydefs.name2codepoint[k] k = htmlentitydefs.entitydefs[k] if k.startswith('&#') and k.endswith(';'): return int(k[2:-1]) # not in latin-1 return ord(k) try: name2cp(ref) except KeyError: text = '&%s;' % ref else: text = unichr(name2cp(ref)).encode('utf-8') self.elementstack[-1][2].append(text)
Example #17
Source File: scrapertools.py From addon with GNU General Public License v3.0 | 6 votes |
def decodeHtmlentities(string): string = entitiesfix(string) entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});") def substitute_entity(match): if PY3: from html.entities import name2codepoint as n2cp else: from htmlentitydefs import name2codepoint as n2cp ent = match.group(2) if match.group(1) == "#": return unichr(int(ent)).encode('utf-8') else: cp = n2cp.get(ent) if cp: return unichr(cp).encode('utf-8') else: return match.group() return entity_re.subn(substitute_entity, string)[0]
Example #18
Source File: beautifulsoup.py From NoobSec-Toolkit with GNU General Public License v2.0 | 6 votes |
def _convertEntities(self, match): """Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML entities are being converted, any unrecognized entities are escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: return unichr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: return u'&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': return unichr(int(x[2:], 16)) else: return unichr(int(x[1:])) elif self.escapeUnrecognizedEntities: return u'&%s;' % x else: return u'&%s;' % x
Example #19
Source File: BeautifulSoup.py From yalih with Apache License 2.0 | 6 votes |
def _convertEntities(self, match): """Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML entities are being converted, any unrecognized entities are escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: return unichr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: return u'&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': return unichr(int(x[2:], 16)) else: return unichr(int(x[1:])) elif self.escapeUnrecognizedEntities: return u'&%s;' % x else: return u'&%s;' % x
Example #20
Source File: isvalid.py From panaroo with MIT License | 6 votes |
def unescape(text): """Replace XML character references with the referenced characters""" def fixup(m): text = m.group(0) if text[1] == '#': # Character reference if text[2] == 'x': code = int(text[3:-1], 16) else: code = int(text[2:-1]) else: # Named entity try: code = htmlentitydefs.name2codepoint[text[1:-1]] except KeyError: return text # leave unchanged try: return chr(code) if code < 256 else unichr(code) except (ValueError, OverflowError): return text # leave unchanged return re.sub("&(?:[0-9A-Za-z]+|#(?:[0-9]+|x[0-9A-Fa-f]+));", fixup, text)
Example #21
Source File: clientform.py From NoobSec-Toolkit with GNU General Public License v2.0 | 6 votes |
def get_entitydefs(): import htmlentitydefs from codecs import latin_1_decode entitydefs = {} try: htmlentitydefs.name2codepoint except AttributeError: entitydefs = {} for name, char in htmlentitydefs.entitydefs.items(): uc = latin_1_decode(char)[0] if uc.startswith("&#") and uc.endswith(";"): uc = unescape_charref(uc[2:-1], None) entitydefs["&%s;" % name] = uc else: for name, codepoint in htmlentitydefs.name2codepoint.items(): entitydefs["&%s;" % name] = unichr(codepoint) return entitydefs
Example #22
Source File: BeautifulSoup.py From dirigible-spreadsheet with MIT License | 6 votes |
def _convertEntities(self, match): """Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML entities are being converted, any unrecognized entities are escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: return unichr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: return u'&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': return unichr(int(x[2:], 16)) else: return unichr(int(x[1:])) elif self.escapeUnrecognizedEntities: return u'&%s;' % x else: return u'&%s;' % x
Example #23
Source File: scrapertools.py From tvalacarta with GNU General Public License v3.0 | 6 votes |
def decodeHtmlentities(string): string = entitiesfix(string) entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});") def substitute_entity(match): from htmlentitydefs import name2codepoint as n2cp ent = match.group(2) if match.group(1) == "#": return unichr(int(ent)).encode('utf-8') else: cp = n2cp.get(ent) if cp: return unichr(cp).encode('utf-8') else: return match.group() return entity_re.subn(substitute_entity, string)[0]
Example #24
Source File: beautifulsoup.py From NoobSec-Toolkit with GNU General Public License v2.0 | 6 votes |
def _convertEntities(self, match): """Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML entities are being converted, any unrecognized entities are escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: return unichr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: return u'&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': return unichr(int(x[2:], 16)) else: return unichr(int(x[1:])) elif self.escapeUnrecognizedEntities: return u'&%s;' % x else: return u'&%s;' % x
Example #25
Source File: tweets.py From gazouilloire with GNU General Public License v3.0 | 6 votes |
def decode_entities(x): if x.group(1).startswith('#'): char = x.group(1)[1:] if char.startswith('x'): try: return unichr(int(x.group(1)[2:], 16)) except: pass try: return unichr(int(x.group(1)[1:])) except: pass try: return unichr(name2codepoint[x.group(1)]) except: return x.group(1)
Example #26
Source File: beautifulsoup.py From NoobSec-Toolkit with GNU General Public License v2.0 | 6 votes |
def _convertEntities(self, match): """Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML entities are being converted, any unrecognized entities are escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: return unichr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: return u'&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': return unichr(int(x[2:], 16)) else: return unichr(int(x[1:])) elif self.escapeUnrecognizedEntities: return u'&%s;' % x else: return u'&%s;' % x
Example #27
Source File: feedparser.py From telegram-robot-rss with Mozilla Public License 2.0 | 6 votes |
def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' if not self.elementstack: return if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): text = '&%s;' % ref elif ref in self.entities: text = self.entities[ref] if text.startswith('&#') and text.endswith(';'): return self.handle_entityref(text) else: try: name2codepoint[ref] except KeyError: text = '&%s;' % ref else: text = unichr(name2codepoint[ref]).encode('utf-8') self.elementstack[-1][2].append(text)
Example #28
Source File: beautifulsoup.py From NoobSec-Toolkit with GNU General Public License v2.0 | 6 votes |
def _convertEntities(self, match): """Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML entities are being converted, any unrecognized entities are escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: return unichr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: return u'&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': return unichr(int(x[2:], 16)) else: return unichr(int(x[1:])) elif self.escapeUnrecognizedEntities: return u'&%s;' % x else: return u'&%s;' % x
Example #29
Source File: moduleBS.py From D-TECT with GNU General Public License v3.0 | 6 votes |
def _convertEntities(self, match): """Used in a call to re.sub to replace HTML, XML, and numeric entities with the appropriate Unicode characters. If HTML entities are being converted, any unrecognized entities are escaped.""" x = match.group(1) if self.convertHTMLEntities and x in name2codepoint: return unichr(name2codepoint[x]) elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: if self.convertXMLEntities: return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] else: return u'&%s;' % x elif len(x) > 0 and x[0] == '#': # Handle numeric entities if len(x) > 1 and x[1] == 'x': return unichr(int(x[2:], 16)) else: return unichr(int(x[1:])) elif self.escapeUnrecognizedEntities: return u'&%s;' % x else: return u'&%s;' % x
Example #30
Source File: clientform.py From NoobSec-Toolkit with GNU General Public License v2.0 | 6 votes |
def get_entitydefs(): import htmlentitydefs from codecs import latin_1_decode entitydefs = {} try: htmlentitydefs.name2codepoint except AttributeError: entitydefs = {} for name, char in htmlentitydefs.entitydefs.items(): uc = latin_1_decode(char)[0] if uc.startswith("&#") and uc.endswith(";"): uc = unescape_charref(uc[2:-1], None) entitydefs["&%s;" % name] = uc else: for name, codepoint in htmlentitydefs.name2codepoint.items(): entitydefs["&%s;" % name] = unichr(codepoint) return entitydefs