Python html.entities.name2codepoint() Examples
The following are 28
code examples of html.entities.name2codepoint().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
html.entities
, or try the search function
.
Example #1
Source File: feedparser.py From odoo12-x64 with GNU General Public License v3.0 | 6 votes |
def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' if not self.elementstack: return if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): text = '&%s;' % ref elif ref in self.entities: text = self.entities[ref] if text.startswith('&#') and text.endswith(';'): return self.handle_entityref(text) else: try: name2codepoint[ref] except KeyError: text = '&%s;' % ref else: text = chr(name2codepoint[ref]).encode('utf-8') self.elementstack[-1][2].append(text)
Example #2
Source File: scrapertools.py From addon with GNU General Public License v3.0 | 6 votes |
def decodeHtmlentities(string): string = entitiesfix(string) entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});") def substitute_entity(match): if PY3: from html.entities import name2codepoint as n2cp else: from htmlentitydefs import name2codepoint as n2cp ent = match.group(2) if match.group(1) == "#": return unichr(int(ent)).encode('utf-8') else: cp = n2cp.get(ent) if cp: return unichr(cp).encode('utf-8') else: return match.group() return entity_re.subn(substitute_entity, string)[0]
Example #3
Source File: gml.py From aws-kube-codesuite with Apache License 2.0 | 6 votes |
def unescape(text): """Replace XML character references with the referenced characters""" def fixup(m): text = m.group(0) if text[1] == '#': # Character reference if text[2] == 'x': code = int(text[3:-1], 16) else: code = int(text[2:-1]) else: # Named entity try: code = htmlentitydefs.name2codepoint[text[1:-1]] except KeyError: return text # leave unchanged try: return chr(code) if code < 256 else unichr(code) except (ValueError, OverflowError): return text # leave unchanged return re.sub("&(?:[0-9A-Za-z]+|#(?:[0-9]+|x[0-9A-Fa-f]+));", fixup, text)
Example #4
Source File: gml.py From Carnets with BSD 3-Clause "New" or "Revised" License | 6 votes |
def unescape(text): """Replace XML character references with the referenced characters""" def fixup(m): text = m.group(0) if text[1] == '#': # Character reference if text[2] == 'x': code = int(text[3:-1], 16) else: code = int(text[2:-1]) else: # Named entity try: code = htmlentitydefs.name2codepoint[text[1:-1]] except KeyError: return text # leave unchanged try: return chr(code) if code < 256 else unichr(code) except (ValueError, OverflowError): return text # leave unchanged return re.sub("&(?:[0-9A-Za-z]+|#(?:[0-9]+|x[0-9A-Fa-f]+));", fixup, text)
Example #5
Source File: gml.py From qgisSpaceSyntaxToolkit with GNU General Public License v3.0 | 6 votes |
def unescape(text): """Replace XML character references in a string with the referenced characters. """ def fixup(m): text = m.group(0) if text[1] == '#': # Character reference if text[2] == 'x': code = int(text[3:-1], 16) else: code = int(text[2:-1]) else: # Named entity try: code = htmlentitydefs.name2codepoint[text[1:-1]] except KeyError: return text # leave unchanged try: return chr(code) if code < 256 else unichr(code) except (ValueError, OverflowError): return text # leave unchanged return re.sub("&(?:[0-9A-Za-z]+|#(?:[0-9]+|x[0-9A-Fa-f]+));", fixup, text)
Example #6
Source File: isvalid.py From panaroo with MIT License | 6 votes |
def unescape(text): """Replace XML character references with the referenced characters""" def fixup(m): text = m.group(0) if text[1] == '#': # Character reference if text[2] == 'x': code = int(text[3:-1], 16) else: code = int(text[2:-1]) else: # Named entity try: code = htmlentitydefs.name2codepoint[text[1:-1]] except KeyError: return text # leave unchanged try: return chr(code) if code < 256 else unichr(code) except (ValueError, OverflowError): return text # leave unchanged return re.sub("&(?:[0-9A-Za-z]+|#(?:[0-9]+|x[0-9A-Fa-f]+));", fixup, text)
Example #7
Source File: feedparser.py From odoo13-x64 with GNU General Public License v3.0 | 6 votes |
def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' if not self.elementstack: return if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): text = '&%s;' % ref elif ref in self.entities: text = self.entities[ref] if text.startswith('&#') and text.endswith(';'): return self.handle_entityref(text) else: try: name2codepoint[ref] except KeyError: text = '&%s;' % ref else: text = chr(name2codepoint[ref]).encode('utf-8') self.elementstack[-1][2].append(text)
Example #8
Source File: feedparser.py From incremental-reading with ISC License | 6 votes |
def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' if not self.elementstack: return if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): text = '&%s;' % ref elif ref in self.entities: text = self.entities[ref] if text.startswith('&#') and text.endswith(';'): return self.handle_entityref(text) else: try: name2codepoint[ref] except KeyError: text = '&%s;' % ref else: text = chr(name2codepoint[ref]).encode('utf-8') self.elementstack[-1][2].append(text)
Example #9
Source File: utils.py From telegraph with MIT License | 5 votes |
def handle_entityref(self, name): self.add_str_node(chr(name2codepoint[name]))
Example #10
Source File: feedparser.py From odoo12-x64 with GNU General Public License v3.0 | 5 votes |
def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' # Reconstruct the original entity reference. if ref in name2codepoint or ref == 'apos': self.pieces.append('&%s;' % ref) else: self.pieces.append('&%s' % ref)
Example #11
Source File: htmlslacker.py From html-slacker with MIT License | 5 votes |
def handle_entityref(self, name): c = chr(name2codepoint[name]) pass
Example #12
Source File: html_parser.py From QualCoder with MIT License | 5 votes |
def handle_entityref(self, name): if name in name2codepoint and not self.hide_output: c = chr(name2codepoint[name]) self._buf.append(c)
Example #13
Source File: WikiExtractor.py From word2vec-on-wikipedia with MIT License | 5 votes |
def unescape(text): """ Removes HTML or XML character references and entities from a text string. :param text The HTML (or XML) source text. :return The plain text, as a Unicode string, if necessary. """ def fixup(m): text = m.group(0) code = m.group(1) try: if text[1] == "#": # character reference if text[2] == "x": return chr(int(code[1:], 16)) else: return chr(int(code)) else: # named entity return chr(name2codepoint[code]) except: return text # leave as is return re.sub("&#?(\w+);", fixup, text) # Match HTML comments # The buggy template {{Template:T}} has a comment terminating with just "->"
Example #14
Source File: html2text.py From RedditBots with MIT License | 5 votes |
def name2cp(k): if k == 'apos': return ord("'") if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 return htmlentitydefs.name2codepoint[k] else: k = htmlentitydefs.entitydefs[k] if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 return ord(codecs.latin_1_decode(k)[0])
Example #15
Source File: feedparser.py From odoo13-x64 with GNU General Public License v3.0 | 5 votes |
def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' # Reconstruct the original entity reference. if ref in name2codepoint or ref == 'apos': self.pieces.append('&%s;' % ref) else: self.pieces.append('&%s' % ref)
Example #16
Source File: feedparser.py From incremental-reading with ISC License | 5 votes |
def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' # Reconstruct the original entity reference. if ref in name2codepoint or ref == 'apos': self.pieces.append('&%s;' % ref) else: self.pieces.append('&%s' % ref)
Example #17
Source File: normalize.py From marko with MIT License | 5 votes |
def handle_entityref(self, name): try: c = chr(name2codepoint[name]) except KeyError: c = None self.output_char(c, '&' + name + ';') self.last = "ref"
Example #18
Source File: WikiExtractor.py From embeddings with Apache License 2.0 | 5 votes |
def unescape(text): """ Removes HTML or XML character references and entities from a text string. :param text The HTML (or XML) source text. :return The plain text, as a Unicode string, if necessary. """ def fixup(m): text = m.group(0) code = m.group(1) try: if text[1] == "#": # character reference if text[2] == "x": return chr(int(code[1:], 16)) else: return chr(int(code)) else: # named entity return chr(name2codepoint[code]) except: return text # leave as is return re.sub("&#?(\w+);", fixup, text) # Match HTML comments # The buggy template {{Template:T}} has a comment terminating with just "->"
Example #19
Source File: common.py From cats-blender-plugin with MIT License | 5 votes |
def handle_entityref(self, name): if name in name2codepoint and not self.hide_output: c = chr(name2codepoint[name]) self._buf.append(c)
Example #20
Source File: html2text.py From PyDataset with MIT License | 5 votes |
def name2cp(k): if k == 'apos': return ord("'") if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 return htmlentitydefs.name2codepoint[k] else: k = htmlentitydefs.entitydefs[k] if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 return ord(codecs.latin_1_decode(k)[0])
Example #21
Source File: unhtml.py From MR with MIT License | 5 votes |
def handle_entityref(self,name): if name in name2codepoint and not self.hide_output: c = chr(name2codepoint[name]) self.__text.append(c)
Example #22
Source File: html2text.py From arlo with Apache License 2.0 | 5 votes |
def name2cp(k): if k == 'apos': return ord("'") if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 return htmlentitydefs.name2codepoint[k] else: k = htmlentitydefs.entitydefs[k] if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 return ord(codecs.latin_1_decode(k)[0])
Example #23
Source File: html.py From pyglet with BSD 3-Clause "New" or "Revised" License | 5 votes |
def handle_entityref(self, name): if name in entities.name2codepoint: self.handle_data(chr(entities.name2codepoint[name]))
Example #24
Source File: scrapertools.py From addon with GNU General Public License v3.0 | 5 votes |
def unescape(text): """Removes HTML or XML character references and entities from a text string. keep &, >, < in the source code. from Fredrik Lundh http://effbot.org/zone/re-sub.htm#unescape-html """ def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)).encode("utf-8") else: return unichr(int(text[2:-1])).encode("utf-8") except ValueError: logger.error("error de valor") pass else: # named entity try: if PY3: import html.entities as htmlentitydefs else: import htmlentitydefs text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]).encode("utf-8") except KeyError: logger.error("keyerror") pass except: pass return text # leave as is return re.sub("&#?\w+;", fixup, str(text)) # Convierte los codigos html "ñ" y lo reemplaza por "ñ" caracter unicode utf-8
Example #25
Source File: WikiExtractor.py From ERNIE with MIT License | 5 votes |
def unescape(text): """ Removes HTML or XML character references and entities from a text string. :param text The HTML (or XML) source text. :return The plain text, as a Unicode string, if necessary. """ def fixup(m): text = m.group(0) code = m.group(1) try: if text[1] == "#": # character reference if text[2] == "x": return chr(int(code[1:], 16)) else: return chr(int(code)) else: # named entity return chr(name2codepoint[code]) except: return text # leave as is return re.sub("&#?(\w+);", fixup, text) # Match HTML comments # The buggy template {{Template:T}} has a comment terminating with just "->"
Example #26
Source File: parsers.py From riko with MIT License | 5 votes |
def entity2text(entitydef): """Convert an HTML entity reference into unicode. http://stackoverflow.com/a/58125/408556 """ if entitydef.startswith('&#x'): cp = int(entitydef[3:-1], 16) elif entitydef.startswith('&#'): cp = int(entitydef[2:-1]) elif entitydef.startswith('&'): cp = name2codepoint[entitydef[1:-1]] else: logger.debug(entitydef) cp = None return chr(cp) if cp else entitydef
Example #27
Source File: WikiExtractor.py From SOQAL with MIT License | 5 votes |
def unescape(text): """ Removes HTML or XML character references and entities from a text string. :param text The HTML (or XML) source text. :return The plain text, as a Unicode string, if necessary. """ def fixup(m): text = m.group(0) code = m.group(1) try: if text[1] == "#": # character reference if text[2] == "x": return chr(int(code[1:], 16)) else: return chr(int(code)) else: # named entity return chr(name2codepoint[code]) except: return text # leave as is return re.sub("&#?(\w+);", fixup, text) # Match HTML comments # The buggy template {{Template:T}} has a comment terminating with just "->"
Example #28
Source File: extract_recipe.py From extract_recipe with Apache License 2.0 | 4 votes |
def name2cp(k): if k == 'apos': return ord("'") if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 return htmlentitydefs.name2codepoint[k] else: k = htmlentitydefs.entitydefs[k] if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 return ord(codecs.latin_1_decode(k)[0])