Python Examples of html.entities.name2codepoint

Source File: feedparser.py From odoo12-x64 with GNU General Public License v3.0

6 votes

def handle_entityref(self, ref):
        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
        if not self.elementstack:
            return
        if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
            text = '&%s;' % ref
        elif ref in self.entities:
            text = self.entities[ref]
            if text.startswith('&#') and text.endswith(';'):
                return self.handle_entityref(text)
        else:
            try:
                name2codepoint[ref]
            except KeyError:
                text = '&%s;' % ref
            else:
                text = chr(name2codepoint[ref]).encode('utf-8')
        self.elementstack[-1][2].append(text)

Source File: scrapertools.py From addon with GNU General Public License v3.0

6 votes

def decodeHtmlentities(string):
    string = entitiesfix(string)
    entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});")

    def substitute_entity(match):
        if PY3:
            from html.entities import name2codepoint as n2cp
        else:
            from htmlentitydefs import name2codepoint as n2cp
        ent = match.group(2)
        if match.group(1) == "#":
            return unichr(int(ent)).encode('utf-8')
        else:
            cp = n2cp.get(ent)

            if cp:
                return unichr(cp).encode('utf-8')
            else:
                return match.group()

    return entity_re.subn(substitute_entity, string)[0]

Source File: gml.py From aws-kube-codesuite with Apache License 2.0

6 votes

def unescape(text):
    """Replace XML character references with the referenced characters"""
    def fixup(m):
        text = m.group(0)
        if text[1] == '#':
            # Character reference
            if text[2] == 'x':
                code = int(text[3:-1], 16)
            else:
                code = int(text[2:-1])
        else:
            # Named entity
            try:
                code = htmlentitydefs.name2codepoint[text[1:-1]]
            except KeyError:
                return text  # leave unchanged
        try:
            return chr(code) if code < 256 else unichr(code)
        except (ValueError, OverflowError):
            return text  # leave unchanged

    return re.sub("&(?:[0-9A-Za-z]+|#(?:[0-9]+|x[0-9A-Fa-f]+));", fixup, text)

Source File: gml.py From Carnets with BSD 3-Clause "New" or "Revised" License

6 votes

def unescape(text):
    """Replace XML character references with the referenced characters"""
    def fixup(m):
        text = m.group(0)
        if text[1] == '#':
            # Character reference
            if text[2] == 'x':
                code = int(text[3:-1], 16)
            else:
                code = int(text[2:-1])
        else:
            # Named entity
            try:
                code = htmlentitydefs.name2codepoint[text[1:-1]]
            except KeyError:
                return text  # leave unchanged
        try:
            return chr(code) if code < 256 else unichr(code)
        except (ValueError, OverflowError):
            return text  # leave unchanged

    return re.sub("&(?:[0-9A-Za-z]+|#(?:[0-9]+|x[0-9A-Fa-f]+));", fixup, text)

Source File: gml.py From qgisSpaceSyntaxToolkit with GNU General Public License v3.0

6 votes

def unescape(text):
    """Replace XML character references in a string with the referenced
    characters.
    """
    def fixup(m):
        text = m.group(0)
        if text[1] == '#':
            # Character reference
            if text[2] == 'x':
                code = int(text[3:-1], 16)
            else:
                code = int(text[2:-1])
        else:
            # Named entity
            try:
                code = htmlentitydefs.name2codepoint[text[1:-1]]
            except KeyError:
                return text  # leave unchanged
        try:
            return chr(code) if code < 256 else unichr(code)
        except (ValueError, OverflowError):
            return text  # leave unchanged

    return re.sub("&(?:[0-9A-Za-z]+|#(?:[0-9]+|x[0-9A-Fa-f]+));", fixup, text)

Source File: isvalid.py From panaroo with MIT License

6 votes

def unescape(text):
    """Replace XML character references with the referenced characters"""
    def fixup(m):
        text = m.group(0)
        if text[1] == '#':
            # Character reference
            if text[2] == 'x':
                code = int(text[3:-1], 16)
            else:
                code = int(text[2:-1])
        else:
            # Named entity
            try:
                code = htmlentitydefs.name2codepoint[text[1:-1]]
            except KeyError:
                return text  # leave unchanged
        try:
            return chr(code) if code < 256 else unichr(code)
        except (ValueError, OverflowError):
            return text  # leave unchanged

    return re.sub("&(?:[0-9A-Za-z]+|#(?:[0-9]+|x[0-9A-Fa-f]+));", fixup, text)

Source File: feedparser.py From odoo13-x64 with GNU General Public License v3.0

6 votes

def handle_entityref(self, ref):
        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
        if not self.elementstack:
            return
        if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
            text = '&%s;' % ref
        elif ref in self.entities:
            text = self.entities[ref]
            if text.startswith('&#') and text.endswith(';'):
                return self.handle_entityref(text)
        else:
            try:
                name2codepoint[ref]
            except KeyError:
                text = '&%s;' % ref
            else:
                text = chr(name2codepoint[ref]).encode('utf-8')
        self.elementstack[-1][2].append(text)

Source File: feedparser.py From incremental-reading with ISC License

6 votes

def handle_entityref(self, ref):
        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
        if not self.elementstack:
            return
        if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
            text = '&%s;' % ref
        elif ref in self.entities:
            text = self.entities[ref]
            if text.startswith('&#') and text.endswith(';'):
                return self.handle_entityref(text)
        else:
            try:
                name2codepoint[ref]
            except KeyError:
                text = '&%s;' % ref
            else:
                text = chr(name2codepoint[ref]).encode('utf-8')
        self.elementstack[-1][2].append(text)

Source File: utils.py From telegraph with MIT License

5 votes

def handle_entityref(self, name):
        self.add_str_node(chr(name2codepoint[name]))

Source File: feedparser.py From odoo12-x64 with GNU General Public License v3.0

5 votes

def handle_entityref(self, ref):
        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
        # Reconstruct the original entity reference.
        if ref in name2codepoint or ref == 'apos':
            self.pieces.append('&%s;' % ref)
        else:
            self.pieces.append('&amp;%s' % ref)

Source File: htmlslacker.py From html-slacker with MIT License

5 votes

def handle_entityref(self, name):
        c = chr(name2codepoint[name])
        pass

Source File: html_parser.py From QualCoder with MIT License

5 votes

def handle_entityref(self, name):
        if name in name2codepoint and not self.hide_output:
            c = chr(name2codepoint[name])
            self._buf.append(c)

Source File: WikiExtractor.py From word2vec-on-wikipedia with MIT License

5 votes

def unescape(text):
    """
    Removes HTML or XML character references and entities from a text string.

    :param text The HTML (or XML) source text.
    :return The plain text, as a Unicode string, if necessary.
    """

    def fixup(m):
        text = m.group(0)
        code = m.group(1)
        try:
            if text[1] == "#":  # character reference
                if text[2] == "x":
                    return chr(int(code[1:], 16))
                else:
                    return chr(int(code))
            else:  # named entity
                return chr(name2codepoint[code])
        except:
            return text  # leave as is

    return re.sub("&#?(\w+);", fixup, text)


# Match HTML comments
# The buggy template {{Template:T}} has a comment terminating with just "->"

Source File: html2text.py From RedditBots with MIT License

5 votes

def name2cp(k):
    if k == 'apos': return ord("'")
    if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
        return htmlentitydefs.name2codepoint[k]
    else:
        k = htmlentitydefs.entitydefs[k]
        if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
        return ord(codecs.latin_1_decode(k)[0])

Source File: feedparser.py From odoo13-x64 with GNU General Public License v3.0

5 votes

def handle_entityref(self, ref):
        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
        # Reconstruct the original entity reference.
        if ref in name2codepoint or ref == 'apos':
            self.pieces.append('&%s;' % ref)
        else:
            self.pieces.append('&amp;%s' % ref)

Source File: feedparser.py From incremental-reading with ISC License

5 votes

def handle_entityref(self, ref):
        # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
        # Reconstruct the original entity reference.
        if ref in name2codepoint or ref == 'apos':
            self.pieces.append('&%s;' % ref)
        else:
            self.pieces.append('&amp;%s' % ref)

Source File: normalize.py From marko with MIT License

5 votes

def handle_entityref(self, name):
        try:
            c = chr(name2codepoint[name])
        except KeyError:
            c = None
        self.output_char(c, '&' + name + ';')
        self.last = "ref"

Source File: WikiExtractor.py From embeddings with Apache License 2.0

5 votes

def unescape(text):
    """
    Removes HTML or XML character references and entities from a text string.

    :param text The HTML (or XML) source text.
    :return The plain text, as a Unicode string, if necessary.
    """

    def fixup(m):
        text = m.group(0)
        code = m.group(1)
        try:
            if text[1] == "#":  # character reference
                if text[2] == "x":
                    return chr(int(code[1:], 16))
                else:
                    return chr(int(code))
            else:  # named entity
                return chr(name2codepoint[code])
        except:
            return text  # leave as is

    return re.sub("&#?(\w+);", fixup, text)


# Match HTML comments
# The buggy template {{Template:T}} has a comment terminating with just "->"

Source File: common.py From cats-blender-plugin with MIT License

5 votes

def handle_entityref(self, name):
        if name in name2codepoint and not self.hide_output:
            c = chr(name2codepoint[name])
            self._buf.append(c)

Source File: html2text.py From PyDataset with MIT License

5 votes

def name2cp(k):
    if k == 'apos': return ord("'")
    if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
        return htmlentitydefs.name2codepoint[k]
    else:
        k = htmlentitydefs.entitydefs[k]
        if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
        return ord(codecs.latin_1_decode(k)[0])

Source File: unhtml.py From MR with MIT License

5 votes

def handle_entityref(self,name):
         if name in name2codepoint and not self.hide_output:
            c = chr(name2codepoint[name])
            self.__text.append(c)

Source File: html2text.py From arlo with Apache License 2.0

5 votes

def name2cp(k):
    if k == 'apos': return ord("'")
    if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
        return htmlentitydefs.name2codepoint[k]
    else:
        k = htmlentitydefs.entitydefs[k]
        if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
        return ord(codecs.latin_1_decode(k)[0])

Source File: html.py From pyglet with BSD 3-Clause "New" or "Revised" License

5 votes

def handle_entityref(self, name):
        if name in entities.name2codepoint:
            self.handle_data(chr(entities.name2codepoint[name]))

Source File: scrapertools.py From addon with GNU General Public License v3.0

5 votes

def unescape(text):
    """Removes HTML or XML character references
       and entities from a text string.
       keep &amp;, &gt;, &lt; in the source code.
    from Fredrik Lundh
    http://effbot.org/zone/re-sub.htm#unescape-html
    """

    def fixup(m):
        text = m.group(0)
        if text[:2] == "&#":
            # character reference
            try:
                if text[:3] == "&#x":
                    return unichr(int(text[3:-1], 16)).encode("utf-8")
                else:
                    return unichr(int(text[2:-1])).encode("utf-8")

            except ValueError:
                logger.error("error de valor")
                pass
        else:
            # named entity
            try:
                if PY3:
                    import html.entities as htmlentitydefs
                else:
                    import htmlentitydefs
                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]).encode("utf-8")
            except KeyError:
                logger.error("keyerror")
                pass
            except:
                pass
        return text  # leave as is

    return re.sub("&#?\w+;", fixup, str(text))

    # Convierte los codigos html "&ntilde;" y lo reemplaza por "ñ" caracter unicode utf-8

Source File: WikiExtractor.py From ERNIE with MIT License

5 votes

def unescape(text):
    """
    Removes HTML or XML character references and entities from a text string.

    :param text The HTML (or XML) source text.
    :return The plain text, as a Unicode string, if necessary.
    """

    def fixup(m):
        text = m.group(0)
        code = m.group(1)
        try:
            if text[1] == "#":  # character reference
                if text[2] == "x":
                    return chr(int(code[1:], 16))
                else:
                    return chr(int(code))
            else:  # named entity
                return chr(name2codepoint[code])
        except:
            return text  # leave as is

    return re.sub("&#?(\w+);", fixup, text)


# Match HTML comments
# The buggy template {{Template:T}} has a comment terminating with just "->"

Source File: parsers.py From riko with MIT License

5 votes

def entity2text(entitydef):
    """Convert an HTML entity reference into unicode.
    http://stackoverflow.com/a/58125/408556
    """
    if entitydef.startswith('&#x'):
        cp = int(entitydef[3:-1], 16)
    elif entitydef.startswith('&#'):
        cp = int(entitydef[2:-1])
    elif entitydef.startswith('&'):
        cp = name2codepoint[entitydef[1:-1]]
    else:
        logger.debug(entitydef)
        cp = None

    return chr(cp) if cp else entitydef

Source File: WikiExtractor.py From SOQAL with MIT License

5 votes

def unescape(text):
    """
    Removes HTML or XML character references and entities from a text string.

    :param text The HTML (or XML) source text.
    :return The plain text, as a Unicode string, if necessary.
    """

    def fixup(m):
        text = m.group(0)
        code = m.group(1)
        try:
            if text[1] == "#":  # character reference
                if text[2] == "x":
                    return chr(int(code[1:], 16))
                else:
                    return chr(int(code))
            else:  # named entity
                return chr(name2codepoint[code])
        except:
            return text  # leave as is

    return re.sub("&#?(\w+);", fixup, text)


# Match HTML comments
# The buggy template {{Template:T}} has a comment terminating with just "->"

Source File: extract_recipe.py From extract_recipe with Apache License 2.0

4 votes

def name2cp(k):
    if k == 'apos': return ord("'")
    if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
        return htmlentitydefs.name2codepoint[k]
    else:
        k = htmlentitydefs.entitydefs[k]
        if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
        return ord(codecs.latin_1_decode(k)[0])

Python html.entities.name2codepoint() Examples