Python Examples of codecs.html

Source File: main.py From cvt2utf with MIT License

6 votes

def normalize_codec_name(chardet_name):
    """
    Normalizes chardet codec names to Python codec names.
    :param chardet_name: chardet codec names
    :return: Python codec names. See: https://docs.python.org/3.7/library/codecs.html#standard-encodings
    """

    python_name = chardet_name.lower().replace('iso-', 'iso').replace('-', '_')
    python_name = codecs.lookup(python_name).name

    # Since chardet only recognized all GB-based target_encoding as 'gb2312', the decoding will fail when the text file
    # contains certain special charaters. To make it more special-character-tolerant, we should
    # upgrade the target_encoding to 'gb18030', which is a character set larger than gb2312.
    if python_name == 'gb2312':
        return 'gb18030'

    return python_name

Source File: shell.py From supersqlite with MIT License

6 votes

def pop_output(self):
        """Restores most recently pushed output.  There are many
        output parameters such as nullvalue, mode
        (list/tcl/html/insert etc), column widths, header etc.  If you
        temporarily need to change some settings then
        :meth:`push_output`, change the settings and then pop the old
        ones back.

        A simple example is implementing a command like .dump.  Push
        the current output, change the mode to insert so we get SQL
        inserts printed and then pop to go back to what was there
        before.

        """
        # first item should always be present
        assert len(self._output_stack)
        if len(self._output_stack)==1:
            o=self._output_stack[0]
        else:
            o=self._output_stack.pop()
        for k,v in o.items():
            setattr(self,k,v)

Source File: shell.py From supersqlite with MIT License

6 votes

def process_command(self, cmd):
        """Processes a dot command.  It is split into parts using the
        `shlex.split
        <http://docs.python.org/library/shlex.html#shlex.split>`__
        function which is roughly the same method used by Unix/POSIX
        shells.
        """
        if self.echo:
            self.write(self.stderr, cmd+"\n")
        # broken with unicode on Python 2!!!
        if sys.version_info<(3,0):
            cmd=cmd.encode("utf8")
            cmd=[c.decode("utf8") for c in shlex.split(cmd)]
        else:
            cmd=shlex.split(cmd)
        assert cmd[0][0]=="."
        cmd[0]=cmd[0][1:]
        fn=getattr(self, "command_"+cmd[0], None)
        if not fn:
            raise self.Error("Unknown command \"%s\".  Enter \".help\" for help" % (cmd[0],))
        res=fn(cmd[1:])

    ###
    ### Commands start here
    ###

Source File: surrogateescape.py From caterpillar with Apache License 2.0

6 votes

def error_handler(error):
  """Error handler for surrogateescape decoding.

  Should be used with an ASCII-compatible encoding (e.g., 'latin-1' or 'utf-8').
  Replaces any invalid byte sequences with surrogate code points.

  As specified in
  https://docs.python.org/2/library/codecs.html#codecs.register_error.
  """
  # We can't use this with UnicodeEncodeError; the UTF-8 encoder doesn't raise
  # an error for surrogates. Instead, use encode.
  if not isinstance(error, UnicodeDecodeError):
    raise error

  result = []
  for i in range(error.start, error.end):
    byte = ord(error.object[i])
    if byte < 128:
      raise error
    result.append(unichr(0xdc00 + byte))

  return ''.join(result), error.end

Source File: shell.py From magnitude with MIT License

6 votes

def pop_output(self):
        """Restores most recently pushed output.  There are many
        output parameters such as nullvalue, mode
        (list/tcl/html/insert etc), column widths, header etc.  If you
        temporarily need to change some settings then
        :meth:`push_output`, change the settings and then pop the old
        ones back.

        A simple example is implementing a command like .dump.  Push
        the current output, change the mode to insert so we get SQL
        inserts printed and then pop to go back to what was there
        before.

        """
        # first item should always be present
        assert len(self._output_stack)
        if len(self._output_stack)==1:
            o=self._output_stack[0]
        else:
            o=self._output_stack.pop()
        for k,v in o.items():
            setattr(self,k,v)

Source File: shell.py From magnitude with MIT License

6 votes

def process_command(self, cmd):
        """Processes a dot command.  It is split into parts using the
        `shlex.split
        <http://docs.python.org/library/shlex.html#shlex.split>`__
        function which is roughly the same method used by Unix/POSIX
        shells.
        """
        if self.echo:
            self.write(self.stderr, cmd+"\n")
        # broken with unicode on Python 2!!!
        if sys.version_info<(3,0):
            cmd=cmd.encode("utf8")
            cmd=[c.decode("utf8") for c in shlex.split(cmd)]
        else:
            cmd=shlex.split(cmd)
        assert cmd[0][0]=="."
        cmd[0]=cmd[0][1:]
        fn=getattr(self, "command_"+cmd[0], None)
        if not fn:
            raise self.Error("Unknown command \"%s\".  Enter \".help\" for help" % (cmd[0],))
        res=fn(cmd[1:])

    ###
    ### Commands start here
    ###

Source File: unit_tests.py From roberteldersoftwarediff with Apache License 2.0

6 votes

def get_special_case_params():
    #  The windows and unix specific tests should be tested on both unix and Windows to detect crashes.
    special_cases = [
        [u"noexist", u"noexist"],
        [u"tests/ascii/ex1", u"noexist"],
        [u"noexist", u"tests/ascii/ex1"],
        [u"tests/ascii/ex1", u"tests/ascii/ex1", "--outfile", "/dev/null"],
        [u"tests/ascii/ex1", u"tests/ascii/ex2"],
        [u"tests/utf_8/ex3", u"tests/utf_8/ex4"],
        [u"tests/utf_8/ex3", u"tests/utf_8/ex4", u"--oldfile-encoding", u"\"utf-8\"", u"--newfile-encoding", u"\"utf-8\""],
        [u"tests/utf_8/ex3", u"tests/utf_8/ex4", u"--oldfile-encoding", u"\"utf-8\"", u"--newfile-encoding", u"\"utf-8\"", u"--output-encoding", u"\"utf-8\""],
        [u"tests/ascii/ex5", u"tests/ascii/ex6"],
        [u"tests/ascii/ex7", u"tests/ascii/ex8"],
        [u"tests/ascii/a.json", u"tests/ascii/b.json"],
        [u"tests/ascii/a.json", u"tests/ascii/b.json", u"--push-delimiters", u"\"{\"", u"\"[\"", u"--pop-delimiters", u"\"}\"", u"\"]\"", u"--include-delimiters"],
        [u"tests/utf_8/fancy1", u"tests/utf_8/fancy2", u"--delimiters", u"日本国", u"--include-delimiters", u"--parameters-encoding", u"\"utf-8\"", u"--output-encoding", u"\"utf-8\"", u"--newfile-encoding", u"\"utf-8\"", u"--oldfile-encoding", u"\"utf-8\""],
        [u"tests/utf_8/fancy1", u"tests/utf_8/fancy2", u"--delimiters", u"\"\\u65e5\\u672c\\u56fd\"", u"--include-delimiters", u"--parameters-encoding", u"\"utf-8\"", u"--output-encoding", u"\"utf-8\"", u"--newfile-encoding", u"\"utf-8\"", u"--oldfile-encoding", u"\"utf-8\""],
        [u"tests/utf_8/this-is-encoded-in-utf-8", u"tests/utf_16/this-is-encoded-in-utf-16", u"--output-encoding", u"\"utf-8\"", u"--newfile-encoding", u"\"utf-16\"", u"--oldfile-encoding", u"\"utf-8\"", u"--enable-mark"],
        [u"tests/ascii/a.html", u"tests/ascii/b.html", u"-m", u"html"]
    ]
    return special_cases[random.randint(0, len(special_cases)-1)]

Source File: codec.py From naz with MIT License

5 votes

def decode(input: bytes, errors: str = "strict") -> typing.Tuple[str, int]:
        """
        return a string decoded from the given bytes and its length.

        Parameters:
            input: the bytes to decode
            errors:	same meaning as the errors argument to pythons' `encode <https://docs.python.org/3/library/codecs.html#codecs.encode>`_ method
        """
        return codecs.utf_16_be_decode(input, errors)

Source File: codec.py From naz with MIT License

5 votes

def register_codecs(custom_codecs: typing.Union[None, typing.Dict[str, codecs.CodecInfo]] = None):
    """
    Register codecs, both custom and naz inbuilt ones.
    Custom codecs that have same encoding as inbuilt ones will take precedence.
    Users should never have to use this directly,
    instead; use `naz.Client(custom_codecs={"my_encoding": codecs.CodecInfo(name="my_encoding", encode=..., decode=...)})`

    Parameters:
        custom_codecs: a list of custom codecs to register.
    """
    if custom_codecs is None:
        custom_codecs = {}

    # Note: Search function registration is not currently reversible,
    # which may cause problems in some cases, such as unit testing or module reloading.
    # https://docs.python.org/3.7/library/codecs.html#codecs.register
    #
    # Note: Encodings are first looked up in the registry's cache.
    # thus if you call `register_codecs` and then call it again with different
    # codecs, the second codecs may not take effect.
    # ie; codecs.lookup(encoding) will return the first codecs since they were stored
    # in the cache.
    # There doesn't appear to be away to clear codec cache at runtime.
    # see: https://docs.python.org/3/library/codecs.html#codecs.lookup

    def _codec_search_function(_encoding):
        """
        We should try and get codecs from the custom_codecs first.
        This way, if someone had overridden an inbuilt codec, their
        implementation is chosen first and cached.
        """
        if custom_codecs.get(_encoding):
            return custom_codecs.get(_encoding)
        else:
            return _INBUILT_CODECS.get(_encoding)

    codecs.register(_codec_search_function)

Source File: shell.py From magnitude with MIT License

5 votes

def usage(self):
        "Returns the usage message.  Make sure it is newline terminated"

        msg="""
Usage: program [OPTIONS] FILENAME [SQL|CMD] [SQL|CMD]...
FILENAME is the name of a SQLite database. A new database is
created if the file does not exist.
OPTIONS include:
   -init filename       read/process named file
   -echo                print commands before execution
   -[no]header          turn headers on or off
   -bail                stop after hitting an error
   -interactive         force interactive I/O
   -batch               force batch I/O
   -column              set output mode to 'column'
   -csv                 set output mode to 'csv'
   -html                set output mode to 'html'
   -line                set output mode to 'line'
   -list                set output mode to 'list'
   -python              set output mode to 'python'
   -separator 'x'       set output field separator (|)
   -nullvalue 'text'    set text string for NULL values
   -version             show SQLite version
   -encoding 'name'     the encoding to use for files
                        opened via .import, .read & .output
   -nocolour            disables colour output to screen
"""
        return msg.lstrip()

    ###
    ### Value formatting routines.  They take a value and return a
    ### text formatting of them.  Mostly used by the various output's
    ### but also by random other pieces of code.
    ###

Source File: codec.py From naz with MIT License

5 votes

def encode(input: str, errors: str = "strict") -> typing.Tuple[bytes, int]:
        """
        return an encoded version of the string as a bytes object and its length.

        Parameters:
            input: the string to encode
            errors:	same meaning as the errors argument to pythons' `encode <https://docs.python.org/3/library/codecs.html#codecs.encode>`_ method
        """
        # https://github.com/google/pytype/issues/348
        return codecs.utf_16_be_encode(input, errors)

Source File: codec.py From naz with MIT License

5 votes

def decode(input: bytes, errors: str = "strict") -> typing.Tuple[str, int]:
        """
        return a string decoded from the given bytes and its length.

        Parameters:
            input: the bytes to decode
            errors:	same meaning as the errors argument to pythons' `encode <https://docs.python.org/3/library/codecs.html#codecs.encode>`_ method
        """
        res = iter(input)
        result = []
        for position, c in enumerate(res):
            try:
                if c == 27:
                    c = next(res)
                    result.append(GSM7BitCodec.gsm_extension[c])
                else:
                    result.append(GSM7BitCodec.gsm_basic_charset[c])
            except IndexError as indexErrorException:
                result.append(
                    GSM7BitCodec._handle_decode_error(
                        c, errors, position, input, indexErrorException
                    )
                )

        obj = "".join(result)
        return (obj, len(obj))

Source File: _util.py From shadowsocks with Apache License 2.0

5 votes

def find_encodings(enc=None, system=False):
    """Find functions for encoding translations for a specific codec.

    :param str enc: The codec to find translation functions for. It will be
                    normalized by converting to lowercase, excluding
                    everything which is not ascii, and hyphens will be
                    converted to underscores.

    :param bool system: If True, find encodings based on the system's stdin
                        encoding, otherwise assume utf-8.

    :raises: :exc:LookupError if the normalized codec, ``enc``, cannot be
             found in Python's encoding translation map.
    """
    if not enc:
        enc = 'utf-8'

    if system:
        if getattr(sys.stdin, 'encoding', None) is None:
            enc = sys.stdin.encoding
            log.debug("Obtained encoding from stdin: %s" % enc)
        else:
            enc = 'ascii'

    ## have to have lowercase to work, see
    ## http://docs.python.org/dev/library/codecs.html#standard-encodings
    enc = enc.lower()
    codec_alias = encodings.normalize_encoding(enc)

    codecs.register(encodings.search_function)
    coder = codecs.lookup(codec_alias)

    return coder

Source File: _util.py From shadowsocks with Apache License 2.0

5 votes

def b(x):
        """See http://python3porting.com/problems.html#nicer-solutions"""
        coder = find_encodings()
        if isinstance(x, bytes):
            return coder.encode(x.decode(coder.name))[0]
        else:
            return coder.encode(x)[0]

Source File: _util.py From shadowsocks with Apache License 2.0

5 votes

def b(x):
        """See http://python3porting.com/problems.html#nicer-solutions"""
        return x

Source File: codec.py From naz with MIT License

5 votes

def encode(input: str, errors: str = "strict") -> typing.Tuple[bytes, int]:
        """
        return an encoded version of the string as a bytes object and its length.

        Parameters:
            input: the string to encode
            errors:	same meaning as the errors argument to pythons' `encode <https://docs.python.org/3/library/codecs.html#codecs.encode>`_ method
        """
        # for the types of this method,
        # see: https://github.com/python/typeshed/blob/f7d240f06e5608a20b2daac4e96fe085c0577239/stdlib/2and3/codecs.pyi#L21-L22
        result = []
        for position, c in enumerate(input):
            idx = GSM7BitCodec.gsm_basic_charset_map.get(c)
            if idx is not None:
                result.append(chr(idx))
                continue
            idx = GSM7BitCodec.gsm_extension_map.get(c)
            if idx is not None:
                result.append(chr(27) + chr(idx))
            else:
                result.append(GSM7BitCodec._handle_encode_error(c, errors, position, input))

        obj = "".join(result)
        # this is equivalent to;
        # import six; six.b('someString')
        # see:
        # https://github.com/benjaminp/six/blob/68112f3193c7d4bef5ad86ed1b6ed528edd9093d/six.py#L625
        obj_bytes = obj.encode("latin-1")
        return (obj_bytes, len(obj_bytes))

Source File: shell.py From supersqlite with MIT License

5 votes

def usage(self):
        "Returns the usage message.  Make sure it is newline terminated"

        msg="""
Usage: program [OPTIONS] FILENAME [SQL|CMD] [SQL|CMD]...
FILENAME is the name of a SQLite database. A new database is
created if the file does not exist.
OPTIONS include:
   -init filename       read/process named file
   -echo                print commands before execution
   -[no]header          turn headers on or off
   -bail                stop after hitting an error
   -interactive         force interactive I/O
   -batch               force batch I/O
   -column              set output mode to 'column'
   -csv                 set output mode to 'csv'
   -html                set output mode to 'html'
   -line                set output mode to 'line'
   -list                set output mode to 'list'
   -python              set output mode to 'python'
   -separator 'x'       set output field separator (|)
   -nullvalue 'text'    set text string for NULL values
   -version             show SQLite version
   -encoding 'name'     the encoding to use for files
                        opened via .import, .read & .output
   -nocolour            disables colour output to screen
"""
        return msg.lstrip()

    ###
    ### Value formatting routines.  They take a value and return a
    ### text formatting of them.  Mostly used by the various output's
    ### but also by random other pieces of code.
    ###

Source File: jproperties.py From community-edition-setup with MIT License

5 votes

def _jbackslashreplace_error_handler(err):
    """
    Encoding error handler which replaces invalid characters with Java-compliant Unicode escape sequences.

    :param err: An `:exc:UnicodeEncodeError` instance.
    :return: See https://docs.python.org/2/library/codecs.html?highlight=codecs#codecs.register_error
    """
    if not isinstance(err, UnicodeEncodeError):
        raise err

    return _escape_non_ascii(err.object[err.start:err.end]), err.end

Source File: formats.py From fuel with MIT License

5 votes

def open_(filename, mode='r', encoding=None):
    """Open a text file with encoding and optional gzip compression.

    Note that on legacy Python any encoding other than ``None`` or opening
    GZipped files will return an unpicklable file-like object.

    Parameters
    ----------
    filename : str
        The filename to read.
    mode : str, optional
        The mode with which to open the file. Defaults to `r`.
    encoding : str, optional
        The encoding to use (see the codecs documentation_ for supported
        values). Defaults to ``None``.

    .. _documentation:
    https://docs.python.org/3/library/codecs.html#standard-encodings

    """
    if filename.endswith('.gz'):
        if six.PY2:
            zf = io.BufferedReader(gzip.open(filename, mode))
            if encoding:
                return codecs.getreader(encoding)(zf)
            else:
                return zf
        else:
            return io.BufferedReader(gzip.open(filename, mode,
                                               encoding=encoding))
    if six.PY2:
        if encoding:
            return codecs.open(filename, mode, encoding=encoding)
        else:
            return open(filename, mode)
    else:
        return open(filename, mode, encoding=encoding)

Source File: codecs.py From Emoji-Tools with GNU General Public License v3.0

5 votes

def encode(self, input, errors='strict'):
		assert errors == 'strict'
		#return codecs.encode(input, self.base_encoding, self.name), len(input)

		# The above line could totally be all we needed, relying on the error
		# handling to replace the unencodable Unicode characters with our extended
		# byte sequences.
		#
		# However, there seems to be a design bug in Python (probably intentional):
		# the error handler for encoding is supposed to return a **Unicode** character,
		# that then needs to be encodable itself...  Ugh.
		#
		# So we implement what codecs.encode() should have been doing: which is expect
		# error handler to return bytes() to be added to the output.
		#
		# This seems to have been fixed in Python 3.3.  We should try using that and
		# use fallback only if that failed.
		# https://docs.python.org/3.3/library/codecs.html#codecs.register_error

		length = len(input)
		out = b''
		while input:
			try:
				part = codecs.encode(input, self.base_encoding)
				out += part
				input = '' # All converted
			except UnicodeEncodeError as e:
				# Convert the correct part
				out += codecs.encode(input[:e.start], self.base_encoding)
				replacement, pos = self.error(e)
				out += replacement
				input = input[pos:]
		return out, length

Source File: basic.py From EasY_HaCk with Apache License 2.0

4 votes

def decodePage(page, contentEncoding, contentType):
    """
    Decode compressed/charset HTTP response
    """

    if not page or (conf.nullConnection and len(page) < 2):
        return getUnicode(page)

    if isinstance(contentEncoding, basestring) and contentEncoding:
        contentEncoding = contentEncoding.lower()
    else:
        contentEncoding = ""

    if isinstance(contentType, basestring) and contentType:
        contentType = contentType.lower()
    else:
        contentType = ""

    if contentEncoding in ("gzip", "x-gzip", "deflate"):
        if not kb.pageCompress:
            return None

        try:
            if contentEncoding == "deflate":
                data = StringIO.StringIO(zlib.decompress(page, -15))  # Reference: http://stackoverflow.com/questions/1089662/python-inflate-and-deflate-implementations
            else:
                data = gzip.GzipFile("", "rb", 9, StringIO.StringIO(page))
                size = struct.unpack("<l", page[-4:])[0]  # Reference: http://pydoc.org/get.cgi/usr/local/lib/python2.5/gzip.py
                if size > MAX_CONNECTION_TOTAL_SIZE:
                    raise Exception("size too large")

            page = data.read()
        except Exception, msg:
            if "<html" not in page:  # in some cases, invalid "Content-Encoding" appears for plain HTML (should be ignored)
                errMsg = "detected invalid data for declared content "
                errMsg += "encoding '%s' ('%s')" % (contentEncoding, msg)
                singleTimeLogMessage(errMsg, logging.ERROR)

                warnMsg = "turning off page compression"
                singleTimeWarnMessage(warnMsg)

                kb.pageCompress = False
                raise SqlmapCompressionException

Python codecs.html() Examples