Python Examples of tokenize.detect

Source File: _bootstrap.py From jawfish with MIT License

7 votes

def get_source(self, fullname):
        """Concrete implementation of InspectLoader.get_source."""
        import tokenize
        path = self.get_filename(fullname)
        try:
            source_bytes = self.get_data(path)
        except IOError as exc:
            raise ImportError("source not available through get_data()",
                              name=fullname) from exc
        readsource = _io.BytesIO(source_bytes).readline
        try:
            encoding = tokenize.detect_encoding(readsource)
        except SyntaxError as exc:
            raise ImportError("Failed to detect encoding",
                              name=fullname) from exc
        newline_decoder = _io.IncrementalNewlineDecoder(None, True)
        try:
            return newline_decoder.decode(source_bytes.decode(encoding[0]))
        except UnicodeDecodeError as exc:
            raise ImportError("Failed to decode source file",
                              name=fullname) from exc

Source File: imp.py From scylla with Apache License 2.0

6 votes

def get_data(self, path):
        """Gross hack to contort loader to deal w/ load_*()'s bad API."""
        if self.file and path == self.path:
            if not self.file.closed:
                file = self.file
            else:
                self.file = file = open(self.path, 'r')

            with file:
                # Technically should be returning bytes, but
                # SourceLoader.get_code() just passed what is returned to
                # compile() which can handle str. And converting to bytes would
                # require figuring out the encoding to decode to and
                # tokenize.detect_encoding() only accepts bytes.
                return file.read()
        else:
            return super().get_data(path)

Source File: imp.py From jawfish with MIT License

6 votes

def get_data(self, path):
        """Gross hack to contort loader to deal w/ load_*()'s bad API."""
        if self.file and path == self.path:
            if not self.file.closed:
                file = self.file
            else:
                self.file = file = open(self.path, 'r')

            with file:
                # Technically should be returning bytes, but
                # SourceLoader.get_code() just passed what is returned to
                # compile() which can handle str. And converting to bytes would
                # require figuring out the encoding to decode to and
                # tokenize.detect_encoding() only accepts bytes.
                return file.read()
        else:
            return super().get_data(path)

Source File: imp.py From python with Apache License 2.0

6 votes

def get_data(self, path):
        """Gross hack to contort loader to deal w/ load_*()'s bad API."""
        if self.file and path == self.path:
            if not self.file.closed:
                file = self.file
            else:
                self.file = file = open(self.path, 'r')

            with file:
                # Technically should be returning bytes, but
                # SourceLoader.get_code() just passed what is returned to
                # compile() which can handle str. And converting to bytes would
                # require figuring out the encoding to decode to and
                # tokenize.detect_encoding() only accepts bytes.
                return file.read()
        else:
            return super().get_data(path)

Source File: imp.py From kobo-predict with BSD 2-Clause "Simplified" License

6 votes

def get_data(self, path):
        """Gross hack to contort loader to deal w/ load_*()'s bad API."""
        if self.file and path == self.path:
            if not self.file.closed:
                file = self.file
            else:
                self.file = file = open(self.path, 'r')

            with file:
                # Technically should be returning bytes, but
                # SourceLoader.get_code() just passed what is returned to
                # compile() which can handle str. And converting to bytes would
                # require figuring out the encoding to decode to and
                # tokenize.detect_encoding() only accepts bytes.
                return file.read()
        else:
            return super().get_data(path)

Source File: openpy.py From Computable with MIT License

6 votes

def source_to_unicode(txt, errors='replace', skip_encoding_cookie=True):
    """Converts a bytes string with python source code to unicode.

    Unicode strings are passed through unchanged. Byte strings are checked
    for the python source file encoding cookie to determine encoding.
    txt can be either a bytes buffer or a string containing the source
    code.
    """
    if isinstance(txt, unicode):
        return txt
    if isinstance(txt, bytes):
        buffer = BytesIO(txt)
    else:
        buffer = txt
    try:
        encoding, _ = detect_encoding(buffer.readline)
    except SyntaxError:
        encoding = "ascii"
    buffer.seek(0)
    text = TextIOWrapper(buffer, encoding, errors=errors, line_buffering=True)
    text.mode = 'r'
    if skip_encoding_cookie:
        return u"".join(strip_encoding_cookie(text))
    else:
        return text.read()

Source File: _pydev_execfile.py From PyDev.Debugger with Eclipse Public License 1.0

6 votes

def execfile(file, glob=None, loc=None):
    if glob is None:
        import sys
        glob = sys._getframe().f_back.f_globals
    if loc is None:
        loc = glob

    # It seems that the best way is using tokenize.open(): http://code.activestate.com/lists/python-dev/131251/
    # (but tokenize.open() is only available for python 3.2)
    import tokenize
    if hasattr(tokenize, 'open'):
        # version 3.2
        stream = tokenize.open(file)  # @UndefinedVariable
    else:
        # version 3.0 or 3.1
        detect_encoding = tokenize.detect_encoding(open(file, mode="rb" ).readline)
        stream = open(file, encoding=detect_encoding[0])
    try:
        contents = stream.read()
    finally:
        stream.close()

    #execute the script (note: it's important to compile first to have the filename set in debug mode)
    exec(compile(contents+"\n", file, 'exec'), glob, loc)

Source File: imp.py From GraphicDesignPatternByPython with MIT License

6 votes

def get_data(self, path):
        """Gross hack to contort loader to deal w/ load_*()'s bad API."""
        if self.file and path == self.path:
            if not self.file.closed:
                file = self.file
            else:
                self.file = file = open(self.path, 'r')

            with file:
                # Technically should be returning bytes, but
                # SourceLoader.get_code() just passed what is returned to
                # compile() which can handle str. And converting to bytes would
                # require figuring out the encoding to decode to and
                # tokenize.detect_encoding() only accepts bytes.
                return file.read()
        else:
            return super().get_data(path)

Source File: imp.py From Fluid-Designer with GNU General Public License v3.0

6 votes

def get_data(self, path):
        """Gross hack to contort loader to deal w/ load_*()'s bad API."""
        if self.file and path == self.path:
            if not self.file.closed:
                file = self.file
            else:
                self.file = file = open(self.path, 'r')

            with file:
                # Technically should be returning bytes, but
                # SourceLoader.get_code() just passed what is returned to
                # compile() which can handle str. And converting to bytes would
                # require figuring out the encoding to decode to and
                # tokenize.detect_encoding() only accepts bytes.
                return file.read()
        else:
            return super().get_data(path)

Source File: imp.py From ironpython3 with Apache License 2.0

6 votes

def get_data(self, path):
        """Gross hack to contort loader to deal w/ load_*()'s bad API."""
        if self.file and path == self.path:
            if not self.file.closed:
                file = self.file
            else:
                self.file = file = open(self.path, 'r')

            with file:
                # Technically should be returning bytes, but
                # SourceLoader.get_code() just passed what is returned to
                # compile() which can handle str. And converting to bytes would
                # require figuring out the encoding to decode to and
                # tokenize.detect_encoding() only accepts bytes.
                return file.read()
        else:
            return super().get_data(path)

Source File: test_tokenize.py From ironpython3 with Apache License 2.0

5 votes

def test_tokenize(self):
        import tokenize as tokenize_module
        encoding = object()
        encoding_used = None
        def mock_detect_encoding(readline):
            return encoding, [b'first', b'second']

        def mock__tokenize(readline, encoding):
            nonlocal encoding_used
            encoding_used = encoding
            out = []
            while True:
                next_line = readline()
                if next_line:
                    out.append(next_line)
                    continue
                return out

        counter = 0
        def mock_readline():
            nonlocal counter
            counter += 1
            if counter == 5:
                return b''
            return str(counter).encode()

        orig_detect_encoding = tokenize_module.detect_encoding
        orig__tokenize = tokenize_module._tokenize
        tokenize_module.detect_encoding = mock_detect_encoding
        tokenize_module._tokenize = mock__tokenize
        try:
            results = tokenize(mock_readline)
            self.assertEqual(list(results),
                             [b'first', b'second', b'1', b'2', b'3', b'4'])
        finally:
            tokenize_module.detect_encoding = orig_detect_encoding
            tokenize_module._tokenize = orig__tokenize

        self.assertTrue(encoding_used, encoding)

Source File: phystokens.py From coveragepy with Apache License 2.0

5 votes

def _source_encoding_py3(source):
    """Determine the encoding for `source`, according to PEP 263.

    `source` is a byte string: the text of the program.

    Returns a string, the name of the encoding.

    """
    readline = iternext(source.splitlines(True))
    return tokenize.detect_encoding(readline)[0]

Source File: test_tokenize.py From ironpython3 with Apache License 2.0

5 votes

def test_utf8_normalization(self):
        # See get_normal_name() in tokenizer.c.
        encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
        for encoding in encodings:
            for rep in ("-", "_"):
                enc = encoding.replace("-", rep)
                lines = (b"#!/usr/bin/python\n",
                         b"# coding: " + enc.encode("ascii") + b"\n",
                         b"1 + 3\n")
                rl = self.get_readline(lines)
                found, consumed_lines = detect_encoding(rl)
                self.assertEqual(found, "utf-8")

Source File: test_tokenize.py From ironpython3 with Apache License 2.0

5 votes

def test_syntaxerror_latin1(self):
        # Issue 14629: need to raise SyntaxError if the first
        # line(s) have non-UTF-8 characters
        lines = (
            b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
            )
        readline = self.get_readline(lines)
        self.assertRaises(SyntaxError, detect_encoding, readline)

Source File: test_tokenize.py From ironpython3 with Apache License 2.0

5 votes

def test_false_encoding(self):
        # Issue 18873: "Encoding" detected in non-comment lines
        readline = self.get_readline((b'print("#coding=fake")',))
        encoding, consumed_lines = detect_encoding(readline)
        self.assertEqual(encoding, 'utf-8')
        self.assertEqual(consumed_lines, [b'print("#coding=fake")'])

Source File: pycodestyle.py From blackmamba with MIT License

5 votes

def readlines(filename):
        """Read the source code."""
        try:
            with open(filename, 'rb') as f:
                (coding, lines) = tokenize.detect_encoding(f.readline)
                f = TextIOWrapper(f, coding, line_buffering=True)
                return [line.decode(coding) for line in lines] + f.readlines()
        except (LookupError, SyntaxError, UnicodeError):
            # Fall back if file encoding is improperly declared
            with open(filename, encoding='latin-1') as f:
                return f.readlines()

Source File: test_tokenize.py From ironpython3 with Apache License 2.0

5 votes

def test_cookie_second_line_no_bom(self):
        lines = (
            b'#! something\n',
            b'# vim: set fileencoding=ascii :\n',
            b'print(something)\n',
            b'do_something(else)\n'
        )
        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
        self.assertEqual(encoding, 'ascii')
        expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
        self.assertEqual(consumed_lines, expected)

Source File: test_tokenize.py From ironpython3 with Apache License 2.0

5 votes

def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
        lines = (
            b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
            b'print(something)\n',
            b'do_something(else)\n'
        )
        readline = self.get_readline(lines)
        self.assertRaises(SyntaxError, detect_encoding, readline)

Source File: test_tokenize.py From ironpython3 with Apache License 2.0

5 votes

def test_matched_bom_and_cookie_first_line(self):
        lines = (
            b'\xef\xbb\xbf# coding=utf-8\n',
            b'print(something)\n',
            b'do_something(else)\n'
        )
        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
        self.assertEqual(encoding, 'utf-8-sig')
        self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])

Source File: test_tokenize.py From ironpython3 with Apache License 2.0

5 votes

def test_bom_no_cookie(self):
        lines = (
            b'\xef\xbb\xbf# something\n',
            b'print(something)\n',
            b'do_something(else)\n'
        )
        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
        self.assertEqual(encoding, 'utf-8-sig')
        self.assertEqual(consumed_lines,
                         [b'# something\n', b'print(something)\n'])

Source File: test_tokenize.py From ironpython3 with Apache License 2.0

5 votes

def test_no_bom_no_encoding_cookie(self):
        lines = (
            b'# something\n',
            b'print(something)\n',
            b'do_something(else)\n'
        )
        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
        self.assertEqual(encoding, 'utf-8')
        self.assertEqual(consumed_lines, list(lines[:2]))

Source File: test_unparse.py From ironpython3 with Apache License 2.0

5 votes

def read_pyfile(filename):
    """Read and return the contents of a Python source file (as a
    string), taking into account the file encoding."""
    with open(filename, "rb") as pyfile:
        encoding = tokenize.detect_encoding(pyfile.readline)[0]
    with open(filename, "r", encoding=encoding) as pyfile:
        source = pyfile.read()
    return source

Source File: executing.py From executing with MIT License

5 votes

def decode_source(source):
        if isinstance(source, bytes):
            encoding, _ = detect_encoding(io.BytesIO(source).readline)
            source = source.decode(encoding)
        return source

Source File: executing.py From executing with MIT License

5 votes

def decode_source(source):
        if isinstance(source, bytes):
            encoding = Source.detect_encoding(source)
            source = source.decode(encoding)
        return source

Source File: builder.py From pySINDy with MIT License

5 votes

def file_build(self, path, modname=None):
        """Build astroid from a source code file (i.e. from an ast)

        *path* is expected to be a python source file
        """
        try:
            stream, encoding, data = open_source_file(path)
        except IOError as exc:
            raise exceptions.AstroidBuildingError(
                "Unable to load file {path}:\n{error}",
                modname=modname,
                path=path,
                error=exc,
            ) from exc
        except (SyntaxError, LookupError) as exc:
            raise exceptions.AstroidSyntaxError(
                "Python 3 encoding specification error or unknown encoding:\n"
                "{error}",
                modname=modname,
                path=path,
                error=exc,
            ) from exc
        except UnicodeError as exc:  # wrong encoding
            # detect_encoding returns utf-8 if no encoding specified
            raise exceptions.AstroidBuildingError(
                "Wrong or no encoding specified for {filename}.", filename=path
            ) from exc
        with stream:
            # get module name if necessary
            if modname is None:
                try:
                    modname = ".".join(modutils.modpath_from_file(path))
                except ImportError:
                    modname = os.path.splitext(os.path.basename(path))[0]
            # build astroid representation
            module = self._data_build(data, modname, path)
            return self._post_build(module, encoding)

Source File: builder.py From pySINDy with MIT License

5 votes

def open_source_file(filename):
    with open(filename, "rb") as byte_stream:
        encoding = detect_encoding(byte_stream.readline)[0]
    stream = open(filename, "r", newline=None, encoding=encoding)
    data = stream.read()
    return stream, encoding, data

Source File: test_tokenize.py From Fluid-Designer with GNU General Public License v3.0

5 votes

def test_tokenize(self):
        import tokenize as tokenize_module
        encoding = object()
        encoding_used = None
        def mock_detect_encoding(readline):
            return encoding, [b'first', b'second']

        def mock__tokenize(readline, encoding):
            nonlocal encoding_used
            encoding_used = encoding
            out = []
            while True:
                next_line = readline()
                if next_line:
                    out.append(next_line)
                    continue
                return out

        counter = 0
        def mock_readline():
            nonlocal counter
            counter += 1
            if counter == 5:
                return b''
            return str(counter).encode()

        orig_detect_encoding = tokenize_module.detect_encoding
        orig__tokenize = tokenize_module._tokenize
        tokenize_module.detect_encoding = mock_detect_encoding
        tokenize_module._tokenize = mock__tokenize
        try:
            results = tokenize(mock_readline)
            self.assertEqual(list(results),
                             [b'first', b'second', b'1', b'2', b'3', b'4'])
        finally:
            tokenize_module.detect_encoding = orig_detect_encoding
            tokenize_module._tokenize = orig__tokenize

        self.assertTrue(encoding_used, encoding)

Source File: test_tokenize.py From Fluid-Designer with GNU General Public License v3.0

5 votes

def test_filename_in_exception(self):
        # When possible, include the file name in the exception.
        path = 'some_file_path'
        lines = (
            b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
            )
        class Bunk:
            def __init__(self, lines, path):
                self.name = path
                self._lines = lines
                self._index = 0

            def readline(self):
                if self._index == len(lines):
                    raise StopIteration
                line = lines[self._index]
                self._index += 1
                return line

        with self.assertRaises(SyntaxError):
            ins = Bunk(lines, path)
            # Make sure lacking a name isn't an issue.
            del ins.name
            detect_encoding(ins.readline)
        with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
            ins = Bunk(lines, path)
            detect_encoding(ins.readline)

Source File: test_tokenize.py From Fluid-Designer with GNU General Public License v3.0

5 votes

def test_false_encoding(self):
        # Issue 18873: "Encoding" detected in non-comment lines
        readline = self.get_readline((b'print("#coding=fake")',))
        encoding, consumed_lines = detect_encoding(readline)
        self.assertEqual(encoding, 'utf-8')
        self.assertEqual(consumed_lines, [b'print("#coding=fake")'])

Source File: test_tokenize.py From ironpython3 with Apache License 2.0

5 votes

def test_matched_bom_and_cookie_second_line(self):
        lines = (
            b'\xef\xbb\xbf#! something\n',
            b'f# coding=utf-8\n',
            b'print(something)\n',
            b'do_something(else)\n'
        )
        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
        self.assertEqual(encoding, 'utf-8-sig')
        self.assertEqual(consumed_lines,
                         [b'#! something\n', b'f# coding=utf-8\n'])

Python tokenize.detect_encoding() Examples