Python Examples of tokenize.tokenize

Source File: pythondoc.py From InternationalizationScript-iOS with MIT License

6 votes

def skip_decorator(self, type, token, start, end, line):
        if token == "(":
            self.decorator_parens = self.decorator_parens + 1
        elif token == ")":
            self.decorator_parens = self.decorator_parens - 1
        if self.decorator_parens or type != tokenize.NEWLINE:
            return self.skip_decorator
        return self.process_subject

    ##
    # (Token handler helper) Processes a PythonDoc comment.  This
    # method creates an "info" element based on the current comment,
    # and attaches it to the current subject element.
    #
    # @param subject_name Subject name (or None if the name is not known).
    # @param subject_elem The current subject element.
    # @return The info element.  Note that this element has already
    #     been attached to the subject element.
    # @defreturn Element

Source File: AutoIndent.py From ironpython2 with Apache License 2.0

6 votes

def run(self):
        OPENERS=('class', 'def', 'for', 'if', 'try', 'while')
        INDENT=tokenize.INDENT
        NAME=tokenize.NAME
                   
        save_tabsize = tokenize.tabsize
        tokenize.tabsize = self.tabwidth
        try:
            try:
                for (typ, token, start, end, line) in token_generator(self.readline):
                    if typ == NAME and token in OPENERS:
                        self.blkopenline = line
                    elif type == INDENT and self.blkopenline:
                        self.indentedline = line
                        break

            except (tokenize.TokenError, IndentationError):
                # since we cut off the tokenizer early, we can trigger
                # spurious errors
                pass
        finally:
            tokenize.tabsize = save_tabsize
        return self.blkopenline, self.indentedline

Source File: config_scope.py From sacred with MIT License

6 votes

def find_doc_for(ast_entry, body_lines):
    lineno = ast_entry.lineno - 1
    line_io = io.BytesIO(body_lines[lineno].encode())
    try:
        tokens = tokenize(line_io.readline) or []
        line_comments = [t.string for t in tokens if t.type == COMMENT]

        if line_comments:
            formatted_lcs = [l[1:].strip() for l in line_comments]
            filtered_lcs = [l for l in formatted_lcs if not is_ignored(l)]
            if filtered_lcs:
                return filtered_lcs[0]
    except TokenError:
        pass

    lineno -= 1
    while lineno >= 0:
        if iscomment(body_lines[lineno]):
            comment = body_lines[lineno].strip("# ")
            if not is_ignored(comment):
                return comment
        if not body_lines[lineno].strip() == "":
            return None
        lineno -= 1
    return None

Source File: gftools-rangify.py From gftools with Apache License 2.0

6 votes

def main():
  if len(sys.argv) != 2:
    sys.exit("Usage: rangify <nam file>")

  codepoints_data = list(tokenize.tokenize(open(sys.argv[1], 'rb').readline))
  codepoints = get_codepoints(codepoints_data)
  codepoints.sort()

  seqs = []
  seq = (None,)
  for cp in codepoints:
    if seq[0] is None:
      seq = (cp,cp)
    elif seq[1] == cp - 1:
      seq = (seq[0], cp)
    else:
      seqs.append(seq)
      seq = (None,)

  for seq in seqs:
    print(seq)

Source File: reindent.py From D-VAE with MIT License

6 votes

def __init__(self, f):
        self.find_stmt = 1  # next token begins a fresh stmt?
        self.level = 0      # current indent level

        # Raw file lines.
        self.raw = f.readlines()

        # File lines, rstripped & tab-expanded.  Dummy at start is so
        # that we can use tokenize's 1-based line numbering easily.
        # Note that a line is all-blank iff it's "\n".
        self.lines = [_rstrip(line).expandtabs() + "\n"
                      for line in self.raw]
        self.lines.insert(0, None)
        self.index = 1  # index into self.lines of next line

        # List of (lineno, indentlevel) pairs, one for each stmt and
        # comment line.  indentlevel is -1 for comment lines, as a
        # signal that tokenize doesn't know what to do about them;
        # indeed, they're our headache!
        self.stats = []

Source File: pygettext.py From oss-ftp with MIT License

6 votes

def __waiting(self, ttype, tstring, lineno):
        opts = self.__options
        # Do docstring extractions, if enabled
        if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
            # module docstring?
            if self.__freshmodule:
                if ttype == tokenize.STRING:
                    self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
                    self.__freshmodule = 0
                elif ttype not in (tokenize.COMMENT, tokenize.NL):
                    self.__freshmodule = 0
                return
            # class docstring?
            if ttype == tokenize.NAME and tstring in ('class', 'def'):
                self.__state = self.__suiteseen
                return
        if ttype == tokenize.NAME and tstring in opts.keywords:
            self.__state = self.__keywordseen

Source File: pygettext.py From oss-ftp with MIT License

6 votes

def __openseen(self, ttype, tstring, lineno):
        if ttype == tokenize.OP and tstring == ')':
            # We've seen the last of the translatable strings.  Record the
            # line number of the first line of the strings and update the list
            # of messages seen.  Reset state for the next batch.  If there
            # were no strings inside _(), then just ignore this entry.
            if self.__data:
                self.__addentry(EMPTYSTRING.join(self.__data))
            self.__state = self.__waiting
        elif ttype == tokenize.STRING:
            self.__data.append(safe_eval(tstring))
        elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT,
                           token.NEWLINE, tokenize.NL]:
            # warn if we see anything else than STRING or whitespace
            print >> sys.stderr, _(
                '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"'
                ) % {
                'token': tstring,
                'file': self.__curfile,
                'lineno': self.__lineno
                }
            self.__state = self.__waiting

Source File: evaluate.py From python_autocomplete with MIT License

6 votes

def __init__(self, model, lstm_layers, lstm_size):
        self.__model = model

        # Initial state
        self._h0 = torch.zeros((lstm_layers, 1, lstm_size), device=device)
        self._c0 = torch.zeros((lstm_layers, 1, lstm_size), device=device)

        # Last line of source code read
        self._last_line = ""

        self._tokens: List[tokenize.TokenInfo] = []

        # Last token, because we need to input that to the model for inference
        self._last_token = 0

        # Last bit of the input string
        self._untokenized = ""

        # For timing
        self.time_add = 0
        self.time_predict = 0
        self.time_check = 0

Source File: reader.py From py2nb with BSD 3-Clause "New" or "Revised" License

6 votes

def read(filename):
    """
    Read a regular Python file with special formatting and performance
    preprocessing on it.  The result is a string that conforms to the IPython
    notebook version 3 python script format.
    """
    with open(filename, 'rb') as fin:
        token_gen = _generate_tokens(fin.readline)
        cvt_docstr_gen = convert_toplevel_docstring(token_gen)
        nl_gen = fix_newlines(cvt_docstr_gen)
        out = list(nl_gen)

    formatted = tokenize.untokenize(out).decode('utf-8')
    return fix_empty_lines(formatted)


# =============================================================================
#                                   Helpers
# =============================================================================

Source File: reader.py From py2nb with BSD 3-Clause "New" or "Revised" License

6 votes

def convert_toplevel_docstring(tokens):
    for token in tokens:
        # For each string
        if token.type == tokenize.STRING:
            text = token.string
            # Must be a docstring
            if text.startswith('"""') or text.startswith("'''"):
                startline, startcol = token.start
                # Starting column MUST be 0
                if startcol == 0:
                    endline, endcol = token.end
                    lines = ['# ' + line
                             for line in text.strip('"\' \n').split('\n')]
                    text = '\n'.join(lines)
                    fmt = '# <markdowncell>\n{0}\n# <codecell>'.format(text)
                    yield TokenInfo(type=tokenize.COMMENT,
                                    start=(startline, startcol),
                                    end=(endline, endcol),
                                    string=fmt,
                                    line='#')
                    # To next token
                    continue
        # Return untouched
        yield token

Source File: reindent.py From attention-lvcsr with MIT License

6 votes

def __init__(self, f):
        self.find_stmt = 1  # next token begins a fresh stmt?
        self.level = 0      # current indent level

        # Raw file lines.
        self.raw = f.readlines()

        # File lines, rstripped & tab-expanded.  Dummy at start is so
        # that we can use tokenize's 1-based line numbering easily.
        # Note that a line is all-blank iff it's "\n".
        self.lines = [_rstrip(line).expandtabs() + "\n"
                      for line in self.raw]
        self.lines.insert(0, None)
        self.index = 1  # index into self.lines of next line

        # List of (lineno, indentlevel) pairs, one for each stmt and
        # comment line.  indentlevel is -1 for comment lines, as a
        # signal that tokenize doesn't know what to do about them;
        # indeed, they're our headache!
        self.stats = []

Source File: split.py From flynt with MIT License

6 votes

def get_chunks(code) -> Generator[Chunk, None, None]:
    g = tokenize.tokenize(io.BytesIO(code.encode("utf-8")).readline)
    chunk = Chunk()

    try:
        for item in g:
            t = PyToken(item)
            reuse = chunk.append(t)

            if chunk.complete:

                yield chunk
                chunk = Chunk()
                if reuse:
                    reuse = chunk.append(t)
                    # assert not reuse
                    if chunk.complete:
                        yield chunk
                        chunk = Chunk()

        yield chunk
    except tokenize.TokenError as e:
        if state.verbose:
            traceback.print_exc()
            print(e)

Source File: aot.py From learn_python3_spider with MIT License

6 votes

def indentify(s):
    out = []
    stack = []
    l = ['', s]
    for (tokenType, tokenString, (startRow, startColumn),
         (endRow, endColumn), logicalLine) in tokenize(l.pop):
        if tokenString in ['[', '(', '{']:
            stack.append(tokenString)
        elif tokenString in [']', ')', '}']:
            stack.pop()
        if tokenString == '\0':
            out.append('  '*len(stack))
        else:
            out.append(tokenString)
    return ''.join(out)



###########
# Unjelly #
###########

Source File: recipe-491274.py From code with MIT License

6 votes

def format(self):
        """ Parse and send the colored source.
        """
        # store line offsets in self.lines
        self.lines = [0, 0]
        pos = 0
        while 1:
            pos = string.find(self.raw, '\n', pos) + 1
            if not pos: break
            self.lines.append(pos)
        self.lines.append(len(self.raw))

        # parse the source and write it
        self.pos = 0
        text = cStringIO.StringIO(self.raw)
        self.out.write('<pre class="code">\n')
        try:
            tokenize.tokenize(text.readline, self)
        except tokenize.TokenError, ex:
            msg = ex[0]
            line = ex[1][0]
            self.out.write("<h3>ERROR: %s</h3>%s\n" % (
                msg, self.raw[self.lines[line]:]))

Source File: pythondoc.py From InternationalizationScript-iOS with MIT License

6 votes

def handle_token(self, *args):
        # dispatch incoming tokens to the current handler
        if DEBUG > 1:
            print self.handler.im_func.func_name, self.indent,
            print tokenize.tok_name[args[0]], repr(args[1])
        if args[0] == tokenize.DEDENT:
            self.indent = self.indent - 1
            while self.scope and self.scope[-1][0] >= self.indent:
                del self.scope[-1]
                del self.stack[-1]
        self.handler = apply(self.handler, args)
        if args[0] == tokenize.INDENT:
            self.indent = self.indent + 1

    ##
    # (Token handler) Scans for encoding directive.

Source File: pythondoc.py From InternationalizationScript-iOS with MIT License

6 votes

def look_for_pythondoc(self, type, token, start, end, line):
        if type == tokenize.COMMENT and string.rstrip(token) == "##":
            # found a comment: set things up for comment processing
            self.comment_start = start
            self.comment = []
            return self.process_comment_body
        else:
            # deal with "bare" subjects
            if token == "def" or token == "class":
                self.subject_indent = self.indent
                self.subject_parens = 0
                self.subject_start = self.comment_start = None
                self.subject = []
                return self.process_subject(type, token, start, end, line)
            return self.look_for_pythondoc

    ##
    # (Token handler) Processes a comment body.  This handler adds
    # comment lines to the current comment.

Source File: pythondoc.py From InternationalizationScript-iOS with MIT License

6 votes

def handle_token(self, *args):
        # dispatch incoming tokens to the current handler
        if DEBUG > 1:
            print self.handler.im_func.func_name, self.indent,
            print tokenize.tok_name[args[0]], repr(args[1])
        if args[0] == tokenize.DEDENT:
            self.indent = self.indent - 1
            while self.scope and self.scope[-1][0] >= self.indent:
                del self.scope[-1]
                del self.stack[-1]
        self.handler = apply(self.handler, args)
        if args[0] == tokenize.INDENT:
            self.indent = self.indent + 1

    ##
    # (Token handler) Scans for encoding directive.

Source File: pythondoc.py From InternationalizationScript-iOS with MIT License

6 votes

def look_for_pythondoc(self, type, token, start, end, line):
        if type == tokenize.COMMENT and string.rstrip(token) == "##":
            # found a comment: set things up for comment processing
            self.comment_start = start
            self.comment = []
            return self.process_comment_body
        else:
            # deal with "bare" subjects
            if token == "def" or token == "class":
                self.subject_indent = self.indent
                self.subject_parens = 0
                self.subject_start = self.comment_start = None
                self.subject = []
                return self.process_subject(type, token, start, end, line)
            return self.look_for_pythondoc

    ##
    # (Token handler) Processes a comment body.  This handler adds
    # comment lines to the current comment.

Source File: reader.py From py2nb with BSD 3-Clause "New" or "Revised" License

6 votes

def fix_newlines(tokens):
    first = True
    curline = 1
    for token in tokens:
        if first:
            first = False
            curline = token.end[0] + 1
        else:
            # Fill NEWLINE token in between
            while curline < token.start[0]:
                yield TokenInfo(type=tokenize.NEWLINE,
                                string='\n',
                                start=(curline, 0),
                                end=(curline, 0),
                                line='\n', )
                curline += 1

            curline = token.end[0] + 1
        yield token

Source File: aot.py From Safejumper-for-Desktop with GNU General Public License v2.0

6 votes

def indentify(s):
    out = []
    stack = []
    l = ['', s]
    for (tokenType, tokenString, (startRow, startColumn),
         (endRow, endColumn), logicalLine) in tokenize(l.pop):
        if tokenString in ['[', '(', '{']:
            stack.append(tokenString)
        elif tokenString in [']', ')', '}']:
            stack.pop()
        if tokenString == '\0':
            out.append('  '*len(stack))
        else:
            out.append(tokenString)
    return ''.join(out)



###########
# Unjelly #
###########

Source File: inspect.py From ironpython2 with Apache License 2.0

5 votes

def tokeneater(self, type, token, srow_scol, erow_ecol, line):
        srow, scol = srow_scol
        erow, ecol = erow_ecol
        if not self.started:
            # look for the first "def", "class" or "lambda"
            if token in ("def", "class", "lambda"):
                if token == "lambda":
                    self.islambda = True
                self.started = True
            self.passline = True    # skip to the end of the line
        elif type == tokenize.NEWLINE:
            self.passline = False   # stop skipping when a NEWLINE is seen
            self.last = srow
            if self.islambda:       # lambdas always end at the first NEWLINE
                raise EndOfBlock
        elif self.passline:
            pass
        elif type == tokenize.INDENT:
            self.indent = self.indent + 1
            self.passline = True
        elif type == tokenize.DEDENT:
            self.indent = self.indent - 1
            # the end of matching indent/dedent pairs end a block
            # (note that this only works for "def"/"class" blocks,
            #  not e.g. for "if: else:" or "try: finally:" blocks)
            if self.indent <= 0:
                raise EndOfBlock
        elif self.indent == 0 and type not in (tokenize.COMMENT, tokenize.NL):
            # any other token on the same indentation level end the previous
            # block as well, except the pseudo-tokens COMMENT and NL.
            raise EndOfBlock

Source File: inspect.py From meddle with MIT License

5 votes

def getblock(lines):
    """Extract the block of code at the top of the given list of lines."""
    blockfinder = BlockFinder()
    try:
        tokenize.tokenize(iter(lines).next, blockfinder.tokeneater)
    except (EndOfBlock, IndentationError):
        pass
    return lines[:blockfinder.last]

Source File: reader.py From py2nb with BSD 3-Clause "New" or "Revised" License

5 votes

def _generate_tokens(readline):
        return map(lambda x: TokenInfo(*x), tokenize.generate_tokens(readline))

Source File: reindent.py From attention-lvcsr with MIT License

5 votes

def getline(self):
        if self.index >= len(self.lines):
            line = ""
        else:
            line = self.lines[self.index]
            self.index += 1
        return line

    # Line-eater for tokenize.

Source File: reindent.py From attention-lvcsr with MIT License

5 votes

def tokeneater(self, type, token, pos, end, line,
                   INDENT=tokenize.INDENT,
                   DEDENT=tokenize.DEDENT,
                   NEWLINE=tokenize.NEWLINE,
                   COMMENT=tokenize.COMMENT,
                   NL=tokenize.NL):
        sline, scol = pos
        if type == NEWLINE:
            # A program statement, or ENDMARKER, will eventually follow,
            # after some (possibly empty) run of tokens of the form
            #     (NL | COMMENT)* (INDENT | DEDENT+)?
            self.find_stmt = 1

        elif type == INDENT:
            self.find_stmt = 1
            self.level += 1

        elif type == DEDENT:
            self.find_stmt = 1
            self.level -= 1

        elif type == COMMENT:
            if self.find_stmt:
                self.stats.append((sline, -1))
                # but we're still looking for a new stmt, so leave
                # find_stmt alone

        elif type == NL:
            pass

        elif self.find_stmt:
            # This is the first "real token" following a NEWLINE, so it
            # must be the first token of the next program statement, or an
            # ENDMARKER.
            self.find_stmt = 0
            if line:   # not endmarker
                self.stats.append((sline, self.level))

# Count number of leading blanks.

Source File: inspect.py From Fluid-Designer with GNU General Public License v3.0

5 votes

def getblock(lines):
    """Extract the block of code at the top of the given list of lines."""
    blockfinder = BlockFinder()
    try:
        tokens = tokenize.generate_tokens(iter(lines).__next__)
        for _token in tokens:
            blockfinder.tokeneater(*_token)
    except (EndOfBlock, IndentationError):
        pass
    return lines[:blockfinder.last]

Source File: inspect.py From Imogen with MIT License

5 votes

def getblock(lines):
    """Extract the block of code at the top of the given list of lines."""
    blockfinder = BlockFinder()
    try:
        tokens = tokenize.generate_tokens(iter(lines).__next__)
        for _token in tokens:
            blockfinder.tokeneater(*_token)
    except (EndOfBlock, IndentationError):
        pass
    return lines[:blockfinder.last]

Source File: manager.py From bandit with Apache License 2.0

5 votes

def _parse_file(self, fname, fdata, new_files_list):
        try:
            # parse the current file
            data = fdata.read()
            lines = data.splitlines()
            self.metrics.begin(fname)
            self.metrics.count_locs(lines)
            if self.ignore_nosec:
                nosec_lines = set()
            else:
                try:
                    fdata.seek(0)
                    if six.PY2:
                        tokens = tokenize.generate_tokens(fdata.readline)
                    else:
                        tokens = tokenize.tokenize(fdata.readline)
                    nosec_lines = set(
                        lineno for toktype, tokval, (lineno, _), _, _ in tokens
                        if toktype == tokenize.COMMENT and
                        '#nosec' in tokval or '# nosec' in tokval)
                except tokenize.TokenError:
                    nosec_lines = set()
            score = self._execute_ast_visitor(fname, data, nosec_lines)
            self.scores.append(score)
            self.metrics.count_issues([score, ])
        except KeyboardInterrupt:
            sys.exit(2)
        except SyntaxError:
            self.skipped.append((fname,
                                 "syntax error while parsing AST from file"))
            new_files_list.remove(fname)
        except Exception as e:
            LOG.error("Exception occurred when executing tests against "
                      "%s. Run \"bandit --debug %s\" to see the full "
                      "traceback.", fname, fname)
            self.skipped.append((fname, 'exception while scanning file'))
            new_files_list.remove(fname)
            LOG.debug("  Exception string: %s", e)
            LOG.debug("  Exception traceback: %s", traceback.format_exc())

Source File: tokenizer.py From python_autocomplete with MIT License

5 votes

def parse_string(content: str) -> List[ParsedToken]:
    """
    Encode source code
    """
    g = tokenize.tokenize(BytesIO(content.encode('utf-8')).readline)

    return parse(g)

Source File: reindent.py From attention-lvcsr with MIT License

5 votes

def write(self, f):
        f.writelines(self.after)

    # Line-getter for tokenize.

Python tokenize.tokenize() Examples