[config.git] / djavu-asus / elpy / rpc-venv / lib / python3.11 / site-packages / blib2to3 / pgen2 / tokenize.py

# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
# All rights reserved.

# mypy: allow-untyped-defs, allow-untyped-calls

"""Tokenization help for Python programs.

generate_tokens(readline) is a generator that breaks a stream of
text into Python tokens.  It accepts a readline-like method which is called
repeatedly to get the next line of input (or "" for EOF).  It generates
5-tuples with these members:

    the token type (see token.py)
    the token (a string)
    the starting (row, column) indices of the token (a 2-tuple of ints)
    the ending (row, column) indices of the token (a 2-tuple of ints)
    the original line (string)

It is designed to match the working of the Python tokenizer exactly, except
that it produces COMMENT tokens for comments and gives type OP for all
operators

Older entry points
    tokenize_loop(readline, tokeneater)
    tokenize(readline, tokeneater=printtoken)
are the same, except instead of generating tokens, tokeneater is a callback
function to which the 5 fields described above are passed as 5 arguments,
each time a new token is found."""

import sys
from typing import (
    Callable,
    Final,
    Iterable,
    Iterator,
    List,
    Optional,
    Pattern,
    Set,
    Tuple,
    Union,
    cast,
)

from blib2to3.pgen2.grammar import Grammar
from blib2to3.pgen2.token import (
    ASYNC,
    AWAIT,
    COMMENT,
    DEDENT,
    ENDMARKER,
    ERRORTOKEN,
    INDENT,
    NAME,
    NEWLINE,
    NL,
    NUMBER,
    OP,
    STRING,
    tok_name,
)

__author__ = "Ka-Ping Yee <ping@lfw.org>"
__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"

import re
from codecs import BOM_UTF8, lookup

from . import token

__all__ = [x for x in dir(token) if x[0] != "_"] + [
    "tokenize",
    "generate_tokens",
    "untokenize",
]
del token


def group(*choices: str) -> str:
    return "(" + "|".join(choices) + ")"


def any(*choices: str) -> str:
    return group(*choices) + "*"


def maybe(*choices: str) -> str:
    return group(*choices) + "?"


def _combinations(*l: str) -> Set[str]:
    return {x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()}


Whitespace = r"[ \f\t]*"
Comment = r"#[^\r\n]*"
Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
Name = (  # this is invalid but it's fine because Name comes after Number in all groups
    r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+"
)

Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
Exponent = r"[eE][-+]?\d+(?:_\d+)*"
Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
    Exponent
)
Expfloat = r"\d+(?:_\d+)*" + Exponent
Floatnumber = group(Pointfloat, Expfloat)
Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
Number = group(Imagnumber, Floatnumber, Intnumber)

# Tail end of ' string.
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
# Tail end of " string.
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
# Tail end of ''' string.
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
# Tail end of """ string.
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
_litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
Triple = group(_litprefix + "'''", _litprefix + '"""')
# Single-line ' or " string.
String = group(
    _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
    _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
)

# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
# recognized as two instances of =).
Operator = group(
    r"\*\*=?",
    r">>=?",
    r"<<=?",
    r"<>",
    r"!=",
    r"//=?",
    r"->",
    r"[+\-*/%&@|^=<>:]=?",
    r"~",
)

Bracket = "[][(){}]"
Special = group(r"\r?\n", r"[:;.,`@]")
Funny = group(Operator, Bracket, Special)

# First (or only) line of ' or " string.
ContStr = group(
    _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
    _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
)
PseudoExtras = group(r"\\\r?\n", Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

pseudoprog: Final = re.compile(PseudoToken, re.UNICODE)
single3prog = re.compile(Single3)
double3prog = re.compile(Double3)

_strprefixes = (
    _combinations("r", "R", "f", "F")
    | _combinations("r", "R", "b", "B")
    | {"u", "U", "ur", "uR", "Ur", "UR"}
)

endprogs: Final = {
    "'": re.compile(Single),
    '"': re.compile(Double),
    "'''": single3prog,
    '"""': double3prog,
    **{f"{prefix}'''": single3prog for prefix in _strprefixes},
    **{f'{prefix}"""': double3prog for prefix in _strprefixes},
}

triple_quoted: Final = (
    {"'''", '"""'}
    | {f"{prefix}'''" for prefix in _strprefixes}
    | {f'{prefix}"""' for prefix in _strprefixes}
)
single_quoted: Final = (
    {"'", '"'}
    | {f"{prefix}'" for prefix in _strprefixes}
    | {f'{prefix}"' for prefix in _strprefixes}
)

tabsize = 8


class TokenError(Exception):
    pass


class StopTokenizing(Exception):
    pass


Coord = Tuple[int, int]


def printtoken(
    type: int, token: str, srow_col: Coord, erow_col: Coord, line: str
) -> None:  # for testing
    (srow, scol) = srow_col
    (erow, ecol) = erow_col
    print(
        "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
    )


TokenEater = Callable[[int, str, Coord, Coord, str], None]


def tokenize(readline: Callable[[], str], tokeneater: TokenEater = printtoken) -> None:
    """
    The tokenize() function accepts two parameters: one representing the
    input stream, and one providing an output mechanism for tokenize().

    The first parameter, readline, must be a callable object which provides
    the same interface as the readline() method of built-in file objects.
    Each call to the function should return one line of input as a string.

    The second parameter, tokeneater, must also be a callable object. It is
    called once for each token, with five arguments, corresponding to the
    tuples generated by generate_tokens().
    """
    try:
        tokenize_loop(readline, tokeneater)
    except StopTokenizing:
        pass


# backwards compatible interface
def tokenize_loop(readline: Callable[[], str], tokeneater: TokenEater) -> None:
    for token_info in generate_tokens(readline):
        tokeneater(*token_info)


GoodTokenInfo = Tuple[int, str, Coord, Coord, str]
TokenInfo = Union[Tuple[int, str], GoodTokenInfo]


class Untokenizer:
    tokens: List[str]
    prev_row: int
    prev_col: int

    def __init__(self) -> None:
        self.tokens = []
        self.prev_row = 1
        self.prev_col = 0

    def add_whitespace(self, start: Coord) -> None:
        row, col = start
        assert row <= self.prev_row
        col_offset = col - self.prev_col
        if col_offset:
            self.tokens.append(" " * col_offset)

    def untokenize(self, iterable: Iterable[TokenInfo]) -> str:
        for t in iterable:
            if len(t) == 2:
                self.compat(cast(Tuple[int, str], t), iterable)
                break
            tok_type, token, start, end, line = cast(
                Tuple[int, str, Coord, Coord, str], t
            )
            self.add_whitespace(start)
            self.tokens.append(token)
            self.prev_row, self.prev_col = end
            if tok_type in (NEWLINE, NL):
                self.prev_row += 1
                self.prev_col = 0
        return "".join(self.tokens)

    def compat(self, token: Tuple[int, str], iterable: Iterable[TokenInfo]) -> None:
        startline = False
        indents = []
        toks_append = self.tokens.append
        toknum, tokval = token
        if toknum in (NAME, NUMBER):
            tokval += " "
        if toknum in (NEWLINE, NL):
            startline = True
        for tok in iterable:
            toknum, tokval = tok[:2]

            if toknum in (NAME, NUMBER, ASYNC, AWAIT):
                tokval += " "

            if toknum == INDENT:
                indents.append(tokval)
                continue
            elif toknum == DEDENT:
                indents.pop()
                continue
            elif toknum in (NEWLINE, NL):
                startline = True
            elif startline and indents:
                toks_append(indents[-1])
                startline = False
            toks_append(tokval)


cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)


def _get_normal_name(orig_enc: str) -> str:
    """Imitates get_normal_name in tokenizer.c."""
    # Only care about the first 12 characters.
    enc = orig_enc[:12].lower().replace("_", "-")
    if enc == "utf-8" or enc.startswith("utf-8-"):
        return "utf-8"
    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
        ("latin-1-", "iso-8859-1-", "iso-latin-1-")
    ):
        return "iso-8859-1"
    return orig_enc


def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
    """
    The detect_encoding() function is used to detect the encoding that should
    be used to decode a Python source file. It requires one argument, readline,
    in the same way as the tokenize() generator.

    It will call readline a maximum of twice, and return the encoding used
    (as a string) and a list of any lines (left as bytes) it has read
    in.

    It detects the encoding from the presence of a utf-8 bom or an encoding
    cookie as specified in pep-0263. If both a bom and a cookie are present, but
    disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
    charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
    'utf-8-sig' is returned.

    If no encoding is specified, then the default of 'utf-8' will be returned.
    """
    bom_found = False
    encoding = None
    default = "utf-8"

    def read_or_stop() -> bytes:
        try:
            return readline()
        except StopIteration:
            return b""

    def find_cookie(line: bytes) -> Optional[str]:
        try:
            line_string = line.decode("ascii")
        except UnicodeDecodeError:
            return None
        match = cookie_re.match(line_string)
        if not match:
            return None
        encoding = _get_normal_name(match.group(1))
        try:
            codec = lookup(encoding)
        except LookupError:
            # This behaviour mimics the Python interpreter
            raise SyntaxError("unknown encoding: " + encoding)

        if bom_found:
            if codec.name != "utf-8":
                # This behaviour mimics the Python interpreter
                raise SyntaxError("encoding problem: utf-8")
            encoding += "-sig"
        return encoding

    first = read_or_stop()
    if first.startswith(BOM_UTF8):
        bom_found = True
        first = first[3:]
        default = "utf-8-sig"
    if not first:
        return default, []

    encoding = find_cookie(first)
    if encoding:
        return encoding, [first]
    if not blank_re.match(first):
        return default, [first]

    second = read_or_stop()
    if not second:
        return default, [first]

    encoding = find_cookie(second)
    if encoding:
        return encoding, [first, second]

    return default, [first, second]


def untokenize(iterable: Iterable[TokenInfo]) -> str:
    """Transform tokens back into Python source code.

    Each element returned by the iterable must be a token sequence
    with at least two elements, a token number and token value.  If
    only two tokens are passed, the resulting output is poor.

    Round-trip invariant for full input:
        Untokenized source will match input source exactly

    Round-trip invariant for limited input:
        # Output text will tokenize the back to the input
        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
        newcode = untokenize(t1)
        readline = iter(newcode.splitlines(1)).next
        t2 = [tok[:2] for tokin generate_tokens(readline)]
        assert t1 == t2
    """
    ut = Untokenizer()
    return ut.untokenize(iterable)


def generate_tokens(
    readline: Callable[[], str], grammar: Optional[Grammar] = None
) -> Iterator[GoodTokenInfo]:
    """
    The generate_tokens() generator requires one argument, readline, which
    must be a callable object which provides the same interface as the
    readline() method of built-in file objects. Each call to the function
    should return one line of input as a string.  Alternately, readline
    can be a callable function terminating with StopIteration:
        readline = open(myfile).next    # Example of alternate readline

    The generator produces 5-tuples with these members: the token type; the
    token string; a 2-tuple (srow, scol) of ints specifying the row and
    column where the token begins in the source; a 2-tuple (erow, ecol) of
    ints specifying the row and column where the token ends in the source;
    and the line on which the token was found. The line passed is the
    logical line; continuation lines are included.
    """
    lnum = parenlev = continued = 0
    numchars: Final[str] = "0123456789"
    contstr, needcont = "", 0
    contline: Optional[str] = None
    indents = [0]

    # If we know we're parsing 3.7+, we can unconditionally parse `async` and
    # `await` as keywords.
    async_keywords = False if grammar is None else grammar.async_keywords
    # 'stashed' and 'async_*' are used for async/await parsing
    stashed: Optional[GoodTokenInfo] = None
    async_def = False
    async_def_indent = 0
    async_def_nl = False

    strstart: Tuple[int, int]
    endprog: Pattern[str]

    while 1:  # loop over lines in stream
        try:
            line = readline()
        except StopIteration:
            line = ""
        lnum += 1
        pos, max = 0, len(line)

        if contstr:  # continued string
            assert contline is not None
            if not line:
                raise TokenError("EOF in multi-line string", strstart)
            endmatch = endprog.match(line)
            if endmatch:
                pos = end = endmatch.end(0)
                yield (
                    STRING,
                    contstr + line[:end],
                    strstart,
                    (lnum, end),
                    contline + line,
                )
                contstr, needcont = "", 0
                contline = None
            elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
                yield (
                    ERRORTOKEN,
                    contstr + line,
                    strstart,
                    (lnum, len(line)),
                    contline,
                )
                contstr = ""
                contline = None
                continue
            else:
                contstr = contstr + line
                contline = contline + line
                continue

        elif parenlev == 0 and not continued:  # new statement
            if not line:
                break
            column = 0
            while pos < max:  # measure leading whitespace
                if line[pos] == " ":
                    column += 1
                elif line[pos] == "\t":
                    column = (column // tabsize + 1) * tabsize
                elif line[pos] == "\f":
                    column = 0
                else:
                    break
                pos += 1
            if pos == max:
                break

            if stashed:
                yield stashed
                stashed = None

            if line[pos] in "\r\n":  # skip blank lines
                yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
                continue

            if line[pos] == "#":  # skip comments
                comment_token = line[pos:].rstrip("\r\n")
                nl_pos = pos + len(comment_token)
                yield (
                    COMMENT,
                    comment_token,
                    (lnum, pos),
                    (lnum, nl_pos),
                    line,
                )
                yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
                continue

            if column > indents[-1]:  # count indents
                indents.append(column)
                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

            while column < indents[-1]:  # count dedents
                if column not in indents:
                    raise IndentationError(
                        "unindent does not match any outer indentation level",
                        ("<tokenize>", lnum, pos, line),
                    )
                indents = indents[:-1]

                if async_def and async_def_indent >= indents[-1]:
                    async_def = False
                    async_def_nl = False
                    async_def_indent = 0

                yield (DEDENT, "", (lnum, pos), (lnum, pos), line)

            if async_def and async_def_nl and async_def_indent >= indents[-1]:
                async_def = False
                async_def_nl = False
                async_def_indent = 0

        else:  # continued statement
            if not line:
                raise TokenError("EOF in multi-line statement", (lnum, 0))
            continued = 0

        while pos < max:
            pseudomatch = pseudoprog.match(line, pos)
            if pseudomatch:  # scan for tokens
                start, end = pseudomatch.span(1)
                spos, epos, pos = (lnum, start), (lnum, end), end
                token, initial = line[start:end], line[start]

                if initial in numchars or (
                    initial == "." and token != "."
                ):  # ordinary number
                    yield (NUMBER, token, spos, epos, line)
                elif initial in "\r\n":
                    newline = NEWLINE
                    if parenlev > 0:
                        newline = NL
                    elif async_def:
                        async_def_nl = True
                    if stashed:
                        yield stashed
                        stashed = None
                    yield (newline, token, spos, epos, line)

                elif initial == "#":
                    assert not token.endswith("\n")
                    if stashed:
                        yield stashed
                        stashed = None
                    yield (COMMENT, token, spos, epos, line)
                elif token in triple_quoted:
                    endprog = endprogs[token]
                    endmatch = endprog.match(line, pos)
                    if endmatch:  # all on one line
                        pos = endmatch.end(0)
                        token = line[start:pos]
                        if stashed:
                            yield stashed
                            stashed = None
                        yield (STRING, token, spos, (lnum, pos), line)
                    else:
                        strstart = (lnum, start)  # multiple lines
                        contstr = line[start:]
                        contline = line
                        break
                elif (
                    initial in single_quoted
                    or token[:2] in single_quoted
                    or token[:3] in single_quoted
                ):
                    if token[-1] == "\n":  # continued string
                        strstart = (lnum, start)
                        maybe_endprog = (
                            endprogs.get(initial)
                            or endprogs.get(token[1])
                            or endprogs.get(token[2])
                        )
                        assert (
                            maybe_endprog is not None
                        ), f"endprog not found for {token}"
                        endprog = maybe_endprog
                        contstr, needcont = line[start:], 1
                        contline = line
                        break
                    else:  # ordinary string
                        if stashed:
                            yield stashed
                            stashed = None
                        yield (STRING, token, spos, epos, line)
                elif initial.isidentifier():  # ordinary name
                    if token in ("async", "await"):
                        if async_keywords or async_def:
                            yield (
                                ASYNC if token == "async" else AWAIT,
                                token,
                                spos,
                                epos,
                                line,
                            )
                            continue

                    tok = (NAME, token, spos, epos, line)
                    if token == "async" and not stashed:
                        stashed = tok
                        continue

                    if token in ("def", "for"):
                        if stashed and stashed[0] == NAME and stashed[1] == "async":
                            if token == "def":
                                async_def = True
                                async_def_indent = indents[-1]

                            yield (
                                ASYNC,
                                stashed[1],
                                stashed[2],
                                stashed[3],
                                stashed[4],
                            )
                            stashed = None

                    if stashed:
                        yield stashed
                        stashed = None

                    yield tok
                elif initial == "\\":  # continued stmt
                    # This yield is new; needed for better idempotency:
                    if stashed:
                        yield stashed
                        stashed = None
                    yield (NL, token, spos, (lnum, pos), line)
                    continued = 1
                else:
                    if initial in "([{":
                        parenlev += 1
                    elif initial in ")]}":
                        parenlev -= 1
                    if stashed:
                        yield stashed
                        stashed = None
                    yield (OP, token, spos, epos, line)
            else:
                yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
                pos += 1

    if stashed:
        yield stashed
        stashed = None

    for _indent in indents[1:]:  # pop remaining indent levels
        yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
    yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")


if __name__ == "__main__":  # testing
    if len(sys.argv) > 1:
        tokenize(open(sys.argv[1]).readline)
    else:
        tokenize(sys.stdin.readline)
Commit	Line	Data
53e6db90 DC	1	# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
	2	# All rights reserved.
	3
	4	# mypy: allow-untyped-defs, allow-untyped-calls
	5
	6	"""Tokenization help for Python programs.
	7
	8	generate_tokens(readline) is a generator that breaks a stream of
	9	text into Python tokens. It accepts a readline-like method which is called
	10	repeatedly to get the next line of input (or "" for EOF). It generates
	11	5-tuples with these members:
	12
	13	the token type (see token.py)
	14	the token (a string)
	15	the starting (row, column) indices of the token (a 2-tuple of ints)
	16	the ending (row, column) indices of the token (a 2-tuple of ints)
	17	the original line (string)
	18
	19	It is designed to match the working of the Python tokenizer exactly, except
	20	that it produces COMMENT tokens for comments and gives type OP for all
	21	operators
	22
	23	Older entry points
	24	tokenize_loop(readline, tokeneater)
	25	tokenize(readline, tokeneater=printtoken)
	26	are the same, except instead of generating tokens, tokeneater is a callback
	27	function to which the 5 fields described above are passed as 5 arguments,
	28	each time a new token is found."""
	29
	30	import sys
	31	from typing import (
	32	Callable,
	33	Final,
	34	Iterable,
	35	Iterator,
	36	List,
	37	Optional,
	38	Pattern,
	39	Set,
	40	Tuple,
	41	Union,
	42	cast,
	43	)
	44
	45	from blib2to3.pgen2.grammar import Grammar
	46	from blib2to3.pgen2.token import (
	47	ASYNC,
	48	AWAIT,
	49	COMMENT,
	50	DEDENT,
	51	ENDMARKER,
	52	ERRORTOKEN,
	53	INDENT,
	54	NAME,
	55	NEWLINE,
	56	NL,
	57	NUMBER,
	58	OP,
	59	STRING,
	60	tok_name,
	61	)
	62
	63	__author__ = "Ka-Ping Yee <ping@lfw.org>"
	64	__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
65
66	import re
67	from codecs import BOM_UTF8, lookup
68
69	from . import token
70
71	__all__ = [x for x in dir(token) if x[0] != "_"] + [
72	"tokenize",
73	"generate_tokens",
74	"untokenize",
75	]
76	del token
77
78
79	def group(*choices: str) -> str:
80	return "(" + "\|".join(choices) + ")"
81
82
83	def any(*choices: str) -> str:
84	return group(choices) + ""
85
86
87	def maybe(*choices: str) -> str:
88	return group(*choices) + "?"
89
90
91	def _combinations(*l: str) -> Set[str]:
92	return {x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()}
93
94
95	Whitespace = r"[ \f\t]*"
96	Comment = r"#[^\r\n]*"
97	Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
98	Name = ( # this is invalid but it's fine because Name comes after Number in all groups
99	r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=\|;:'\",\.<>/?`~\\]+"
100	)
101
102	Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
103	Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
104	Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
105	Decnumber = group(r"[1-9]\d(?:_\d+)[lL]?", "0[lL]?")
106	Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
107	Exponent = r"[eE][-+]?\d+(?:_\d+)*"
108	Pointfloat = group(r"\d+(?:_\d+)\.(?:\d+(?:_\d+))?", r"\.\d+(?:_\d+)*") + maybe(
109	Exponent
110	)
111	Expfloat = r"\d+(?:_\d+)*" + Exponent
112	Floatnumber = group(Pointfloat, Expfloat)
113	Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
114	Number = group(Imagnumber, Floatnumber, Intnumber)
115
116	# Tail end of ' string.
117	Single = r"[^'\\](?:\\.[^'\\])*'"
118	# Tail end of " string.
119	Double = r'[^"\\](?:\\.[^"\\])*"'
120	# Tail end of ''' string.
121	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
122	# Tail end of """ string.
123	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
124	_litprefix = r"(?:[uUrRbBfF]\|[rR][fFbB]\|[fFbBuU][rR])?"
125	Triple = group(_litprefix + "'''", _litprefix + '"""')
126	# Single-line ' or " string.
127	String = group(
128	_litprefix + r"'[^\n'\\](?:\\.[^\n'\\])*'",
129	_litprefix + r'"[^\n"\\](?:\\.[^\n"\\])*"',
130	)
131
132	# Because of leftmost-then-longest match semantics, be sure to put the
133	# longest operators first (e.g., if = came before ==, == would get
134	# recognized as two instances of =).
135	Operator = group(
136	r"\\=?",
137	r">>=?",
138	r"<<=?",
139	r"<>",
140	r"!=",
141	r"//=?",
142	r"->",
143	r"[+\-*/%&@\|^=<>:]=?",
144	r"~",
145	)
146
147	Bracket = "[][(){}]"
148	Special = group(r"\r?\n", r"[:;.,`@]")
149	Funny = group(Operator, Bracket, Special)
150
151	# First (or only) line of ' or " string.
152	ContStr = group(
153	_litprefix + r"'[^\n'\\](?:\\.[^\n'\\])*" + group("'", r"\\\r?\n"),
154	_litprefix + r'"[^\n"\\](?:\\.[^\n"\\])*' + group('"', r"\\\r?\n"),
155	)
156	PseudoExtras = group(r"\\\r?\n", Comment, Triple)
157	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
158
159	pseudoprog: Final = re.compile(PseudoToken, re.UNICODE)
160	single3prog = re.compile(Single3)
161	double3prog = re.compile(Double3)
162
163	_strprefixes = (
164	_combinations("r", "R", "f", "F")
165	\| _combinations("r", "R", "b", "B")
166	\| {"u", "U", "ur", "uR", "Ur", "UR"}
167	)
168
169	endprogs: Final = {
170	"'": re.compile(Single),
171	'"': re.compile(Double),
172	"'''": single3prog,
173	'"""': double3prog,
174	**{f"{prefix}'''": single3prog for prefix in _strprefixes},
175	**{f'{prefix}"""': double3prog for prefix in _strprefixes},
176	}
177
178	triple_quoted: Final = (
179	{"'''", '"""'}
180	\| {f"{prefix}'''" for prefix in _strprefixes}
181	\| {f'{prefix}"""' for prefix in _strprefixes}
182	)
183	single_quoted: Final = (
184	{"'", '"'}
185	\| {f"{prefix}'" for prefix in _strprefixes}
186	\| {f'{prefix}"' for prefix in _strprefixes}
187	)
188
189	tabsize = 8
190
191
192	class TokenError(Exception):
193	pass
194
195
196	class StopTokenizing(Exception):
197	pass
198
199
200	Coord = Tuple[int, int]
201
202
203	def printtoken(
204	type: int, token: str, srow_col: Coord, erow_col: Coord, line: str
205	) -> None: # for testing
206	(srow, scol) = srow_col
207	(erow, ecol) = erow_col
208	print(
209	"%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
210	)
211
212
213	TokenEater = Callable[[int, str, Coord, Coord, str], None]
214
215
216	def tokenize(readline: Callable[[], str], tokeneater: TokenEater = printtoken) -> None:
217	"""
218	The tokenize() function accepts two parameters: one representing the
219	input stream, and one providing an output mechanism for tokenize().
220
221	The first parameter, readline, must be a callable object which provides
222	the same interface as the readline() method of built-in file objects.
223	Each call to the function should return one line of input as a string.
224
225	The second parameter, tokeneater, must also be a callable object. It is
226	called once for each token, with five arguments, corresponding to the
227	tuples generated by generate_tokens().
228	"""
229	try:
230	tokenize_loop(readline, tokeneater)
231	except StopTokenizing:
232	pass
233
234
235	# backwards compatible interface
236	def tokenize_loop(readline: Callable[[], str], tokeneater: TokenEater) -> None:
237	for token_info in generate_tokens(readline):
238	tokeneater(*token_info)
239
240
241	GoodTokenInfo = Tuple[int, str, Coord, Coord, str]
242	TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
243
244
245	class Untokenizer:
246	tokens: List[str]
247	prev_row: int
248	prev_col: int
249
250	def __init__(self) -> None:
251	self.tokens = []
252	self.prev_row = 1
253	self.prev_col = 0
254
255	def add_whitespace(self, start: Coord) -> None:
256	row, col = start
257	assert row <= self.prev_row
258	col_offset = col - self.prev_col
259	if col_offset:
260	self.tokens.append(" " * col_offset)
261
262	def untokenize(self, iterable: Iterable[TokenInfo]) -> str:
263	for t in iterable:
264	if len(t) == 2:
265	self.compat(cast(Tuple[int, str], t), iterable)
266	break
267	tok_type, token, start, end, line = cast(
268	Tuple[int, str, Coord, Coord, str], t
269	)
270	self.add_whitespace(start)
271	self.tokens.append(token)
272	self.prev_row, self.prev_col = end
273	if tok_type in (NEWLINE, NL):
274	self.prev_row += 1
275	self.prev_col = 0
276	return "".join(self.tokens)
277
278	def compat(self, token: Tuple[int, str], iterable: Iterable[TokenInfo]) -> None:
279	startline = False
280	indents = []
281	toks_append = self.tokens.append
282	toknum, tokval = token
283	if toknum in (NAME, NUMBER):
284	tokval += " "
285	if toknum in (NEWLINE, NL):
286	startline = True
287	for tok in iterable:
288	toknum, tokval = tok[:2]
289
290	if toknum in (NAME, NUMBER, ASYNC, AWAIT):
291	tokval += " "
292
293	if toknum == INDENT:
294	indents.append(tokval)
295	continue
296	elif toknum == DEDENT:
297	indents.pop()
298	continue
299	elif toknum in (NEWLINE, NL):
300	startline = True
301	elif startline and indents:
302	toks_append(indents[-1])
303	startline = False
304	toks_append(tokval)
305
306
307	cookie_re = re.compile(r"^[ \t\f]#.?coding[:=][ \t]*([-\w.]+)", re.ASCII)
308	blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]\|$)", re.ASCII)
309
310
311	def _get_normal_name(orig_enc: str) -> str:
312	"""Imitates get_normal_name in tokenizer.c."""
313	# Only care about the first 12 characters.
314	enc = orig_enc[:12].lower().replace("_", "-")
315	if enc == "utf-8" or enc.startswith("utf-8-"):
316	return "utf-8"
317	if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
318	("latin-1-", "iso-8859-1-", "iso-latin-1-")
319	):
320	return "iso-8859-1"
321	return orig_enc
322
323
324	def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
325	"""
326	The detect_encoding() function is used to detect the encoding that should
327	be used to decode a Python source file. It requires one argument, readline,
328	in the same way as the tokenize() generator.
329
330	It will call readline a maximum of twice, and return the encoding used
331	(as a string) and a list of any lines (left as bytes) it has read
332	in.
333
334	It detects the encoding from the presence of a utf-8 bom or an encoding
335	cookie as specified in pep-0263. If both a bom and a cookie are present, but
336	disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
337	charset, raise a SyntaxError. Note that if a utf-8 bom is found,
338	'utf-8-sig' is returned.
339
340	If no encoding is specified, then the default of 'utf-8' will be returned.
341	"""
342	bom_found = False
343	encoding = None
344	default = "utf-8"
345
346	def read_or_stop() -> bytes:
347	try:
348	return readline()
349	except StopIteration:
350	return b""
351
352	def find_cookie(line: bytes) -> Optional[str]:
353	try:
354	line_string = line.decode("ascii")
355	except UnicodeDecodeError:
356	return None
357	match = cookie_re.match(line_string)
358	if not match:
359	return None
360	encoding = _get_normal_name(match.group(1))
361	try:
362	codec = lookup(encoding)
363	except LookupError:
364	# This behaviour mimics the Python interpreter
365	raise SyntaxError("unknown encoding: " + encoding)
366
367	if bom_found:
368	if codec.name != "utf-8":
369	# This behaviour mimics the Python interpreter
370	raise SyntaxError("encoding problem: utf-8")
371	encoding += "-sig"
372	return encoding
373
374	first = read_or_stop()
375	if first.startswith(BOM_UTF8):
376	bom_found = True
377	first = first[3:]
378	default = "utf-8-sig"
379	if not first:
380	return default, []
381
382	encoding = find_cookie(first)
383	if encoding:
384	return encoding, [first]
385	if not blank_re.match(first):
386	return default, [first]
387
388	second = read_or_stop()
389	if not second:
390	return default, [first]
391
392	encoding = find_cookie(second)
393	if encoding:
394	return encoding, [first, second]
395
396	return default, [first, second]
397
398
399	def untokenize(iterable: Iterable[TokenInfo]) -> str:
400	"""Transform tokens back into Python source code.
401
402	Each element returned by the iterable must be a token sequence
403	with at least two elements, a token number and token value. If
404	only two tokens are passed, the resulting output is poor.
405
406	Round-trip invariant for full input:
407	Untokenized source will match input source exactly
408
409	Round-trip invariant for limited input:
410	# Output text will tokenize the back to the input
411	t1 = [tok[:2] for tok in generate_tokens(f.readline)]
412	newcode = untokenize(t1)
413	readline = iter(newcode.splitlines(1)).next
414	t2 = [tok[:2] for tokin generate_tokens(readline)]
415	assert t1 == t2
416	"""
417	ut = Untokenizer()
418	return ut.untokenize(iterable)
419
420
421	def generate_tokens(
422	readline: Callable[[], str], grammar: Optional[Grammar] = None
423	) -> Iterator[GoodTokenInfo]:
424	"""
425	The generate_tokens() generator requires one argument, readline, which
426	must be a callable object which provides the same interface as the
427	readline() method of built-in file objects. Each call to the function
428	should return one line of input as a string. Alternately, readline
429	can be a callable function terminating with StopIteration:
430	readline = open(myfile).next # Example of alternate readline
431
432	The generator produces 5-tuples with these members: the token type; the
433	token string; a 2-tuple (srow, scol) of ints specifying the row and
434	column where the token begins in the source; a 2-tuple (erow, ecol) of
435	ints specifying the row and column where the token ends in the source;
436	and the line on which the token was found. The line passed is the
437	logical line; continuation lines are included.
438	"""
439	lnum = parenlev = continued = 0
440	numchars: Final[str] = "0123456789"
441	contstr, needcont = "", 0
442	contline: Optional[str] = None
443	indents = [0]
444
445	# If we know we're parsing 3.7+, we can unconditionally parse `async` and
446	# `await` as keywords.
447	async_keywords = False if grammar is None else grammar.async_keywords
448	# 'stashed' and 'async_*' are used for async/await parsing
449	stashed: Optional[GoodTokenInfo] = None
450	async_def = False
451	async_def_indent = 0
452	async_def_nl = False
453
454	strstart: Tuple[int, int]
455	endprog: Pattern[str]
456
457	while 1: # loop over lines in stream
458	try:
459	line = readline()
460	except StopIteration:
461	line = ""
462	lnum += 1
463	pos, max = 0, len(line)
464
465	if contstr: # continued string
466	assert contline is not None
467	if not line:
468	raise TokenError("EOF in multi-line string", strstart)
469	endmatch = endprog.match(line)
470	if endmatch:
471	pos = end = endmatch.end(0)
472	yield (
473	STRING,
474	contstr + line[:end],
475	strstart,
476	(lnum, end),
477	contline + line,
478	)
479	contstr, needcont = "", 0
480	contline = None
481	elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
482	yield (
483	ERRORTOKEN,
484	contstr + line,
485	strstart,
486	(lnum, len(line)),
487	contline,
488	)
489	contstr = ""
490	contline = None
491	continue
492	else:
493	contstr = contstr + line
494	contline = contline + line
495	continue
496
497	elif parenlev == 0 and not continued: # new statement
498	if not line:
499	break
500	column = 0
501	while pos < max: # measure leading whitespace
502	if line[pos] == " ":
503	column += 1
504	elif line[pos] == "\t":
505	column = (column // tabsize + 1) * tabsize
506	elif line[pos] == "\f":
507	column = 0
508	else:
509	break
510	pos += 1
511	if pos == max:
512	break
513
514	if stashed:
515	yield stashed
516	stashed = None
517
518	if line[pos] in "\r\n": # skip blank lines
519	yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
520	continue
521
522	if line[pos] == "#": # skip comments
523	comment_token = line[pos:].rstrip("\r\n")
524	nl_pos = pos + len(comment_token)
525	yield (
526	COMMENT,
527	comment_token,
528	(lnum, pos),
529	(lnum, nl_pos),
530	line,
531	)
532	yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
533	continue
534
535	if column > indents[-1]: # count indents
536	indents.append(column)
537	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
538
539	while column < indents[-1]: # count dedents
540	if column not in indents:
541	raise IndentationError(
542	"unindent does not match any outer indentation level",
543	("<tokenize>", lnum, pos, line),
544	)
545	indents = indents[:-1]
546
547	if async_def and async_def_indent >= indents[-1]:
548	async_def = False
549	async_def_nl = False
550	async_def_indent = 0
551
552	yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
553
554	if async_def and async_def_nl and async_def_indent >= indents[-1]:
555	async_def = False
556	async_def_nl = False
557	async_def_indent = 0
558
559	else: # continued statement
560	if not line:
561	raise TokenError("EOF in multi-line statement", (lnum, 0))
562	continued = 0
563
564	while pos < max:
565	pseudomatch = pseudoprog.match(line, pos)
566	if pseudomatch: # scan for tokens
567	start, end = pseudomatch.span(1)
568	spos, epos, pos = (lnum, start), (lnum, end), end
569	token, initial = line[start:end], line[start]
570
571	if initial in numchars or (
572	initial == "." and token != "."
573	): # ordinary number
574	yield (NUMBER, token, spos, epos, line)
575	elif initial in "\r\n":
576	newline = NEWLINE
577	if parenlev > 0:
578	newline = NL
579	elif async_def:
580	async_def_nl = True
581	if stashed:
582	yield stashed
583	stashed = None
584	yield (newline, token, spos, epos, line)
585
586	elif initial == "#":
587	assert not token.endswith("\n")
588	if stashed:
589	yield stashed
590	stashed = None
591	yield (COMMENT, token, spos, epos, line)
592	elif token in triple_quoted:
593	endprog = endprogs[token]
594	endmatch = endprog.match(line, pos)
595	if endmatch: # all on one line
596	pos = endmatch.end(0)
597	token = line[start:pos]
598	if stashed:
599	yield stashed
600	stashed = None
601	yield (STRING, token, spos, (lnum, pos), line)
602	else:
603	strstart = (lnum, start) # multiple lines
604	contstr = line[start:]
605	contline = line
606	break
607	elif (
608	initial in single_quoted
609	or token[:2] in single_quoted
610	or token[:3] in single_quoted
611	):
612	if token[-1] == "\n": # continued string
613	strstart = (lnum, start)
614	maybe_endprog = (
615	endprogs.get(initial)
616	or endprogs.get(token[1])
617	or endprogs.get(token[2])
618	)
619	assert (
620	maybe_endprog is not None
621	), f"endprog not found for {token}"
622	endprog = maybe_endprog
623	contstr, needcont = line[start:], 1
624	contline = line
625	break
626	else: # ordinary string
627	if stashed:
628	yield stashed
629	stashed = None
630	yield (STRING, token, spos, epos, line)
631	elif initial.isidentifier(): # ordinary name
632	if token in ("async", "await"):
633	if async_keywords or async_def:
634	yield (
635	ASYNC if token == "async" else AWAIT,
636	token,
637	spos,
638	epos,
639	line,
640	)
641	continue
642
643	tok = (NAME, token, spos, epos, line)
644	if token == "async" and not stashed:
645	stashed = tok
646	continue
647
648	if token in ("def", "for"):
649	if stashed and stashed[0] == NAME and stashed[1] == "async":
650	if token == "def":
651	async_def = True
652	async_def_indent = indents[-1]
653
654	yield (
655	ASYNC,
656	stashed[1],
657	stashed[2],
658	stashed[3],
659	stashed[4],
660	)
661	stashed = None
662
663	if stashed:
664	yield stashed
665	stashed = None
666
667	yield tok
668	elif initial == "\\": # continued stmt
669	# This yield is new; needed for better idempotency:
670	if stashed:
671	yield stashed
672	stashed = None
673	yield (NL, token, spos, (lnum, pos), line)
674	continued = 1
675	else:
676	if initial in "([{":
677	parenlev += 1
678	elif initial in ")]}":
679	parenlev -= 1
680	if stashed:
681	yield stashed
682	stashed = None
683	yield (OP, token, spos, epos, line)
684	else:
685	yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
686	pos += 1
687
688	if stashed:
689	yield stashed
690	stashed = None
691
692	for _indent in indents[1:]: # pop remaining indent levels
693	yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
694	yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
695
696
697	if __name__ == "__main__": # testing
698	if len(sys.argv) > 1:
699	tokenize(open(sys.argv[1]).readline)
700	else:
701	tokenize(sys.stdin.readline)