crepu.dev Git - config.git/blame_incremental - djavu-asus/elpy/rpc-venv/lib/python3.11/site-packages/blib2to3/pgen2/tokenize.py

... / ...

Commit	Line	Data
	1	# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
	2	# All rights reserved.
	3
	4	# mypy: allow-untyped-defs, allow-untyped-calls
	5
	6	"""Tokenization help for Python programs.
	7
	8	generate_tokens(readline) is a generator that breaks a stream of
	9	text into Python tokens. It accepts a readline-like method which is called
	10	repeatedly to get the next line of input (or "" for EOF). It generates
	11	5-tuples with these members:
	12
	13	the token type (see token.py)
	14	the token (a string)
	15	the starting (row, column) indices of the token (a 2-tuple of ints)
	16	the ending (row, column) indices of the token (a 2-tuple of ints)
	17	the original line (string)
	18
	19	It is designed to match the working of the Python tokenizer exactly, except
	20	that it produces COMMENT tokens for comments and gives type OP for all
	21	operators
	22
	23	Older entry points
	24	tokenize_loop(readline, tokeneater)
	25	tokenize(readline, tokeneater=printtoken)
	26	are the same, except instead of generating tokens, tokeneater is a callback
	27	function to which the 5 fields described above are passed as 5 arguments,
	28	each time a new token is found."""
	29
	30	import sys
	31	from typing import (
	32	Callable,
	33	Final,
	34	Iterable,
	35	Iterator,
	36	List,
	37	Optional,
	38	Pattern,
	39	Set,
	40	Tuple,
	41	Union,
	42	cast,
	43	)
	44
	45	from blib2to3.pgen2.grammar import Grammar
	46	from blib2to3.pgen2.token import (
	47	ASYNC,
	48	AWAIT,
	49	COMMENT,
	50	DEDENT,
	51	ENDMARKER,
	52	ERRORTOKEN,
	53	INDENT,
	54	NAME,
	55	NEWLINE,
	56	NL,
	57	NUMBER,
	58	OP,
	59	STRING,
	60	tok_name,
	61	)
	62
	63	__author__ = "Ka-Ping Yee <ping@lfw.org>"
	64	__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
	65
	66	import re
	67	from codecs import BOM_UTF8, lookup
	68
	69	from . import token
	70
	71	__all__ = [x for x in dir(token) if x[0] != "_"] + [
	72	"tokenize",
	73	"generate_tokens",
	74	"untokenize",
	75	]
	76	del token
	77
	78
	79	def group(*choices: str) -> str:
	80	return "(" + "\|".join(choices) + ")"
	81
	82
	83	def any(*choices: str) -> str:
	84	return group(choices) + ""
	85
	86
	87	def maybe(*choices: str) -> str:
	88	return group(*choices) + "?"
	89
	90
	91	def _combinations(*l: str) -> Set[str]:
	92	return {x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()}
	93
	94
	95	Whitespace = r"[ \f\t]*"
	96	Comment = r"#[^\r\n]*"
	97	Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
	98	Name = ( # this is invalid but it's fine because Name comes after Number in all groups
	99	r"[^\s#\[\]\{\}+\-*/!@$%^&=\|;:'\",\.<>/?`~\\]+"
	100	)
	101
	102	Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
	103	Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
	104	Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
	105	Decnumber = group(r"[1-9]\d(?:_\d+)[lL]?", "0[lL]?")
	106	Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
	107	Exponent = r"[eE][-+]?\d+(?:_\d+)*"
	108	Pointfloat = group(r"\d+(?:_\d+)\.(?:\d+(?:_\d+))?", r"\.\d+(?:_\d+)*") + maybe(
	109	Exponent
	110	)
	111	Expfloat = r"\d+(?:_\d+)*" + Exponent
	112	Floatnumber = group(Pointfloat, Expfloat)
	113	Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
	114	Number = group(Imagnumber, Floatnumber, Intnumber)
	115
	116	# Tail end of ' string.
	117	Single = r"[^'\\](?:\\.[^'\\])*'"
	118	# Tail end of " string.
	119	Double = r'[^"\\](?:\\.[^"\\])*"'
	120	# Tail end of ''' string.
	121	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
	122	# Tail end of """ string.
	123	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
	124	_litprefix = r"(?:[uUrRbBfF]\|[rR][fFbB]\|[fFbBuU][rR])?"
	125	Triple = group(_litprefix + "'''", _litprefix + '"""')
	126	# Single-line ' or " string.
	127	String = group(
	128	_litprefix + r"'[^\n'\\](?:\\.[^\n'\\])*'",
	129	_litprefix + r'"[^\n"\\](?:\\.[^\n"\\])*"',
	130	)
	131
	132	# Because of leftmost-then-longest match semantics, be sure to put the
	133	# longest operators first (e.g., if = came before ==, == would get
	134	# recognized as two instances of =).
	135	Operator = group(
	136	r"\\=?",
	137	r">>=?",
	138	r"<<=?",
	139	r"<>",
	140	r"!=",
	141	r"//=?",
	142	r"->",
	143	r"[+\-*/%&@\|^=<>:]=?",
	144	r"~",
	145	)
	146
	147	Bracket = "[][(){}]"
	148	Special = group(r"\r?\n", r"[:;.,`@]")
	149	Funny = group(Operator, Bracket, Special)
	150
	151	# First (or only) line of ' or " string.
	152	ContStr = group(
	153	_litprefix + r"'[^\n'\\](?:\\.[^\n'\\])*" + group("'", r"\\\r?\n"),
	154	_litprefix + r'"[^\n"\\](?:\\.[^\n"\\])*' + group('"', r"\\\r?\n"),
	155	)
	156	PseudoExtras = group(r"\\\r?\n", Comment, Triple)
	157	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
	158
	159	pseudoprog: Final = re.compile(PseudoToken, re.UNICODE)
	160	single3prog = re.compile(Single3)
	161	double3prog = re.compile(Double3)
	162
	163	_strprefixes = (
	164	_combinations("r", "R", "f", "F")
	165	\| _combinations("r", "R", "b", "B")
	166	\| {"u", "U", "ur", "uR", "Ur", "UR"}
	167	)
	168
	169	endprogs: Final = {
	170	"'": re.compile(Single),
	171	'"': re.compile(Double),
	172	"'''": single3prog,
	173	'"""': double3prog,
	174	**{f"{prefix}'''": single3prog for prefix in _strprefixes},
	175	**{f'{prefix}"""': double3prog for prefix in _strprefixes},
	176	}
	177
	178	triple_quoted: Final = (
	179	{"'''", '"""'}
	180	\| {f"{prefix}'''" for prefix in _strprefixes}
	181	\| {f'{prefix}"""' for prefix in _strprefixes}
	182	)
	183	single_quoted: Final = (
	184	{"'", '"'}
	185	\| {f"{prefix}'" for prefix in _strprefixes}
	186	\| {f'{prefix}"' for prefix in _strprefixes}
	187	)
	188
	189	tabsize = 8
	190
	191
	192	class TokenError(Exception):
	193	pass
	194
	195
	196	class StopTokenizing(Exception):
	197	pass
	198
	199
	200	Coord = Tuple[int, int]
	201
	202
	203	def printtoken(
	204	type: int, token: str, srow_col: Coord, erow_col: Coord, line: str
	205	) -> None: # for testing
	206	(srow, scol) = srow_col
	207	(erow, ecol) = erow_col
	208	print(
	209	"%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
	210	)
	211
	212
	213	TokenEater = Callable[[int, str, Coord, Coord, str], None]
	214
	215
	216	def tokenize(readline: Callable[[], str], tokeneater: TokenEater = printtoken) -> None:
	217	"""
	218	The tokenize() function accepts two parameters: one representing the
	219	input stream, and one providing an output mechanism for tokenize().
	220
	221	The first parameter, readline, must be a callable object which provides
	222	the same interface as the readline() method of built-in file objects.
	223	Each call to the function should return one line of input as a string.
	224
	225	The second parameter, tokeneater, must also be a callable object. It is
	226	called once for each token, with five arguments, corresponding to the
	227	tuples generated by generate_tokens().
	228	"""
	229	try:
	230	tokenize_loop(readline, tokeneater)
	231	except StopTokenizing:
	232	pass
	233
	234
	235	# backwards compatible interface
	236	def tokenize_loop(readline: Callable[[], str], tokeneater: TokenEater) -> None:
	237	for token_info in generate_tokens(readline):
	238	tokeneater(*token_info)
	239
	240
	241	GoodTokenInfo = Tuple[int, str, Coord, Coord, str]
	242	TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
	243
	244
	245	class Untokenizer:
	246	tokens: List[str]
	247	prev_row: int
	248	prev_col: int
	249
	250	def __init__(self) -> None:
	251	self.tokens = []
	252	self.prev_row = 1
	253	self.prev_col = 0
	254
	255	def add_whitespace(self, start: Coord) -> None:
	256	row, col = start
	257	assert row <= self.prev_row
	258	col_offset = col - self.prev_col
	259	if col_offset:
	260	self.tokens.append(" " * col_offset)
	261
	262	def untokenize(self, iterable: Iterable[TokenInfo]) -> str:
	263	for t in iterable:
	264	if len(t) == 2:
	265	self.compat(cast(Tuple[int, str], t), iterable)
	266	break
	267	tok_type, token, start, end, line = cast(
	268	Tuple[int, str, Coord, Coord, str], t
	269	)
	270	self.add_whitespace(start)
	271	self.tokens.append(token)
	272	self.prev_row, self.prev_col = end
	273	if tok_type in (NEWLINE, NL):
	274	self.prev_row += 1
	275	self.prev_col = 0
	276	return "".join(self.tokens)
	277
	278	def compat(self, token: Tuple[int, str], iterable: Iterable[TokenInfo]) -> None:
	279	startline = False
	280	indents = []
	281	toks_append = self.tokens.append
	282	toknum, tokval = token
	283	if toknum in (NAME, NUMBER):
	284	tokval += " "
	285	if toknum in (NEWLINE, NL):
	286	startline = True
	287	for tok in iterable:
	288	toknum, tokval = tok[:2]
	289
	290	if toknum in (NAME, NUMBER, ASYNC, AWAIT):
	291	tokval += " "
	292
	293	if toknum == INDENT:
	294	indents.append(tokval)
	295	continue
	296	elif toknum == DEDENT:
	297	indents.pop()
	298	continue
	299	elif toknum in (NEWLINE, NL):
	300	startline = True
	301	elif startline and indents:
	302	toks_append(indents[-1])
	303	startline = False
	304	toks_append(tokval)
	305
	306
	307	cookie_re = re.compile(r"^[ \t\f]#.?coding[:=][ \t]*([-\w.]+)", re.ASCII)
	308	blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]\|$)", re.ASCII)
	309
	310
	311	def _get_normal_name(orig_enc: str) -> str:
	312	"""Imitates get_normal_name in tokenizer.c."""
	313	# Only care about the first 12 characters.
	314	enc = orig_enc[:12].lower().replace("_", "-")
	315	if enc == "utf-8" or enc.startswith("utf-8-"):
	316	return "utf-8"
	317	if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
	318	("latin-1-", "iso-8859-1-", "iso-latin-1-")
	319	):
	320	return "iso-8859-1"
	321	return orig_enc
	322
	323
	324	def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
	325	"""
	326	The detect_encoding() function is used to detect the encoding that should
	327	be used to decode a Python source file. It requires one argument, readline,
	328	in the same way as the tokenize() generator.
	329
	330	It will call readline a maximum of twice, and return the encoding used
	331	(as a string) and a list of any lines (left as bytes) it has read
	332	in.
	333
	334	It detects the encoding from the presence of a utf-8 bom or an encoding
	335	cookie as specified in pep-0263. If both a bom and a cookie are present, but
	336	disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
	337	charset, raise a SyntaxError. Note that if a utf-8 bom is found,
	338	'utf-8-sig' is returned.
	339
	340	If no encoding is specified, then the default of 'utf-8' will be returned.
	341	"""
	342	bom_found = False
	343	encoding = None
	344	default = "utf-8"
	345
	346	def read_or_stop() -> bytes:
	347	try:
	348	return readline()
	349	except StopIteration:
	350	return b""
	351
	352	def find_cookie(line: bytes) -> Optional[str]:
	353	try:
	354	line_string = line.decode("ascii")
	355	except UnicodeDecodeError:
	356	return None
	357	match = cookie_re.match(line_string)
	358	if not match:
	359	return None
	360	encoding = _get_normal_name(match.group(1))
	361	try:
	362	codec = lookup(encoding)
	363	except LookupError:
	364	# This behaviour mimics the Python interpreter
	365	raise SyntaxError("unknown encoding: " + encoding)
	366
	367	if bom_found:
	368	if codec.name != "utf-8":
	369	# This behaviour mimics the Python interpreter
	370	raise SyntaxError("encoding problem: utf-8")
	371	encoding += "-sig"
	372	return encoding
	373
	374	first = read_or_stop()
	375	if first.startswith(BOM_UTF8):
	376	bom_found = True
	377	first = first[3:]
	378	default = "utf-8-sig"
	379	if not first:
	380	return default, []
	381
	382	encoding = find_cookie(first)
	383	if encoding:
	384	return encoding, [first]
	385	if not blank_re.match(first):
	386	return default, [first]
	387
	388	second = read_or_stop()
	389	if not second:
	390	return default, [first]
	391
	392	encoding = find_cookie(second)
	393	if encoding:
	394	return encoding, [first, second]
	395
	396	return default, [first, second]
	397
	398
	399	def untokenize(iterable: Iterable[TokenInfo]) -> str:
	400	"""Transform tokens back into Python source code.
	401
	402	Each element returned by the iterable must be a token sequence
	403	with at least two elements, a token number and token value. If
	404	only two tokens are passed, the resulting output is poor.
	405
	406	Round-trip invariant for full input:
	407	Untokenized source will match input source exactly
	408
	409	Round-trip invariant for limited input:
	410	# Output text will tokenize the back to the input
	411	t1 = [tok[:2] for tok in generate_tokens(f.readline)]
	412	newcode = untokenize(t1)
	413	readline = iter(newcode.splitlines(1)).next
	414	t2 = [tok[:2] for tokin generate_tokens(readline)]
	415	assert t1 == t2
	416	"""
	417	ut = Untokenizer()
	418	return ut.untokenize(iterable)
	419
	420
	421	def generate_tokens(
	422	readline: Callable[[], str], grammar: Optional[Grammar] = None
	423	) -> Iterator[GoodTokenInfo]:
	424	"""
	425	The generate_tokens() generator requires one argument, readline, which
	426	must be a callable object which provides the same interface as the
	427	readline() method of built-in file objects. Each call to the function
	428	should return one line of input as a string. Alternately, readline
	429	can be a callable function terminating with StopIteration:
	430	readline = open(myfile).next # Example of alternate readline
	431
	432	The generator produces 5-tuples with these members: the token type; the
	433	token string; a 2-tuple (srow, scol) of ints specifying the row and
	434	column where the token begins in the source; a 2-tuple (erow, ecol) of
	435	ints specifying the row and column where the token ends in the source;
	436	and the line on which the token was found. The line passed is the
	437	logical line; continuation lines are included.
	438	"""
	439	lnum = parenlev = continued = 0
	440	numchars: Final[str] = "0123456789"
	441	contstr, needcont = "", 0
	442	contline: Optional[str] = None
	443	indents = [0]
	444
	445	# If we know we're parsing 3.7+, we can unconditionally parse `async` and
	446	# `await` as keywords.
	447	async_keywords = False if grammar is None else grammar.async_keywords
	448	# 'stashed' and 'async_*' are used for async/await parsing
	449	stashed: Optional[GoodTokenInfo] = None
	450	async_def = False
	451	async_def_indent = 0
	452	async_def_nl = False
	453
	454	strstart: Tuple[int, int]
	455	endprog: Pattern[str]
	456
	457	while 1: # loop over lines in stream
	458	try:
	459	line = readline()
	460	except StopIteration:
	461	line = ""
	462	lnum += 1
	463	pos, max = 0, len(line)
	464
	465	if contstr: # continued string
	466	assert contline is not None
	467	if not line:
	468	raise TokenError("EOF in multi-line string", strstart)
	469	endmatch = endprog.match(line)
	470	if endmatch:
	471	pos = end = endmatch.end(0)
	472	yield (
	473	STRING,
	474	contstr + line[:end],
	475	strstart,
	476	(lnum, end),
	477	contline + line,
	478	)
	479	contstr, needcont = "", 0
	480	contline = None
	481	elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
	482	yield (
	483	ERRORTOKEN,
	484	contstr + line,
	485	strstart,
	486	(lnum, len(line)),
	487	contline,
	488	)
	489	contstr = ""
	490	contline = None
	491	continue
	492	else:
	493	contstr = contstr + line
	494	contline = contline + line
	495	continue
	496
	497	elif parenlev == 0 and not continued: # new statement
	498	if not line:
	499	break
	500	column = 0
	501	while pos < max: # measure leading whitespace
	502	if line[pos] == " ":
	503	column += 1
	504	elif line[pos] == "\t":
	505	column = (column // tabsize + 1) * tabsize
	506	elif line[pos] == "\f":
	507	column = 0
	508	else:
	509	break
	510	pos += 1
	511	if pos == max:
	512	break
	513
	514	if stashed:
	515	yield stashed
	516	stashed = None
	517
	518	if line[pos] in "\r\n": # skip blank lines
	519	yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
	520	continue
	521
	522	if line[pos] == "#": # skip comments
	523	comment_token = line[pos:].rstrip("\r\n")
	524	nl_pos = pos + len(comment_token)
	525	yield (
	526	COMMENT,
	527	comment_token,
	528	(lnum, pos),
	529	(lnum, nl_pos),
	530	line,
	531	)
	532	yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
	533	continue
	534
	535	if column > indents[-1]: # count indents
	536	indents.append(column)
	537	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
	538
	539	while column < indents[-1]: # count dedents
	540	if column not in indents:
	541	raise IndentationError(
	542	"unindent does not match any outer indentation level",
	543	("<tokenize>", lnum, pos, line),
	544	)
	545	indents = indents[:-1]
	546
	547	if async_def and async_def_indent >= indents[-1]:
	548	async_def = False
	549	async_def_nl = False
	550	async_def_indent = 0
	551
	552	yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
	553
	554	if async_def and async_def_nl and async_def_indent >= indents[-1]:
	555	async_def = False
	556	async_def_nl = False
	557	async_def_indent = 0
	558
	559	else: # continued statement
	560	if not line:
	561	raise TokenError("EOF in multi-line statement", (lnum, 0))
	562	continued = 0
	563
	564	while pos < max:
	565	pseudomatch = pseudoprog.match(line, pos)
	566	if pseudomatch: # scan for tokens
	567	start, end = pseudomatch.span(1)
	568	spos, epos, pos = (lnum, start), (lnum, end), end
	569	token, initial = line[start:end], line[start]
	570
	571	if initial in numchars or (
	572	initial == "." and token != "."
	573	): # ordinary number
	574	yield (NUMBER, token, spos, epos, line)
	575	elif initial in "\r\n":
	576	newline = NEWLINE
	577	if parenlev > 0:
	578	newline = NL
	579	elif async_def:
	580	async_def_nl = True
	581	if stashed:
	582	yield stashed
	583	stashed = None
	584	yield (newline, token, spos, epos, line)
	585
	586	elif initial == "#":
	587	assert not token.endswith("\n")
	588	if stashed:
	589	yield stashed
	590	stashed = None
	591	yield (COMMENT, token, spos, epos, line)
	592	elif token in triple_quoted:
	593	endprog = endprogs[token]
	594	endmatch = endprog.match(line, pos)
	595	if endmatch: # all on one line
	596	pos = endmatch.end(0)
	597	token = line[start:pos]
	598	if stashed:
	599	yield stashed
	600	stashed = None
	601	yield (STRING, token, spos, (lnum, pos), line)
	602	else:
	603	strstart = (lnum, start) # multiple lines
	604	contstr = line[start:]
	605	contline = line
	606	break
	607	elif (
	608	initial in single_quoted
	609	or token[:2] in single_quoted
	610	or token[:3] in single_quoted
	611	):
	612	if token[-1] == "\n": # continued string
	613	strstart = (lnum, start)
	614	maybe_endprog = (
	615	endprogs.get(initial)
	616	or endprogs.get(token[1])
	617	or endprogs.get(token[2])
	618	)
	619	assert (
	620	maybe_endprog is not None
	621	), f"endprog not found for {token}"
	622	endprog = maybe_endprog
	623	contstr, needcont = line[start:], 1
	624	contline = line
	625	break
	626	else: # ordinary string
	627	if stashed:
	628	yield stashed
	629	stashed = None
	630	yield (STRING, token, spos, epos, line)
	631	elif initial.isidentifier(): # ordinary name
	632	if token in ("async", "await"):
	633	if async_keywords or async_def:
	634	yield (
	635	ASYNC if token == "async" else AWAIT,
	636	token,
	637	spos,
	638	epos,
	639	line,
	640	)
	641	continue
	642
	643	tok = (NAME, token, spos, epos, line)
	644	if token == "async" and not stashed:
	645	stashed = tok
	646	continue
	647
	648	if token in ("def", "for"):
	649	if stashed and stashed[0] == NAME and stashed[1] == "async":
	650	if token == "def":
	651	async_def = True
	652	async_def_indent = indents[-1]
	653
	654	yield (
	655	ASYNC,
	656	stashed[1],
	657	stashed[2],
	658	stashed[3],
	659	stashed[4],
	660	)
	661	stashed = None
	662
	663	if stashed:
	664	yield stashed
	665	stashed = None
	666
	667	yield tok
	668	elif initial == "\\": # continued stmt
	669	# This yield is new; needed for better idempotency:
	670	if stashed:
	671	yield stashed
	672	stashed = None
	673	yield (NL, token, spos, (lnum, pos), line)
	674	continued = 1
	675	else:
	676	if initial in "([{":
	677	parenlev += 1
	678	elif initial in ")]}":
	679	parenlev -= 1
	680	if stashed:
	681	yield stashed
	682	stashed = None
	683	yield (OP, token, spos, epos, line)
	684	else:
	685	yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
	686	pos += 1
	687
	688	if stashed:
	689	yield stashed
	690	stashed = None
	691
	692	for _indent in indents[1:]: # pop remaining indent levels
	693	yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
	694	yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
	695
	696
	697	if __name__ == "__main__": # testing
	698	if len(sys.argv) > 1:
	699	tokenize(open(sys.argv[1]).readline)
	700	else:
	701	tokenize(sys.stdin.readline)