]>
Commit | Line | Data |
---|---|---|
53e6db90 DC |
1 | # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. |
2 | # All rights reserved. | |
3 | ||
4 | # mypy: allow-untyped-defs, allow-untyped-calls | |
5 | ||
6 | """Tokenization help for Python programs. | |
7 | ||
8 | generate_tokens(readline) is a generator that breaks a stream of | |
9 | text into Python tokens. It accepts a readline-like method which is called | |
10 | repeatedly to get the next line of input (or "" for EOF). It generates | |
11 | 5-tuples with these members: | |
12 | ||
13 | the token type (see token.py) | |
14 | the token (a string) | |
15 | the starting (row, column) indices of the token (a 2-tuple of ints) | |
16 | the ending (row, column) indices of the token (a 2-tuple of ints) | |
17 | the original line (string) | |
18 | ||
19 | It is designed to match the working of the Python tokenizer exactly, except | |
20 | that it produces COMMENT tokens for comments and gives type OP for all | |
21 | operators | |
22 | ||
23 | Older entry points | |
24 | tokenize_loop(readline, tokeneater) | |
25 | tokenize(readline, tokeneater=printtoken) | |
26 | are the same, except instead of generating tokens, tokeneater is a callback | |
27 | function to which the 5 fields described above are passed as 5 arguments, | |
28 | each time a new token is found.""" | |
29 | ||
30 | import sys | |
31 | from typing import ( | |
32 | Callable, | |
33 | Final, | |
34 | Iterable, | |
35 | Iterator, | |
36 | List, | |
37 | Optional, | |
38 | Pattern, | |
39 | Set, | |
40 | Tuple, | |
41 | Union, | |
42 | cast, | |
43 | ) | |
44 | ||
45 | from blib2to3.pgen2.grammar import Grammar | |
46 | from blib2to3.pgen2.token import ( | |
47 | ASYNC, | |
48 | AWAIT, | |
49 | COMMENT, | |
50 | DEDENT, | |
51 | ENDMARKER, | |
52 | ERRORTOKEN, | |
53 | INDENT, | |
54 | NAME, | |
55 | NEWLINE, | |
56 | NL, | |
57 | NUMBER, | |
58 | OP, | |
59 | STRING, | |
60 | tok_name, | |
61 | ) | |
62 | ||
63 | __author__ = "Ka-Ping Yee <ping@lfw.org>" | |
64 | __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro" | |
65 | ||
66 | import re | |
67 | from codecs import BOM_UTF8, lookup | |
68 | ||
69 | from . import token | |
70 | ||
71 | __all__ = [x for x in dir(token) if x[0] != "_"] + [ | |
72 | "tokenize", | |
73 | "generate_tokens", | |
74 | "untokenize", | |
75 | ] | |
76 | del token | |
77 | ||
78 | ||
79 | def group(*choices: str) -> str: | |
80 | return "(" + "|".join(choices) + ")" | |
81 | ||
82 | ||
83 | def any(*choices: str) -> str: | |
84 | return group(*choices) + "*" | |
85 | ||
86 | ||
87 | def maybe(*choices: str) -> str: | |
88 | return group(*choices) + "?" | |
89 | ||
90 | ||
91 | def _combinations(*l: str) -> Set[str]: | |
92 | return {x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()} | |
93 | ||
94 | ||
95 | Whitespace = r"[ \f\t]*" | |
96 | Comment = r"#[^\r\n]*" | |
97 | Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment) | |
98 | Name = ( # this is invalid but it's fine because Name comes after Number in all groups | |
99 | r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+" | |
100 | ) | |
101 | ||
102 | Binnumber = r"0[bB]_?[01]+(?:_[01]+)*" | |
103 | Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?" | |
104 | Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?" | |
105 | Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?") | |
106 | Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber) | |
107 | Exponent = r"[eE][-+]?\d+(?:_\d+)*" | |
108 | Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe( | |
109 | Exponent | |
110 | ) | |
111 | Expfloat = r"\d+(?:_\d+)*" + Exponent | |
112 | Floatnumber = group(Pointfloat, Expfloat) | |
113 | Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]") | |
114 | Number = group(Imagnumber, Floatnumber, Intnumber) | |
115 | ||
116 | # Tail end of ' string. | |
117 | Single = r"[^'\\]*(?:\\.[^'\\]*)*'" | |
118 | # Tail end of " string. | |
119 | Double = r'[^"\\]*(?:\\.[^"\\]*)*"' | |
120 | # Tail end of ''' string. | |
121 | Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" | |
122 | # Tail end of """ string. | |
123 | Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' | |
124 | _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?" | |
125 | Triple = group(_litprefix + "'''", _litprefix + '"""') | |
126 | # Single-line ' or " string. | |
127 | String = group( | |
128 | _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", | |
129 | _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"', | |
130 | ) | |
131 | ||
132 | # Because of leftmost-then-longest match semantics, be sure to put the | |
133 | # longest operators first (e.g., if = came before ==, == would get | |
134 | # recognized as two instances of =). | |
135 | Operator = group( | |
136 | r"\*\*=?", | |
137 | r">>=?", | |
138 | r"<<=?", | |
139 | r"<>", | |
140 | r"!=", | |
141 | r"//=?", | |
142 | r"->", | |
143 | r"[+\-*/%&@|^=<>:]=?", | |
144 | r"~", | |
145 | ) | |
146 | ||
147 | Bracket = "[][(){}]" | |
148 | Special = group(r"\r?\n", r"[:;.,`@]") | |
149 | Funny = group(Operator, Bracket, Special) | |
150 | ||
151 | # First (or only) line of ' or " string. | |
152 | ContStr = group( | |
153 | _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"), | |
154 | _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"), | |
155 | ) | |
156 | PseudoExtras = group(r"\\\r?\n", Comment, Triple) | |
157 | PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) | |
158 | ||
159 | pseudoprog: Final = re.compile(PseudoToken, re.UNICODE) | |
160 | single3prog = re.compile(Single3) | |
161 | double3prog = re.compile(Double3) | |
162 | ||
163 | _strprefixes = ( | |
164 | _combinations("r", "R", "f", "F") | |
165 | | _combinations("r", "R", "b", "B") | |
166 | | {"u", "U", "ur", "uR", "Ur", "UR"} | |
167 | ) | |
168 | ||
169 | endprogs: Final = { | |
170 | "'": re.compile(Single), | |
171 | '"': re.compile(Double), | |
172 | "'''": single3prog, | |
173 | '"""': double3prog, | |
174 | **{f"{prefix}'''": single3prog for prefix in _strprefixes}, | |
175 | **{f'{prefix}"""': double3prog for prefix in _strprefixes}, | |
176 | } | |
177 | ||
178 | triple_quoted: Final = ( | |
179 | {"'''", '"""'} | |
180 | | {f"{prefix}'''" for prefix in _strprefixes} | |
181 | | {f'{prefix}"""' for prefix in _strprefixes} | |
182 | ) | |
183 | single_quoted: Final = ( | |
184 | {"'", '"'} | |
185 | | {f"{prefix}'" for prefix in _strprefixes} | |
186 | | {f'{prefix}"' for prefix in _strprefixes} | |
187 | ) | |
188 | ||
189 | tabsize = 8 | |
190 | ||
191 | ||
192 | class TokenError(Exception): | |
193 | pass | |
194 | ||
195 | ||
196 | class StopTokenizing(Exception): | |
197 | pass | |
198 | ||
199 | ||
200 | Coord = Tuple[int, int] | |
201 | ||
202 | ||
203 | def printtoken( | |
204 | type: int, token: str, srow_col: Coord, erow_col: Coord, line: str | |
205 | ) -> None: # for testing | |
206 | (srow, scol) = srow_col | |
207 | (erow, ecol) = erow_col | |
208 | print( | |
209 | "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token)) | |
210 | ) | |
211 | ||
212 | ||
213 | TokenEater = Callable[[int, str, Coord, Coord, str], None] | |
214 | ||
215 | ||
216 | def tokenize(readline: Callable[[], str], tokeneater: TokenEater = printtoken) -> None: | |
217 | """ | |
218 | The tokenize() function accepts two parameters: one representing the | |
219 | input stream, and one providing an output mechanism for tokenize(). | |
220 | ||
221 | The first parameter, readline, must be a callable object which provides | |
222 | the same interface as the readline() method of built-in file objects. | |
223 | Each call to the function should return one line of input as a string. | |
224 | ||
225 | The second parameter, tokeneater, must also be a callable object. It is | |
226 | called once for each token, with five arguments, corresponding to the | |
227 | tuples generated by generate_tokens(). | |
228 | """ | |
229 | try: | |
230 | tokenize_loop(readline, tokeneater) | |
231 | except StopTokenizing: | |
232 | pass | |
233 | ||
234 | ||
235 | # backwards compatible interface | |
236 | def tokenize_loop(readline: Callable[[], str], tokeneater: TokenEater) -> None: | |
237 | for token_info in generate_tokens(readline): | |
238 | tokeneater(*token_info) | |
239 | ||
240 | ||
241 | GoodTokenInfo = Tuple[int, str, Coord, Coord, str] | |
242 | TokenInfo = Union[Tuple[int, str], GoodTokenInfo] | |
243 | ||
244 | ||
245 | class Untokenizer: | |
246 | tokens: List[str] | |
247 | prev_row: int | |
248 | prev_col: int | |
249 | ||
250 | def __init__(self) -> None: | |
251 | self.tokens = [] | |
252 | self.prev_row = 1 | |
253 | self.prev_col = 0 | |
254 | ||
255 | def add_whitespace(self, start: Coord) -> None: | |
256 | row, col = start | |
257 | assert row <= self.prev_row | |
258 | col_offset = col - self.prev_col | |
259 | if col_offset: | |
260 | self.tokens.append(" " * col_offset) | |
261 | ||
262 | def untokenize(self, iterable: Iterable[TokenInfo]) -> str: | |
263 | for t in iterable: | |
264 | if len(t) == 2: | |
265 | self.compat(cast(Tuple[int, str], t), iterable) | |
266 | break | |
267 | tok_type, token, start, end, line = cast( | |
268 | Tuple[int, str, Coord, Coord, str], t | |
269 | ) | |
270 | self.add_whitespace(start) | |
271 | self.tokens.append(token) | |
272 | self.prev_row, self.prev_col = end | |
273 | if tok_type in (NEWLINE, NL): | |
274 | self.prev_row += 1 | |
275 | self.prev_col = 0 | |
276 | return "".join(self.tokens) | |
277 | ||
278 | def compat(self, token: Tuple[int, str], iterable: Iterable[TokenInfo]) -> None: | |
279 | startline = False | |
280 | indents = [] | |
281 | toks_append = self.tokens.append | |
282 | toknum, tokval = token | |
283 | if toknum in (NAME, NUMBER): | |
284 | tokval += " " | |
285 | if toknum in (NEWLINE, NL): | |
286 | startline = True | |
287 | for tok in iterable: | |
288 | toknum, tokval = tok[:2] | |
289 | ||
290 | if toknum in (NAME, NUMBER, ASYNC, AWAIT): | |
291 | tokval += " " | |
292 | ||
293 | if toknum == INDENT: | |
294 | indents.append(tokval) | |
295 | continue | |
296 | elif toknum == DEDENT: | |
297 | indents.pop() | |
298 | continue | |
299 | elif toknum in (NEWLINE, NL): | |
300 | startline = True | |
301 | elif startline and indents: | |
302 | toks_append(indents[-1]) | |
303 | startline = False | |
304 | toks_append(tokval) | |
305 | ||
306 | ||
307 | cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII) | |
308 | blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII) | |
309 | ||
310 | ||
311 | def _get_normal_name(orig_enc: str) -> str: | |
312 | """Imitates get_normal_name in tokenizer.c.""" | |
313 | # Only care about the first 12 characters. | |
314 | enc = orig_enc[:12].lower().replace("_", "-") | |
315 | if enc == "utf-8" or enc.startswith("utf-8-"): | |
316 | return "utf-8" | |
317 | if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith( | |
318 | ("latin-1-", "iso-8859-1-", "iso-latin-1-") | |
319 | ): | |
320 | return "iso-8859-1" | |
321 | return orig_enc | |
322 | ||
323 | ||
324 | def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]: | |
325 | """ | |
326 | The detect_encoding() function is used to detect the encoding that should | |
327 | be used to decode a Python source file. It requires one argument, readline, | |
328 | in the same way as the tokenize() generator. | |
329 | ||
330 | It will call readline a maximum of twice, and return the encoding used | |
331 | (as a string) and a list of any lines (left as bytes) it has read | |
332 | in. | |
333 | ||
334 | It detects the encoding from the presence of a utf-8 bom or an encoding | |
335 | cookie as specified in pep-0263. If both a bom and a cookie are present, but | |
336 | disagree, a SyntaxError will be raised. If the encoding cookie is an invalid | |
337 | charset, raise a SyntaxError. Note that if a utf-8 bom is found, | |
338 | 'utf-8-sig' is returned. | |
339 | ||
340 | If no encoding is specified, then the default of 'utf-8' will be returned. | |
341 | """ | |
342 | bom_found = False | |
343 | encoding = None | |
344 | default = "utf-8" | |
345 | ||
346 | def read_or_stop() -> bytes: | |
347 | try: | |
348 | return readline() | |
349 | except StopIteration: | |
350 | return b"" | |
351 | ||
352 | def find_cookie(line: bytes) -> Optional[str]: | |
353 | try: | |
354 | line_string = line.decode("ascii") | |
355 | except UnicodeDecodeError: | |
356 | return None | |
357 | match = cookie_re.match(line_string) | |
358 | if not match: | |
359 | return None | |
360 | encoding = _get_normal_name(match.group(1)) | |
361 | try: | |
362 | codec = lookup(encoding) | |
363 | except LookupError: | |
364 | # This behaviour mimics the Python interpreter | |
365 | raise SyntaxError("unknown encoding: " + encoding) | |
366 | ||
367 | if bom_found: | |
368 | if codec.name != "utf-8": | |
369 | # This behaviour mimics the Python interpreter | |
370 | raise SyntaxError("encoding problem: utf-8") | |
371 | encoding += "-sig" | |
372 | return encoding | |
373 | ||
374 | first = read_or_stop() | |
375 | if first.startswith(BOM_UTF8): | |
376 | bom_found = True | |
377 | first = first[3:] | |
378 | default = "utf-8-sig" | |
379 | if not first: | |
380 | return default, [] | |
381 | ||
382 | encoding = find_cookie(first) | |
383 | if encoding: | |
384 | return encoding, [first] | |
385 | if not blank_re.match(first): | |
386 | return default, [first] | |
387 | ||
388 | second = read_or_stop() | |
389 | if not second: | |
390 | return default, [first] | |
391 | ||
392 | encoding = find_cookie(second) | |
393 | if encoding: | |
394 | return encoding, [first, second] | |
395 | ||
396 | return default, [first, second] | |
397 | ||
398 | ||
399 | def untokenize(iterable: Iterable[TokenInfo]) -> str: | |
400 | """Transform tokens back into Python source code. | |
401 | ||
402 | Each element returned by the iterable must be a token sequence | |
403 | with at least two elements, a token number and token value. If | |
404 | only two tokens are passed, the resulting output is poor. | |
405 | ||
406 | Round-trip invariant for full input: | |
407 | Untokenized source will match input source exactly | |
408 | ||
409 | Round-trip invariant for limited input: | |
410 | # Output text will tokenize the back to the input | |
411 | t1 = [tok[:2] for tok in generate_tokens(f.readline)] | |
412 | newcode = untokenize(t1) | |
413 | readline = iter(newcode.splitlines(1)).next | |
414 | t2 = [tok[:2] for tokin generate_tokens(readline)] | |
415 | assert t1 == t2 | |
416 | """ | |
417 | ut = Untokenizer() | |
418 | return ut.untokenize(iterable) | |
419 | ||
420 | ||
421 | def generate_tokens( | |
422 | readline: Callable[[], str], grammar: Optional[Grammar] = None | |
423 | ) -> Iterator[GoodTokenInfo]: | |
424 | """ | |
425 | The generate_tokens() generator requires one argument, readline, which | |
426 | must be a callable object which provides the same interface as the | |
427 | readline() method of built-in file objects. Each call to the function | |
428 | should return one line of input as a string. Alternately, readline | |
429 | can be a callable function terminating with StopIteration: | |
430 | readline = open(myfile).next # Example of alternate readline | |
431 | ||
432 | The generator produces 5-tuples with these members: the token type; the | |
433 | token string; a 2-tuple (srow, scol) of ints specifying the row and | |
434 | column where the token begins in the source; a 2-tuple (erow, ecol) of | |
435 | ints specifying the row and column where the token ends in the source; | |
436 | and the line on which the token was found. The line passed is the | |
437 | logical line; continuation lines are included. | |
438 | """ | |
439 | lnum = parenlev = continued = 0 | |
440 | numchars: Final[str] = "0123456789" | |
441 | contstr, needcont = "", 0 | |
442 | contline: Optional[str] = None | |
443 | indents = [0] | |
444 | ||
445 | # If we know we're parsing 3.7+, we can unconditionally parse `async` and | |
446 | # `await` as keywords. | |
447 | async_keywords = False if grammar is None else grammar.async_keywords | |
448 | # 'stashed' and 'async_*' are used for async/await parsing | |
449 | stashed: Optional[GoodTokenInfo] = None | |
450 | async_def = False | |
451 | async_def_indent = 0 | |
452 | async_def_nl = False | |
453 | ||
454 | strstart: Tuple[int, int] | |
455 | endprog: Pattern[str] | |
456 | ||
457 | while 1: # loop over lines in stream | |
458 | try: | |
459 | line = readline() | |
460 | except StopIteration: | |
461 | line = "" | |
462 | lnum += 1 | |
463 | pos, max = 0, len(line) | |
464 | ||
465 | if contstr: # continued string | |
466 | assert contline is not None | |
467 | if not line: | |
468 | raise TokenError("EOF in multi-line string", strstart) | |
469 | endmatch = endprog.match(line) | |
470 | if endmatch: | |
471 | pos = end = endmatch.end(0) | |
472 | yield ( | |
473 | STRING, | |
474 | contstr + line[:end], | |
475 | strstart, | |
476 | (lnum, end), | |
477 | contline + line, | |
478 | ) | |
479 | contstr, needcont = "", 0 | |
480 | contline = None | |
481 | elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n": | |
482 | yield ( | |
483 | ERRORTOKEN, | |
484 | contstr + line, | |
485 | strstart, | |
486 | (lnum, len(line)), | |
487 | contline, | |
488 | ) | |
489 | contstr = "" | |
490 | contline = None | |
491 | continue | |
492 | else: | |
493 | contstr = contstr + line | |
494 | contline = contline + line | |
495 | continue | |
496 | ||
497 | elif parenlev == 0 and not continued: # new statement | |
498 | if not line: | |
499 | break | |
500 | column = 0 | |
501 | while pos < max: # measure leading whitespace | |
502 | if line[pos] == " ": | |
503 | column += 1 | |
504 | elif line[pos] == "\t": | |
505 | column = (column // tabsize + 1) * tabsize | |
506 | elif line[pos] == "\f": | |
507 | column = 0 | |
508 | else: | |
509 | break | |
510 | pos += 1 | |
511 | if pos == max: | |
512 | break | |
513 | ||
514 | if stashed: | |
515 | yield stashed | |
516 | stashed = None | |
517 | ||
518 | if line[pos] in "\r\n": # skip blank lines | |
519 | yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line) | |
520 | continue | |
521 | ||
522 | if line[pos] == "#": # skip comments | |
523 | comment_token = line[pos:].rstrip("\r\n") | |
524 | nl_pos = pos + len(comment_token) | |
525 | yield ( | |
526 | COMMENT, | |
527 | comment_token, | |
528 | (lnum, pos), | |
529 | (lnum, nl_pos), | |
530 | line, | |
531 | ) | |
532 | yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line) | |
533 | continue | |
534 | ||
535 | if column > indents[-1]: # count indents | |
536 | indents.append(column) | |
537 | yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) | |
538 | ||
539 | while column < indents[-1]: # count dedents | |
540 | if column not in indents: | |
541 | raise IndentationError( | |
542 | "unindent does not match any outer indentation level", | |
543 | ("<tokenize>", lnum, pos, line), | |
544 | ) | |
545 | indents = indents[:-1] | |
546 | ||
547 | if async_def and async_def_indent >= indents[-1]: | |
548 | async_def = False | |
549 | async_def_nl = False | |
550 | async_def_indent = 0 | |
551 | ||
552 | yield (DEDENT, "", (lnum, pos), (lnum, pos), line) | |
553 | ||
554 | if async_def and async_def_nl and async_def_indent >= indents[-1]: | |
555 | async_def = False | |
556 | async_def_nl = False | |
557 | async_def_indent = 0 | |
558 | ||
559 | else: # continued statement | |
560 | if not line: | |
561 | raise TokenError("EOF in multi-line statement", (lnum, 0)) | |
562 | continued = 0 | |
563 | ||
564 | while pos < max: | |
565 | pseudomatch = pseudoprog.match(line, pos) | |
566 | if pseudomatch: # scan for tokens | |
567 | start, end = pseudomatch.span(1) | |
568 | spos, epos, pos = (lnum, start), (lnum, end), end | |
569 | token, initial = line[start:end], line[start] | |
570 | ||
571 | if initial in numchars or ( | |
572 | initial == "." and token != "." | |
573 | ): # ordinary number | |
574 | yield (NUMBER, token, spos, epos, line) | |
575 | elif initial in "\r\n": | |
576 | newline = NEWLINE | |
577 | if parenlev > 0: | |
578 | newline = NL | |
579 | elif async_def: | |
580 | async_def_nl = True | |
581 | if stashed: | |
582 | yield stashed | |
583 | stashed = None | |
584 | yield (newline, token, spos, epos, line) | |
585 | ||
586 | elif initial == "#": | |
587 | assert not token.endswith("\n") | |
588 | if stashed: | |
589 | yield stashed | |
590 | stashed = None | |
591 | yield (COMMENT, token, spos, epos, line) | |
592 | elif token in triple_quoted: | |
593 | endprog = endprogs[token] | |
594 | endmatch = endprog.match(line, pos) | |
595 | if endmatch: # all on one line | |
596 | pos = endmatch.end(0) | |
597 | token = line[start:pos] | |
598 | if stashed: | |
599 | yield stashed | |
600 | stashed = None | |
601 | yield (STRING, token, spos, (lnum, pos), line) | |
602 | else: | |
603 | strstart = (lnum, start) # multiple lines | |
604 | contstr = line[start:] | |
605 | contline = line | |
606 | break | |
607 | elif ( | |
608 | initial in single_quoted | |
609 | or token[:2] in single_quoted | |
610 | or token[:3] in single_quoted | |
611 | ): | |
612 | if token[-1] == "\n": # continued string | |
613 | strstart = (lnum, start) | |
614 | maybe_endprog = ( | |
615 | endprogs.get(initial) | |
616 | or endprogs.get(token[1]) | |
617 | or endprogs.get(token[2]) | |
618 | ) | |
619 | assert ( | |
620 | maybe_endprog is not None | |
621 | ), f"endprog not found for {token}" | |
622 | endprog = maybe_endprog | |
623 | contstr, needcont = line[start:], 1 | |
624 | contline = line | |
625 | break | |
626 | else: # ordinary string | |
627 | if stashed: | |
628 | yield stashed | |
629 | stashed = None | |
630 | yield (STRING, token, spos, epos, line) | |
631 | elif initial.isidentifier(): # ordinary name | |
632 | if token in ("async", "await"): | |
633 | if async_keywords or async_def: | |
634 | yield ( | |
635 | ASYNC if token == "async" else AWAIT, | |
636 | token, | |
637 | spos, | |
638 | epos, | |
639 | line, | |
640 | ) | |
641 | continue | |
642 | ||
643 | tok = (NAME, token, spos, epos, line) | |
644 | if token == "async" and not stashed: | |
645 | stashed = tok | |
646 | continue | |
647 | ||
648 | if token in ("def", "for"): | |
649 | if stashed and stashed[0] == NAME and stashed[1] == "async": | |
650 | if token == "def": | |
651 | async_def = True | |
652 | async_def_indent = indents[-1] | |
653 | ||
654 | yield ( | |
655 | ASYNC, | |
656 | stashed[1], | |
657 | stashed[2], | |
658 | stashed[3], | |
659 | stashed[4], | |
660 | ) | |
661 | stashed = None | |
662 | ||
663 | if stashed: | |
664 | yield stashed | |
665 | stashed = None | |
666 | ||
667 | yield tok | |
668 | elif initial == "\\": # continued stmt | |
669 | # This yield is new; needed for better idempotency: | |
670 | if stashed: | |
671 | yield stashed | |
672 | stashed = None | |
673 | yield (NL, token, spos, (lnum, pos), line) | |
674 | continued = 1 | |
675 | else: | |
676 | if initial in "([{": | |
677 | parenlev += 1 | |
678 | elif initial in ")]}": | |
679 | parenlev -= 1 | |
680 | if stashed: | |
681 | yield stashed | |
682 | stashed = None | |
683 | yield (OP, token, spos, epos, line) | |
684 | else: | |
685 | yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line) | |
686 | pos += 1 | |
687 | ||
688 | if stashed: | |
689 | yield stashed | |
690 | stashed = None | |
691 | ||
692 | for _indent in indents[1:]: # pop remaining indent levels | |
693 | yield (DEDENT, "", (lnum, 0), (lnum, 0), "") | |
694 | yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "") | |
695 | ||
696 | ||
697 | if __name__ == "__main__": # testing | |
698 | if len(sys.argv) > 1: | |
699 | tokenize(open(sys.argv[1]).readline) | |
700 | else: | |
701 | tokenize(sys.stdin.readline) |