]> crepu.dev Git - config.git/blame_incremental - djavu-asus/emacs/elpy/rpc-venv/lib/python3.11/site-packages/blib2to3/pgen2/tokenize.py
Reorganización de directorios
[config.git] / djavu-asus / emacs / elpy / rpc-venv / lib / python3.11 / site-packages / blib2to3 / pgen2 / tokenize.py
... / ...
CommitLineData
1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3
4# mypy: allow-untyped-defs, allow-untyped-calls
5
6"""Tokenization help for Python programs.
7
8generate_tokens(readline) is a generator that breaks a stream of
9text into Python tokens. It accepts a readline-like method which is called
10repeatedly to get the next line of input (or "" for EOF). It generates
115-tuples with these members:
12
13 the token type (see token.py)
14 the token (a string)
15 the starting (row, column) indices of the token (a 2-tuple of ints)
16 the ending (row, column) indices of the token (a 2-tuple of ints)
17 the original line (string)
18
19It is designed to match the working of the Python tokenizer exactly, except
20that it produces COMMENT tokens for comments and gives type OP for all
21operators
22
23Older entry points
24 tokenize_loop(readline, tokeneater)
25 tokenize(readline, tokeneater=printtoken)
26are the same, except instead of generating tokens, tokeneater is a callback
27function to which the 5 fields described above are passed as 5 arguments,
28each time a new token is found."""
29
30import sys
31from typing import (
32 Callable,
33 Final,
34 Iterable,
35 Iterator,
36 List,
37 Optional,
38 Pattern,
39 Set,
40 Tuple,
41 Union,
42 cast,
43)
44
45from blib2to3.pgen2.grammar import Grammar
46from blib2to3.pgen2.token import (
47 ASYNC,
48 AWAIT,
49 COMMENT,
50 DEDENT,
51 ENDMARKER,
52 ERRORTOKEN,
53 INDENT,
54 NAME,
55 NEWLINE,
56 NL,
57 NUMBER,
58 OP,
59 STRING,
60 tok_name,
61)
62
63__author__ = "Ka-Ping Yee <ping@lfw.org>"
64__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
65
66import re
67from codecs import BOM_UTF8, lookup
68
69from . import token
70
71__all__ = [x for x in dir(token) if x[0] != "_"] + [
72 "tokenize",
73 "generate_tokens",
74 "untokenize",
75]
76del token
77
78
79def group(*choices: str) -> str:
80 return "(" + "|".join(choices) + ")"
81
82
83def any(*choices: str) -> str:
84 return group(*choices) + "*"
85
86
87def maybe(*choices: str) -> str:
88 return group(*choices) + "?"
89
90
91def _combinations(*l: str) -> Set[str]:
92 return {x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()}
93
94
95Whitespace = r"[ \f\t]*"
96Comment = r"#[^\r\n]*"
97Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
98Name = ( # this is invalid but it's fine because Name comes after Number in all groups
99 r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+"
100)
101
102Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
103Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
104Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
105Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
106Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
107Exponent = r"[eE][-+]?\d+(?:_\d+)*"
108Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
109 Exponent
110)
111Expfloat = r"\d+(?:_\d+)*" + Exponent
112Floatnumber = group(Pointfloat, Expfloat)
113Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
114Number = group(Imagnumber, Floatnumber, Intnumber)
115
116# Tail end of ' string.
117Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
118# Tail end of " string.
119Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
120# Tail end of ''' string.
121Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
122# Tail end of """ string.
123Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
124_litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
125Triple = group(_litprefix + "'''", _litprefix + '"""')
126# Single-line ' or " string.
127String = group(
128 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
129 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
130)
131
132# Because of leftmost-then-longest match semantics, be sure to put the
133# longest operators first (e.g., if = came before ==, == would get
134# recognized as two instances of =).
135Operator = group(
136 r"\*\*=?",
137 r">>=?",
138 r"<<=?",
139 r"<>",
140 r"!=",
141 r"//=?",
142 r"->",
143 r"[+\-*/%&@|^=<>:]=?",
144 r"~",
145)
146
147Bracket = "[][(){}]"
148Special = group(r"\r?\n", r"[:;.,`@]")
149Funny = group(Operator, Bracket, Special)
150
151# First (or only) line of ' or " string.
152ContStr = group(
153 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
154 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
155)
156PseudoExtras = group(r"\\\r?\n", Comment, Triple)
157PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
158
159pseudoprog: Final = re.compile(PseudoToken, re.UNICODE)
160single3prog = re.compile(Single3)
161double3prog = re.compile(Double3)
162
163_strprefixes = (
164 _combinations("r", "R", "f", "F")
165 | _combinations("r", "R", "b", "B")
166 | {"u", "U", "ur", "uR", "Ur", "UR"}
167)
168
169endprogs: Final = {
170 "'": re.compile(Single),
171 '"': re.compile(Double),
172 "'''": single3prog,
173 '"""': double3prog,
174 **{f"{prefix}'''": single3prog for prefix in _strprefixes},
175 **{f'{prefix}"""': double3prog for prefix in _strprefixes},
176}
177
178triple_quoted: Final = (
179 {"'''", '"""'}
180 | {f"{prefix}'''" for prefix in _strprefixes}
181 | {f'{prefix}"""' for prefix in _strprefixes}
182)
183single_quoted: Final = (
184 {"'", '"'}
185 | {f"{prefix}'" for prefix in _strprefixes}
186 | {f'{prefix}"' for prefix in _strprefixes}
187)
188
189tabsize = 8
190
191
192class TokenError(Exception):
193 pass
194
195
196class StopTokenizing(Exception):
197 pass
198
199
200Coord = Tuple[int, int]
201
202
203def printtoken(
204 type: int, token: str, srow_col: Coord, erow_col: Coord, line: str
205) -> None: # for testing
206 (srow, scol) = srow_col
207 (erow, ecol) = erow_col
208 print(
209 "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
210 )
211
212
213TokenEater = Callable[[int, str, Coord, Coord, str], None]
214
215
216def tokenize(readline: Callable[[], str], tokeneater: TokenEater = printtoken) -> None:
217 """
218 The tokenize() function accepts two parameters: one representing the
219 input stream, and one providing an output mechanism for tokenize().
220
221 The first parameter, readline, must be a callable object which provides
222 the same interface as the readline() method of built-in file objects.
223 Each call to the function should return one line of input as a string.
224
225 The second parameter, tokeneater, must also be a callable object. It is
226 called once for each token, with five arguments, corresponding to the
227 tuples generated by generate_tokens().
228 """
229 try:
230 tokenize_loop(readline, tokeneater)
231 except StopTokenizing:
232 pass
233
234
235# backwards compatible interface
236def tokenize_loop(readline: Callable[[], str], tokeneater: TokenEater) -> None:
237 for token_info in generate_tokens(readline):
238 tokeneater(*token_info)
239
240
241GoodTokenInfo = Tuple[int, str, Coord, Coord, str]
242TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
243
244
245class Untokenizer:
246 tokens: List[str]
247 prev_row: int
248 prev_col: int
249
250 def __init__(self) -> None:
251 self.tokens = []
252 self.prev_row = 1
253 self.prev_col = 0
254
255 def add_whitespace(self, start: Coord) -> None:
256 row, col = start
257 assert row <= self.prev_row
258 col_offset = col - self.prev_col
259 if col_offset:
260 self.tokens.append(" " * col_offset)
261
262 def untokenize(self, iterable: Iterable[TokenInfo]) -> str:
263 for t in iterable:
264 if len(t) == 2:
265 self.compat(cast(Tuple[int, str], t), iterable)
266 break
267 tok_type, token, start, end, line = cast(
268 Tuple[int, str, Coord, Coord, str], t
269 )
270 self.add_whitespace(start)
271 self.tokens.append(token)
272 self.prev_row, self.prev_col = end
273 if tok_type in (NEWLINE, NL):
274 self.prev_row += 1
275 self.prev_col = 0
276 return "".join(self.tokens)
277
278 def compat(self, token: Tuple[int, str], iterable: Iterable[TokenInfo]) -> None:
279 startline = False
280 indents = []
281 toks_append = self.tokens.append
282 toknum, tokval = token
283 if toknum in (NAME, NUMBER):
284 tokval += " "
285 if toknum in (NEWLINE, NL):
286 startline = True
287 for tok in iterable:
288 toknum, tokval = tok[:2]
289
290 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
291 tokval += " "
292
293 if toknum == INDENT:
294 indents.append(tokval)
295 continue
296 elif toknum == DEDENT:
297 indents.pop()
298 continue
299 elif toknum in (NEWLINE, NL):
300 startline = True
301 elif startline and indents:
302 toks_append(indents[-1])
303 startline = False
304 toks_append(tokval)
305
306
307cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
308blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
309
310
311def _get_normal_name(orig_enc: str) -> str:
312 """Imitates get_normal_name in tokenizer.c."""
313 # Only care about the first 12 characters.
314 enc = orig_enc[:12].lower().replace("_", "-")
315 if enc == "utf-8" or enc.startswith("utf-8-"):
316 return "utf-8"
317 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
318 ("latin-1-", "iso-8859-1-", "iso-latin-1-")
319 ):
320 return "iso-8859-1"
321 return orig_enc
322
323
324def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
325 """
326 The detect_encoding() function is used to detect the encoding that should
327 be used to decode a Python source file. It requires one argument, readline,
328 in the same way as the tokenize() generator.
329
330 It will call readline a maximum of twice, and return the encoding used
331 (as a string) and a list of any lines (left as bytes) it has read
332 in.
333
334 It detects the encoding from the presence of a utf-8 bom or an encoding
335 cookie as specified in pep-0263. If both a bom and a cookie are present, but
336 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
337 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
338 'utf-8-sig' is returned.
339
340 If no encoding is specified, then the default of 'utf-8' will be returned.
341 """
342 bom_found = False
343 encoding = None
344 default = "utf-8"
345
346 def read_or_stop() -> bytes:
347 try:
348 return readline()
349 except StopIteration:
350 return b""
351
352 def find_cookie(line: bytes) -> Optional[str]:
353 try:
354 line_string = line.decode("ascii")
355 except UnicodeDecodeError:
356 return None
357 match = cookie_re.match(line_string)
358 if not match:
359 return None
360 encoding = _get_normal_name(match.group(1))
361 try:
362 codec = lookup(encoding)
363 except LookupError:
364 # This behaviour mimics the Python interpreter
365 raise SyntaxError("unknown encoding: " + encoding)
366
367 if bom_found:
368 if codec.name != "utf-8":
369 # This behaviour mimics the Python interpreter
370 raise SyntaxError("encoding problem: utf-8")
371 encoding += "-sig"
372 return encoding
373
374 first = read_or_stop()
375 if first.startswith(BOM_UTF8):
376 bom_found = True
377 first = first[3:]
378 default = "utf-8-sig"
379 if not first:
380 return default, []
381
382 encoding = find_cookie(first)
383 if encoding:
384 return encoding, [first]
385 if not blank_re.match(first):
386 return default, [first]
387
388 second = read_or_stop()
389 if not second:
390 return default, [first]
391
392 encoding = find_cookie(second)
393 if encoding:
394 return encoding, [first, second]
395
396 return default, [first, second]
397
398
399def untokenize(iterable: Iterable[TokenInfo]) -> str:
400 """Transform tokens back into Python source code.
401
402 Each element returned by the iterable must be a token sequence
403 with at least two elements, a token number and token value. If
404 only two tokens are passed, the resulting output is poor.
405
406 Round-trip invariant for full input:
407 Untokenized source will match input source exactly
408
409 Round-trip invariant for limited input:
410 # Output text will tokenize the back to the input
411 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
412 newcode = untokenize(t1)
413 readline = iter(newcode.splitlines(1)).next
414 t2 = [tok[:2] for tokin generate_tokens(readline)]
415 assert t1 == t2
416 """
417 ut = Untokenizer()
418 return ut.untokenize(iterable)
419
420
421def generate_tokens(
422 readline: Callable[[], str], grammar: Optional[Grammar] = None
423) -> Iterator[GoodTokenInfo]:
424 """
425 The generate_tokens() generator requires one argument, readline, which
426 must be a callable object which provides the same interface as the
427 readline() method of built-in file objects. Each call to the function
428 should return one line of input as a string. Alternately, readline
429 can be a callable function terminating with StopIteration:
430 readline = open(myfile).next # Example of alternate readline
431
432 The generator produces 5-tuples with these members: the token type; the
433 token string; a 2-tuple (srow, scol) of ints specifying the row and
434 column where the token begins in the source; a 2-tuple (erow, ecol) of
435 ints specifying the row and column where the token ends in the source;
436 and the line on which the token was found. The line passed is the
437 logical line; continuation lines are included.
438 """
439 lnum = parenlev = continued = 0
440 numchars: Final[str] = "0123456789"
441 contstr, needcont = "", 0
442 contline: Optional[str] = None
443 indents = [0]
444
445 # If we know we're parsing 3.7+, we can unconditionally parse `async` and
446 # `await` as keywords.
447 async_keywords = False if grammar is None else grammar.async_keywords
448 # 'stashed' and 'async_*' are used for async/await parsing
449 stashed: Optional[GoodTokenInfo] = None
450 async_def = False
451 async_def_indent = 0
452 async_def_nl = False
453
454 strstart: Tuple[int, int]
455 endprog: Pattern[str]
456
457 while 1: # loop over lines in stream
458 try:
459 line = readline()
460 except StopIteration:
461 line = ""
462 lnum += 1
463 pos, max = 0, len(line)
464
465 if contstr: # continued string
466 assert contline is not None
467 if not line:
468 raise TokenError("EOF in multi-line string", strstart)
469 endmatch = endprog.match(line)
470 if endmatch:
471 pos = end = endmatch.end(0)
472 yield (
473 STRING,
474 contstr + line[:end],
475 strstart,
476 (lnum, end),
477 contline + line,
478 )
479 contstr, needcont = "", 0
480 contline = None
481 elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
482 yield (
483 ERRORTOKEN,
484 contstr + line,
485 strstart,
486 (lnum, len(line)),
487 contline,
488 )
489 contstr = ""
490 contline = None
491 continue
492 else:
493 contstr = contstr + line
494 contline = contline + line
495 continue
496
497 elif parenlev == 0 and not continued: # new statement
498 if not line:
499 break
500 column = 0
501 while pos < max: # measure leading whitespace
502 if line[pos] == " ":
503 column += 1
504 elif line[pos] == "\t":
505 column = (column // tabsize + 1) * tabsize
506 elif line[pos] == "\f":
507 column = 0
508 else:
509 break
510 pos += 1
511 if pos == max:
512 break
513
514 if stashed:
515 yield stashed
516 stashed = None
517
518 if line[pos] in "\r\n": # skip blank lines
519 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
520 continue
521
522 if line[pos] == "#": # skip comments
523 comment_token = line[pos:].rstrip("\r\n")
524 nl_pos = pos + len(comment_token)
525 yield (
526 COMMENT,
527 comment_token,
528 (lnum, pos),
529 (lnum, nl_pos),
530 line,
531 )
532 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
533 continue
534
535 if column > indents[-1]: # count indents
536 indents.append(column)
537 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
538
539 while column < indents[-1]: # count dedents
540 if column not in indents:
541 raise IndentationError(
542 "unindent does not match any outer indentation level",
543 ("<tokenize>", lnum, pos, line),
544 )
545 indents = indents[:-1]
546
547 if async_def and async_def_indent >= indents[-1]:
548 async_def = False
549 async_def_nl = False
550 async_def_indent = 0
551
552 yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
553
554 if async_def and async_def_nl and async_def_indent >= indents[-1]:
555 async_def = False
556 async_def_nl = False
557 async_def_indent = 0
558
559 else: # continued statement
560 if not line:
561 raise TokenError("EOF in multi-line statement", (lnum, 0))
562 continued = 0
563
564 while pos < max:
565 pseudomatch = pseudoprog.match(line, pos)
566 if pseudomatch: # scan for tokens
567 start, end = pseudomatch.span(1)
568 spos, epos, pos = (lnum, start), (lnum, end), end
569 token, initial = line[start:end], line[start]
570
571 if initial in numchars or (
572 initial == "." and token != "."
573 ): # ordinary number
574 yield (NUMBER, token, spos, epos, line)
575 elif initial in "\r\n":
576 newline = NEWLINE
577 if parenlev > 0:
578 newline = NL
579 elif async_def:
580 async_def_nl = True
581 if stashed:
582 yield stashed
583 stashed = None
584 yield (newline, token, spos, epos, line)
585
586 elif initial == "#":
587 assert not token.endswith("\n")
588 if stashed:
589 yield stashed
590 stashed = None
591 yield (COMMENT, token, spos, epos, line)
592 elif token in triple_quoted:
593 endprog = endprogs[token]
594 endmatch = endprog.match(line, pos)
595 if endmatch: # all on one line
596 pos = endmatch.end(0)
597 token = line[start:pos]
598 if stashed:
599 yield stashed
600 stashed = None
601 yield (STRING, token, spos, (lnum, pos), line)
602 else:
603 strstart = (lnum, start) # multiple lines
604 contstr = line[start:]
605 contline = line
606 break
607 elif (
608 initial in single_quoted
609 or token[:2] in single_quoted
610 or token[:3] in single_quoted
611 ):
612 if token[-1] == "\n": # continued string
613 strstart = (lnum, start)
614 maybe_endprog = (
615 endprogs.get(initial)
616 or endprogs.get(token[1])
617 or endprogs.get(token[2])
618 )
619 assert (
620 maybe_endprog is not None
621 ), f"endprog not found for {token}"
622 endprog = maybe_endprog
623 contstr, needcont = line[start:], 1
624 contline = line
625 break
626 else: # ordinary string
627 if stashed:
628 yield stashed
629 stashed = None
630 yield (STRING, token, spos, epos, line)
631 elif initial.isidentifier(): # ordinary name
632 if token in ("async", "await"):
633 if async_keywords or async_def:
634 yield (
635 ASYNC if token == "async" else AWAIT,
636 token,
637 spos,
638 epos,
639 line,
640 )
641 continue
642
643 tok = (NAME, token, spos, epos, line)
644 if token == "async" and not stashed:
645 stashed = tok
646 continue
647
648 if token in ("def", "for"):
649 if stashed and stashed[0] == NAME and stashed[1] == "async":
650 if token == "def":
651 async_def = True
652 async_def_indent = indents[-1]
653
654 yield (
655 ASYNC,
656 stashed[1],
657 stashed[2],
658 stashed[3],
659 stashed[4],
660 )
661 stashed = None
662
663 if stashed:
664 yield stashed
665 stashed = None
666
667 yield tok
668 elif initial == "\\": # continued stmt
669 # This yield is new; needed for better idempotency:
670 if stashed:
671 yield stashed
672 stashed = None
673 yield (NL, token, spos, (lnum, pos), line)
674 continued = 1
675 else:
676 if initial in "([{":
677 parenlev += 1
678 elif initial in ")]}":
679 parenlev -= 1
680 if stashed:
681 yield stashed
682 stashed = None
683 yield (OP, token, spos, epos, line)
684 else:
685 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
686 pos += 1
687
688 if stashed:
689 yield stashed
690 stashed = None
691
692 for _indent in indents[1:]: # pop remaining indent levels
693 yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
694 yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
695
696
697if __name__ == "__main__": # testing
698 if len(sys.argv) > 1:
699 tokenize(open(sys.argv[1]).readline)
700 else:
701 tokenize(sys.stdin.readline)