]>
Commit | Line | Data |
---|---|---|
53e6db90 DC |
1 | # -*- coding: utf-8 -*- |
2 | """ | |
3 | This tokenizer has been copied from the ``tokenize.py`` standard library | |
4 | tokenizer. The reason was simple: The standard library tokenizer fails | |
5 | if the indentation is not right. To make it possible to do error recovery the | |
6 | tokenizer needed to be rewritten. | |
7 | ||
8 | Basically this is a stripped down version of the standard library module, so | |
9 | you can read the documentation there. Additionally we included some speed and | |
10 | memory optimizations here. | |
11 | """ | |
12 | from __future__ import absolute_import | |
13 | ||
14 | import sys | |
15 | import re | |
16 | import itertools as _itertools | |
17 | from codecs import BOM_UTF8 | |
18 | from typing import NamedTuple, Tuple, Iterator, Iterable, List, Dict, \ | |
19 | Pattern, Set | |
20 | ||
21 | from parso.python.token import PythonTokenTypes | |
22 | from parso.utils import split_lines, PythonVersionInfo, parse_version_string | |
23 | ||
24 | ||
25 | # Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) | |
26 | MAX_UNICODE = '\U0010ffff' | |
27 | ||
28 | STRING = PythonTokenTypes.STRING | |
29 | NAME = PythonTokenTypes.NAME | |
30 | NUMBER = PythonTokenTypes.NUMBER | |
31 | OP = PythonTokenTypes.OP | |
32 | NEWLINE = PythonTokenTypes.NEWLINE | |
33 | INDENT = PythonTokenTypes.INDENT | |
34 | DEDENT = PythonTokenTypes.DEDENT | |
35 | ENDMARKER = PythonTokenTypes.ENDMARKER | |
36 | ERRORTOKEN = PythonTokenTypes.ERRORTOKEN | |
37 | ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT | |
38 | FSTRING_START = PythonTokenTypes.FSTRING_START | |
39 | FSTRING_STRING = PythonTokenTypes.FSTRING_STRING | |
40 | FSTRING_END = PythonTokenTypes.FSTRING_END | |
41 | ||
42 | ||
43 | class TokenCollection(NamedTuple): | |
44 | pseudo_token: Pattern | |
45 | single_quoted: Set[str] | |
46 | triple_quoted: Set[str] | |
47 | endpats: Dict[str, Pattern] | |
48 | whitespace: Pattern | |
49 | fstring_pattern_map: Dict[str, str] | |
50 | always_break_tokens: Tuple[str] | |
51 | ||
52 | ||
53 | BOM_UTF8_STRING = BOM_UTF8.decode('utf-8') | |
54 | ||
55 | _token_collection_cache: Dict[PythonVersionInfo, TokenCollection] = {} | |
56 | ||
57 | ||
58 | def group(*choices, capture=False, **kwargs): | |
59 | assert not kwargs | |
60 | ||
61 | start = '(' | |
62 | if not capture: | |
63 | start += '?:' | |
64 | return start + '|'.join(choices) + ')' | |
65 | ||
66 | ||
67 | def maybe(*choices): | |
68 | return group(*choices) + '?' | |
69 | ||
70 | ||
71 | # Return the empty string, plus all of the valid string prefixes. | |
72 | def _all_string_prefixes(*, include_fstring=False, only_fstring=False): | |
73 | def different_case_versions(prefix): | |
74 | for s in _itertools.product(*[(c, c.upper()) for c in prefix]): | |
75 | yield ''.join(s) | |
76 | # The valid string prefixes. Only contain the lower case versions, | |
77 | # and don't contain any permuations (include 'fr', but not | |
78 | # 'rf'). The various permutations will be generated. | |
79 | valid_string_prefixes = ['b', 'r', 'u', 'br'] | |
80 | ||
81 | result = {''} | |
82 | if include_fstring: | |
83 | f = ['f', 'fr'] | |
84 | if only_fstring: | |
85 | valid_string_prefixes = f | |
86 | result = set() | |
87 | else: | |
88 | valid_string_prefixes += f | |
89 | elif only_fstring: | |
90 | return set() | |
91 | ||
92 | # if we add binary f-strings, add: ['fb', 'fbr'] | |
93 | for prefix in valid_string_prefixes: | |
94 | for t in _itertools.permutations(prefix): | |
95 | # create a list with upper and lower versions of each | |
96 | # character | |
97 | result.update(different_case_versions(t)) | |
98 | return result | |
99 | ||
100 | ||
101 | def _compile(expr): | |
102 | return re.compile(expr, re.UNICODE) | |
103 | ||
104 | ||
105 | def _get_token_collection(version_info): | |
106 | try: | |
107 | return _token_collection_cache[tuple(version_info)] | |
108 | except KeyError: | |
109 | _token_collection_cache[tuple(version_info)] = result = \ | |
110 | _create_token_collection(version_info) | |
111 | return result | |
112 | ||
113 | ||
114 | unicode_character_name = r'[A-Za-z0-9\-]+(?: [A-Za-z0-9\-]+)*' | |
115 | fstring_string_single_line = _compile( | |
116 | r'(?:\{\{|\}\}|\\N\{' + unicode_character_name | |
117 | + r'\}|\\(?:\r\n?|\n)|\\[^\r\nN]|[^{}\r\n\\])+' | |
118 | ) | |
119 | fstring_string_multi_line = _compile( | |
120 | r'(?:\{\{|\}\}|\\N\{' + unicode_character_name + r'\}|\\[^N]|[^{}\\])+' | |
121 | ) | |
122 | fstring_format_spec_single_line = _compile(r'(?:\\(?:\r\n?|\n)|[^{}\r\n])+') | |
123 | fstring_format_spec_multi_line = _compile(r'[^{}]+') | |
124 | ||
125 | ||
126 | def _create_token_collection(version_info): | |
127 | # Note: we use unicode matching for names ("\w") but ascii matching for | |
128 | # number literals. | |
129 | Whitespace = r'[ \f\t]*' | |
130 | whitespace = _compile(Whitespace) | |
131 | Comment = r'#[^\r\n]*' | |
132 | Name = '([A-Za-z_0-9\u0080-' + MAX_UNICODE + ']+)' | |
133 | ||
134 | Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+' | |
135 | Binnumber = r'0[bB](?:_?[01])+' | |
136 | Octnumber = r'0[oO](?:_?[0-7])+' | |
137 | Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)' | |
138 | Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) | |
139 | Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*' | |
140 | Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?', | |
141 | r'\.[0-9](?:_?[0-9])*') + maybe(Exponent) | |
142 | Expfloat = r'[0-9](?:_?[0-9])*' + Exponent | |
143 | Floatnumber = group(Pointfloat, Expfloat) | |
144 | Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]') | |
145 | Number = group(Imagnumber, Floatnumber, Intnumber) | |
146 | ||
147 | # Note that since _all_string_prefixes includes the empty string, | |
148 | # StringPrefix can be the empty string (making it optional). | |
149 | possible_prefixes = _all_string_prefixes() | |
150 | StringPrefix = group(*possible_prefixes) | |
151 | StringPrefixWithF = group(*_all_string_prefixes(include_fstring=True)) | |
152 | fstring_prefixes = _all_string_prefixes(include_fstring=True, only_fstring=True) | |
153 | FStringStart = group(*fstring_prefixes) | |
154 | ||
155 | # Tail end of ' string. | |
156 | Single = r"(?:\\.|[^'\\])*'" | |
157 | # Tail end of " string. | |
158 | Double = r'(?:\\.|[^"\\])*"' | |
159 | # Tail end of ''' string. | |
160 | Single3 = r"(?:\\.|'(?!'')|[^'\\])*'''" | |
161 | # Tail end of """ string. | |
162 | Double3 = r'(?:\\.|"(?!"")|[^"\\])*"""' | |
163 | Triple = group(StringPrefixWithF + "'''", StringPrefixWithF + '"""') | |
164 | ||
165 | # Because of leftmost-then-longest match semantics, be sure to put the | |
166 | # longest operators first (e.g., if = came before ==, == would get | |
167 | # recognized as two instances of =). | |
168 | Operator = group(r"\*\*=?", r">>=?", r"<<=?", | |
169 | r"//=?", r"->", | |
170 | r"[+\-*/%&@`|^!=<>]=?", | |
171 | r"~") | |
172 | ||
173 | Bracket = '[][(){}]' | |
174 | ||
175 | special_args = [r'\.\.\.', r'\r\n?', r'\n', r'[;.,@]'] | |
176 | if version_info >= (3, 8): | |
177 | special_args.insert(0, ":=?") | |
178 | else: | |
179 | special_args.insert(0, ":") | |
180 | Special = group(*special_args) | |
181 | ||
182 | Funny = group(Operator, Bracket, Special) | |
183 | ||
184 | # First (or only) line of ' or " string. | |
185 | ContStr = group(StringPrefix + r"'[^\r\n'\\]*(?:\\.[^\r\n'\\]*)*" | |
186 | + group("'", r'\\(?:\r\n?|\n)'), | |
187 | StringPrefix + r'"[^\r\n"\\]*(?:\\.[^\r\n"\\]*)*' | |
188 | + group('"', r'\\(?:\r\n?|\n)')) | |
189 | pseudo_extra_pool = [Comment, Triple] | |
190 | all_quotes = '"', "'", '"""', "'''" | |
191 | if fstring_prefixes: | |
192 | pseudo_extra_pool.append(FStringStart + group(*all_quotes)) | |
193 | ||
194 | PseudoExtras = group(r'\\(?:\r\n?|\n)|\Z', *pseudo_extra_pool) | |
195 | PseudoToken = group(Whitespace, capture=True) + \ | |
196 | group(PseudoExtras, Number, Funny, ContStr, Name, capture=True) | |
197 | ||
198 | # For a given string prefix plus quotes, endpats maps it to a regex | |
199 | # to match the remainder of that string. _prefix can be empty, for | |
200 | # a normal single or triple quoted string (with no prefix). | |
201 | endpats = {} | |
202 | for _prefix in possible_prefixes: | |
203 | endpats[_prefix + "'"] = _compile(Single) | |
204 | endpats[_prefix + '"'] = _compile(Double) | |
205 | endpats[_prefix + "'''"] = _compile(Single3) | |
206 | endpats[_prefix + '"""'] = _compile(Double3) | |
207 | ||
208 | # A set of all of the single and triple quoted string prefixes, | |
209 | # including the opening quotes. | |
210 | single_quoted = set() | |
211 | triple_quoted = set() | |
212 | fstring_pattern_map = {} | |
213 | for t in possible_prefixes: | |
214 | for quote in '"', "'": | |
215 | single_quoted.add(t + quote) | |
216 | ||
217 | for quote in '"""', "'''": | |
218 | triple_quoted.add(t + quote) | |
219 | ||
220 | for t in fstring_prefixes: | |
221 | for quote in all_quotes: | |
222 | fstring_pattern_map[t + quote] = quote | |
223 | ||
224 | ALWAYS_BREAK_TOKENS = (';', 'import', 'class', 'def', 'try', 'except', | |
225 | 'finally', 'while', 'with', 'return', 'continue', | |
226 | 'break', 'del', 'pass', 'global', 'assert', 'nonlocal') | |
227 | pseudo_token_compiled = _compile(PseudoToken) | |
228 | return TokenCollection( | |
229 | pseudo_token_compiled, single_quoted, triple_quoted, endpats, | |
230 | whitespace, fstring_pattern_map, set(ALWAYS_BREAK_TOKENS) | |
231 | ) | |
232 | ||
233 | ||
234 | class Token(NamedTuple): | |
235 | type: PythonTokenTypes | |
236 | string: str | |
237 | start_pos: Tuple[int, int] | |
238 | prefix: str | |
239 | ||
240 | @property | |
241 | def end_pos(self) -> Tuple[int, int]: | |
242 | lines = split_lines(self.string) | |
243 | if len(lines) > 1: | |
244 | return self.start_pos[0] + len(lines) - 1, 0 | |
245 | else: | |
246 | return self.start_pos[0], self.start_pos[1] + len(self.string) | |
247 | ||
248 | ||
249 | class PythonToken(Token): | |
250 | def __repr__(self): | |
251 | return ('TokenInfo(type=%s, string=%r, start_pos=%r, prefix=%r)' % | |
252 | self._replace(type=self.type.name)) | |
253 | ||
254 | ||
255 | class FStringNode: | |
256 | def __init__(self, quote): | |
257 | self.quote = quote | |
258 | self.parentheses_count = 0 | |
259 | self.previous_lines = '' | |
260 | self.last_string_start_pos = None | |
261 | # In the syntax there can be multiple format_spec's nested: | |
262 | # {x:{y:3}} | |
263 | self.format_spec_count = 0 | |
264 | ||
265 | def open_parentheses(self, character): | |
266 | self.parentheses_count += 1 | |
267 | ||
268 | def close_parentheses(self, character): | |
269 | self.parentheses_count -= 1 | |
270 | if self.parentheses_count == 0: | |
271 | # No parentheses means that the format spec is also finished. | |
272 | self.format_spec_count = 0 | |
273 | ||
274 | def allow_multiline(self): | |
275 | return len(self.quote) == 3 | |
276 | ||
277 | def is_in_expr(self): | |
278 | return self.parentheses_count > self.format_spec_count | |
279 | ||
280 | def is_in_format_spec(self): | |
281 | return not self.is_in_expr() and self.format_spec_count | |
282 | ||
283 | ||
284 | def _close_fstring_if_necessary(fstring_stack, string, line_nr, column, additional_prefix): | |
285 | for fstring_stack_index, node in enumerate(fstring_stack): | |
286 | lstripped_string = string.lstrip() | |
287 | len_lstrip = len(string) - len(lstripped_string) | |
288 | if lstripped_string.startswith(node.quote): | |
289 | token = PythonToken( | |
290 | FSTRING_END, | |
291 | node.quote, | |
292 | (line_nr, column + len_lstrip), | |
293 | prefix=additional_prefix+string[:len_lstrip], | |
294 | ) | |
295 | additional_prefix = '' | |
296 | assert not node.previous_lines | |
297 | del fstring_stack[fstring_stack_index:] | |
298 | return token, '', len(node.quote) + len_lstrip | |
299 | return None, additional_prefix, 0 | |
300 | ||
301 | ||
302 | def _find_fstring_string(endpats, fstring_stack, line, lnum, pos): | |
303 | tos = fstring_stack[-1] | |
304 | allow_multiline = tos.allow_multiline() | |
305 | if tos.is_in_format_spec(): | |
306 | if allow_multiline: | |
307 | regex = fstring_format_spec_multi_line | |
308 | else: | |
309 | regex = fstring_format_spec_single_line | |
310 | else: | |
311 | if allow_multiline: | |
312 | regex = fstring_string_multi_line | |
313 | else: | |
314 | regex = fstring_string_single_line | |
315 | ||
316 | match = regex.match(line, pos) | |
317 | if match is None: | |
318 | return tos.previous_lines, pos | |
319 | ||
320 | if not tos.previous_lines: | |
321 | tos.last_string_start_pos = (lnum, pos) | |
322 | ||
323 | string = match.group(0) | |
324 | for fstring_stack_node in fstring_stack: | |
325 | end_match = endpats[fstring_stack_node.quote].match(string) | |
326 | if end_match is not None: | |
327 | string = end_match.group(0)[:-len(fstring_stack_node.quote)] | |
328 | ||
329 | new_pos = pos | |
330 | new_pos += len(string) | |
331 | # even if allow_multiline is False, we still need to check for trailing | |
332 | # newlines, because a single-line f-string can contain line continuations | |
333 | if string.endswith('\n') or string.endswith('\r'): | |
334 | tos.previous_lines += string | |
335 | string = '' | |
336 | else: | |
337 | string = tos.previous_lines + string | |
338 | ||
339 | return string, new_pos | |
340 | ||
341 | ||
342 | def tokenize( | |
343 | code: str, *, version_info: PythonVersionInfo, start_pos: Tuple[int, int] = (1, 0) | |
344 | ) -> Iterator[PythonToken]: | |
345 | """Generate tokens from a the source code (string).""" | |
346 | lines = split_lines(code, keepends=True) | |
347 | return tokenize_lines(lines, version_info=version_info, start_pos=start_pos) | |
348 | ||
349 | ||
350 | def _print_tokens(func): | |
351 | """ | |
352 | A small helper function to help debug the tokenize_lines function. | |
353 | """ | |
354 | def wrapper(*args, **kwargs): | |
355 | for token in func(*args, **kwargs): | |
356 | print(token) # This print is intentional for debugging! | |
357 | yield token | |
358 | ||
359 | return wrapper | |
360 | ||
361 | ||
362 | # @_print_tokens | |
363 | def tokenize_lines( | |
364 | lines: Iterable[str], | |
365 | *, | |
366 | version_info: PythonVersionInfo, | |
367 | indents: List[int] = None, | |
368 | start_pos: Tuple[int, int] = (1, 0), | |
369 | is_first_token=True, | |
370 | ) -> Iterator[PythonToken]: | |
371 | """ | |
372 | A heavily modified Python standard library tokenizer. | |
373 | ||
374 | Additionally to the default information, yields also the prefix of each | |
375 | token. This idea comes from lib2to3. The prefix contains all information | |
376 | that is irrelevant for the parser like newlines in parentheses or comments. | |
377 | """ | |
378 | def dedent_if_necessary(start): | |
379 | while start < indents[-1]: | |
380 | if start > indents[-2]: | |
381 | yield PythonToken(ERROR_DEDENT, '', (lnum, start), '') | |
382 | indents[-1] = start | |
383 | break | |
384 | indents.pop() | |
385 | yield PythonToken(DEDENT, '', spos, '') | |
386 | ||
387 | pseudo_token, single_quoted, triple_quoted, endpats, whitespace, \ | |
388 | fstring_pattern_map, always_break_tokens, = \ | |
389 | _get_token_collection(version_info) | |
390 | paren_level = 0 # count parentheses | |
391 | if indents is None: | |
392 | indents = [0] | |
393 | max_ = 0 | |
394 | numchars = '0123456789' | |
395 | contstr = '' | |
396 | contline: str | |
397 | contstr_start: Tuple[int, int] | |
398 | endprog: Pattern | |
399 | # We start with a newline. This makes indent at the first position | |
400 | # possible. It's not valid Python, but still better than an INDENT in the | |
401 | # second line (and not in the first). This makes quite a few things in | |
402 | # Jedi's fast parser possible. | |
403 | new_line = True | |
404 | prefix = '' # Should never be required, but here for safety | |
405 | additional_prefix = '' | |
406 | lnum = start_pos[0] - 1 | |
407 | fstring_stack: List[FStringNode] = [] | |
408 | for line in lines: # loop over lines in stream | |
409 | lnum += 1 | |
410 | pos = 0 | |
411 | max_ = len(line) | |
412 | if is_first_token: | |
413 | if line.startswith(BOM_UTF8_STRING): | |
414 | additional_prefix = BOM_UTF8_STRING | |
415 | line = line[1:] | |
416 | max_ = len(line) | |
417 | ||
418 | # Fake that the part before was already parsed. | |
419 | line = '^' * start_pos[1] + line | |
420 | pos = start_pos[1] | |
421 | max_ += start_pos[1] | |
422 | ||
423 | is_first_token = False | |
424 | ||
425 | if contstr: # continued string | |
426 | endmatch = endprog.match(line) # noqa: F821 | |
427 | if endmatch: | |
428 | pos = endmatch.end(0) | |
429 | yield PythonToken( | |
430 | STRING, contstr + line[:pos], | |
431 | contstr_start, prefix) # noqa: F821 | |
432 | contstr = '' | |
433 | contline = '' | |
434 | else: | |
435 | contstr = contstr + line | |
436 | contline = contline + line | |
437 | continue | |
438 | ||
439 | while pos < max_: | |
440 | if fstring_stack: | |
441 | tos = fstring_stack[-1] | |
442 | if not tos.is_in_expr(): | |
443 | string, pos = _find_fstring_string(endpats, fstring_stack, line, lnum, pos) | |
444 | if string: | |
445 | yield PythonToken( | |
446 | FSTRING_STRING, string, | |
447 | tos.last_string_start_pos, | |
448 | # Never has a prefix because it can start anywhere and | |
449 | # include whitespace. | |
450 | prefix='' | |
451 | ) | |
452 | tos.previous_lines = '' | |
453 | continue | |
454 | if pos == max_: | |
455 | break | |
456 | ||
457 | rest = line[pos:] | |
458 | fstring_end_token, additional_prefix, quote_length = _close_fstring_if_necessary( | |
459 | fstring_stack, | |
460 | rest, | |
461 | lnum, | |
462 | pos, | |
463 | additional_prefix, | |
464 | ) | |
465 | pos += quote_length | |
466 | if fstring_end_token is not None: | |
467 | yield fstring_end_token | |
468 | continue | |
469 | ||
470 | # in an f-string, match until the end of the string | |
471 | if fstring_stack: | |
472 | string_line = line | |
473 | for fstring_stack_node in fstring_stack: | |
474 | quote = fstring_stack_node.quote | |
475 | end_match = endpats[quote].match(line, pos) | |
476 | if end_match is not None: | |
477 | end_match_string = end_match.group(0) | |
478 | if len(end_match_string) - len(quote) + pos < len(string_line): | |
479 | string_line = line[:pos] + end_match_string[:-len(quote)] | |
480 | pseudomatch = pseudo_token.match(string_line, pos) | |
481 | else: | |
482 | pseudomatch = pseudo_token.match(line, pos) | |
483 | ||
484 | if pseudomatch: | |
485 | prefix = additional_prefix + pseudomatch.group(1) | |
486 | additional_prefix = '' | |
487 | start, pos = pseudomatch.span(2) | |
488 | spos = (lnum, start) | |
489 | token = pseudomatch.group(2) | |
490 | if token == '': | |
491 | assert prefix | |
492 | additional_prefix = prefix | |
493 | # This means that we have a line with whitespace/comments at | |
494 | # the end, which just results in an endmarker. | |
495 | break | |
496 | initial = token[0] | |
497 | else: | |
498 | match = whitespace.match(line, pos) | |
499 | initial = line[match.end()] | |
500 | start = match.end() | |
501 | spos = (lnum, start) | |
502 | ||
503 | if new_line and initial not in '\r\n#' and (initial != '\\' or pseudomatch is None): | |
504 | new_line = False | |
505 | if paren_level == 0 and not fstring_stack: | |
506 | indent_start = start | |
507 | if indent_start > indents[-1]: | |
508 | yield PythonToken(INDENT, '', spos, '') | |
509 | indents.append(indent_start) | |
510 | yield from dedent_if_necessary(indent_start) | |
511 | ||
512 | if not pseudomatch: # scan for tokens | |
513 | match = whitespace.match(line, pos) | |
514 | if new_line and paren_level == 0 and not fstring_stack: | |
515 | yield from dedent_if_necessary(match.end()) | |
516 | pos = match.end() | |
517 | new_line = False | |
518 | yield PythonToken( | |
519 | ERRORTOKEN, line[pos], (lnum, pos), | |
520 | additional_prefix + match.group(0) | |
521 | ) | |
522 | additional_prefix = '' | |
523 | pos += 1 | |
524 | continue | |
525 | ||
526 | if (initial in numchars # ordinary number | |
527 | or (initial == '.' and token != '.' and token != '...')): | |
528 | yield PythonToken(NUMBER, token, spos, prefix) | |
529 | elif pseudomatch.group(3) is not None: # ordinary name | |
530 | if token in always_break_tokens and (fstring_stack or paren_level): | |
531 | fstring_stack[:] = [] | |
532 | paren_level = 0 | |
533 | # We only want to dedent if the token is on a new line. | |
534 | m = re.match(r'[ \f\t]*$', line[:start]) | |
535 | if m is not None: | |
536 | yield from dedent_if_necessary(m.end()) | |
537 | if token.isidentifier(): | |
538 | yield PythonToken(NAME, token, spos, prefix) | |
539 | else: | |
540 | yield from _split_illegal_unicode_name(token, spos, prefix) | |
541 | elif initial in '\r\n': | |
542 | if any(not f.allow_multiline() for f in fstring_stack): | |
543 | fstring_stack.clear() | |
544 | ||
545 | if not new_line and paren_level == 0 and not fstring_stack: | |
546 | yield PythonToken(NEWLINE, token, spos, prefix) | |
547 | else: | |
548 | additional_prefix = prefix + token | |
549 | new_line = True | |
550 | elif initial == '#': # Comments | |
551 | assert not token.endswith("\n") and not token.endswith("\r") | |
552 | if fstring_stack and fstring_stack[-1].is_in_expr(): | |
553 | # `#` is not allowed in f-string expressions | |
554 | yield PythonToken(ERRORTOKEN, initial, spos, prefix) | |
555 | pos = start + 1 | |
556 | else: | |
557 | additional_prefix = prefix + token | |
558 | elif token in triple_quoted: | |
559 | endprog = endpats[token] | |
560 | endmatch = endprog.match(line, pos) | |
561 | if endmatch: # all on one line | |
562 | pos = endmatch.end(0) | |
563 | token = line[start:pos] | |
564 | yield PythonToken(STRING, token, spos, prefix) | |
565 | else: | |
566 | contstr_start = spos # multiple lines | |
567 | contstr = line[start:] | |
568 | contline = line | |
569 | break | |
570 | ||
571 | # Check up to the first 3 chars of the token to see if | |
572 | # they're in the single_quoted set. If so, they start | |
573 | # a string. | |
574 | # We're using the first 3, because we're looking for | |
575 | # "rb'" (for example) at the start of the token. If | |
576 | # we switch to longer prefixes, this needs to be | |
577 | # adjusted. | |
578 | # Note that initial == token[:1]. | |
579 | # Also note that single quote checking must come after | |
580 | # triple quote checking (above). | |
581 | elif initial in single_quoted or \ | |
582 | token[:2] in single_quoted or \ | |
583 | token[:3] in single_quoted: | |
584 | if token[-1] in '\r\n': # continued string | |
585 | # This means that a single quoted string ends with a | |
586 | # backslash and is continued. | |
587 | contstr_start = lnum, start | |
588 | endprog = (endpats.get(initial) or endpats.get(token[1]) | |
589 | or endpats.get(token[2])) | |
590 | contstr = line[start:] | |
591 | contline = line | |
592 | break | |
593 | else: # ordinary string | |
594 | yield PythonToken(STRING, token, spos, prefix) | |
595 | elif token in fstring_pattern_map: # The start of an fstring. | |
596 | fstring_stack.append(FStringNode(fstring_pattern_map[token])) | |
597 | yield PythonToken(FSTRING_START, token, spos, prefix) | |
598 | elif initial == '\\' and line[start:] in ('\\\n', '\\\r\n', '\\\r'): # continued stmt | |
599 | additional_prefix += prefix + line[start:] | |
600 | break | |
601 | else: | |
602 | if token in '([{': | |
603 | if fstring_stack: | |
604 | fstring_stack[-1].open_parentheses(token) | |
605 | else: | |
606 | paren_level += 1 | |
607 | elif token in ')]}': | |
608 | if fstring_stack: | |
609 | fstring_stack[-1].close_parentheses(token) | |
610 | else: | |
611 | if paren_level: | |
612 | paren_level -= 1 | |
613 | elif token.startswith(':') and fstring_stack \ | |
614 | and fstring_stack[-1].parentheses_count \ | |
615 | - fstring_stack[-1].format_spec_count == 1: | |
616 | # `:` and `:=` both count | |
617 | fstring_stack[-1].format_spec_count += 1 | |
618 | token = ':' | |
619 | pos = start + 1 | |
620 | ||
621 | yield PythonToken(OP, token, spos, prefix) | |
622 | ||
623 | if contstr: | |
624 | yield PythonToken(ERRORTOKEN, contstr, contstr_start, prefix) | |
625 | if contstr.endswith('\n') or contstr.endswith('\r'): | |
626 | new_line = True | |
627 | ||
628 | if fstring_stack: | |
629 | tos = fstring_stack[-1] | |
630 | if tos.previous_lines: | |
631 | yield PythonToken( | |
632 | FSTRING_STRING, tos.previous_lines, | |
633 | tos.last_string_start_pos, | |
634 | # Never has a prefix because it can start anywhere and | |
635 | # include whitespace. | |
636 | prefix='' | |
637 | ) | |
638 | ||
639 | end_pos = lnum, max_ | |
640 | # As the last position we just take the maximally possible position. We | |
641 | # remove -1 for the last new line. | |
642 | for indent in indents[1:]: | |
643 | indents.pop() | |
644 | yield PythonToken(DEDENT, '', end_pos, '') | |
645 | yield PythonToken(ENDMARKER, '', end_pos, additional_prefix) | |
646 | ||
647 | ||
648 | def _split_illegal_unicode_name(token, start_pos, prefix): | |
649 | def create_token(): | |
650 | return PythonToken(ERRORTOKEN if is_illegal else NAME, found, pos, prefix) | |
651 | ||
652 | found = '' | |
653 | is_illegal = False | |
654 | pos = start_pos | |
655 | for i, char in enumerate(token): | |
656 | if is_illegal: | |
657 | if char.isidentifier(): | |
658 | yield create_token() | |
659 | found = char | |
660 | is_illegal = False | |
661 | prefix = '' | |
662 | pos = start_pos[0], start_pos[1] + i | |
663 | else: | |
664 | found += char | |
665 | else: | |
666 | new_found = found + char | |
667 | if new_found.isidentifier(): | |
668 | found = new_found | |
669 | else: | |
670 | if found: | |
671 | yield create_token() | |
672 | prefix = '' | |
673 | pos = start_pos[0], start_pos[1] + i | |
674 | found = char | |
675 | is_illegal = True | |
676 | ||
677 | if found: | |
678 | yield create_token() | |
679 | ||
680 | ||
681 | if __name__ == "__main__": | |
682 | path = sys.argv[1] | |
683 | with open(path) as f: | |
684 | code = f.read() | |
685 | ||
686 | for token in tokenize(code, version_info=parse_version_string('3.10')): | |
687 | print(token) |