1 # -*- coding: utf-8 -*-
3 This tokenizer has been copied from the ``tokenize.py`` standard library
4 tokenizer. The reason was simple: The standard library tokenizer fails
5 if the indentation is not right. To make it possible to do error recovery the
6 tokenizer needed to be rewritten.
8 Basically this is a stripped down version of the standard library module, so
9 you can read the documentation there. Additionally we included some speed and
10 memory optimizations here.
12 from __future__
import absolute_import
16 import itertools
as _itertools
17 from codecs
import BOM_UTF8
18 from typing
import NamedTuple
, Tuple
, Iterator
, Iterable
, List
, Dict
, \
21 from parso
.python
.token
import PythonTokenTypes
22 from parso
.utils
import split_lines
, PythonVersionInfo
, parse_version_string
25 # Maximum code point of Unicode 6.0: 0x10ffff (1,114,111)
26 MAX_UNICODE
= '\U0010ffff'
28 STRING
= PythonTokenTypes
.STRING
29 NAME
= PythonTokenTypes
.NAME
30 NUMBER
= PythonTokenTypes
.NUMBER
31 OP
= PythonTokenTypes
.OP
32 NEWLINE
= PythonTokenTypes
.NEWLINE
33 INDENT
= PythonTokenTypes
.INDENT
34 DEDENT
= PythonTokenTypes
.DEDENT
35 ENDMARKER
= PythonTokenTypes
.ENDMARKER
36 ERRORTOKEN
= PythonTokenTypes
.ERRORTOKEN
37 ERROR_DEDENT
= PythonTokenTypes
.ERROR_DEDENT
38 FSTRING_START
= PythonTokenTypes
.FSTRING_START
39 FSTRING_STRING
= PythonTokenTypes
.FSTRING_STRING
40 FSTRING_END
= PythonTokenTypes
.FSTRING_END
43 class TokenCollection(NamedTuple
):
45 single_quoted
: Set
[str]
46 triple_quoted
: Set
[str]
47 endpats
: Dict
[str, Pattern
]
49 fstring_pattern_map
: Dict
[str, str]
50 always_break_tokens
: Tuple
[str]
53 BOM_UTF8_STRING
= BOM_UTF8
.decode('utf-8')
55 _token_collection_cache
: Dict
[PythonVersionInfo
, TokenCollection
] = {}
58 def group(*choices
, capture
=False, **kwargs
):
64 return start
+ '|'.join(choices
) + ')'
68 return group(*choices
) + '?'
71 # Return the empty string, plus all of the valid string prefixes.
72 def _all_string_prefixes(*, include_fstring
=False, only_fstring
=False):
73 def different_case_versions(prefix
):
74 for s
in _itertools
.product(*[(c
, c
.upper()) for c
in prefix
]):
76 # The valid string prefixes. Only contain the lower case versions,
77 # and don't contain any permuations (include 'fr', but not
78 # 'rf'). The various permutations will be generated.
79 valid_string_prefixes
= ['b', 'r', 'u', 'br']
85 valid_string_prefixes
= f
88 valid_string_prefixes
+= f
92 # if we add binary f-strings, add: ['fb', 'fbr']
93 for prefix
in valid_string_prefixes
:
94 for t
in _itertools
.permutations(prefix
):
95 # create a list with upper and lower versions of each
97 result
.update(different_case_versions(t
))
102 return re
.compile(expr
, re
.UNICODE
)
105 def _get_token_collection(version_info
):
107 return _token_collection_cache
[tuple(version_info
)]
109 _token_collection_cache
[tuple(version_info
)] = result
= \
110 _create_token_collection(version_info
)
114 unicode_character_name
= r
'[A-Za-z0-9\-]+(?: [A-Za-z0-9\-]+)*'
115 fstring_string_single_line
= _compile(
116 r
'(?:\{\{|\}\}|\\N\{' + unicode_character_name
117 + r
'\}|\\(?:\r\n?|\n)|\\[^\r\nN]|[^{}\r\n\\])+'
119 fstring_string_multi_line
= _compile(
120 r
'(?:\{\{|\}\}|\\N\{' + unicode_character_name
+ r
'\}|\\[^N]|[^{}\\])+'
122 fstring_format_spec_single_line
= _compile(r
'(?:\\(?:\r\n?|\n)|[^{}\r\n])+')
123 fstring_format_spec_multi_line
= _compile(r
'[^{}]+')
126 def _create_token_collection(version_info
):
127 # Note: we use unicode matching for names ("\w") but ascii matching for
129 Whitespace
= r
'[ \f\t]*'
130 whitespace
= _compile(Whitespace
)
131 Comment
= r
'#[^\r\n]*'
132 Name
= '([A-Za-z_0-9\u0080-' + MAX_UNICODE
+ ']+)'
134 Hexnumber
= r
'0[xX](?:_?[0-9a-fA-F])+'
135 Binnumber
= r
'0[bB](?:_?[01])+'
136 Octnumber
= r
'0[oO](?:_?[0-7])+'
137 Decnumber
= r
'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
138 Intnumber
= group(Hexnumber
, Binnumber
, Octnumber
, Decnumber
)
139 Exponent
= r
'[eE][-+]?[0-9](?:_?[0-9])*'
140 Pointfloat
= group(r
'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
141 r
'\.[0-9](?:_?[0-9])*') + maybe(Exponent
)
142 Expfloat
= r
'[0-9](?:_?[0-9])*' + Exponent
143 Floatnumber
= group(Pointfloat
, Expfloat
)
144 Imagnumber
= group(r
'[0-9](?:_?[0-9])*[jJ]', Floatnumber
+ r
'[jJ]')
145 Number
= group(Imagnumber
, Floatnumber
, Intnumber
)
147 # Note that since _all_string_prefixes includes the empty string,
148 # StringPrefix can be the empty string (making it optional).
149 possible_prefixes
= _all_string_prefixes()
150 StringPrefix
= group(*possible_prefixes
)
151 StringPrefixWithF
= group(*_all_string_prefixes(include_fstring
=True))
152 fstring_prefixes
= _all_string_prefixes(include_fstring
=True, only_fstring
=True)
153 FStringStart
= group(*fstring_prefixes
)
155 # Tail end of ' string.
156 Single
= r
"(?:\\.|[^'\\])*'"
157 # Tail end of " string.
158 Double
= r
'(?:\\.|[^"\\])*"'
159 # Tail end of ''' string.
160 Single3
= r
"(?:\\.|'(?!'')|[^'\\])*'''"
161 # Tail end of """ string.
162 Double3
= r
'(?:\\.|"(?!"")|[^"\\])*"""'
163 Triple
= group(StringPrefixWithF
+ "'''", StringPrefixWithF
+ '"""')
165 # Because of leftmost-then-longest match semantics, be sure to put the
166 # longest operators first (e.g., if = came before ==, == would get
167 # recognized as two instances of =).
168 Operator
= group(r
"\*\*=?", r
">>=?", r
"<<=?",
170 r
"[+\-*/%&@`|^!=<>]=?",
175 special_args
= [r
'\.\.\.', r
'\r\n?', r
'\n', r
'[;.,@]']
176 if version_info
>= (3, 8):
177 special_args
.insert(0, ":=?")
179 special_args
.insert(0, ":")
180 Special
= group(*special_args
)
182 Funny
= group(Operator
, Bracket
, Special
)
184 # First (or only) line of ' or " string.
185 ContStr
= group(StringPrefix
+ r
"'[^\r\n'\\]*(?:\\.[^\r\n'\\]*)*"
186 + group("'", r
'\\(?:\r\n?|\n)'),
187 StringPrefix
+ r
'"[^\r\n"\\]*(?:\\.[^\r\n"\\]*)*'
188 + group('"', r
'\\(?:\r\n?|\n)'))
189 pseudo_extra_pool
= [Comment
, Triple
]
190 all_quotes
= '"', "'", '"""', "'''"
192 pseudo_extra_pool
.append(FStringStart
+ group(*all_quotes
))
194 PseudoExtras
= group(r
'\\(?:\r\n?|\n)|\Z', *pseudo_extra_pool
)
195 PseudoToken
= group(Whitespace
, capture
=True) + \
196 group(PseudoExtras
, Number
, Funny
, ContStr
, Name
, capture
=True)
198 # For a given string prefix plus quotes, endpats maps it to a regex
199 # to match the remainder of that string. _prefix can be empty, for
200 # a normal single or triple quoted string (with no prefix).
202 for _prefix
in possible_prefixes
:
203 endpats
[_prefix
+ "'"] = _compile(Single
)
204 endpats
[_prefix
+ '"'] = _compile(Double
)
205 endpats
[_prefix
+ "'''"] = _compile(Single3
)
206 endpats
[_prefix
+ '"""'] = _compile(Double3
)
208 # A set of all of the single and triple quoted string prefixes,
209 # including the opening quotes.
210 single_quoted
= set()
211 triple_quoted
= set()
212 fstring_pattern_map
= {}
213 for t
in possible_prefixes
:
214 for quote
in '"', "'":
215 single_quoted
.add(t
+ quote
)
217 for quote
in '"""', "'''":
218 triple_quoted
.add(t
+ quote
)
220 for t
in fstring_prefixes
:
221 for quote
in all_quotes
:
222 fstring_pattern_map
[t
+ quote
] = quote
224 ALWAYS_BREAK_TOKENS
= (';', 'import', 'class', 'def', 'try', 'except',
225 'finally', 'while', 'with', 'return', 'continue',
226 'break', 'del', 'pass', 'global', 'assert', 'nonlocal')
227 pseudo_token_compiled
= _compile(PseudoToken
)
228 return TokenCollection(
229 pseudo_token_compiled
, single_quoted
, triple_quoted
, endpats
,
230 whitespace
, fstring_pattern_map
, set(ALWAYS_BREAK_TOKENS
)
234 class Token(NamedTuple
):
235 type: PythonTokenTypes
237 start_pos
: Tuple
[int, int]
241 def end_pos(self
) -> Tuple
[int, int]:
242 lines
= split_lines(self
.string
)
244 return self
.start_pos
[0] + len(lines
) - 1, 0
246 return self
.start_pos
[0], self
.start_pos
[1] + len(self
.string
)
249 class PythonToken(Token
):
251 return ('TokenInfo(type=%s, string=%r, start_pos=%r, prefix=%r)' %
252 self
._replace
(type=self
.type.name
))
256 def __init__(self
, quote
):
258 self
.parentheses_count
= 0
259 self
.previous_lines
= ''
260 self
.last_string_start_pos
= None
261 # In the syntax there can be multiple format_spec's nested:
263 self
.format_spec_count
= 0
265 def open_parentheses(self
, character
):
266 self
.parentheses_count
+= 1
268 def close_parentheses(self
, character
):
269 self
.parentheses_count
-= 1
270 if self
.parentheses_count
== 0:
271 # No parentheses means that the format spec is also finished.
272 self
.format_spec_count
= 0
274 def allow_multiline(self
):
275 return len(self
.quote
) == 3
277 def is_in_expr(self
):
278 return self
.parentheses_count
> self
.format_spec_count
280 def is_in_format_spec(self
):
281 return not self
.is_in_expr() and self
.format_spec_count
284 def _close_fstring_if_necessary(fstring_stack
, string
, line_nr
, column
, additional_prefix
):
285 for fstring_stack_index
, node
in enumerate(fstring_stack
):
286 lstripped_string
= string
.lstrip()
287 len_lstrip
= len(string
) - len(lstripped_string
)
288 if lstripped_string
.startswith(node
.quote
):
292 (line_nr
, column
+ len_lstrip
),
293 prefix
=additional_prefix
+string
[:len_lstrip
],
295 additional_prefix
= ''
296 assert not node
.previous_lines
297 del fstring_stack
[fstring_stack_index
:]
298 return token
, '', len(node
.quote
) + len_lstrip
299 return None, additional_prefix
, 0
302 def _find_fstring_string(endpats
, fstring_stack
, line
, lnum
, pos
):
303 tos
= fstring_stack
[-1]
304 allow_multiline
= tos
.allow_multiline()
305 if tos
.is_in_format_spec():
307 regex
= fstring_format_spec_multi_line
309 regex
= fstring_format_spec_single_line
312 regex
= fstring_string_multi_line
314 regex
= fstring_string_single_line
316 match
= regex
.match(line
, pos
)
318 return tos
.previous_lines
, pos
320 if not tos
.previous_lines
:
321 tos
.last_string_start_pos
= (lnum
, pos
)
323 string
= match
.group(0)
324 for fstring_stack_node
in fstring_stack
:
325 end_match
= endpats
[fstring_stack_node
.quote
].match(string
)
326 if end_match
is not None:
327 string
= end_match
.group(0)[:-len(fstring_stack_node
.quote
)]
330 new_pos
+= len(string
)
331 # even if allow_multiline is False, we still need to check for trailing
332 # newlines, because a single-line f-string can contain line continuations
333 if string
.endswith('\n') or string
.endswith('\r'):
334 tos
.previous_lines
+= string
337 string
= tos
.previous_lines
+ string
339 return string
, new_pos
343 code
: str, *, version_info
: PythonVersionInfo
, start_pos
: Tuple
[int, int] = (1, 0)
344 ) -> Iterator
[PythonToken
]:
345 """Generate tokens from a the source code (string)."""
346 lines
= split_lines(code
, keepends
=True)
347 return tokenize_lines(lines
, version_info
=version_info
, start_pos
=start_pos
)
350 def _print_tokens(func
):
352 A small helper function to help debug the tokenize_lines function.
354 def wrapper(*args
, **kwargs
):
355 for token
in func(*args
, **kwargs
):
356 print(token
) # This print is intentional for debugging!
364 lines
: Iterable
[str],
366 version_info
: PythonVersionInfo
,
367 indents
: List
[int] = None,
368 start_pos
: Tuple
[int, int] = (1, 0),
370 ) -> Iterator
[PythonToken
]:
372 A heavily modified Python standard library tokenizer.
374 Additionally to the default information, yields also the prefix of each
375 token. This idea comes from lib2to3. The prefix contains all information
376 that is irrelevant for the parser like newlines in parentheses or comments.
378 def dedent_if_necessary(start
):
379 while start
< indents
[-1]:
380 if start
> indents
[-2]:
381 yield PythonToken(ERROR_DEDENT
, '', (lnum
, start
), '')
385 yield PythonToken(DEDENT
, '', spos
, '')
387 pseudo_token
, single_quoted
, triple_quoted
, endpats
, whitespace
, \
388 fstring_pattern_map
, always_break_tokens
, = \
389 _get_token_collection(version_info
)
390 paren_level
= 0 # count parentheses
394 numchars
= '0123456789'
397 contstr_start
: Tuple
[int, int]
399 # We start with a newline. This makes indent at the first position
400 # possible. It's not valid Python, but still better than an INDENT in the
401 # second line (and not in the first). This makes quite a few things in
402 # Jedi's fast parser possible.
404 prefix
= '' # Should never be required, but here for safety
405 additional_prefix
= ''
406 lnum
= start_pos
[0] - 1
407 fstring_stack
: List
[FStringNode
] = []
408 for line
in lines
: # loop over lines in stream
413 if line
.startswith(BOM_UTF8_STRING
):
414 additional_prefix
= BOM_UTF8_STRING
418 # Fake that the part before was already parsed.
419 line
= '^' * start_pos
[1] + line
423 is_first_token
= False
425 if contstr
: # continued string
426 endmatch
= endprog
.match(line
) # noqa: F821
428 pos
= endmatch
.end(0)
430 STRING
, contstr
+ line
[:pos
],
431 contstr_start
, prefix
) # noqa: F821
435 contstr
= contstr
+ line
436 contline
= contline
+ line
441 tos
= fstring_stack
[-1]
442 if not tos
.is_in_expr():
443 string
, pos
= _find_fstring_string(endpats
, fstring_stack
, line
, lnum
, pos
)
446 FSTRING_STRING
, string
,
447 tos
.last_string_start_pos
,
448 # Never has a prefix because it can start anywhere and
449 # include whitespace.
452 tos
.previous_lines
= ''
458 fstring_end_token
, additional_prefix
, quote_length
= _close_fstring_if_necessary(
466 if fstring_end_token
is not None:
467 yield fstring_end_token
470 # in an f-string, match until the end of the string
473 for fstring_stack_node
in fstring_stack
:
474 quote
= fstring_stack_node
.quote
475 end_match
= endpats
[quote
].match(line
, pos
)
476 if end_match
is not None:
477 end_match_string
= end_match
.group(0)
478 if len(end_match_string
) - len(quote
) + pos
< len(string_line
):
479 string_line
= line
[:pos
] + end_match_string
[:-len(quote
)]
480 pseudomatch
= pseudo_token
.match(string_line
, pos
)
482 pseudomatch
= pseudo_token
.match(line
, pos
)
485 prefix
= additional_prefix
+ pseudomatch
.group(1)
486 additional_prefix
= ''
487 start
, pos
= pseudomatch
.span(2)
489 token
= pseudomatch
.group(2)
492 additional_prefix
= prefix
493 # This means that we have a line with whitespace/comments at
494 # the end, which just results in an endmarker.
498 match
= whitespace
.match(line
, pos
)
499 initial
= line
[match
.end()]
503 if new_line
and initial
not in '\r\n#' and (initial
!= '\\' or pseudomatch
is None):
505 if paren_level
== 0 and not fstring_stack
:
507 if indent_start
> indents
[-1]:
508 yield PythonToken(INDENT
, '', spos
, '')
509 indents
.append(indent_start
)
510 yield from dedent_if_necessary(indent_start
)
512 if not pseudomatch
: # scan for tokens
513 match
= whitespace
.match(line
, pos
)
514 if new_line
and paren_level
== 0 and not fstring_stack
:
515 yield from dedent_if_necessary(match
.end())
519 ERRORTOKEN
, line
[pos
], (lnum
, pos
),
520 additional_prefix
+ match
.group(0)
522 additional_prefix
= ''
526 if (initial
in numchars
# ordinary number
527 or (initial
== '.' and token
!= '.' and token
!= '...')):
528 yield PythonToken(NUMBER
, token
, spos
, prefix
)
529 elif pseudomatch
.group(3) is not None: # ordinary name
530 if token
in always_break_tokens
and (fstring_stack
or paren_level
):
531 fstring_stack
[:] = []
533 # We only want to dedent if the token is on a new line.
534 m
= re
.match(r
'[ \f\t]*$', line
[:start
])
536 yield from dedent_if_necessary(m
.end())
537 if token
.isidentifier():
538 yield PythonToken(NAME
, token
, spos
, prefix
)
540 yield from _split_illegal_unicode_name(token
, spos
, prefix
)
541 elif initial
in '\r\n':
542 if any(not f
.allow_multiline() for f
in fstring_stack
):
543 fstring_stack
.clear()
545 if not new_line
and paren_level
== 0 and not fstring_stack
:
546 yield PythonToken(NEWLINE
, token
, spos
, prefix
)
548 additional_prefix
= prefix
+ token
550 elif initial
== '#': # Comments
551 assert not token
.endswith("\n") and not token
.endswith("\r")
552 if fstring_stack
and fstring_stack
[-1].is_in_expr():
553 # `#` is not allowed in f-string expressions
554 yield PythonToken(ERRORTOKEN
, initial
, spos
, prefix
)
557 additional_prefix
= prefix
+ token
558 elif token
in triple_quoted
:
559 endprog
= endpats
[token
]
560 endmatch
= endprog
.match(line
, pos
)
561 if endmatch
: # all on one line
562 pos
= endmatch
.end(0)
563 token
= line
[start
:pos
]
564 yield PythonToken(STRING
, token
, spos
, prefix
)
566 contstr_start
= spos
# multiple lines
567 contstr
= line
[start
:]
571 # Check up to the first 3 chars of the token to see if
572 # they're in the single_quoted set. If so, they start
574 # We're using the first 3, because we're looking for
575 # "rb'" (for example) at the start of the token. If
576 # we switch to longer prefixes, this needs to be
578 # Note that initial == token[:1].
579 # Also note that single quote checking must come after
580 # triple quote checking (above).
581 elif initial
in single_quoted
or \
582 token
[:2] in single_quoted
or \
583 token
[:3] in single_quoted
:
584 if token
[-1] in '\r\n': # continued string
585 # This means that a single quoted string ends with a
586 # backslash and is continued.
587 contstr_start
= lnum
, start
588 endprog
= (endpats
.get(initial
) or endpats
.get(token
[1])
589 or endpats
.get(token
[2]))
590 contstr
= line
[start
:]
593 else: # ordinary string
594 yield PythonToken(STRING
, token
, spos
, prefix
)
595 elif token
in fstring_pattern_map
: # The start of an fstring.
596 fstring_stack
.append(FStringNode(fstring_pattern_map
[token
]))
597 yield PythonToken(FSTRING_START
, token
, spos
, prefix
)
598 elif initial
== '\\' and line
[start
:] in ('\\\n', '\\\r\n', '\\\r'): # continued stmt
599 additional_prefix
+= prefix
+ line
[start
:]
604 fstring_stack
[-1].open_parentheses(token
)
609 fstring_stack
[-1].close_parentheses(token
)
613 elif token
.startswith(':') and fstring_stack \
614 and fstring_stack
[-1].parentheses_count \
615 - fstring_stack
[-1].format_spec_count
== 1:
616 # `:` and `:=` both count
617 fstring_stack
[-1].format_spec_count
+= 1
621 yield PythonToken(OP
, token
, spos
, prefix
)
624 yield PythonToken(ERRORTOKEN
, contstr
, contstr_start
, prefix
)
625 if contstr
.endswith('\n') or contstr
.endswith('\r'):
629 tos
= fstring_stack
[-1]
630 if tos
.previous_lines
:
632 FSTRING_STRING
, tos
.previous_lines
,
633 tos
.last_string_start_pos
,
634 # Never has a prefix because it can start anywhere and
635 # include whitespace.
640 # As the last position we just take the maximally possible position. We
641 # remove -1 for the last new line.
642 for indent
in indents
[1:]:
644 yield PythonToken(DEDENT
, '', end_pos
, '')
645 yield PythonToken(ENDMARKER
, '', end_pos
, additional_prefix
)
648 def _split_illegal_unicode_name(token
, start_pos
, prefix
):
650 return PythonToken(ERRORTOKEN
if is_illegal
else NAME
, found
, pos
, prefix
)
655 for i
, char
in enumerate(token
):
657 if char
.isidentifier():
662 pos
= start_pos
[0], start_pos
[1] + i
666 new_found
= found
+ char
667 if new_found
.isidentifier():
673 pos
= start_pos
[0], start_pos
[1] + i
681 if __name__
== "__main__":
683 with
open(path
) as f
:
686 for token
in tokenize(code
, version_info
=parse_version_string('3.10')):