]>
Commit | Line | Data |
---|---|---|
53e6db90 DC |
1 | """ |
2 | The diff parser is trying to be a faster version of the normal parser by trying | |
3 | to reuse the nodes of a previous pass over the same file. This is also called | |
4 | incremental parsing in parser literature. The difference is mostly that with | |
5 | incremental parsing you get a range that needs to be reparsed. Here we | |
6 | calculate that range ourselves by using difflib. After that it's essentially | |
7 | incremental parsing. | |
8 | ||
9 | The biggest issue of this approach is that we reuse nodes in a mutable way. The | |
10 | intial design and idea is quite problematic for this parser, but it is also | |
11 | pretty fast. Measurements showed that just copying nodes in Python is simply | |
12 | quite a bit slower (especially for big files >3 kLOC). Therefore we did not | |
13 | want to get rid of the mutable nodes, since this is usually not an issue. | |
14 | ||
15 | This is by far the hardest software I ever wrote, exactly because the initial | |
16 | design is crappy. When you have to account for a lot of mutable state, it | |
17 | creates a ton of issues that you would otherwise not have. This file took | |
18 | probably 3-6 months to write, which is insane for a parser. | |
19 | ||
20 | There is a fuzzer in that helps test this whole thing. Please use it if you | |
21 | make changes here. If you run the fuzzer like:: | |
22 | ||
23 | test/fuzz_diff_parser.py random -n 100000 | |
24 | ||
25 | you can be pretty sure that everything is still fine. I sometimes run the | |
26 | fuzzer up to 24h to make sure everything is still ok. | |
27 | """ | |
28 | import re | |
29 | import difflib | |
30 | from collections import namedtuple | |
31 | import logging | |
32 | ||
33 | from parso.utils import split_lines | |
34 | from parso.python.parser import Parser | |
35 | from parso.python.tree import EndMarker | |
36 | from parso.python.tokenize import PythonToken, BOM_UTF8_STRING | |
37 | from parso.python.token import PythonTokenTypes | |
38 | ||
39 | LOG = logging.getLogger(__name__) | |
40 | DEBUG_DIFF_PARSER = False | |
41 | ||
42 | _INDENTATION_TOKENS = 'INDENT', 'ERROR_DEDENT', 'DEDENT' | |
43 | ||
44 | NEWLINE = PythonTokenTypes.NEWLINE | |
45 | DEDENT = PythonTokenTypes.DEDENT | |
46 | NAME = PythonTokenTypes.NAME | |
47 | ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT | |
48 | ENDMARKER = PythonTokenTypes.ENDMARKER | |
49 | ||
50 | ||
51 | def _is_indentation_error_leaf(node): | |
52 | return node.type == 'error_leaf' and node.token_type in _INDENTATION_TOKENS | |
53 | ||
54 | ||
55 | def _get_previous_leaf_if_indentation(leaf): | |
56 | while leaf and _is_indentation_error_leaf(leaf): | |
57 | leaf = leaf.get_previous_leaf() | |
58 | return leaf | |
59 | ||
60 | ||
61 | def _get_next_leaf_if_indentation(leaf): | |
62 | while leaf and _is_indentation_error_leaf(leaf): | |
63 | leaf = leaf.get_next_leaf() | |
64 | return leaf | |
65 | ||
66 | ||
67 | def _get_suite_indentation(tree_node): | |
68 | return _get_indentation(tree_node.children[1]) | |
69 | ||
70 | ||
71 | def _get_indentation(tree_node): | |
72 | return tree_node.start_pos[1] | |
73 | ||
74 | ||
75 | def _assert_valid_graph(node): | |
76 | """ | |
77 | Checks if the parent/children relationship is correct. | |
78 | ||
79 | This is a check that only runs during debugging/testing. | |
80 | """ | |
81 | try: | |
82 | children = node.children | |
83 | except AttributeError: | |
84 | # Ignore INDENT is necessary, because indent/dedent tokens don't | |
85 | # contain value/prefix and are just around, because of the tokenizer. | |
86 | if node.type == 'error_leaf' and node.token_type in _INDENTATION_TOKENS: | |
87 | assert not node.value | |
88 | assert not node.prefix | |
89 | return | |
90 | ||
91 | # Calculate the content between two start positions. | |
92 | previous_leaf = _get_previous_leaf_if_indentation(node.get_previous_leaf()) | |
93 | if previous_leaf is None: | |
94 | content = node.prefix | |
95 | previous_start_pos = 1, 0 | |
96 | else: | |
97 | assert previous_leaf.end_pos <= node.start_pos, \ | |
98 | (previous_leaf, node) | |
99 | ||
100 | content = previous_leaf.value + node.prefix | |
101 | previous_start_pos = previous_leaf.start_pos | |
102 | ||
103 | if '\n' in content or '\r' in content: | |
104 | splitted = split_lines(content) | |
105 | line = previous_start_pos[0] + len(splitted) - 1 | |
106 | actual = line, len(splitted[-1]) | |
107 | else: | |
108 | actual = previous_start_pos[0], previous_start_pos[1] + len(content) | |
109 | if content.startswith(BOM_UTF8_STRING) \ | |
110 | and node.get_start_pos_of_prefix() == (1, 0): | |
111 | # Remove the byte order mark | |
112 | actual = actual[0], actual[1] - 1 | |
113 | ||
114 | assert node.start_pos == actual, (node.start_pos, actual) | |
115 | else: | |
116 | for child in children: | |
117 | assert child.parent == node, (node, child) | |
118 | _assert_valid_graph(child) | |
119 | ||
120 | ||
121 | def _assert_nodes_are_equal(node1, node2): | |
122 | try: | |
123 | children1 = node1.children | |
124 | except AttributeError: | |
125 | assert not hasattr(node2, 'children'), (node1, node2) | |
126 | assert node1.value == node2.value, (node1, node2) | |
127 | assert node1.type == node2.type, (node1, node2) | |
128 | assert node1.prefix == node2.prefix, (node1, node2) | |
129 | assert node1.start_pos == node2.start_pos, (node1, node2) | |
130 | return | |
131 | else: | |
132 | try: | |
133 | children2 = node2.children | |
134 | except AttributeError: | |
135 | assert False, (node1, node2) | |
136 | for n1, n2 in zip(children1, children2): | |
137 | _assert_nodes_are_equal(n1, n2) | |
138 | assert len(children1) == len(children2), '\n' + repr(children1) + '\n' + repr(children2) | |
139 | ||
140 | ||
141 | def _get_debug_error_message(module, old_lines, new_lines): | |
142 | current_lines = split_lines(module.get_code(), keepends=True) | |
143 | current_diff = difflib.unified_diff(new_lines, current_lines) | |
144 | old_new_diff = difflib.unified_diff(old_lines, new_lines) | |
145 | import parso | |
146 | return ( | |
147 | "There's an issue with the diff parser. Please " | |
148 | "report (parso v%s) - Old/New:\n%s\nActual Diff (May be empty):\n%s" | |
149 | % (parso.__version__, ''.join(old_new_diff), ''.join(current_diff)) | |
150 | ) | |
151 | ||
152 | ||
153 | def _get_last_line(node_or_leaf): | |
154 | last_leaf = node_or_leaf.get_last_leaf() | |
155 | if _ends_with_newline(last_leaf): | |
156 | return last_leaf.start_pos[0] | |
157 | else: | |
158 | n = last_leaf.get_next_leaf() | |
159 | if n.type == 'endmarker' and '\n' in n.prefix: | |
160 | # This is a very special case and has to do with error recovery in | |
161 | # Parso. The problem is basically that there's no newline leaf at | |
162 | # the end sometimes (it's required in the grammar, but not needed | |
163 | # actually before endmarker, CPython just adds a newline to make | |
164 | # source code pass the parser, to account for that Parso error | |
165 | # recovery allows small_stmt instead of simple_stmt). | |
166 | return last_leaf.end_pos[0] + 1 | |
167 | return last_leaf.end_pos[0] | |
168 | ||
169 | ||
170 | def _skip_dedent_error_leaves(leaf): | |
171 | while leaf is not None and leaf.type == 'error_leaf' and leaf.token_type == 'DEDENT': | |
172 | leaf = leaf.get_previous_leaf() | |
173 | return leaf | |
174 | ||
175 | ||
176 | def _ends_with_newline(leaf, suffix=''): | |
177 | leaf = _skip_dedent_error_leaves(leaf) | |
178 | ||
179 | if leaf.type == 'error_leaf': | |
180 | typ = leaf.token_type.lower() | |
181 | else: | |
182 | typ = leaf.type | |
183 | ||
184 | return typ == 'newline' or suffix.endswith('\n') or suffix.endswith('\r') | |
185 | ||
186 | ||
187 | def _flows_finished(pgen_grammar, stack): | |
188 | """ | |
189 | if, while, for and try might not be finished, because another part might | |
190 | still be parsed. | |
191 | """ | |
192 | for stack_node in stack: | |
193 | if stack_node.nonterminal in ('if_stmt', 'while_stmt', 'for_stmt', 'try_stmt'): | |
194 | return False | |
195 | return True | |
196 | ||
197 | ||
198 | def _func_or_class_has_suite(node): | |
199 | if node.type == 'decorated': | |
200 | node = node.children[-1] | |
201 | if node.type in ('async_funcdef', 'async_stmt'): | |
202 | node = node.children[-1] | |
203 | return node.type in ('classdef', 'funcdef') and node.children[-1].type == 'suite' | |
204 | ||
205 | ||
206 | def _suite_or_file_input_is_valid(pgen_grammar, stack): | |
207 | if not _flows_finished(pgen_grammar, stack): | |
208 | return False | |
209 | ||
210 | for stack_node in reversed(stack): | |
211 | if stack_node.nonterminal == 'decorator': | |
212 | # A decorator is only valid with the upcoming function. | |
213 | return False | |
214 | ||
215 | if stack_node.nonterminal == 'suite': | |
216 | # If only newline is in the suite, the suite is not valid, yet. | |
217 | return len(stack_node.nodes) > 1 | |
218 | # Not reaching a suite means that we're dealing with file_input levels | |
219 | # where there's no need for a valid statement in it. It can also be empty. | |
220 | return True | |
221 | ||
222 | ||
223 | def _is_flow_node(node): | |
224 | if node.type == 'async_stmt': | |
225 | node = node.children[1] | |
226 | try: | |
227 | value = node.children[0].value | |
228 | except AttributeError: | |
229 | return False | |
230 | return value in ('if', 'for', 'while', 'try', 'with') | |
231 | ||
232 | ||
233 | class _PositionUpdatingFinished(Exception): | |
234 | pass | |
235 | ||
236 | ||
237 | def _update_positions(nodes, line_offset, last_leaf): | |
238 | for node in nodes: | |
239 | try: | |
240 | children = node.children | |
241 | except AttributeError: | |
242 | # Is a leaf | |
243 | node.line += line_offset | |
244 | if node is last_leaf: | |
245 | raise _PositionUpdatingFinished | |
246 | else: | |
247 | _update_positions(children, line_offset, last_leaf) | |
248 | ||
249 | ||
250 | class DiffParser: | |
251 | """ | |
252 | An advanced form of parsing a file faster. Unfortunately comes with huge | |
253 | side effects. It changes the given module. | |
254 | """ | |
255 | def __init__(self, pgen_grammar, tokenizer, module): | |
256 | self._pgen_grammar = pgen_grammar | |
257 | self._tokenizer = tokenizer | |
258 | self._module = module | |
259 | ||
260 | def _reset(self): | |
261 | self._copy_count = 0 | |
262 | self._parser_count = 0 | |
263 | ||
264 | self._nodes_tree = _NodesTree(self._module) | |
265 | ||
266 | def update(self, old_lines, new_lines): | |
267 | ''' | |
268 | The algorithm works as follows: | |
269 | ||
270 | Equal: | |
271 | - Assure that the start is a newline, otherwise parse until we get | |
272 | one. | |
273 | - Copy from parsed_until_line + 1 to max(i2 + 1) | |
274 | - Make sure that the indentation is correct (e.g. add DEDENT) | |
275 | - Add old and change positions | |
276 | Insert: | |
277 | - Parse from parsed_until_line + 1 to min(j2 + 1), hopefully not | |
278 | much more. | |
279 | ||
280 | Returns the new module node. | |
281 | ''' | |
282 | LOG.debug('diff parser start') | |
283 | # Reset the used names cache so they get regenerated. | |
284 | self._module._used_names = None | |
285 | ||
286 | self._parser_lines_new = new_lines | |
287 | ||
288 | self._reset() | |
289 | ||
290 | line_length = len(new_lines) | |
291 | sm = difflib.SequenceMatcher(None, old_lines, self._parser_lines_new) | |
292 | opcodes = sm.get_opcodes() | |
293 | LOG.debug('line_lengths old: %s; new: %s' % (len(old_lines), line_length)) | |
294 | ||
295 | for operation, i1, i2, j1, j2 in opcodes: | |
296 | LOG.debug('-> code[%s] old[%s:%s] new[%s:%s]', | |
297 | operation, i1 + 1, i2, j1 + 1, j2) | |
298 | ||
299 | if j2 == line_length and new_lines[-1] == '': | |
300 | # The empty part after the last newline is not relevant. | |
301 | j2 -= 1 | |
302 | ||
303 | if operation == 'equal': | |
304 | line_offset = j1 - i1 | |
305 | self._copy_from_old_parser(line_offset, i1 + 1, i2, j2) | |
306 | elif operation == 'replace': | |
307 | self._parse(until_line=j2) | |
308 | elif operation == 'insert': | |
309 | self._parse(until_line=j2) | |
310 | else: | |
311 | assert operation == 'delete' | |
312 | ||
313 | # With this action all change will finally be applied and we have a | |
314 | # changed module. | |
315 | self._nodes_tree.close() | |
316 | ||
317 | if DEBUG_DIFF_PARSER: | |
318 | # If there is reasonable suspicion that the diff parser is not | |
319 | # behaving well, this should be enabled. | |
320 | try: | |
321 | code = ''.join(new_lines) | |
322 | assert self._module.get_code() == code | |
323 | _assert_valid_graph(self._module) | |
324 | without_diff_parser_module = Parser( | |
325 | self._pgen_grammar, | |
326 | error_recovery=True | |
327 | ).parse(self._tokenizer(new_lines)) | |
328 | _assert_nodes_are_equal(self._module, without_diff_parser_module) | |
329 | except AssertionError: | |
330 | print(_get_debug_error_message(self._module, old_lines, new_lines)) | |
331 | raise | |
332 | ||
333 | last_pos = self._module.end_pos[0] | |
334 | if last_pos != line_length: | |
335 | raise Exception( | |
336 | ('(%s != %s) ' % (last_pos, line_length)) | |
337 | + _get_debug_error_message(self._module, old_lines, new_lines) | |
338 | ) | |
339 | LOG.debug('diff parser end') | |
340 | return self._module | |
341 | ||
342 | def _enabled_debugging(self, old_lines, lines_new): | |
343 | if self._module.get_code() != ''.join(lines_new): | |
344 | LOG.warning('parser issue:\n%s\n%s', ''.join(old_lines), ''.join(lines_new)) | |
345 | ||
346 | def _copy_from_old_parser(self, line_offset, start_line_old, until_line_old, until_line_new): | |
347 | last_until_line = -1 | |
348 | while until_line_new > self._nodes_tree.parsed_until_line: | |
349 | parsed_until_line_old = self._nodes_tree.parsed_until_line - line_offset | |
350 | line_stmt = self._get_old_line_stmt(parsed_until_line_old + 1) | |
351 | if line_stmt is None: | |
352 | # Parse 1 line at least. We don't need more, because we just | |
353 | # want to get into a state where the old parser has statements | |
354 | # again that can be copied (e.g. not lines within parentheses). | |
355 | self._parse(self._nodes_tree.parsed_until_line + 1) | |
356 | else: | |
357 | p_children = line_stmt.parent.children | |
358 | index = p_children.index(line_stmt) | |
359 | ||
360 | if start_line_old == 1 \ | |
361 | and p_children[0].get_first_leaf().prefix.startswith(BOM_UTF8_STRING): | |
362 | # If there's a BOM in the beginning, just reparse. It's too | |
363 | # complicated to account for it otherwise. | |
364 | copied_nodes = [] | |
365 | else: | |
366 | from_ = self._nodes_tree.parsed_until_line + 1 | |
367 | copied_nodes = self._nodes_tree.copy_nodes( | |
368 | p_children[index:], | |
369 | until_line_old, | |
370 | line_offset | |
371 | ) | |
372 | # Match all the nodes that are in the wanted range. | |
373 | if copied_nodes: | |
374 | self._copy_count += 1 | |
375 | ||
376 | to = self._nodes_tree.parsed_until_line | |
377 | ||
378 | LOG.debug('copy old[%s:%s] new[%s:%s]', | |
379 | copied_nodes[0].start_pos[0], | |
380 | copied_nodes[-1].end_pos[0] - 1, from_, to) | |
381 | else: | |
382 | # We have copied as much as possible (but definitely not too | |
383 | # much). Therefore we just parse a bit more. | |
384 | self._parse(self._nodes_tree.parsed_until_line + 1) | |
385 | # Since there are potential bugs that might loop here endlessly, we | |
386 | # just stop here. | |
387 | assert last_until_line != self._nodes_tree.parsed_until_line, last_until_line | |
388 | last_until_line = self._nodes_tree.parsed_until_line | |
389 | ||
390 | def _get_old_line_stmt(self, old_line): | |
391 | leaf = self._module.get_leaf_for_position((old_line, 0), include_prefixes=True) | |
392 | ||
393 | if _ends_with_newline(leaf): | |
394 | leaf = leaf.get_next_leaf() | |
395 | if leaf.get_start_pos_of_prefix()[0] == old_line: | |
396 | node = leaf | |
397 | while node.parent.type not in ('file_input', 'suite'): | |
398 | node = node.parent | |
399 | ||
400 | # Make sure that if only the `else:` line of an if statement is | |
401 | # copied that not the whole thing is going to be copied. | |
402 | if node.start_pos[0] >= old_line: | |
403 | return node | |
404 | # Must be on the same line. Otherwise we need to parse that bit. | |
405 | return None | |
406 | ||
407 | def _parse(self, until_line): | |
408 | """ | |
409 | Parses at least until the given line, but might just parse more until a | |
410 | valid state is reached. | |
411 | """ | |
412 | last_until_line = 0 | |
413 | while until_line > self._nodes_tree.parsed_until_line: | |
414 | node = self._try_parse_part(until_line) | |
415 | nodes = node.children | |
416 | ||
417 | self._nodes_tree.add_parsed_nodes(nodes, self._keyword_token_indents) | |
418 | if self._replace_tos_indent is not None: | |
419 | self._nodes_tree.indents[-1] = self._replace_tos_indent | |
420 | ||
421 | LOG.debug( | |
422 | 'parse_part from %s to %s (to %s in part parser)', | |
423 | nodes[0].get_start_pos_of_prefix()[0], | |
424 | self._nodes_tree.parsed_until_line, | |
425 | node.end_pos[0] - 1 | |
426 | ) | |
427 | # Since the tokenizer sometimes has bugs, we cannot be sure that | |
428 | # this loop terminates. Therefore assert that there's always a | |
429 | # change. | |
430 | assert last_until_line != self._nodes_tree.parsed_until_line, last_until_line | |
431 | last_until_line = self._nodes_tree.parsed_until_line | |
432 | ||
433 | def _try_parse_part(self, until_line): | |
434 | """ | |
435 | Sets up a normal parser that uses a spezialized tokenizer to only parse | |
436 | until a certain position (or a bit longer if the statement hasn't | |
437 | ended. | |
438 | """ | |
439 | self._parser_count += 1 | |
440 | # TODO speed up, shouldn't copy the whole list all the time. | |
441 | # memoryview? | |
442 | parsed_until_line = self._nodes_tree.parsed_until_line | |
443 | lines_after = self._parser_lines_new[parsed_until_line:] | |
444 | tokens = self._diff_tokenize( | |
445 | lines_after, | |
446 | until_line, | |
447 | line_offset=parsed_until_line | |
448 | ) | |
449 | self._active_parser = Parser( | |
450 | self._pgen_grammar, | |
451 | error_recovery=True | |
452 | ) | |
453 | return self._active_parser.parse(tokens=tokens) | |
454 | ||
455 | def _diff_tokenize(self, lines, until_line, line_offset=0): | |
456 | was_newline = False | |
457 | indents = self._nodes_tree.indents | |
458 | initial_indentation_count = len(indents) | |
459 | ||
460 | tokens = self._tokenizer( | |
461 | lines, | |
462 | start_pos=(line_offset + 1, 0), | |
463 | indents=indents, | |
464 | is_first_token=line_offset == 0, | |
465 | ) | |
466 | stack = self._active_parser.stack | |
467 | self._replace_tos_indent = None | |
468 | self._keyword_token_indents = {} | |
469 | # print('start', line_offset + 1, indents) | |
470 | for token in tokens: | |
471 | # print(token, indents) | |
472 | typ = token.type | |
473 | if typ == DEDENT: | |
474 | if len(indents) < initial_indentation_count: | |
475 | # We are done here, only thing that can come now is an | |
476 | # endmarker or another dedented code block. | |
477 | while True: | |
478 | typ, string, start_pos, prefix = token = next(tokens) | |
479 | if typ in (DEDENT, ERROR_DEDENT): | |
480 | if typ == ERROR_DEDENT: | |
481 | # We want to force an error dedent in the next | |
482 | # parser/pass. To make this possible we just | |
483 | # increase the location by one. | |
484 | self._replace_tos_indent = start_pos[1] + 1 | |
485 | pass | |
486 | else: | |
487 | break | |
488 | ||
489 | if '\n' in prefix or '\r' in prefix: | |
490 | prefix = re.sub(r'[^\n\r]+\Z', '', prefix) | |
491 | else: | |
492 | assert start_pos[1] >= len(prefix), repr(prefix) | |
493 | if start_pos[1] - len(prefix) == 0: | |
494 | prefix = '' | |
495 | yield PythonToken( | |
496 | ENDMARKER, '', | |
497 | start_pos, | |
498 | prefix | |
499 | ) | |
500 | break | |
501 | elif typ == NEWLINE and token.start_pos[0] >= until_line: | |
502 | was_newline = True | |
503 | elif was_newline: | |
504 | was_newline = False | |
505 | if len(indents) == initial_indentation_count: | |
506 | # Check if the parser is actually in a valid suite state. | |
507 | if _suite_or_file_input_is_valid(self._pgen_grammar, stack): | |
508 | yield PythonToken(ENDMARKER, '', token.start_pos, '') | |
509 | break | |
510 | ||
511 | if typ == NAME and token.string in ('class', 'def'): | |
512 | self._keyword_token_indents[token.start_pos] = list(indents) | |
513 | ||
514 | yield token | |
515 | ||
516 | ||
517 | class _NodesTreeNode: | |
518 | _ChildrenGroup = namedtuple( | |
519 | '_ChildrenGroup', | |
520 | 'prefix children line_offset last_line_offset_leaf') | |
521 | ||
522 | def __init__(self, tree_node, parent=None, indentation=0): | |
523 | self.tree_node = tree_node | |
524 | self._children_groups = [] | |
525 | self.parent = parent | |
526 | self._node_children = [] | |
527 | self.indentation = indentation | |
528 | ||
529 | def finish(self): | |
530 | children = [] | |
531 | for prefix, children_part, line_offset, last_line_offset_leaf in self._children_groups: | |
532 | first_leaf = _get_next_leaf_if_indentation( | |
533 | children_part[0].get_first_leaf() | |
534 | ) | |
535 | ||
536 | first_leaf.prefix = prefix + first_leaf.prefix | |
537 | if line_offset != 0: | |
538 | try: | |
539 | _update_positions( | |
540 | children_part, line_offset, last_line_offset_leaf) | |
541 | except _PositionUpdatingFinished: | |
542 | pass | |
543 | children += children_part | |
544 | self.tree_node.children = children | |
545 | # Reset the parents | |
546 | for node in children: | |
547 | node.parent = self.tree_node | |
548 | ||
549 | for node_child in self._node_children: | |
550 | node_child.finish() | |
551 | ||
552 | def add_child_node(self, child_node): | |
553 | self._node_children.append(child_node) | |
554 | ||
555 | def add_tree_nodes(self, prefix, children, line_offset=0, | |
556 | last_line_offset_leaf=None): | |
557 | if last_line_offset_leaf is None: | |
558 | last_line_offset_leaf = children[-1].get_last_leaf() | |
559 | group = self._ChildrenGroup( | |
560 | prefix, children, line_offset, last_line_offset_leaf | |
561 | ) | |
562 | self._children_groups.append(group) | |
563 | ||
564 | def get_last_line(self, suffix): | |
565 | line = 0 | |
566 | if self._children_groups: | |
567 | children_group = self._children_groups[-1] | |
568 | last_leaf = _get_previous_leaf_if_indentation( | |
569 | children_group.last_line_offset_leaf | |
570 | ) | |
571 | ||
572 | line = last_leaf.end_pos[0] + children_group.line_offset | |
573 | ||
574 | # Newlines end on the next line, which means that they would cover | |
575 | # the next line. That line is not fully parsed at this point. | |
576 | if _ends_with_newline(last_leaf, suffix): | |
577 | line -= 1 | |
578 | line += len(split_lines(suffix)) - 1 | |
579 | ||
580 | if suffix and not suffix.endswith('\n') and not suffix.endswith('\r'): | |
581 | # This is the end of a file (that doesn't end with a newline). | |
582 | line += 1 | |
583 | ||
584 | if self._node_children: | |
585 | return max(line, self._node_children[-1].get_last_line(suffix)) | |
586 | return line | |
587 | ||
588 | def __repr__(self): | |
589 | return '<%s: %s>' % (self.__class__.__name__, self.tree_node) | |
590 | ||
591 | ||
592 | class _NodesTree: | |
593 | def __init__(self, module): | |
594 | self._base_node = _NodesTreeNode(module) | |
595 | self._working_stack = [self._base_node] | |
596 | self._module = module | |
597 | self._prefix_remainder = '' | |
598 | self.prefix = '' | |
599 | self.indents = [0] | |
600 | ||
601 | @property | |
602 | def parsed_until_line(self): | |
603 | return self._working_stack[-1].get_last_line(self.prefix) | |
604 | ||
605 | def _update_insertion_node(self, indentation): | |
606 | for node in reversed(list(self._working_stack)): | |
607 | if node.indentation < indentation or node is self._working_stack[0]: | |
608 | return node | |
609 | self._working_stack.pop() | |
610 | ||
611 | def add_parsed_nodes(self, tree_nodes, keyword_token_indents): | |
612 | old_prefix = self.prefix | |
613 | tree_nodes = self._remove_endmarker(tree_nodes) | |
614 | if not tree_nodes: | |
615 | self.prefix = old_prefix + self.prefix | |
616 | return | |
617 | ||
618 | assert tree_nodes[0].type != 'newline' | |
619 | ||
620 | node = self._update_insertion_node(tree_nodes[0].start_pos[1]) | |
621 | assert node.tree_node.type in ('suite', 'file_input') | |
622 | node.add_tree_nodes(old_prefix, tree_nodes) | |
623 | # tos = Top of stack | |
624 | self._update_parsed_node_tos(tree_nodes[-1], keyword_token_indents) | |
625 | ||
626 | def _update_parsed_node_tos(self, tree_node, keyword_token_indents): | |
627 | if tree_node.type == 'suite': | |
628 | def_leaf = tree_node.parent.children[0] | |
629 | new_tos = _NodesTreeNode( | |
630 | tree_node, | |
631 | indentation=keyword_token_indents[def_leaf.start_pos][-1], | |
632 | ) | |
633 | new_tos.add_tree_nodes('', list(tree_node.children)) | |
634 | ||
635 | self._working_stack[-1].add_child_node(new_tos) | |
636 | self._working_stack.append(new_tos) | |
637 | ||
638 | self._update_parsed_node_tos(tree_node.children[-1], keyword_token_indents) | |
639 | elif _func_or_class_has_suite(tree_node): | |
640 | self._update_parsed_node_tos(tree_node.children[-1], keyword_token_indents) | |
641 | ||
642 | def _remove_endmarker(self, tree_nodes): | |
643 | """ | |
644 | Helps cleaning up the tree nodes that get inserted. | |
645 | """ | |
646 | last_leaf = tree_nodes[-1].get_last_leaf() | |
647 | is_endmarker = last_leaf.type == 'endmarker' | |
648 | self._prefix_remainder = '' | |
649 | if is_endmarker: | |
650 | prefix = last_leaf.prefix | |
651 | separation = max(prefix.rfind('\n'), prefix.rfind('\r')) | |
652 | if separation > -1: | |
653 | # Remove the whitespace part of the prefix after a newline. | |
654 | # That is not relevant if parentheses were opened. Always parse | |
655 | # until the end of a line. | |
656 | last_leaf.prefix, self._prefix_remainder = \ | |
657 | last_leaf.prefix[:separation + 1], last_leaf.prefix[separation + 1:] | |
658 | ||
659 | self.prefix = '' | |
660 | ||
661 | if is_endmarker: | |
662 | self.prefix = last_leaf.prefix | |
663 | ||
664 | tree_nodes = tree_nodes[:-1] | |
665 | return tree_nodes | |
666 | ||
667 | def _get_matching_indent_nodes(self, tree_nodes, is_new_suite): | |
668 | # There might be a random dedent where we have to stop copying. | |
669 | # Invalid indents are ok, because the parser handled that | |
670 | # properly before. An invalid dedent can happen, because a few | |
671 | # lines above there was an invalid indent. | |
672 | node_iterator = iter(tree_nodes) | |
673 | if is_new_suite: | |
674 | yield next(node_iterator) | |
675 | ||
676 | first_node = next(node_iterator) | |
677 | indent = _get_indentation(first_node) | |
678 | if not is_new_suite and indent not in self.indents: | |
679 | return | |
680 | yield first_node | |
681 | ||
682 | for n in node_iterator: | |
683 | if _get_indentation(n) != indent: | |
684 | return | |
685 | yield n | |
686 | ||
687 | def copy_nodes(self, tree_nodes, until_line, line_offset): | |
688 | """ | |
689 | Copies tree nodes from the old parser tree. | |
690 | ||
691 | Returns the number of tree nodes that were copied. | |
692 | """ | |
693 | if tree_nodes[0].type in ('error_leaf', 'error_node'): | |
694 | # Avoid copying errors in the beginning. Can lead to a lot of | |
695 | # issues. | |
696 | return [] | |
697 | ||
698 | indentation = _get_indentation(tree_nodes[0]) | |
699 | old_working_stack = list(self._working_stack) | |
700 | old_prefix = self.prefix | |
701 | old_indents = self.indents | |
702 | self.indents = [i for i in self.indents if i <= indentation] | |
703 | ||
704 | self._update_insertion_node(indentation) | |
705 | ||
706 | new_nodes, self._working_stack, self.prefix, added_indents = self._copy_nodes( | |
707 | list(self._working_stack), | |
708 | tree_nodes, | |
709 | until_line, | |
710 | line_offset, | |
711 | self.prefix, | |
712 | ) | |
713 | if new_nodes: | |
714 | self.indents += added_indents | |
715 | else: | |
716 | self._working_stack = old_working_stack | |
717 | self.prefix = old_prefix | |
718 | self.indents = old_indents | |
719 | return new_nodes | |
720 | ||
721 | def _copy_nodes(self, working_stack, nodes, until_line, line_offset, | |
722 | prefix='', is_nested=False): | |
723 | new_nodes = [] | |
724 | added_indents = [] | |
725 | ||
726 | nodes = list(self._get_matching_indent_nodes( | |
727 | nodes, | |
728 | is_new_suite=is_nested, | |
729 | )) | |
730 | ||
731 | new_prefix = '' | |
732 | for node in nodes: | |
733 | if node.start_pos[0] > until_line: | |
734 | break | |
735 | ||
736 | if node.type == 'endmarker': | |
737 | break | |
738 | ||
739 | if node.type == 'error_leaf' and node.token_type in ('DEDENT', 'ERROR_DEDENT'): | |
740 | break | |
741 | # TODO this check might take a bit of time for large files. We | |
742 | # might want to change this to do more intelligent guessing or | |
743 | # binary search. | |
744 | if _get_last_line(node) > until_line: | |
745 | # We can split up functions and classes later. | |
746 | if _func_or_class_has_suite(node): | |
747 | new_nodes.append(node) | |
748 | break | |
749 | try: | |
750 | c = node.children | |
751 | except AttributeError: | |
752 | pass | |
753 | else: | |
754 | # This case basically appears with error recovery of one line | |
755 | # suites like `def foo(): bar.-`. In this case we might not | |
756 | # include a newline in the statement and we need to take care | |
757 | # of that. | |
758 | n = node | |
759 | if n.type == 'decorated': | |
760 | n = n.children[-1] | |
761 | if n.type in ('async_funcdef', 'async_stmt'): | |
762 | n = n.children[-1] | |
763 | if n.type in ('classdef', 'funcdef'): | |
764 | suite_node = n.children[-1] | |
765 | else: | |
766 | suite_node = c[-1] | |
767 | ||
768 | if suite_node.type in ('error_leaf', 'error_node'): | |
769 | break | |
770 | ||
771 | new_nodes.append(node) | |
772 | ||
773 | # Pop error nodes at the end from the list | |
774 | if new_nodes: | |
775 | while new_nodes: | |
776 | last_node = new_nodes[-1] | |
777 | if (last_node.type in ('error_leaf', 'error_node') | |
778 | or _is_flow_node(new_nodes[-1])): | |
779 | # Error leafs/nodes don't have a defined start/end. Error | |
780 | # nodes might not end with a newline (e.g. if there's an | |
781 | # open `(`). Therefore ignore all of them unless they are | |
782 | # succeeded with valid parser state. | |
783 | # If we copy flows at the end, they might be continued | |
784 | # after the copy limit (in the new parser). | |
785 | # In this while loop we try to remove until we find a newline. | |
786 | new_prefix = '' | |
787 | new_nodes.pop() | |
788 | while new_nodes: | |
789 | last_node = new_nodes[-1] | |
790 | if last_node.get_last_leaf().type == 'newline': | |
791 | break | |
792 | new_nodes.pop() | |
793 | continue | |
794 | if len(new_nodes) > 1 and new_nodes[-2].type == 'error_node': | |
795 | # The problem here is that Parso error recovery sometimes | |
796 | # influences nodes before this node. | |
797 | # Since the new last node is an error node this will get | |
798 | # cleaned up in the next while iteration. | |
799 | new_nodes.pop() | |
800 | continue | |
801 | break | |
802 | ||
803 | if not new_nodes: | |
804 | return [], working_stack, prefix, added_indents | |
805 | ||
806 | tos = working_stack[-1] | |
807 | last_node = new_nodes[-1] | |
808 | had_valid_suite_last = False | |
809 | # Pop incomplete suites from the list | |
810 | if _func_or_class_has_suite(last_node): | |
811 | suite = last_node | |
812 | while suite.type != 'suite': | |
813 | suite = suite.children[-1] | |
814 | ||
815 | indent = _get_suite_indentation(suite) | |
816 | added_indents.append(indent) | |
817 | ||
818 | suite_tos = _NodesTreeNode(suite, indentation=_get_indentation(last_node)) | |
819 | # Don't need to pass line_offset here, it's already done by the | |
820 | # parent. | |
821 | suite_nodes, new_working_stack, new_prefix, ai = self._copy_nodes( | |
822 | working_stack + [suite_tos], suite.children, until_line, line_offset, | |
823 | is_nested=True, | |
824 | ) | |
825 | added_indents += ai | |
826 | if len(suite_nodes) < 2: | |
827 | # A suite only with newline is not valid. | |
828 | new_nodes.pop() | |
829 | new_prefix = '' | |
830 | else: | |
831 | assert new_nodes | |
832 | tos.add_child_node(suite_tos) | |
833 | working_stack = new_working_stack | |
834 | had_valid_suite_last = True | |
835 | ||
836 | if new_nodes: | |
837 | if not _ends_with_newline(new_nodes[-1].get_last_leaf()) and not had_valid_suite_last: | |
838 | p = new_nodes[-1].get_next_leaf().prefix | |
839 | # We are not allowed to remove the newline at the end of the | |
840 | # line, otherwise it's going to be missing. This happens e.g. | |
841 | # if a bracket is around before that moves newlines to | |
842 | # prefixes. | |
843 | new_prefix = split_lines(p, keepends=True)[0] | |
844 | ||
845 | if had_valid_suite_last: | |
846 | last = new_nodes[-1] | |
847 | if last.type == 'decorated': | |
848 | last = last.children[-1] | |
849 | if last.type in ('async_funcdef', 'async_stmt'): | |
850 | last = last.children[-1] | |
851 | last_line_offset_leaf = last.children[-2].get_last_leaf() | |
852 | assert last_line_offset_leaf == ':' | |
853 | else: | |
854 | last_line_offset_leaf = new_nodes[-1].get_last_leaf() | |
855 | tos.add_tree_nodes( | |
856 | prefix, new_nodes, line_offset, last_line_offset_leaf, | |
857 | ) | |
858 | prefix = new_prefix | |
859 | self._prefix_remainder = '' | |
860 | ||
861 | return new_nodes, working_stack, prefix, added_indents | |
862 | ||
863 | def close(self): | |
864 | self._base_node.finish() | |
865 | ||
866 | # Add an endmarker. | |
867 | try: | |
868 | last_leaf = self._module.get_last_leaf() | |
869 | except IndexError: | |
870 | end_pos = [1, 0] | |
871 | else: | |
872 | last_leaf = _skip_dedent_error_leaves(last_leaf) | |
873 | end_pos = list(last_leaf.end_pos) | |
874 | lines = split_lines(self.prefix) | |
875 | assert len(lines) > 0 | |
876 | if len(lines) == 1: | |
877 | if lines[0].startswith(BOM_UTF8_STRING) and end_pos == [1, 0]: | |
878 | end_pos[1] -= 1 | |
879 | end_pos[1] += len(lines[0]) | |
880 | else: | |
881 | end_pos[0] += len(lines) - 1 | |
882 | end_pos[1] = len(lines[-1]) | |
883 | ||
884 | endmarker = EndMarker('', tuple(end_pos), self.prefix + self._prefix_remainder) | |
885 | endmarker.parent = self._module | |
886 | self._module.children.append(endmarker) |