]> crepu.dev Git - config.git/blame_incremental - djavu-asus/elpy/rpc-venv/lib/python3.11/site-packages/parso/python/diff.py
ActualizaciĆ³n de Readme
[config.git] / djavu-asus / elpy / rpc-venv / lib / python3.11 / site-packages / parso / python / diff.py
... / ...
CommitLineData
1"""
2The diff parser is trying to be a faster version of the normal parser by trying
3to reuse the nodes of a previous pass over the same file. This is also called
4incremental parsing in parser literature. The difference is mostly that with
5incremental parsing you get a range that needs to be reparsed. Here we
6calculate that range ourselves by using difflib. After that it's essentially
7incremental parsing.
8
9The biggest issue of this approach is that we reuse nodes in a mutable way. The
10intial design and idea is quite problematic for this parser, but it is also
11pretty fast. Measurements showed that just copying nodes in Python is simply
12quite a bit slower (especially for big files >3 kLOC). Therefore we did not
13want to get rid of the mutable nodes, since this is usually not an issue.
14
15This is by far the hardest software I ever wrote, exactly because the initial
16design is crappy. When you have to account for a lot of mutable state, it
17creates a ton of issues that you would otherwise not have. This file took
18probably 3-6 months to write, which is insane for a parser.
19
20There is a fuzzer in that helps test this whole thing. Please use it if you
21make changes here. If you run the fuzzer like::
22
23 test/fuzz_diff_parser.py random -n 100000
24
25you can be pretty sure that everything is still fine. I sometimes run the
26fuzzer up to 24h to make sure everything is still ok.
27"""
28import re
29import difflib
30from collections import namedtuple
31import logging
32
33from parso.utils import split_lines
34from parso.python.parser import Parser
35from parso.python.tree import EndMarker
36from parso.python.tokenize import PythonToken, BOM_UTF8_STRING
37from parso.python.token import PythonTokenTypes
38
39LOG = logging.getLogger(__name__)
40DEBUG_DIFF_PARSER = False
41
42_INDENTATION_TOKENS = 'INDENT', 'ERROR_DEDENT', 'DEDENT'
43
44NEWLINE = PythonTokenTypes.NEWLINE
45DEDENT = PythonTokenTypes.DEDENT
46NAME = PythonTokenTypes.NAME
47ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT
48ENDMARKER = PythonTokenTypes.ENDMARKER
49
50
51def _is_indentation_error_leaf(node):
52 return node.type == 'error_leaf' and node.token_type in _INDENTATION_TOKENS
53
54
55def _get_previous_leaf_if_indentation(leaf):
56 while leaf and _is_indentation_error_leaf(leaf):
57 leaf = leaf.get_previous_leaf()
58 return leaf
59
60
61def _get_next_leaf_if_indentation(leaf):
62 while leaf and _is_indentation_error_leaf(leaf):
63 leaf = leaf.get_next_leaf()
64 return leaf
65
66
67def _get_suite_indentation(tree_node):
68 return _get_indentation(tree_node.children[1])
69
70
71def _get_indentation(tree_node):
72 return tree_node.start_pos[1]
73
74
75def _assert_valid_graph(node):
76 """
77 Checks if the parent/children relationship is correct.
78
79 This is a check that only runs during debugging/testing.
80 """
81 try:
82 children = node.children
83 except AttributeError:
84 # Ignore INDENT is necessary, because indent/dedent tokens don't
85 # contain value/prefix and are just around, because of the tokenizer.
86 if node.type == 'error_leaf' and node.token_type in _INDENTATION_TOKENS:
87 assert not node.value
88 assert not node.prefix
89 return
90
91 # Calculate the content between two start positions.
92 previous_leaf = _get_previous_leaf_if_indentation(node.get_previous_leaf())
93 if previous_leaf is None:
94 content = node.prefix
95 previous_start_pos = 1, 0
96 else:
97 assert previous_leaf.end_pos <= node.start_pos, \
98 (previous_leaf, node)
99
100 content = previous_leaf.value + node.prefix
101 previous_start_pos = previous_leaf.start_pos
102
103 if '\n' in content or '\r' in content:
104 splitted = split_lines(content)
105 line = previous_start_pos[0] + len(splitted) - 1
106 actual = line, len(splitted[-1])
107 else:
108 actual = previous_start_pos[0], previous_start_pos[1] + len(content)
109 if content.startswith(BOM_UTF8_STRING) \
110 and node.get_start_pos_of_prefix() == (1, 0):
111 # Remove the byte order mark
112 actual = actual[0], actual[1] - 1
113
114 assert node.start_pos == actual, (node.start_pos, actual)
115 else:
116 for child in children:
117 assert child.parent == node, (node, child)
118 _assert_valid_graph(child)
119
120
121def _assert_nodes_are_equal(node1, node2):
122 try:
123 children1 = node1.children
124 except AttributeError:
125 assert not hasattr(node2, 'children'), (node1, node2)
126 assert node1.value == node2.value, (node1, node2)
127 assert node1.type == node2.type, (node1, node2)
128 assert node1.prefix == node2.prefix, (node1, node2)
129 assert node1.start_pos == node2.start_pos, (node1, node2)
130 return
131 else:
132 try:
133 children2 = node2.children
134 except AttributeError:
135 assert False, (node1, node2)
136 for n1, n2 in zip(children1, children2):
137 _assert_nodes_are_equal(n1, n2)
138 assert len(children1) == len(children2), '\n' + repr(children1) + '\n' + repr(children2)
139
140
141def _get_debug_error_message(module, old_lines, new_lines):
142 current_lines = split_lines(module.get_code(), keepends=True)
143 current_diff = difflib.unified_diff(new_lines, current_lines)
144 old_new_diff = difflib.unified_diff(old_lines, new_lines)
145 import parso
146 return (
147 "There's an issue with the diff parser. Please "
148 "report (parso v%s) - Old/New:\n%s\nActual Diff (May be empty):\n%s"
149 % (parso.__version__, ''.join(old_new_diff), ''.join(current_diff))
150 )
151
152
153def _get_last_line(node_or_leaf):
154 last_leaf = node_or_leaf.get_last_leaf()
155 if _ends_with_newline(last_leaf):
156 return last_leaf.start_pos[0]
157 else:
158 n = last_leaf.get_next_leaf()
159 if n.type == 'endmarker' and '\n' in n.prefix:
160 # This is a very special case and has to do with error recovery in
161 # Parso. The problem is basically that there's no newline leaf at
162 # the end sometimes (it's required in the grammar, but not needed
163 # actually before endmarker, CPython just adds a newline to make
164 # source code pass the parser, to account for that Parso error
165 # recovery allows small_stmt instead of simple_stmt).
166 return last_leaf.end_pos[0] + 1
167 return last_leaf.end_pos[0]
168
169
170def _skip_dedent_error_leaves(leaf):
171 while leaf is not None and leaf.type == 'error_leaf' and leaf.token_type == 'DEDENT':
172 leaf = leaf.get_previous_leaf()
173 return leaf
174
175
176def _ends_with_newline(leaf, suffix=''):
177 leaf = _skip_dedent_error_leaves(leaf)
178
179 if leaf.type == 'error_leaf':
180 typ = leaf.token_type.lower()
181 else:
182 typ = leaf.type
183
184 return typ == 'newline' or suffix.endswith('\n') or suffix.endswith('\r')
185
186
187def _flows_finished(pgen_grammar, stack):
188 """
189 if, while, for and try might not be finished, because another part might
190 still be parsed.
191 """
192 for stack_node in stack:
193 if stack_node.nonterminal in ('if_stmt', 'while_stmt', 'for_stmt', 'try_stmt'):
194 return False
195 return True
196
197
198def _func_or_class_has_suite(node):
199 if node.type == 'decorated':
200 node = node.children[-1]
201 if node.type in ('async_funcdef', 'async_stmt'):
202 node = node.children[-1]
203 return node.type in ('classdef', 'funcdef') and node.children[-1].type == 'suite'
204
205
206def _suite_or_file_input_is_valid(pgen_grammar, stack):
207 if not _flows_finished(pgen_grammar, stack):
208 return False
209
210 for stack_node in reversed(stack):
211 if stack_node.nonterminal == 'decorator':
212 # A decorator is only valid with the upcoming function.
213 return False
214
215 if stack_node.nonterminal == 'suite':
216 # If only newline is in the suite, the suite is not valid, yet.
217 return len(stack_node.nodes) > 1
218 # Not reaching a suite means that we're dealing with file_input levels
219 # where there's no need for a valid statement in it. It can also be empty.
220 return True
221
222
223def _is_flow_node(node):
224 if node.type == 'async_stmt':
225 node = node.children[1]
226 try:
227 value = node.children[0].value
228 except AttributeError:
229 return False
230 return value in ('if', 'for', 'while', 'try', 'with')
231
232
233class _PositionUpdatingFinished(Exception):
234 pass
235
236
237def _update_positions(nodes, line_offset, last_leaf):
238 for node in nodes:
239 try:
240 children = node.children
241 except AttributeError:
242 # Is a leaf
243 node.line += line_offset
244 if node is last_leaf:
245 raise _PositionUpdatingFinished
246 else:
247 _update_positions(children, line_offset, last_leaf)
248
249
250class DiffParser:
251 """
252 An advanced form of parsing a file faster. Unfortunately comes with huge
253 side effects. It changes the given module.
254 """
255 def __init__(self, pgen_grammar, tokenizer, module):
256 self._pgen_grammar = pgen_grammar
257 self._tokenizer = tokenizer
258 self._module = module
259
260 def _reset(self):
261 self._copy_count = 0
262 self._parser_count = 0
263
264 self._nodes_tree = _NodesTree(self._module)
265
266 def update(self, old_lines, new_lines):
267 '''
268 The algorithm works as follows:
269
270 Equal:
271 - Assure that the start is a newline, otherwise parse until we get
272 one.
273 - Copy from parsed_until_line + 1 to max(i2 + 1)
274 - Make sure that the indentation is correct (e.g. add DEDENT)
275 - Add old and change positions
276 Insert:
277 - Parse from parsed_until_line + 1 to min(j2 + 1), hopefully not
278 much more.
279
280 Returns the new module node.
281 '''
282 LOG.debug('diff parser start')
283 # Reset the used names cache so they get regenerated.
284 self._module._used_names = None
285
286 self._parser_lines_new = new_lines
287
288 self._reset()
289
290 line_length = len(new_lines)
291 sm = difflib.SequenceMatcher(None, old_lines, self._parser_lines_new)
292 opcodes = sm.get_opcodes()
293 LOG.debug('line_lengths old: %s; new: %s' % (len(old_lines), line_length))
294
295 for operation, i1, i2, j1, j2 in opcodes:
296 LOG.debug('-> code[%s] old[%s:%s] new[%s:%s]',
297 operation, i1 + 1, i2, j1 + 1, j2)
298
299 if j2 == line_length and new_lines[-1] == '':
300 # The empty part after the last newline is not relevant.
301 j2 -= 1
302
303 if operation == 'equal':
304 line_offset = j1 - i1
305 self._copy_from_old_parser(line_offset, i1 + 1, i2, j2)
306 elif operation == 'replace':
307 self._parse(until_line=j2)
308 elif operation == 'insert':
309 self._parse(until_line=j2)
310 else:
311 assert operation == 'delete'
312
313 # With this action all change will finally be applied and we have a
314 # changed module.
315 self._nodes_tree.close()
316
317 if DEBUG_DIFF_PARSER:
318 # If there is reasonable suspicion that the diff parser is not
319 # behaving well, this should be enabled.
320 try:
321 code = ''.join(new_lines)
322 assert self._module.get_code() == code
323 _assert_valid_graph(self._module)
324 without_diff_parser_module = Parser(
325 self._pgen_grammar,
326 error_recovery=True
327 ).parse(self._tokenizer(new_lines))
328 _assert_nodes_are_equal(self._module, without_diff_parser_module)
329 except AssertionError:
330 print(_get_debug_error_message(self._module, old_lines, new_lines))
331 raise
332
333 last_pos = self._module.end_pos[0]
334 if last_pos != line_length:
335 raise Exception(
336 ('(%s != %s) ' % (last_pos, line_length))
337 + _get_debug_error_message(self._module, old_lines, new_lines)
338 )
339 LOG.debug('diff parser end')
340 return self._module
341
342 def _enabled_debugging(self, old_lines, lines_new):
343 if self._module.get_code() != ''.join(lines_new):
344 LOG.warning('parser issue:\n%s\n%s', ''.join(old_lines), ''.join(lines_new))
345
346 def _copy_from_old_parser(self, line_offset, start_line_old, until_line_old, until_line_new):
347 last_until_line = -1
348 while until_line_new > self._nodes_tree.parsed_until_line:
349 parsed_until_line_old = self._nodes_tree.parsed_until_line - line_offset
350 line_stmt = self._get_old_line_stmt(parsed_until_line_old + 1)
351 if line_stmt is None:
352 # Parse 1 line at least. We don't need more, because we just
353 # want to get into a state where the old parser has statements
354 # again that can be copied (e.g. not lines within parentheses).
355 self._parse(self._nodes_tree.parsed_until_line + 1)
356 else:
357 p_children = line_stmt.parent.children
358 index = p_children.index(line_stmt)
359
360 if start_line_old == 1 \
361 and p_children[0].get_first_leaf().prefix.startswith(BOM_UTF8_STRING):
362 # If there's a BOM in the beginning, just reparse. It's too
363 # complicated to account for it otherwise.
364 copied_nodes = []
365 else:
366 from_ = self._nodes_tree.parsed_until_line + 1
367 copied_nodes = self._nodes_tree.copy_nodes(
368 p_children[index:],
369 until_line_old,
370 line_offset
371 )
372 # Match all the nodes that are in the wanted range.
373 if copied_nodes:
374 self._copy_count += 1
375
376 to = self._nodes_tree.parsed_until_line
377
378 LOG.debug('copy old[%s:%s] new[%s:%s]',
379 copied_nodes[0].start_pos[0],
380 copied_nodes[-1].end_pos[0] - 1, from_, to)
381 else:
382 # We have copied as much as possible (but definitely not too
383 # much). Therefore we just parse a bit more.
384 self._parse(self._nodes_tree.parsed_until_line + 1)
385 # Since there are potential bugs that might loop here endlessly, we
386 # just stop here.
387 assert last_until_line != self._nodes_tree.parsed_until_line, last_until_line
388 last_until_line = self._nodes_tree.parsed_until_line
389
390 def _get_old_line_stmt(self, old_line):
391 leaf = self._module.get_leaf_for_position((old_line, 0), include_prefixes=True)
392
393 if _ends_with_newline(leaf):
394 leaf = leaf.get_next_leaf()
395 if leaf.get_start_pos_of_prefix()[0] == old_line:
396 node = leaf
397 while node.parent.type not in ('file_input', 'suite'):
398 node = node.parent
399
400 # Make sure that if only the `else:` line of an if statement is
401 # copied that not the whole thing is going to be copied.
402 if node.start_pos[0] >= old_line:
403 return node
404 # Must be on the same line. Otherwise we need to parse that bit.
405 return None
406
407 def _parse(self, until_line):
408 """
409 Parses at least until the given line, but might just parse more until a
410 valid state is reached.
411 """
412 last_until_line = 0
413 while until_line > self._nodes_tree.parsed_until_line:
414 node = self._try_parse_part(until_line)
415 nodes = node.children
416
417 self._nodes_tree.add_parsed_nodes(nodes, self._keyword_token_indents)
418 if self._replace_tos_indent is not None:
419 self._nodes_tree.indents[-1] = self._replace_tos_indent
420
421 LOG.debug(
422 'parse_part from %s to %s (to %s in part parser)',
423 nodes[0].get_start_pos_of_prefix()[0],
424 self._nodes_tree.parsed_until_line,
425 node.end_pos[0] - 1
426 )
427 # Since the tokenizer sometimes has bugs, we cannot be sure that
428 # this loop terminates. Therefore assert that there's always a
429 # change.
430 assert last_until_line != self._nodes_tree.parsed_until_line, last_until_line
431 last_until_line = self._nodes_tree.parsed_until_line
432
433 def _try_parse_part(self, until_line):
434 """
435 Sets up a normal parser that uses a spezialized tokenizer to only parse
436 until a certain position (or a bit longer if the statement hasn't
437 ended.
438 """
439 self._parser_count += 1
440 # TODO speed up, shouldn't copy the whole list all the time.
441 # memoryview?
442 parsed_until_line = self._nodes_tree.parsed_until_line
443 lines_after = self._parser_lines_new[parsed_until_line:]
444 tokens = self._diff_tokenize(
445 lines_after,
446 until_line,
447 line_offset=parsed_until_line
448 )
449 self._active_parser = Parser(
450 self._pgen_grammar,
451 error_recovery=True
452 )
453 return self._active_parser.parse(tokens=tokens)
454
455 def _diff_tokenize(self, lines, until_line, line_offset=0):
456 was_newline = False
457 indents = self._nodes_tree.indents
458 initial_indentation_count = len(indents)
459
460 tokens = self._tokenizer(
461 lines,
462 start_pos=(line_offset + 1, 0),
463 indents=indents,
464 is_first_token=line_offset == 0,
465 )
466 stack = self._active_parser.stack
467 self._replace_tos_indent = None
468 self._keyword_token_indents = {}
469 # print('start', line_offset + 1, indents)
470 for token in tokens:
471 # print(token, indents)
472 typ = token.type
473 if typ == DEDENT:
474 if len(indents) < initial_indentation_count:
475 # We are done here, only thing that can come now is an
476 # endmarker or another dedented code block.
477 while True:
478 typ, string, start_pos, prefix = token = next(tokens)
479 if typ in (DEDENT, ERROR_DEDENT):
480 if typ == ERROR_DEDENT:
481 # We want to force an error dedent in the next
482 # parser/pass. To make this possible we just
483 # increase the location by one.
484 self._replace_tos_indent = start_pos[1] + 1
485 pass
486 else:
487 break
488
489 if '\n' in prefix or '\r' in prefix:
490 prefix = re.sub(r'[^\n\r]+\Z', '', prefix)
491 else:
492 assert start_pos[1] >= len(prefix), repr(prefix)
493 if start_pos[1] - len(prefix) == 0:
494 prefix = ''
495 yield PythonToken(
496 ENDMARKER, '',
497 start_pos,
498 prefix
499 )
500 break
501 elif typ == NEWLINE and token.start_pos[0] >= until_line:
502 was_newline = True
503 elif was_newline:
504 was_newline = False
505 if len(indents) == initial_indentation_count:
506 # Check if the parser is actually in a valid suite state.
507 if _suite_or_file_input_is_valid(self._pgen_grammar, stack):
508 yield PythonToken(ENDMARKER, '', token.start_pos, '')
509 break
510
511 if typ == NAME and token.string in ('class', 'def'):
512 self._keyword_token_indents[token.start_pos] = list(indents)
513
514 yield token
515
516
517class _NodesTreeNode:
518 _ChildrenGroup = namedtuple(
519 '_ChildrenGroup',
520 'prefix children line_offset last_line_offset_leaf')
521
522 def __init__(self, tree_node, parent=None, indentation=0):
523 self.tree_node = tree_node
524 self._children_groups = []
525 self.parent = parent
526 self._node_children = []
527 self.indentation = indentation
528
529 def finish(self):
530 children = []
531 for prefix, children_part, line_offset, last_line_offset_leaf in self._children_groups:
532 first_leaf = _get_next_leaf_if_indentation(
533 children_part[0].get_first_leaf()
534 )
535
536 first_leaf.prefix = prefix + first_leaf.prefix
537 if line_offset != 0:
538 try:
539 _update_positions(
540 children_part, line_offset, last_line_offset_leaf)
541 except _PositionUpdatingFinished:
542 pass
543 children += children_part
544 self.tree_node.children = children
545 # Reset the parents
546 for node in children:
547 node.parent = self.tree_node
548
549 for node_child in self._node_children:
550 node_child.finish()
551
552 def add_child_node(self, child_node):
553 self._node_children.append(child_node)
554
555 def add_tree_nodes(self, prefix, children, line_offset=0,
556 last_line_offset_leaf=None):
557 if last_line_offset_leaf is None:
558 last_line_offset_leaf = children[-1].get_last_leaf()
559 group = self._ChildrenGroup(
560 prefix, children, line_offset, last_line_offset_leaf
561 )
562 self._children_groups.append(group)
563
564 def get_last_line(self, suffix):
565 line = 0
566 if self._children_groups:
567 children_group = self._children_groups[-1]
568 last_leaf = _get_previous_leaf_if_indentation(
569 children_group.last_line_offset_leaf
570 )
571
572 line = last_leaf.end_pos[0] + children_group.line_offset
573
574 # Newlines end on the next line, which means that they would cover
575 # the next line. That line is not fully parsed at this point.
576 if _ends_with_newline(last_leaf, suffix):
577 line -= 1
578 line += len(split_lines(suffix)) - 1
579
580 if suffix and not suffix.endswith('\n') and not suffix.endswith('\r'):
581 # This is the end of a file (that doesn't end with a newline).
582 line += 1
583
584 if self._node_children:
585 return max(line, self._node_children[-1].get_last_line(suffix))
586 return line
587
588 def __repr__(self):
589 return '<%s: %s>' % (self.__class__.__name__, self.tree_node)
590
591
592class _NodesTree:
593 def __init__(self, module):
594 self._base_node = _NodesTreeNode(module)
595 self._working_stack = [self._base_node]
596 self._module = module
597 self._prefix_remainder = ''
598 self.prefix = ''
599 self.indents = [0]
600
601 @property
602 def parsed_until_line(self):
603 return self._working_stack[-1].get_last_line(self.prefix)
604
605 def _update_insertion_node(self, indentation):
606 for node in reversed(list(self._working_stack)):
607 if node.indentation < indentation or node is self._working_stack[0]:
608 return node
609 self._working_stack.pop()
610
611 def add_parsed_nodes(self, tree_nodes, keyword_token_indents):
612 old_prefix = self.prefix
613 tree_nodes = self._remove_endmarker(tree_nodes)
614 if not tree_nodes:
615 self.prefix = old_prefix + self.prefix
616 return
617
618 assert tree_nodes[0].type != 'newline'
619
620 node = self._update_insertion_node(tree_nodes[0].start_pos[1])
621 assert node.tree_node.type in ('suite', 'file_input')
622 node.add_tree_nodes(old_prefix, tree_nodes)
623 # tos = Top of stack
624 self._update_parsed_node_tos(tree_nodes[-1], keyword_token_indents)
625
626 def _update_parsed_node_tos(self, tree_node, keyword_token_indents):
627 if tree_node.type == 'suite':
628 def_leaf = tree_node.parent.children[0]
629 new_tos = _NodesTreeNode(
630 tree_node,
631 indentation=keyword_token_indents[def_leaf.start_pos][-1],
632 )
633 new_tos.add_tree_nodes('', list(tree_node.children))
634
635 self._working_stack[-1].add_child_node(new_tos)
636 self._working_stack.append(new_tos)
637
638 self._update_parsed_node_tos(tree_node.children[-1], keyword_token_indents)
639 elif _func_or_class_has_suite(tree_node):
640 self._update_parsed_node_tos(tree_node.children[-1], keyword_token_indents)
641
642 def _remove_endmarker(self, tree_nodes):
643 """
644 Helps cleaning up the tree nodes that get inserted.
645 """
646 last_leaf = tree_nodes[-1].get_last_leaf()
647 is_endmarker = last_leaf.type == 'endmarker'
648 self._prefix_remainder = ''
649 if is_endmarker:
650 prefix = last_leaf.prefix
651 separation = max(prefix.rfind('\n'), prefix.rfind('\r'))
652 if separation > -1:
653 # Remove the whitespace part of the prefix after a newline.
654 # That is not relevant if parentheses were opened. Always parse
655 # until the end of a line.
656 last_leaf.prefix, self._prefix_remainder = \
657 last_leaf.prefix[:separation + 1], last_leaf.prefix[separation + 1:]
658
659 self.prefix = ''
660
661 if is_endmarker:
662 self.prefix = last_leaf.prefix
663
664 tree_nodes = tree_nodes[:-1]
665 return tree_nodes
666
667 def _get_matching_indent_nodes(self, tree_nodes, is_new_suite):
668 # There might be a random dedent where we have to stop copying.
669 # Invalid indents are ok, because the parser handled that
670 # properly before. An invalid dedent can happen, because a few
671 # lines above there was an invalid indent.
672 node_iterator = iter(tree_nodes)
673 if is_new_suite:
674 yield next(node_iterator)
675
676 first_node = next(node_iterator)
677 indent = _get_indentation(first_node)
678 if not is_new_suite and indent not in self.indents:
679 return
680 yield first_node
681
682 for n in node_iterator:
683 if _get_indentation(n) != indent:
684 return
685 yield n
686
687 def copy_nodes(self, tree_nodes, until_line, line_offset):
688 """
689 Copies tree nodes from the old parser tree.
690
691 Returns the number of tree nodes that were copied.
692 """
693 if tree_nodes[0].type in ('error_leaf', 'error_node'):
694 # Avoid copying errors in the beginning. Can lead to a lot of
695 # issues.
696 return []
697
698 indentation = _get_indentation(tree_nodes[0])
699 old_working_stack = list(self._working_stack)
700 old_prefix = self.prefix
701 old_indents = self.indents
702 self.indents = [i for i in self.indents if i <= indentation]
703
704 self._update_insertion_node(indentation)
705
706 new_nodes, self._working_stack, self.prefix, added_indents = self._copy_nodes(
707 list(self._working_stack),
708 tree_nodes,
709 until_line,
710 line_offset,
711 self.prefix,
712 )
713 if new_nodes:
714 self.indents += added_indents
715 else:
716 self._working_stack = old_working_stack
717 self.prefix = old_prefix
718 self.indents = old_indents
719 return new_nodes
720
721 def _copy_nodes(self, working_stack, nodes, until_line, line_offset,
722 prefix='', is_nested=False):
723 new_nodes = []
724 added_indents = []
725
726 nodes = list(self._get_matching_indent_nodes(
727 nodes,
728 is_new_suite=is_nested,
729 ))
730
731 new_prefix = ''
732 for node in nodes:
733 if node.start_pos[0] > until_line:
734 break
735
736 if node.type == 'endmarker':
737 break
738
739 if node.type == 'error_leaf' and node.token_type in ('DEDENT', 'ERROR_DEDENT'):
740 break
741 # TODO this check might take a bit of time for large files. We
742 # might want to change this to do more intelligent guessing or
743 # binary search.
744 if _get_last_line(node) > until_line:
745 # We can split up functions and classes later.
746 if _func_or_class_has_suite(node):
747 new_nodes.append(node)
748 break
749 try:
750 c = node.children
751 except AttributeError:
752 pass
753 else:
754 # This case basically appears with error recovery of one line
755 # suites like `def foo(): bar.-`. In this case we might not
756 # include a newline in the statement and we need to take care
757 # of that.
758 n = node
759 if n.type == 'decorated':
760 n = n.children[-1]
761 if n.type in ('async_funcdef', 'async_stmt'):
762 n = n.children[-1]
763 if n.type in ('classdef', 'funcdef'):
764 suite_node = n.children[-1]
765 else:
766 suite_node = c[-1]
767
768 if suite_node.type in ('error_leaf', 'error_node'):
769 break
770
771 new_nodes.append(node)
772
773 # Pop error nodes at the end from the list
774 if new_nodes:
775 while new_nodes:
776 last_node = new_nodes[-1]
777 if (last_node.type in ('error_leaf', 'error_node')
778 or _is_flow_node(new_nodes[-1])):
779 # Error leafs/nodes don't have a defined start/end. Error
780 # nodes might not end with a newline (e.g. if there's an
781 # open `(`). Therefore ignore all of them unless they are
782 # succeeded with valid parser state.
783 # If we copy flows at the end, they might be continued
784 # after the copy limit (in the new parser).
785 # In this while loop we try to remove until we find a newline.
786 new_prefix = ''
787 new_nodes.pop()
788 while new_nodes:
789 last_node = new_nodes[-1]
790 if last_node.get_last_leaf().type == 'newline':
791 break
792 new_nodes.pop()
793 continue
794 if len(new_nodes) > 1 and new_nodes[-2].type == 'error_node':
795 # The problem here is that Parso error recovery sometimes
796 # influences nodes before this node.
797 # Since the new last node is an error node this will get
798 # cleaned up in the next while iteration.
799 new_nodes.pop()
800 continue
801 break
802
803 if not new_nodes:
804 return [], working_stack, prefix, added_indents
805
806 tos = working_stack[-1]
807 last_node = new_nodes[-1]
808 had_valid_suite_last = False
809 # Pop incomplete suites from the list
810 if _func_or_class_has_suite(last_node):
811 suite = last_node
812 while suite.type != 'suite':
813 suite = suite.children[-1]
814
815 indent = _get_suite_indentation(suite)
816 added_indents.append(indent)
817
818 suite_tos = _NodesTreeNode(suite, indentation=_get_indentation(last_node))
819 # Don't need to pass line_offset here, it's already done by the
820 # parent.
821 suite_nodes, new_working_stack, new_prefix, ai = self._copy_nodes(
822 working_stack + [suite_tos], suite.children, until_line, line_offset,
823 is_nested=True,
824 )
825 added_indents += ai
826 if len(suite_nodes) < 2:
827 # A suite only with newline is not valid.
828 new_nodes.pop()
829 new_prefix = ''
830 else:
831 assert new_nodes
832 tos.add_child_node(suite_tos)
833 working_stack = new_working_stack
834 had_valid_suite_last = True
835
836 if new_nodes:
837 if not _ends_with_newline(new_nodes[-1].get_last_leaf()) and not had_valid_suite_last:
838 p = new_nodes[-1].get_next_leaf().prefix
839 # We are not allowed to remove the newline at the end of the
840 # line, otherwise it's going to be missing. This happens e.g.
841 # if a bracket is around before that moves newlines to
842 # prefixes.
843 new_prefix = split_lines(p, keepends=True)[0]
844
845 if had_valid_suite_last:
846 last = new_nodes[-1]
847 if last.type == 'decorated':
848 last = last.children[-1]
849 if last.type in ('async_funcdef', 'async_stmt'):
850 last = last.children[-1]
851 last_line_offset_leaf = last.children[-2].get_last_leaf()
852 assert last_line_offset_leaf == ':'
853 else:
854 last_line_offset_leaf = new_nodes[-1].get_last_leaf()
855 tos.add_tree_nodes(
856 prefix, new_nodes, line_offset, last_line_offset_leaf,
857 )
858 prefix = new_prefix
859 self._prefix_remainder = ''
860
861 return new_nodes, working_stack, prefix, added_indents
862
863 def close(self):
864 self._base_node.finish()
865
866 # Add an endmarker.
867 try:
868 last_leaf = self._module.get_last_leaf()
869 except IndexError:
870 end_pos = [1, 0]
871 else:
872 last_leaf = _skip_dedent_error_leaves(last_leaf)
873 end_pos = list(last_leaf.end_pos)
874 lines = split_lines(self.prefix)
875 assert len(lines) > 0
876 if len(lines) == 1:
877 if lines[0].startswith(BOM_UTF8_STRING) and end_pos == [1, 0]:
878 end_pos[1] -= 1
879 end_pos[1] += len(lines[0])
880 else:
881 end_pos[0] += len(lines) - 1
882 end_pos[1] = len(lines[-1])
883
884 endmarker = EndMarker('', tuple(end_pos), self.prefix + self._prefix_remainder)
885 endmarker.parent = self._module
886 self._module.children.append(endmarker)