]>
Commit | Line | Data |
---|---|---|
1 | """ | |
2 | Parse Python code and perform AST validation. | |
3 | """ | |
4 | import ast | |
5 | import sys | |
6 | from typing import Final, Iterable, Iterator, List, Set, Tuple | |
7 | ||
8 | from black.mode import VERSION_TO_FEATURES, Feature, TargetVersion, supports_feature | |
9 | from black.nodes import syms | |
10 | from blib2to3 import pygram | |
11 | from blib2to3.pgen2 import driver | |
12 | from blib2to3.pgen2.grammar import Grammar | |
13 | from blib2to3.pgen2.parse import ParseError | |
14 | from blib2to3.pgen2.tokenize import TokenError | |
15 | from blib2to3.pytree import Leaf, Node | |
16 | ||
17 | PY2_HINT: Final = "Python 2 support was removed in version 22.0." | |
18 | ||
19 | ||
20 | class InvalidInput(ValueError): | |
21 | """Raised when input source code fails all parse attempts.""" | |
22 | ||
23 | ||
24 | def get_grammars(target_versions: Set[TargetVersion]) -> List[Grammar]: | |
25 | if not target_versions: | |
26 | # No target_version specified, so try all grammars. | |
27 | return [ | |
28 | # Python 3.7-3.9 | |
29 | pygram.python_grammar_no_print_statement_no_exec_statement_async_keywords, | |
30 | # Python 3.0-3.6 | |
31 | pygram.python_grammar_no_print_statement_no_exec_statement, | |
32 | # Python 3.10+ | |
33 | pygram.python_grammar_soft_keywords, | |
34 | ] | |
35 | ||
36 | grammars = [] | |
37 | # If we have to parse both, try to parse async as a keyword first | |
38 | if not supports_feature( | |
39 | target_versions, Feature.ASYNC_IDENTIFIERS | |
40 | ) and not supports_feature(target_versions, Feature.PATTERN_MATCHING): | |
41 | # Python 3.7-3.9 | |
42 | grammars.append( | |
43 | pygram.python_grammar_no_print_statement_no_exec_statement_async_keywords | |
44 | ) | |
45 | if not supports_feature(target_versions, Feature.ASYNC_KEYWORDS): | |
46 | # Python 3.0-3.6 | |
47 | grammars.append(pygram.python_grammar_no_print_statement_no_exec_statement) | |
48 | if any(Feature.PATTERN_MATCHING in VERSION_TO_FEATURES[v] for v in target_versions): | |
49 | # Python 3.10+ | |
50 | grammars.append(pygram.python_grammar_soft_keywords) | |
51 | ||
52 | # At least one of the above branches must have been taken, because every Python | |
53 | # version has exactly one of the two 'ASYNC_*' flags | |
54 | return grammars | |
55 | ||
56 | ||
57 | def lib2to3_parse(src_txt: str, target_versions: Iterable[TargetVersion] = ()) -> Node: | |
58 | """Given a string with source, return the lib2to3 Node.""" | |
59 | if not src_txt.endswith("\n"): | |
60 | src_txt += "\n" | |
61 | ||
62 | grammars = get_grammars(set(target_versions)) | |
63 | errors = {} | |
64 | for grammar in grammars: | |
65 | drv = driver.Driver(grammar) | |
66 | try: | |
67 | result = drv.parse_string(src_txt, True) | |
68 | break | |
69 | ||
70 | except ParseError as pe: | |
71 | lineno, column = pe.context[1] | |
72 | lines = src_txt.splitlines() | |
73 | try: | |
74 | faulty_line = lines[lineno - 1] | |
75 | except IndexError: | |
76 | faulty_line = "<line number missing in source>" | |
77 | errors[grammar.version] = InvalidInput( | |
78 | f"Cannot parse: {lineno}:{column}: {faulty_line}" | |
79 | ) | |
80 | ||
81 | except TokenError as te: | |
82 | # In edge cases these are raised; and typically don't have a "faulty_line". | |
83 | lineno, column = te.args[1] | |
84 | errors[grammar.version] = InvalidInput( | |
85 | f"Cannot parse: {lineno}:{column}: {te.args[0]}" | |
86 | ) | |
87 | ||
88 | else: | |
89 | # Choose the latest version when raising the actual parsing error. | |
90 | assert len(errors) >= 1 | |
91 | exc = errors[max(errors)] | |
92 | ||
93 | if matches_grammar(src_txt, pygram.python_grammar) or matches_grammar( | |
94 | src_txt, pygram.python_grammar_no_print_statement | |
95 | ): | |
96 | original_msg = exc.args[0] | |
97 | msg = f"{original_msg}\n{PY2_HINT}" | |
98 | raise InvalidInput(msg) from None | |
99 | ||
100 | raise exc from None | |
101 | ||
102 | if isinstance(result, Leaf): | |
103 | result = Node(syms.file_input, [result]) | |
104 | return result | |
105 | ||
106 | ||
107 | def matches_grammar(src_txt: str, grammar: Grammar) -> bool: | |
108 | drv = driver.Driver(grammar) | |
109 | try: | |
110 | drv.parse_string(src_txt, True) | |
111 | except (ParseError, TokenError, IndentationError): | |
112 | return False | |
113 | else: | |
114 | return True | |
115 | ||
116 | ||
117 | def lib2to3_unparse(node: Node) -> str: | |
118 | """Given a lib2to3 node, return its string representation.""" | |
119 | code = str(node) | |
120 | return code | |
121 | ||
122 | ||
123 | def parse_single_version( | |
124 | src: str, version: Tuple[int, int], *, type_comments: bool | |
125 | ) -> ast.AST: | |
126 | filename = "<unknown>" | |
127 | return ast.parse( | |
128 | src, filename, feature_version=version, type_comments=type_comments | |
129 | ) | |
130 | ||
131 | ||
132 | def parse_ast(src: str) -> ast.AST: | |
133 | # TODO: support Python 4+ ;) | |
134 | versions = [(3, minor) for minor in range(3, sys.version_info[1] + 1)] | |
135 | ||
136 | first_error = "" | |
137 | for version in sorted(versions, reverse=True): | |
138 | try: | |
139 | return parse_single_version(src, version, type_comments=True) | |
140 | except SyntaxError as e: | |
141 | if not first_error: | |
142 | first_error = str(e) | |
143 | ||
144 | # Try to parse without type comments | |
145 | for version in sorted(versions, reverse=True): | |
146 | try: | |
147 | return parse_single_version(src, version, type_comments=False) | |
148 | except SyntaxError: | |
149 | pass | |
150 | ||
151 | raise SyntaxError(first_error) | |
152 | ||
153 | ||
154 | def _normalize(lineend: str, value: str) -> str: | |
155 | # To normalize, we strip any leading and trailing space from | |
156 | # each line... | |
157 | stripped: List[str] = [i.strip() for i in value.splitlines()] | |
158 | normalized = lineend.join(stripped) | |
159 | # ...and remove any blank lines at the beginning and end of | |
160 | # the whole string | |
161 | return normalized.strip() | |
162 | ||
163 | ||
164 | def stringify_ast(node: ast.AST, depth: int = 0) -> Iterator[str]: | |
165 | """Simple visitor generating strings to compare ASTs by content.""" | |
166 | ||
167 | if ( | |
168 | isinstance(node, ast.Constant) | |
169 | and isinstance(node.value, str) | |
170 | and node.kind == "u" | |
171 | ): | |
172 | # It's a quirk of history that we strip the u prefix over here. We used to | |
173 | # rewrite the AST nodes for Python version compatibility and we never copied | |
174 | # over the kind | |
175 | node.kind = None | |
176 | ||
177 | yield f"{' ' * depth}{node.__class__.__name__}(" | |
178 | ||
179 | for field in sorted(node._fields): # noqa: F402 | |
180 | # TypeIgnore has only one field 'lineno' which breaks this comparison | |
181 | if isinstance(node, ast.TypeIgnore): | |
182 | break | |
183 | ||
184 | try: | |
185 | value: object = getattr(node, field) | |
186 | except AttributeError: | |
187 | continue | |
188 | ||
189 | yield f"{' ' * (depth+1)}{field}=" | |
190 | ||
191 | if isinstance(value, list): | |
192 | for item in value: | |
193 | # Ignore nested tuples within del statements, because we may insert | |
194 | # parentheses and they change the AST. | |
195 | if ( | |
196 | field == "targets" | |
197 | and isinstance(node, ast.Delete) | |
198 | and isinstance(item, ast.Tuple) | |
199 | ): | |
200 | for elt in item.elts: | |
201 | yield from stringify_ast(elt, depth + 2) | |
202 | ||
203 | elif isinstance(item, ast.AST): | |
204 | yield from stringify_ast(item, depth + 2) | |
205 | ||
206 | elif isinstance(value, ast.AST): | |
207 | yield from stringify_ast(value, depth + 2) | |
208 | ||
209 | else: | |
210 | normalized: object | |
211 | if ( | |
212 | isinstance(node, ast.Constant) | |
213 | and field == "value" | |
214 | and isinstance(value, str) | |
215 | ): | |
216 | # Constant strings may be indented across newlines, if they are | |
217 | # docstrings; fold spaces after newlines when comparing. Similarly, | |
218 | # trailing and leading space may be removed. | |
219 | normalized = _normalize("\n", value) | |
220 | elif field == "type_comment" and isinstance(value, str): | |
221 | # Trailing whitespace in type comments is removed. | |
222 | normalized = value.rstrip() | |
223 | else: | |
224 | normalized = value | |
225 | yield f"{' ' * (depth+2)}{normalized!r}, # {value.__class__.__name__}" | |
226 | ||
227 | yield f"{' ' * depth}) # /{node.__class__.__name__}" |