]>
Commit | Line | Data |
---|---|---|
53e6db90 DC |
1 | # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. |
2 | # Licensed to PSF under a Contributor Agreement. | |
3 | ||
4 | # Modifications: | |
5 | # Copyright David Halter and Contributors | |
6 | # Modifications are dual-licensed: MIT and PSF. | |
7 | # 99% of the code is different from pgen2, now. | |
8 | ||
9 | """ | |
10 | The ``Parser`` tries to convert the available Python code in an easy to read | |
11 | format, something like an abstract syntax tree. The classes who represent this | |
12 | tree, are sitting in the :mod:`parso.tree` module. | |
13 | ||
14 | The Python module ``tokenize`` is a very important part in the ``Parser``, | |
15 | because it splits the code into different words (tokens). Sometimes it looks a | |
16 | bit messy. Sorry for that! You might ask now: "Why didn't you use the ``ast`` | |
17 | module for this? Well, ``ast`` does a very good job understanding proper Python | |
18 | code, but fails to work as soon as there's a single line of broken code. | |
19 | ||
20 | There's one important optimization that needs to be known: Statements are not | |
21 | being parsed completely. ``Statement`` is just a representation of the tokens | |
22 | within the statement. This lowers memory usage and cpu time and reduces the | |
23 | complexity of the ``Parser`` (there's another parser sitting inside | |
24 | ``Statement``, which produces ``Array`` and ``Call``). | |
25 | """ | |
26 | from typing import Dict, Type | |
27 | ||
28 | from parso import tree | |
29 | from parso.pgen2.generator import ReservedString | |
30 | ||
31 | ||
32 | class ParserSyntaxError(Exception): | |
33 | """ | |
34 | Contains error information about the parser tree. | |
35 | ||
36 | May be raised as an exception. | |
37 | """ | |
38 | def __init__(self, message, error_leaf): | |
39 | self.message = message | |
40 | self.error_leaf = error_leaf | |
41 | ||
42 | ||
43 | class InternalParseError(Exception): | |
44 | """ | |
45 | Exception to signal the parser is stuck and error recovery didn't help. | |
46 | Basically this shouldn't happen. It's a sign that something is really | |
47 | wrong. | |
48 | """ | |
49 | ||
50 | def __init__(self, msg, type_, value, start_pos): | |
51 | Exception.__init__(self, "%s: type=%r, value=%r, start_pos=%r" % | |
52 | (msg, type_.name, value, start_pos)) | |
53 | self.msg = msg | |
54 | self.type = type | |
55 | self.value = value | |
56 | self.start_pos = start_pos | |
57 | ||
58 | ||
59 | class Stack(list): | |
60 | def _allowed_transition_names_and_token_types(self): | |
61 | def iterate(): | |
62 | # An API just for Jedi. | |
63 | for stack_node in reversed(self): | |
64 | for transition in stack_node.dfa.transitions: | |
65 | if isinstance(transition, ReservedString): | |
66 | yield transition.value | |
67 | else: | |
68 | yield transition # A token type | |
69 | ||
70 | if not stack_node.dfa.is_final: | |
71 | break | |
72 | ||
73 | return list(iterate()) | |
74 | ||
75 | ||
76 | class StackNode: | |
77 | def __init__(self, dfa): | |
78 | self.dfa = dfa | |
79 | self.nodes = [] | |
80 | ||
81 | @property | |
82 | def nonterminal(self): | |
83 | return self.dfa.from_rule | |
84 | ||
85 | def __repr__(self): | |
86 | return '%s(%s, %s)' % (self.__class__.__name__, self.dfa, self.nodes) | |
87 | ||
88 | ||
89 | def _token_to_transition(grammar, type_, value): | |
90 | # Map from token to label | |
91 | if type_.value.contains_syntax: | |
92 | # Check for reserved words (keywords) | |
93 | try: | |
94 | return grammar.reserved_syntax_strings[value] | |
95 | except KeyError: | |
96 | pass | |
97 | ||
98 | return type_ | |
99 | ||
100 | ||
101 | class BaseParser: | |
102 | """Parser engine. | |
103 | ||
104 | A Parser instance contains state pertaining to the current token | |
105 | sequence, and should not be used concurrently by different threads | |
106 | to parse separate token sequences. | |
107 | ||
108 | See python/tokenize.py for how to get input tokens by a string. | |
109 | ||
110 | When a syntax error occurs, error_recovery() is called. | |
111 | """ | |
112 | ||
113 | node_map: Dict[str, Type[tree.BaseNode]] = {} | |
114 | default_node = tree.Node | |
115 | ||
116 | leaf_map: Dict[str, Type[tree.Leaf]] = {} | |
117 | default_leaf = tree.Leaf | |
118 | ||
119 | def __init__(self, pgen_grammar, start_nonterminal='file_input', error_recovery=False): | |
120 | self._pgen_grammar = pgen_grammar | |
121 | self._start_nonterminal = start_nonterminal | |
122 | self._error_recovery = error_recovery | |
123 | ||
124 | def parse(self, tokens): | |
125 | first_dfa = self._pgen_grammar.nonterminal_to_dfas[self._start_nonterminal][0] | |
126 | self.stack = Stack([StackNode(first_dfa)]) | |
127 | ||
128 | for token in tokens: | |
129 | self._add_token(token) | |
130 | ||
131 | while True: | |
132 | tos = self.stack[-1] | |
133 | if not tos.dfa.is_final: | |
134 | # We never broke out -- EOF is too soon -- Unfinished statement. | |
135 | # However, the error recovery might have added the token again, if | |
136 | # the stack is empty, we're fine. | |
137 | raise InternalParseError( | |
138 | "incomplete input", token.type, token.string, token.start_pos | |
139 | ) | |
140 | ||
141 | if len(self.stack) > 1: | |
142 | self._pop() | |
143 | else: | |
144 | return self.convert_node(tos.nonterminal, tos.nodes) | |
145 | ||
146 | def error_recovery(self, token): | |
147 | if self._error_recovery: | |
148 | raise NotImplementedError("Error Recovery is not implemented") | |
149 | else: | |
150 | type_, value, start_pos, prefix = token | |
151 | error_leaf = tree.ErrorLeaf(type_, value, start_pos, prefix) | |
152 | raise ParserSyntaxError('SyntaxError: invalid syntax', error_leaf) | |
153 | ||
154 | def convert_node(self, nonterminal, children): | |
155 | try: | |
156 | node = self.node_map[nonterminal](children) | |
157 | except KeyError: | |
158 | node = self.default_node(nonterminal, children) | |
159 | return node | |
160 | ||
161 | def convert_leaf(self, type_, value, prefix, start_pos): | |
162 | try: | |
163 | return self.leaf_map[type_](value, start_pos, prefix) | |
164 | except KeyError: | |
165 | return self.default_leaf(value, start_pos, prefix) | |
166 | ||
167 | def _add_token(self, token): | |
168 | """ | |
169 | This is the only core function for parsing. Here happens basically | |
170 | everything. Everything is well prepared by the parser generator and we | |
171 | only apply the necessary steps here. | |
172 | """ | |
173 | grammar = self._pgen_grammar | |
174 | stack = self.stack | |
175 | type_, value, start_pos, prefix = token | |
176 | transition = _token_to_transition(grammar, type_, value) | |
177 | ||
178 | while True: | |
179 | try: | |
180 | plan = stack[-1].dfa.transitions[transition] | |
181 | break | |
182 | except KeyError: | |
183 | if stack[-1].dfa.is_final: | |
184 | self._pop() | |
185 | else: | |
186 | self.error_recovery(token) | |
187 | return | |
188 | except IndexError: | |
189 | raise InternalParseError("too much input", type_, value, start_pos) | |
190 | ||
191 | stack[-1].dfa = plan.next_dfa | |
192 | ||
193 | for push in plan.dfa_pushes: | |
194 | stack.append(StackNode(push)) | |
195 | ||
196 | leaf = self.convert_leaf(type_, value, prefix, start_pos) | |
197 | stack[-1].nodes.append(leaf) | |
198 | ||
199 | def _pop(self): | |
200 | tos = self.stack.pop() | |
201 | # If there's exactly one child, return that child instead of | |
202 | # creating a new node. We still create expr_stmt and | |
203 | # file_input though, because a lot of Jedi depends on its | |
204 | # logic. | |
205 | if len(tos.nodes) == 1: | |
206 | new_node = tos.nodes[0] | |
207 | else: | |
208 | new_node = self.convert_node(tos.dfa.from_rule, tos.nodes) | |
209 | ||
210 | self.stack[-1].nodes.append(new_node) |