]>
Commit | Line | Data |
---|---|---|
1 | """ | |
2 | String transformers that can split and merge strings. | |
3 | """ | |
4 | import re | |
5 | from abc import ABC, abstractmethod | |
6 | from collections import defaultdict | |
7 | from dataclasses import dataclass | |
8 | from typing import ( | |
9 | Any, | |
10 | Callable, | |
11 | ClassVar, | |
12 | Collection, | |
13 | Dict, | |
14 | Final, | |
15 | Iterable, | |
16 | Iterator, | |
17 | List, | |
18 | Literal, | |
19 | Optional, | |
20 | Sequence, | |
21 | Set, | |
22 | Tuple, | |
23 | TypeVar, | |
24 | Union, | |
25 | ) | |
26 | ||
27 | from mypy_extensions import trait | |
28 | ||
29 | from black.comments import contains_pragma_comment | |
30 | from black.lines import Line, append_leaves | |
31 | from black.mode import Feature, Mode | |
32 | from black.nodes import ( | |
33 | CLOSING_BRACKETS, | |
34 | OPENING_BRACKETS, | |
35 | STANDALONE_COMMENT, | |
36 | is_empty_lpar, | |
37 | is_empty_par, | |
38 | is_empty_rpar, | |
39 | is_part_of_annotation, | |
40 | parent_type, | |
41 | replace_child, | |
42 | syms, | |
43 | ) | |
44 | from black.rusty import Err, Ok, Result | |
45 | from black.strings import ( | |
46 | assert_is_leaf_string, | |
47 | count_chars_in_width, | |
48 | get_string_prefix, | |
49 | has_triple_quotes, | |
50 | normalize_string_quotes, | |
51 | str_width, | |
52 | ) | |
53 | from blib2to3.pgen2 import token | |
54 | from blib2to3.pytree import Leaf, Node | |
55 | ||
56 | ||
57 | class CannotTransform(Exception): | |
58 | """Base class for errors raised by Transformers.""" | |
59 | ||
60 | ||
61 | # types | |
62 | T = TypeVar("T") | |
63 | LN = Union[Leaf, Node] | |
64 | Transformer = Callable[[Line, Collection[Feature], Mode], Iterator[Line]] | |
65 | Index = int | |
66 | NodeType = int | |
67 | ParserState = int | |
68 | StringID = int | |
69 | TResult = Result[T, CannotTransform] # (T)ransform Result | |
70 | TMatchResult = TResult[List[Index]] | |
71 | ||
72 | SPLIT_SAFE_CHARS = frozenset(["\u3001", "\u3002", "\uff0c"]) # East Asian stops | |
73 | ||
74 | ||
75 | def TErr(err_msg: str) -> Err[CannotTransform]: | |
76 | """(T)ransform Err | |
77 | ||
78 | Convenience function used when working with the TResult type. | |
79 | """ | |
80 | cant_transform = CannotTransform(err_msg) | |
81 | return Err(cant_transform) | |
82 | ||
83 | ||
84 | def hug_power_op( | |
85 | line: Line, features: Collection[Feature], mode: Mode | |
86 | ) -> Iterator[Line]: | |
87 | """A transformer which normalizes spacing around power operators.""" | |
88 | ||
89 | # Performance optimization to avoid unnecessary Leaf clones and other ops. | |
90 | for leaf in line.leaves: | |
91 | if leaf.type == token.DOUBLESTAR: | |
92 | break | |
93 | else: | |
94 | raise CannotTransform("No doublestar token was found in the line.") | |
95 | ||
96 | def is_simple_lookup(index: int, step: Literal[1, -1]) -> bool: | |
97 | # Brackets and parentheses indicate calls, subscripts, etc. ... | |
98 | # basically stuff that doesn't count as "simple". Only a NAME lookup | |
99 | # or dotted lookup (eg. NAME.NAME) is OK. | |
100 | if step == -1: | |
101 | disallowed = {token.RPAR, token.RSQB} | |
102 | else: | |
103 | disallowed = {token.LPAR, token.LSQB} | |
104 | ||
105 | while 0 <= index < len(line.leaves): | |
106 | current = line.leaves[index] | |
107 | if current.type in disallowed: | |
108 | return False | |
109 | if current.type not in {token.NAME, token.DOT} or current.value == "for": | |
110 | # If the current token isn't disallowed, we'll assume this is simple as | |
111 | # only the disallowed tokens are semantically attached to this lookup | |
112 | # expression we're checking. Also, stop early if we hit the 'for' bit | |
113 | # of a comprehension. | |
114 | return True | |
115 | ||
116 | index += step | |
117 | ||
118 | return True | |
119 | ||
120 | def is_simple_operand(index: int, kind: Literal["base", "exponent"]) -> bool: | |
121 | # An operand is considered "simple" if's a NAME, a numeric CONSTANT, a simple | |
122 | # lookup (see above), with or without a preceding unary operator. | |
123 | start = line.leaves[index] | |
124 | if start.type in {token.NAME, token.NUMBER}: | |
125 | return is_simple_lookup(index, step=(1 if kind == "exponent" else -1)) | |
126 | ||
127 | if start.type in {token.PLUS, token.MINUS, token.TILDE}: | |
128 | if line.leaves[index + 1].type in {token.NAME, token.NUMBER}: | |
129 | # step is always one as bases with a preceding unary op will be checked | |
130 | # for simplicity starting from the next token (so it'll hit the check | |
131 | # above). | |
132 | return is_simple_lookup(index + 1, step=1) | |
133 | ||
134 | return False | |
135 | ||
136 | new_line = line.clone() | |
137 | should_hug = False | |
138 | for idx, leaf in enumerate(line.leaves): | |
139 | new_leaf = leaf.clone() | |
140 | if should_hug: | |
141 | new_leaf.prefix = "" | |
142 | should_hug = False | |
143 | ||
144 | should_hug = ( | |
145 | (0 < idx < len(line.leaves) - 1) | |
146 | and leaf.type == token.DOUBLESTAR | |
147 | and is_simple_operand(idx - 1, kind="base") | |
148 | and line.leaves[idx - 1].value != "lambda" | |
149 | and is_simple_operand(idx + 1, kind="exponent") | |
150 | ) | |
151 | if should_hug: | |
152 | new_leaf.prefix = "" | |
153 | ||
154 | # We have to be careful to make a new line properly: | |
155 | # - bracket related metadata must be maintained (handled by Line.append) | |
156 | # - comments need to copied over, updating the leaf IDs they're attached to | |
157 | new_line.append(new_leaf, preformatted=True) | |
158 | for comment_leaf in line.comments_after(leaf): | |
159 | new_line.append(comment_leaf, preformatted=True) | |
160 | ||
161 | yield new_line | |
162 | ||
163 | ||
164 | class StringTransformer(ABC): | |
165 | """ | |
166 | An implementation of the Transformer protocol that relies on its | |
167 | subclasses overriding the template methods `do_match(...)` and | |
168 | `do_transform(...)`. | |
169 | ||
170 | This Transformer works exclusively on strings (for example, by merging | |
171 | or splitting them). | |
172 | ||
173 | The following sections can be found among the docstrings of each concrete | |
174 | StringTransformer subclass. | |
175 | ||
176 | Requirements: | |
177 | Which requirements must be met of the given Line for this | |
178 | StringTransformer to be applied? | |
179 | ||
180 | Transformations: | |
181 | If the given Line meets all of the above requirements, which string | |
182 | transformations can you expect to be applied to it by this | |
183 | StringTransformer? | |
184 | ||
185 | Collaborations: | |
186 | What contractual agreements does this StringTransformer have with other | |
187 | StringTransfomers? Such collaborations should be eliminated/minimized | |
188 | as much as possible. | |
189 | """ | |
190 | ||
191 | __name__: Final = "StringTransformer" | |
192 | ||
193 | # Ideally this would be a dataclass, but unfortunately mypyc breaks when used with | |
194 | # `abc.ABC`. | |
195 | def __init__(self, line_length: int, normalize_strings: bool) -> None: | |
196 | self.line_length = line_length | |
197 | self.normalize_strings = normalize_strings | |
198 | ||
199 | @abstractmethod | |
200 | def do_match(self, line: Line) -> TMatchResult: | |
201 | """ | |
202 | Returns: | |
203 | * Ok(string_indices) such that for each index, `line.leaves[index]` | |
204 | is our target string if a match was able to be made. For | |
205 | transformers that don't result in more lines (e.g. StringMerger, | |
206 | StringParenStripper), multiple matches and transforms are done at | |
207 | once to reduce the complexity. | |
208 | OR | |
209 | * Err(CannotTransform), if no match could be made. | |
210 | """ | |
211 | ||
212 | @abstractmethod | |
213 | def do_transform( | |
214 | self, line: Line, string_indices: List[int] | |
215 | ) -> Iterator[TResult[Line]]: | |
216 | """ | |
217 | Yields: | |
218 | * Ok(new_line) where new_line is the new transformed line. | |
219 | OR | |
220 | * Err(CannotTransform) if the transformation failed for some reason. The | |
221 | `do_match(...)` template method should usually be used to reject | |
222 | the form of the given Line, but in some cases it is difficult to | |
223 | know whether or not a Line meets the StringTransformer's | |
224 | requirements until the transformation is already midway. | |
225 | ||
226 | Side Effects: | |
227 | This method should NOT mutate @line directly, but it MAY mutate the | |
228 | Line's underlying Node structure. (WARNING: If the underlying Node | |
229 | structure IS altered, then this method should NOT be allowed to | |
230 | yield an CannotTransform after that point.) | |
231 | """ | |
232 | ||
233 | def __call__( | |
234 | self, line: Line, _features: Collection[Feature], _mode: Mode | |
235 | ) -> Iterator[Line]: | |
236 | """ | |
237 | StringTransformer instances have a call signature that mirrors that of | |
238 | the Transformer type. | |
239 | ||
240 | Raises: | |
241 | CannotTransform(...) if the concrete StringTransformer class is unable | |
242 | to transform @line. | |
243 | """ | |
244 | # Optimization to avoid calling `self.do_match(...)` when the line does | |
245 | # not contain any string. | |
246 | if not any(leaf.type == token.STRING for leaf in line.leaves): | |
247 | raise CannotTransform("There are no strings in this line.") | |
248 | ||
249 | match_result = self.do_match(line) | |
250 | ||
251 | if isinstance(match_result, Err): | |
252 | cant_transform = match_result.err() | |
253 | raise CannotTransform( | |
254 | f"The string transformer {self.__class__.__name__} does not recognize" | |
255 | " this line as one that it can transform." | |
256 | ) from cant_transform | |
257 | ||
258 | string_indices = match_result.ok() | |
259 | ||
260 | for line_result in self.do_transform(line, string_indices): | |
261 | if isinstance(line_result, Err): | |
262 | cant_transform = line_result.err() | |
263 | raise CannotTransform( | |
264 | "StringTransformer failed while attempting to transform string." | |
265 | ) from cant_transform | |
266 | line = line_result.ok() | |
267 | yield line | |
268 | ||
269 | ||
270 | @dataclass | |
271 | class CustomSplit: | |
272 | """A custom (i.e. manual) string split. | |
273 | ||
274 | A single CustomSplit instance represents a single substring. | |
275 | ||
276 | Examples: | |
277 | Consider the following string: | |
278 | ``` | |
279 | "Hi there friend." | |
280 | " This is a custom" | |
281 | f" string {split}." | |
282 | ``` | |
283 | ||
284 | This string will correspond to the following three CustomSplit instances: | |
285 | ``` | |
286 | CustomSplit(False, 16) | |
287 | CustomSplit(False, 17) | |
288 | CustomSplit(True, 16) | |
289 | ``` | |
290 | """ | |
291 | ||
292 | has_prefix: bool | |
293 | break_idx: int | |
294 | ||
295 | ||
296 | @trait | |
297 | class CustomSplitMapMixin: | |
298 | """ | |
299 | This mixin class is used to map merged strings to a sequence of | |
300 | CustomSplits, which will then be used to re-split the strings iff none of | |
301 | the resultant substrings go over the configured max line length. | |
302 | """ | |
303 | ||
304 | _Key: ClassVar = Tuple[StringID, str] | |
305 | _CUSTOM_SPLIT_MAP: ClassVar[Dict[_Key, Tuple[CustomSplit, ...]]] = defaultdict( | |
306 | tuple | |
307 | ) | |
308 | ||
309 | @staticmethod | |
310 | def _get_key(string: str) -> "CustomSplitMapMixin._Key": | |
311 | """ | |
312 | Returns: | |
313 | A unique identifier that is used internally to map @string to a | |
314 | group of custom splits. | |
315 | """ | |
316 | return (id(string), string) | |
317 | ||
318 | def add_custom_splits( | |
319 | self, string: str, custom_splits: Iterable[CustomSplit] | |
320 | ) -> None: | |
321 | """Custom Split Map Setter Method | |
322 | ||
323 | Side Effects: | |
324 | Adds a mapping from @string to the custom splits @custom_splits. | |
325 | """ | |
326 | key = self._get_key(string) | |
327 | self._CUSTOM_SPLIT_MAP[key] = tuple(custom_splits) | |
328 | ||
329 | def pop_custom_splits(self, string: str) -> List[CustomSplit]: | |
330 | """Custom Split Map Getter Method | |
331 | ||
332 | Returns: | |
333 | * A list of the custom splits that are mapped to @string, if any | |
334 | exist. | |
335 | OR | |
336 | * [], otherwise. | |
337 | ||
338 | Side Effects: | |
339 | Deletes the mapping between @string and its associated custom | |
340 | splits (which are returned to the caller). | |
341 | """ | |
342 | key = self._get_key(string) | |
343 | ||
344 | custom_splits = self._CUSTOM_SPLIT_MAP[key] | |
345 | del self._CUSTOM_SPLIT_MAP[key] | |
346 | ||
347 | return list(custom_splits) | |
348 | ||
349 | def has_custom_splits(self, string: str) -> bool: | |
350 | """ | |
351 | Returns: | |
352 | True iff @string is associated with a set of custom splits. | |
353 | """ | |
354 | key = self._get_key(string) | |
355 | return key in self._CUSTOM_SPLIT_MAP | |
356 | ||
357 | ||
358 | class StringMerger(StringTransformer, CustomSplitMapMixin): | |
359 | """StringTransformer that merges strings together. | |
360 | ||
361 | Requirements: | |
362 | (A) The line contains adjacent strings such that ALL of the validation checks | |
363 | listed in StringMerger._validate_msg(...)'s docstring pass. | |
364 | OR | |
365 | (B) The line contains a string which uses line continuation backslashes. | |
366 | ||
367 | Transformations: | |
368 | Depending on which of the two requirements above where met, either: | |
369 | ||
370 | (A) The string group associated with the target string is merged. | |
371 | OR | |
372 | (B) All line-continuation backslashes are removed from the target string. | |
373 | ||
374 | Collaborations: | |
375 | StringMerger provides custom split information to StringSplitter. | |
376 | """ | |
377 | ||
378 | def do_match(self, line: Line) -> TMatchResult: | |
379 | LL = line.leaves | |
380 | ||
381 | is_valid_index = is_valid_index_factory(LL) | |
382 | ||
383 | string_indices = [] | |
384 | idx = 0 | |
385 | while is_valid_index(idx): | |
386 | leaf = LL[idx] | |
387 | if ( | |
388 | leaf.type == token.STRING | |
389 | and is_valid_index(idx + 1) | |
390 | and LL[idx + 1].type == token.STRING | |
391 | ): | |
392 | if not is_part_of_annotation(leaf): | |
393 | string_indices.append(idx) | |
394 | ||
395 | # Advance to the next non-STRING leaf. | |
396 | idx += 2 | |
397 | while is_valid_index(idx) and LL[idx].type == token.STRING: | |
398 | idx += 1 | |
399 | ||
400 | elif leaf.type == token.STRING and "\\\n" in leaf.value: | |
401 | string_indices.append(idx) | |
402 | # Advance to the next non-STRING leaf. | |
403 | idx += 1 | |
404 | while is_valid_index(idx) and LL[idx].type == token.STRING: | |
405 | idx += 1 | |
406 | ||
407 | else: | |
408 | idx += 1 | |
409 | ||
410 | if string_indices: | |
411 | return Ok(string_indices) | |
412 | else: | |
413 | return TErr("This line has no strings that need merging.") | |
414 | ||
415 | def do_transform( | |
416 | self, line: Line, string_indices: List[int] | |
417 | ) -> Iterator[TResult[Line]]: | |
418 | new_line = line | |
419 | ||
420 | rblc_result = self._remove_backslash_line_continuation_chars( | |
421 | new_line, string_indices | |
422 | ) | |
423 | if isinstance(rblc_result, Ok): | |
424 | new_line = rblc_result.ok() | |
425 | ||
426 | msg_result = self._merge_string_group(new_line, string_indices) | |
427 | if isinstance(msg_result, Ok): | |
428 | new_line = msg_result.ok() | |
429 | ||
430 | if isinstance(rblc_result, Err) and isinstance(msg_result, Err): | |
431 | msg_cant_transform = msg_result.err() | |
432 | rblc_cant_transform = rblc_result.err() | |
433 | cant_transform = CannotTransform( | |
434 | "StringMerger failed to merge any strings in this line." | |
435 | ) | |
436 | ||
437 | # Chain the errors together using `__cause__`. | |
438 | msg_cant_transform.__cause__ = rblc_cant_transform | |
439 | cant_transform.__cause__ = msg_cant_transform | |
440 | ||
441 | yield Err(cant_transform) | |
442 | else: | |
443 | yield Ok(new_line) | |
444 | ||
445 | @staticmethod | |
446 | def _remove_backslash_line_continuation_chars( | |
447 | line: Line, string_indices: List[int] | |
448 | ) -> TResult[Line]: | |
449 | """ | |
450 | Merge strings that were split across multiple lines using | |
451 | line-continuation backslashes. | |
452 | ||
453 | Returns: | |
454 | Ok(new_line), if @line contains backslash line-continuation | |
455 | characters. | |
456 | OR | |
457 | Err(CannotTransform), otherwise. | |
458 | """ | |
459 | LL = line.leaves | |
460 | ||
461 | indices_to_transform = [] | |
462 | for string_idx in string_indices: | |
463 | string_leaf = LL[string_idx] | |
464 | if ( | |
465 | string_leaf.type == token.STRING | |
466 | and "\\\n" in string_leaf.value | |
467 | and not has_triple_quotes(string_leaf.value) | |
468 | ): | |
469 | indices_to_transform.append(string_idx) | |
470 | ||
471 | if not indices_to_transform: | |
472 | return TErr( | |
473 | "Found no string leaves that contain backslash line continuation" | |
474 | " characters." | |
475 | ) | |
476 | ||
477 | new_line = line.clone() | |
478 | new_line.comments = line.comments.copy() | |
479 | append_leaves(new_line, line, LL) | |
480 | ||
481 | for string_idx in indices_to_transform: | |
482 | new_string_leaf = new_line.leaves[string_idx] | |
483 | new_string_leaf.value = new_string_leaf.value.replace("\\\n", "") | |
484 | ||
485 | return Ok(new_line) | |
486 | ||
487 | def _merge_string_group( | |
488 | self, line: Line, string_indices: List[int] | |
489 | ) -> TResult[Line]: | |
490 | """ | |
491 | Merges string groups (i.e. set of adjacent strings). | |
492 | ||
493 | Each index from `string_indices` designates one string group's first | |
494 | leaf in `line.leaves`. | |
495 | ||
496 | Returns: | |
497 | Ok(new_line), if ALL of the validation checks found in | |
498 | _validate_msg(...) pass. | |
499 | OR | |
500 | Err(CannotTransform), otherwise. | |
501 | """ | |
502 | LL = line.leaves | |
503 | ||
504 | is_valid_index = is_valid_index_factory(LL) | |
505 | ||
506 | # A dict of {string_idx: tuple[num_of_strings, string_leaf]}. | |
507 | merged_string_idx_dict: Dict[int, Tuple[int, Leaf]] = {} | |
508 | for string_idx in string_indices: | |
509 | vresult = self._validate_msg(line, string_idx) | |
510 | if isinstance(vresult, Err): | |
511 | continue | |
512 | merged_string_idx_dict[string_idx] = self._merge_one_string_group( | |
513 | LL, string_idx, is_valid_index | |
514 | ) | |
515 | ||
516 | if not merged_string_idx_dict: | |
517 | return TErr("No string group is merged") | |
518 | ||
519 | # Build the final line ('new_line') that this method will later return. | |
520 | new_line = line.clone() | |
521 | previous_merged_string_idx = -1 | |
522 | previous_merged_num_of_strings = -1 | |
523 | for i, leaf in enumerate(LL): | |
524 | if i in merged_string_idx_dict: | |
525 | previous_merged_string_idx = i | |
526 | previous_merged_num_of_strings, string_leaf = merged_string_idx_dict[i] | |
527 | new_line.append(string_leaf) | |
528 | ||
529 | if ( | |
530 | previous_merged_string_idx | |
531 | <= i | |
532 | < previous_merged_string_idx + previous_merged_num_of_strings | |
533 | ): | |
534 | for comment_leaf in line.comments_after(LL[i]): | |
535 | new_line.append(comment_leaf, preformatted=True) | |
536 | continue | |
537 | ||
538 | append_leaves(new_line, line, [leaf]) | |
539 | ||
540 | return Ok(new_line) | |
541 | ||
542 | def _merge_one_string_group( | |
543 | self, LL: List[Leaf], string_idx: int, is_valid_index: Callable[[int], bool] | |
544 | ) -> Tuple[int, Leaf]: | |
545 | """ | |
546 | Merges one string group where the first string in the group is | |
547 | `LL[string_idx]`. | |
548 | ||
549 | Returns: | |
550 | A tuple of `(num_of_strings, leaf)` where `num_of_strings` is the | |
551 | number of strings merged and `leaf` is the newly merged string | |
552 | to be replaced in the new line. | |
553 | """ | |
554 | # If the string group is wrapped inside an Atom node, we must make sure | |
555 | # to later replace that Atom with our new (merged) string leaf. | |
556 | atom_node = LL[string_idx].parent | |
557 | ||
558 | # We will place BREAK_MARK in between every two substrings that we | |
559 | # merge. We will then later go through our final result and use the | |
560 | # various instances of BREAK_MARK we find to add the right values to | |
561 | # the custom split map. | |
562 | BREAK_MARK = "@@@@@ BLACK BREAKPOINT MARKER @@@@@" | |
563 | ||
564 | QUOTE = LL[string_idx].value[-1] | |
565 | ||
566 | def make_naked(string: str, string_prefix: str) -> str: | |
567 | """Strip @string (i.e. make it a "naked" string) | |
568 | ||
569 | Pre-conditions: | |
570 | * assert_is_leaf_string(@string) | |
571 | ||
572 | Returns: | |
573 | A string that is identical to @string except that | |
574 | @string_prefix has been stripped, the surrounding QUOTE | |
575 | characters have been removed, and any remaining QUOTE | |
576 | characters have been escaped. | |
577 | """ | |
578 | assert_is_leaf_string(string) | |
579 | if "f" in string_prefix: | |
580 | string = _toggle_fexpr_quotes(string, QUOTE) | |
581 | # After quotes toggling, quotes in expressions won't be escaped | |
582 | # because quotes can't be reused in f-strings. So we can simply | |
583 | # let the escaping logic below run without knowing f-string | |
584 | # expressions. | |
585 | ||
586 | RE_EVEN_BACKSLASHES = r"(?:(?<!\\)(?:\\\\)*)" | |
587 | naked_string = string[len(string_prefix) + 1 : -1] | |
588 | naked_string = re.sub( | |
589 | "(" + RE_EVEN_BACKSLASHES + ")" + QUOTE, r"\1\\" + QUOTE, naked_string | |
590 | ) | |
591 | return naked_string | |
592 | ||
593 | # Holds the CustomSplit objects that will later be added to the custom | |
594 | # split map. | |
595 | custom_splits = [] | |
596 | ||
597 | # Temporary storage for the 'has_prefix' part of the CustomSplit objects. | |
598 | prefix_tracker = [] | |
599 | ||
600 | # Sets the 'prefix' variable. This is the prefix that the final merged | |
601 | # string will have. | |
602 | next_str_idx = string_idx | |
603 | prefix = "" | |
604 | while ( | |
605 | not prefix | |
606 | and is_valid_index(next_str_idx) | |
607 | and LL[next_str_idx].type == token.STRING | |
608 | ): | |
609 | prefix = get_string_prefix(LL[next_str_idx].value).lower() | |
610 | next_str_idx += 1 | |
611 | ||
612 | # The next loop merges the string group. The final string will be | |
613 | # contained in 'S'. | |
614 | # | |
615 | # The following convenience variables are used: | |
616 | # | |
617 | # S: string | |
618 | # NS: naked string | |
619 | # SS: next string | |
620 | # NSS: naked next string | |
621 | S = "" | |
622 | NS = "" | |
623 | num_of_strings = 0 | |
624 | next_str_idx = string_idx | |
625 | while is_valid_index(next_str_idx) and LL[next_str_idx].type == token.STRING: | |
626 | num_of_strings += 1 | |
627 | ||
628 | SS = LL[next_str_idx].value | |
629 | next_prefix = get_string_prefix(SS).lower() | |
630 | ||
631 | # If this is an f-string group but this substring is not prefixed | |
632 | # with 'f'... | |
633 | if "f" in prefix and "f" not in next_prefix: | |
634 | # Then we must escape any braces contained in this substring. | |
635 | SS = re.sub(r"(\{|\})", r"\1\1", SS) | |
636 | ||
637 | NSS = make_naked(SS, next_prefix) | |
638 | ||
639 | has_prefix = bool(next_prefix) | |
640 | prefix_tracker.append(has_prefix) | |
641 | ||
642 | S = prefix + QUOTE + NS + NSS + BREAK_MARK + QUOTE | |
643 | NS = make_naked(S, prefix) | |
644 | ||
645 | next_str_idx += 1 | |
646 | ||
647 | # Take a note on the index of the non-STRING leaf. | |
648 | non_string_idx = next_str_idx | |
649 | ||
650 | S_leaf = Leaf(token.STRING, S) | |
651 | if self.normalize_strings: | |
652 | S_leaf.value = normalize_string_quotes(S_leaf.value) | |
653 | ||
654 | # Fill the 'custom_splits' list with the appropriate CustomSplit objects. | |
655 | temp_string = S_leaf.value[len(prefix) + 1 : -1] | |
656 | for has_prefix in prefix_tracker: | |
657 | mark_idx = temp_string.find(BREAK_MARK) | |
658 | assert ( | |
659 | mark_idx >= 0 | |
660 | ), "Logic error while filling the custom string breakpoint cache." | |
661 | ||
662 | temp_string = temp_string[mark_idx + len(BREAK_MARK) :] | |
663 | breakpoint_idx = mark_idx + (len(prefix) if has_prefix else 0) + 1 | |
664 | custom_splits.append(CustomSplit(has_prefix, breakpoint_idx)) | |
665 | ||
666 | string_leaf = Leaf(token.STRING, S_leaf.value.replace(BREAK_MARK, "")) | |
667 | ||
668 | if atom_node is not None: | |
669 | # If not all children of the atom node are merged (this can happen | |
670 | # when there is a standalone comment in the middle) ... | |
671 | if non_string_idx - string_idx < len(atom_node.children): | |
672 | # We need to replace the old STRING leaves with the new string leaf. | |
673 | first_child_idx = LL[string_idx].remove() | |
674 | for idx in range(string_idx + 1, non_string_idx): | |
675 | LL[idx].remove() | |
676 | if first_child_idx is not None: | |
677 | atom_node.insert_child(first_child_idx, string_leaf) | |
678 | else: | |
679 | # Else replace the atom node with the new string leaf. | |
680 | replace_child(atom_node, string_leaf) | |
681 | ||
682 | self.add_custom_splits(string_leaf.value, custom_splits) | |
683 | return num_of_strings, string_leaf | |
684 | ||
685 | @staticmethod | |
686 | def _validate_msg(line: Line, string_idx: int) -> TResult[None]: | |
687 | """Validate (M)erge (S)tring (G)roup | |
688 | ||
689 | Transform-time string validation logic for _merge_string_group(...). | |
690 | ||
691 | Returns: | |
692 | * Ok(None), if ALL validation checks (listed below) pass. | |
693 | OR | |
694 | * Err(CannotTransform), if any of the following are true: | |
695 | - The target string group does not contain ANY stand-alone comments. | |
696 | - The target string is not in a string group (i.e. it has no | |
697 | adjacent strings). | |
698 | - The string group has more than one inline comment. | |
699 | - The string group has an inline comment that appears to be a pragma. | |
700 | - The set of all string prefixes in the string group is of | |
701 | length greater than one and is not equal to {"", "f"}. | |
702 | - The string group consists of raw strings. | |
703 | - The string group is stringified type annotations. We don't want to | |
704 | process stringified type annotations since pyright doesn't support | |
705 | them spanning multiple string values. (NOTE: mypy, pytype, pyre do | |
706 | support them, so we can change if pyright also gains support in the | |
707 | future. See https://github.com/microsoft/pyright/issues/4359.) | |
708 | """ | |
709 | # We first check for "inner" stand-alone comments (i.e. stand-alone | |
710 | # comments that have a string leaf before them AND after them). | |
711 | for inc in [1, -1]: | |
712 | i = string_idx | |
713 | found_sa_comment = False | |
714 | is_valid_index = is_valid_index_factory(line.leaves) | |
715 | while is_valid_index(i) and line.leaves[i].type in [ | |
716 | token.STRING, | |
717 | STANDALONE_COMMENT, | |
718 | ]: | |
719 | if line.leaves[i].type == STANDALONE_COMMENT: | |
720 | found_sa_comment = True | |
721 | elif found_sa_comment: | |
722 | return TErr( | |
723 | "StringMerger does NOT merge string groups which contain " | |
724 | "stand-alone comments." | |
725 | ) | |
726 | ||
727 | i += inc | |
728 | ||
729 | num_of_inline_string_comments = 0 | |
730 | set_of_prefixes = set() | |
731 | num_of_strings = 0 | |
732 | for leaf in line.leaves[string_idx:]: | |
733 | if leaf.type != token.STRING: | |
734 | # If the string group is trailed by a comma, we count the | |
735 | # comments trailing the comma to be one of the string group's | |
736 | # comments. | |
737 | if leaf.type == token.COMMA and id(leaf) in line.comments: | |
738 | num_of_inline_string_comments += 1 | |
739 | break | |
740 | ||
741 | if has_triple_quotes(leaf.value): | |
742 | return TErr("StringMerger does NOT merge multiline strings.") | |
743 | ||
744 | num_of_strings += 1 | |
745 | prefix = get_string_prefix(leaf.value).lower() | |
746 | if "r" in prefix: | |
747 | return TErr("StringMerger does NOT merge raw strings.") | |
748 | ||
749 | set_of_prefixes.add(prefix) | |
750 | ||
751 | if id(leaf) in line.comments: | |
752 | num_of_inline_string_comments += 1 | |
753 | if contains_pragma_comment(line.comments[id(leaf)]): | |
754 | return TErr("Cannot merge strings which have pragma comments.") | |
755 | ||
756 | if num_of_strings < 2: | |
757 | return TErr( | |
758 | f"Not enough strings to merge (num_of_strings={num_of_strings})." | |
759 | ) | |
760 | ||
761 | if num_of_inline_string_comments > 1: | |
762 | return TErr( | |
763 | f"Too many inline string comments ({num_of_inline_string_comments})." | |
764 | ) | |
765 | ||
766 | if len(set_of_prefixes) > 1 and set_of_prefixes != {"", "f"}: | |
767 | return TErr(f"Too many different prefixes ({set_of_prefixes}).") | |
768 | ||
769 | return Ok(None) | |
770 | ||
771 | ||
772 | class StringParenStripper(StringTransformer): | |
773 | """StringTransformer that strips surrounding parentheses from strings. | |
774 | ||
775 | Requirements: | |
776 | The line contains a string which is surrounded by parentheses and: | |
777 | - The target string is NOT the only argument to a function call. | |
778 | - The target string is NOT a "pointless" string. | |
779 | - If the target string contains a PERCENT, the brackets are not | |
780 | preceded or followed by an operator with higher precedence than | |
781 | PERCENT. | |
782 | ||
783 | Transformations: | |
784 | The parentheses mentioned in the 'Requirements' section are stripped. | |
785 | ||
786 | Collaborations: | |
787 | StringParenStripper has its own inherent usefulness, but it is also | |
788 | relied on to clean up the parentheses created by StringParenWrapper (in | |
789 | the event that they are no longer needed). | |
790 | """ | |
791 | ||
792 | def do_match(self, line: Line) -> TMatchResult: | |
793 | LL = line.leaves | |
794 | ||
795 | is_valid_index = is_valid_index_factory(LL) | |
796 | ||
797 | string_indices = [] | |
798 | ||
799 | idx = -1 | |
800 | while True: | |
801 | idx += 1 | |
802 | if idx >= len(LL): | |
803 | break | |
804 | leaf = LL[idx] | |
805 | ||
806 | # Should be a string... | |
807 | if leaf.type != token.STRING: | |
808 | continue | |
809 | ||
810 | # If this is a "pointless" string... | |
811 | if ( | |
812 | leaf.parent | |
813 | and leaf.parent.parent | |
814 | and leaf.parent.parent.type == syms.simple_stmt | |
815 | ): | |
816 | continue | |
817 | ||
818 | # Should be preceded by a non-empty LPAR... | |
819 | if ( | |
820 | not is_valid_index(idx - 1) | |
821 | or LL[idx - 1].type != token.LPAR | |
822 | or is_empty_lpar(LL[idx - 1]) | |
823 | ): | |
824 | continue | |
825 | ||
826 | # That LPAR should NOT be preceded by a function name or a closing | |
827 | # bracket (which could be a function which returns a function or a | |
828 | # list/dictionary that contains a function)... | |
829 | if is_valid_index(idx - 2) and ( | |
830 | LL[idx - 2].type == token.NAME or LL[idx - 2].type in CLOSING_BRACKETS | |
831 | ): | |
832 | continue | |
833 | ||
834 | string_idx = idx | |
835 | ||
836 | # Skip the string trailer, if one exists. | |
837 | string_parser = StringParser() | |
838 | next_idx = string_parser.parse(LL, string_idx) | |
839 | ||
840 | # if the leaves in the parsed string include a PERCENT, we need to | |
841 | # make sure the initial LPAR is NOT preceded by an operator with | |
842 | # higher or equal precedence to PERCENT | |
843 | if is_valid_index(idx - 2): | |
844 | # mypy can't quite follow unless we name this | |
845 | before_lpar = LL[idx - 2] | |
846 | if token.PERCENT in {leaf.type for leaf in LL[idx - 1 : next_idx]} and ( | |
847 | ( | |
848 | before_lpar.type | |
849 | in { | |
850 | token.STAR, | |
851 | token.AT, | |
852 | token.SLASH, | |
853 | token.DOUBLESLASH, | |
854 | token.PERCENT, | |
855 | token.TILDE, | |
856 | token.DOUBLESTAR, | |
857 | token.AWAIT, | |
858 | token.LSQB, | |
859 | token.LPAR, | |
860 | } | |
861 | ) | |
862 | or ( | |
863 | # only unary PLUS/MINUS | |
864 | before_lpar.parent | |
865 | and before_lpar.parent.type == syms.factor | |
866 | and (before_lpar.type in {token.PLUS, token.MINUS}) | |
867 | ) | |
868 | ): | |
869 | continue | |
870 | ||
871 | # Should be followed by a non-empty RPAR... | |
872 | if ( | |
873 | is_valid_index(next_idx) | |
874 | and LL[next_idx].type == token.RPAR | |
875 | and not is_empty_rpar(LL[next_idx]) | |
876 | ): | |
877 | # That RPAR should NOT be followed by anything with higher | |
878 | # precedence than PERCENT | |
879 | if is_valid_index(next_idx + 1) and LL[next_idx + 1].type in { | |
880 | token.DOUBLESTAR, | |
881 | token.LSQB, | |
882 | token.LPAR, | |
883 | token.DOT, | |
884 | }: | |
885 | continue | |
886 | ||
887 | string_indices.append(string_idx) | |
888 | idx = string_idx | |
889 | while idx < len(LL) - 1 and LL[idx + 1].type == token.STRING: | |
890 | idx += 1 | |
891 | ||
892 | if string_indices: | |
893 | return Ok(string_indices) | |
894 | return TErr("This line has no strings wrapped in parens.") | |
895 | ||
896 | def do_transform( | |
897 | self, line: Line, string_indices: List[int] | |
898 | ) -> Iterator[TResult[Line]]: | |
899 | LL = line.leaves | |
900 | ||
901 | string_and_rpar_indices: List[int] = [] | |
902 | for string_idx in string_indices: | |
903 | string_parser = StringParser() | |
904 | rpar_idx = string_parser.parse(LL, string_idx) | |
905 | ||
906 | should_transform = True | |
907 | for leaf in (LL[string_idx - 1], LL[rpar_idx]): | |
908 | if line.comments_after(leaf): | |
909 | # Should not strip parentheses which have comments attached | |
910 | # to them. | |
911 | should_transform = False | |
912 | break | |
913 | if should_transform: | |
914 | string_and_rpar_indices.extend((string_idx, rpar_idx)) | |
915 | ||
916 | if string_and_rpar_indices: | |
917 | yield Ok(self._transform_to_new_line(line, string_and_rpar_indices)) | |
918 | else: | |
919 | yield Err( | |
920 | CannotTransform("All string groups have comments attached to them.") | |
921 | ) | |
922 | ||
923 | def _transform_to_new_line( | |
924 | self, line: Line, string_and_rpar_indices: List[int] | |
925 | ) -> Line: | |
926 | LL = line.leaves | |
927 | ||
928 | new_line = line.clone() | |
929 | new_line.comments = line.comments.copy() | |
930 | ||
931 | previous_idx = -1 | |
932 | # We need to sort the indices, since string_idx and its matching | |
933 | # rpar_idx may not come in order, e.g. in | |
934 | # `("outer" % ("inner".join(items)))`, the "inner" string's | |
935 | # string_idx is smaller than "outer" string's rpar_idx. | |
936 | for idx in sorted(string_and_rpar_indices): | |
937 | leaf = LL[idx] | |
938 | lpar_or_rpar_idx = idx - 1 if leaf.type == token.STRING else idx | |
939 | append_leaves(new_line, line, LL[previous_idx + 1 : lpar_or_rpar_idx]) | |
940 | if leaf.type == token.STRING: | |
941 | string_leaf = Leaf(token.STRING, LL[idx].value) | |
942 | LL[lpar_or_rpar_idx].remove() # Remove lpar. | |
943 | replace_child(LL[idx], string_leaf) | |
944 | new_line.append(string_leaf) | |
945 | else: | |
946 | LL[lpar_or_rpar_idx].remove() # This is a rpar. | |
947 | ||
948 | previous_idx = idx | |
949 | ||
950 | # Append the leaves after the last idx: | |
951 | append_leaves(new_line, line, LL[idx + 1 :]) | |
952 | ||
953 | return new_line | |
954 | ||
955 | ||
956 | class BaseStringSplitter(StringTransformer): | |
957 | """ | |
958 | Abstract class for StringTransformers which transform a Line's strings by splitting | |
959 | them or placing them on their own lines where necessary to avoid going over | |
960 | the configured line length. | |
961 | ||
962 | Requirements: | |
963 | * The target string value is responsible for the line going over the | |
964 | line length limit. It follows that after all of black's other line | |
965 | split methods have been exhausted, this line (or one of the resulting | |
966 | lines after all line splits are performed) would still be over the | |
967 | line_length limit unless we split this string. | |
968 | AND | |
969 | ||
970 | * The target string is NOT a "pointless" string (i.e. a string that has | |
971 | no parent or siblings). | |
972 | AND | |
973 | ||
974 | * The target string is not followed by an inline comment that appears | |
975 | to be a pragma. | |
976 | AND | |
977 | ||
978 | * The target string is not a multiline (i.e. triple-quote) string. | |
979 | """ | |
980 | ||
981 | STRING_OPERATORS: Final = [ | |
982 | token.EQEQUAL, | |
983 | token.GREATER, | |
984 | token.GREATEREQUAL, | |
985 | token.LESS, | |
986 | token.LESSEQUAL, | |
987 | token.NOTEQUAL, | |
988 | token.PERCENT, | |
989 | token.PLUS, | |
990 | token.STAR, | |
991 | ] | |
992 | ||
993 | @abstractmethod | |
994 | def do_splitter_match(self, line: Line) -> TMatchResult: | |
995 | """ | |
996 | BaseStringSplitter asks its clients to override this method instead of | |
997 | `StringTransformer.do_match(...)`. | |
998 | ||
999 | Follows the same protocol as `StringTransformer.do_match(...)`. | |
1000 | ||
1001 | Refer to `help(StringTransformer.do_match)` for more information. | |
1002 | """ | |
1003 | ||
1004 | def do_match(self, line: Line) -> TMatchResult: | |
1005 | match_result = self.do_splitter_match(line) | |
1006 | if isinstance(match_result, Err): | |
1007 | return match_result | |
1008 | ||
1009 | string_indices = match_result.ok() | |
1010 | assert len(string_indices) == 1, ( | |
1011 | f"{self.__class__.__name__} should only find one match at a time, found" | |
1012 | f" {len(string_indices)}" | |
1013 | ) | |
1014 | string_idx = string_indices[0] | |
1015 | vresult = self._validate(line, string_idx) | |
1016 | if isinstance(vresult, Err): | |
1017 | return vresult | |
1018 | ||
1019 | return match_result | |
1020 | ||
1021 | def _validate(self, line: Line, string_idx: int) -> TResult[None]: | |
1022 | """ | |
1023 | Checks that @line meets all of the requirements listed in this classes' | |
1024 | docstring. Refer to `help(BaseStringSplitter)` for a detailed | |
1025 | description of those requirements. | |
1026 | ||
1027 | Returns: | |
1028 | * Ok(None), if ALL of the requirements are met. | |
1029 | OR | |
1030 | * Err(CannotTransform), if ANY of the requirements are NOT met. | |
1031 | """ | |
1032 | LL = line.leaves | |
1033 | ||
1034 | string_leaf = LL[string_idx] | |
1035 | ||
1036 | max_string_length = self._get_max_string_length(line, string_idx) | |
1037 | if len(string_leaf.value) <= max_string_length: | |
1038 | return TErr( | |
1039 | "The string itself is not what is causing this line to be too long." | |
1040 | ) | |
1041 | ||
1042 | if not string_leaf.parent or [L.type for L in string_leaf.parent.children] == [ | |
1043 | token.STRING, | |
1044 | token.NEWLINE, | |
1045 | ]: | |
1046 | return TErr( | |
1047 | f"This string ({string_leaf.value}) appears to be pointless (i.e. has" | |
1048 | " no parent)." | |
1049 | ) | |
1050 | ||
1051 | if id(line.leaves[string_idx]) in line.comments and contains_pragma_comment( | |
1052 | line.comments[id(line.leaves[string_idx])] | |
1053 | ): | |
1054 | return TErr( | |
1055 | "Line appears to end with an inline pragma comment. Splitting the line" | |
1056 | " could modify the pragma's behavior." | |
1057 | ) | |
1058 | ||
1059 | if has_triple_quotes(string_leaf.value): | |
1060 | return TErr("We cannot split multiline strings.") | |
1061 | ||
1062 | return Ok(None) | |
1063 | ||
1064 | def _get_max_string_length(self, line: Line, string_idx: int) -> int: | |
1065 | """ | |
1066 | Calculates the max string length used when attempting to determine | |
1067 | whether or not the target string is responsible for causing the line to | |
1068 | go over the line length limit. | |
1069 | ||
1070 | WARNING: This method is tightly coupled to both StringSplitter and | |
1071 | (especially) StringParenWrapper. There is probably a better way to | |
1072 | accomplish what is being done here. | |
1073 | ||
1074 | Returns: | |
1075 | max_string_length: such that `line.leaves[string_idx].value > | |
1076 | max_string_length` implies that the target string IS responsible | |
1077 | for causing this line to exceed the line length limit. | |
1078 | """ | |
1079 | LL = line.leaves | |
1080 | ||
1081 | is_valid_index = is_valid_index_factory(LL) | |
1082 | ||
1083 | # We use the shorthand "WMA4" in comments to abbreviate "We must | |
1084 | # account for". When giving examples, we use STRING to mean some/any | |
1085 | # valid string. | |
1086 | # | |
1087 | # Finally, we use the following convenience variables: | |
1088 | # | |
1089 | # P: The leaf that is before the target string leaf. | |
1090 | # N: The leaf that is after the target string leaf. | |
1091 | # NN: The leaf that is after N. | |
1092 | ||
1093 | # WMA4 the whitespace at the beginning of the line. | |
1094 | offset = line.depth * 4 | |
1095 | ||
1096 | if is_valid_index(string_idx - 1): | |
1097 | p_idx = string_idx - 1 | |
1098 | if ( | |
1099 | LL[string_idx - 1].type == token.LPAR | |
1100 | and LL[string_idx - 1].value == "" | |
1101 | and string_idx >= 2 | |
1102 | ): | |
1103 | # If the previous leaf is an empty LPAR placeholder, we should skip it. | |
1104 | p_idx -= 1 | |
1105 | ||
1106 | P = LL[p_idx] | |
1107 | if P.type in self.STRING_OPERATORS: | |
1108 | # WMA4 a space and a string operator (e.g. `+ STRING` or `== STRING`). | |
1109 | offset += len(str(P)) + 1 | |
1110 | ||
1111 | if P.type == token.COMMA: | |
1112 | # WMA4 a space, a comma, and a closing bracket [e.g. `), STRING`]. | |
1113 | offset += 3 | |
1114 | ||
1115 | if P.type in [token.COLON, token.EQUAL, token.PLUSEQUAL, token.NAME]: | |
1116 | # This conditional branch is meant to handle dictionary keys, | |
1117 | # variable assignments, 'return STRING' statement lines, and | |
1118 | # 'else STRING' ternary expression lines. | |
1119 | ||
1120 | # WMA4 a single space. | |
1121 | offset += 1 | |
1122 | ||
1123 | # WMA4 the lengths of any leaves that came before that space, | |
1124 | # but after any closing bracket before that space. | |
1125 | for leaf in reversed(LL[: p_idx + 1]): | |
1126 | offset += len(str(leaf)) | |
1127 | if leaf.type in CLOSING_BRACKETS: | |
1128 | break | |
1129 | ||
1130 | if is_valid_index(string_idx + 1): | |
1131 | N = LL[string_idx + 1] | |
1132 | if N.type == token.RPAR and N.value == "" and len(LL) > string_idx + 2: | |
1133 | # If the next leaf is an empty RPAR placeholder, we should skip it. | |
1134 | N = LL[string_idx + 2] | |
1135 | ||
1136 | if N.type == token.COMMA: | |
1137 | # WMA4 a single comma at the end of the string (e.g `STRING,`). | |
1138 | offset += 1 | |
1139 | ||
1140 | if is_valid_index(string_idx + 2): | |
1141 | NN = LL[string_idx + 2] | |
1142 | ||
1143 | if N.type == token.DOT and NN.type == token.NAME: | |
1144 | # This conditional branch is meant to handle method calls invoked | |
1145 | # off of a string literal up to and including the LPAR character. | |
1146 | ||
1147 | # WMA4 the '.' character. | |
1148 | offset += 1 | |
1149 | ||
1150 | if ( | |
1151 | is_valid_index(string_idx + 3) | |
1152 | and LL[string_idx + 3].type == token.LPAR | |
1153 | ): | |
1154 | # WMA4 the left parenthesis character. | |
1155 | offset += 1 | |
1156 | ||
1157 | # WMA4 the length of the method's name. | |
1158 | offset += len(NN.value) | |
1159 | ||
1160 | has_comments = False | |
1161 | for comment_leaf in line.comments_after(LL[string_idx]): | |
1162 | if not has_comments: | |
1163 | has_comments = True | |
1164 | # WMA4 two spaces before the '#' character. | |
1165 | offset += 2 | |
1166 | ||
1167 | # WMA4 the length of the inline comment. | |
1168 | offset += len(comment_leaf.value) | |
1169 | ||
1170 | max_string_length = count_chars_in_width(str(line), self.line_length - offset) | |
1171 | return max_string_length | |
1172 | ||
1173 | @staticmethod | |
1174 | def _prefer_paren_wrap_match(LL: List[Leaf]) -> Optional[int]: | |
1175 | """ | |
1176 | Returns: | |
1177 | string_idx such that @LL[string_idx] is equal to our target (i.e. | |
1178 | matched) string, if this line matches the "prefer paren wrap" statement | |
1179 | requirements listed in the 'Requirements' section of the StringParenWrapper | |
1180 | class's docstring. | |
1181 | OR | |
1182 | None, otherwise. | |
1183 | """ | |
1184 | # The line must start with a string. | |
1185 | if LL[0].type != token.STRING: | |
1186 | return None | |
1187 | ||
1188 | matching_nodes = [ | |
1189 | syms.listmaker, | |
1190 | syms.dictsetmaker, | |
1191 | syms.testlist_gexp, | |
1192 | ] | |
1193 | # If the string is an immediate child of a list/set/tuple literal... | |
1194 | if ( | |
1195 | parent_type(LL[0]) in matching_nodes | |
1196 | or parent_type(LL[0].parent) in matching_nodes | |
1197 | ): | |
1198 | # And the string is surrounded by commas (or is the first/last child)... | |
1199 | prev_sibling = LL[0].prev_sibling | |
1200 | next_sibling = LL[0].next_sibling | |
1201 | if ( | |
1202 | not prev_sibling | |
1203 | and not next_sibling | |
1204 | and parent_type(LL[0]) == syms.atom | |
1205 | ): | |
1206 | # If it's an atom string, we need to check the parent atom's siblings. | |
1207 | parent = LL[0].parent | |
1208 | assert parent is not None # For type checkers. | |
1209 | prev_sibling = parent.prev_sibling | |
1210 | next_sibling = parent.next_sibling | |
1211 | if (not prev_sibling or prev_sibling.type == token.COMMA) and ( | |
1212 | not next_sibling or next_sibling.type == token.COMMA | |
1213 | ): | |
1214 | return 0 | |
1215 | ||
1216 | return None | |
1217 | ||
1218 | ||
1219 | def iter_fexpr_spans(s: str) -> Iterator[Tuple[int, int]]: | |
1220 | """ | |
1221 | Yields spans corresponding to expressions in a given f-string. | |
1222 | Spans are half-open ranges (left inclusive, right exclusive). | |
1223 | Assumes the input string is a valid f-string, but will not crash if the input | |
1224 | string is invalid. | |
1225 | """ | |
1226 | stack: List[int] = [] # our curly paren stack | |
1227 | i = 0 | |
1228 | while i < len(s): | |
1229 | if s[i] == "{": | |
1230 | # if we're in a string part of the f-string, ignore escaped curly braces | |
1231 | if not stack and i + 1 < len(s) and s[i + 1] == "{": | |
1232 | i += 2 | |
1233 | continue | |
1234 | stack.append(i) | |
1235 | i += 1 | |
1236 | continue | |
1237 | ||
1238 | if s[i] == "}": | |
1239 | if not stack: | |
1240 | i += 1 | |
1241 | continue | |
1242 | j = stack.pop() | |
1243 | # we've made it back out of the expression! yield the span | |
1244 | if not stack: | |
1245 | yield (j, i + 1) | |
1246 | i += 1 | |
1247 | continue | |
1248 | ||
1249 | # if we're in an expression part of the f-string, fast forward through strings | |
1250 | # note that backslashes are not legal in the expression portion of f-strings | |
1251 | if stack: | |
1252 | delim = None | |
1253 | if s[i : i + 3] in ("'''", '"""'): | |
1254 | delim = s[i : i + 3] | |
1255 | elif s[i] in ("'", '"'): | |
1256 | delim = s[i] | |
1257 | if delim: | |
1258 | i += len(delim) | |
1259 | while i < len(s) and s[i : i + len(delim)] != delim: | |
1260 | i += 1 | |
1261 | i += len(delim) | |
1262 | continue | |
1263 | i += 1 | |
1264 | ||
1265 | ||
1266 | def fstring_contains_expr(s: str) -> bool: | |
1267 | return any(iter_fexpr_spans(s)) | |
1268 | ||
1269 | ||
1270 | def _toggle_fexpr_quotes(fstring: str, old_quote: str) -> str: | |
1271 | """ | |
1272 | Toggles quotes used in f-string expressions that are `old_quote`. | |
1273 | ||
1274 | f-string expressions can't contain backslashes, so we need to toggle the | |
1275 | quotes if the f-string itself will end up using the same quote. We can | |
1276 | simply toggle without escaping because, quotes can't be reused in f-string | |
1277 | expressions. They will fail to parse. | |
1278 | ||
1279 | NOTE: If PEP 701 is accepted, above statement will no longer be true. | |
1280 | Though if quotes can be reused, we can simply reuse them without updates or | |
1281 | escaping, once Black figures out how to parse the new grammar. | |
1282 | """ | |
1283 | new_quote = "'" if old_quote == '"' else '"' | |
1284 | parts = [] | |
1285 | previous_index = 0 | |
1286 | for start, end in iter_fexpr_spans(fstring): | |
1287 | parts.append(fstring[previous_index:start]) | |
1288 | parts.append(fstring[start:end].replace(old_quote, new_quote)) | |
1289 | previous_index = end | |
1290 | parts.append(fstring[previous_index:]) | |
1291 | return "".join(parts) | |
1292 | ||
1293 | ||
1294 | class StringSplitter(BaseStringSplitter, CustomSplitMapMixin): | |
1295 | """ | |
1296 | StringTransformer that splits "atom" strings (i.e. strings which exist on | |
1297 | lines by themselves). | |
1298 | ||
1299 | Requirements: | |
1300 | * The line consists ONLY of a single string (possibly prefixed by a | |
1301 | string operator [e.g. '+' or '==']), MAYBE a string trailer, and MAYBE | |
1302 | a trailing comma. | |
1303 | AND | |
1304 | * All of the requirements listed in BaseStringSplitter's docstring. | |
1305 | ||
1306 | Transformations: | |
1307 | The string mentioned in the 'Requirements' section is split into as | |
1308 | many substrings as necessary to adhere to the configured line length. | |
1309 | ||
1310 | In the final set of substrings, no substring should be smaller than | |
1311 | MIN_SUBSTR_SIZE characters. | |
1312 | ||
1313 | The string will ONLY be split on spaces (i.e. each new substring should | |
1314 | start with a space). Note that the string will NOT be split on a space | |
1315 | which is escaped with a backslash. | |
1316 | ||
1317 | If the string is an f-string, it will NOT be split in the middle of an | |
1318 | f-expression (e.g. in f"FooBar: {foo() if x else bar()}", {foo() if x | |
1319 | else bar()} is an f-expression). | |
1320 | ||
1321 | If the string that is being split has an associated set of custom split | |
1322 | records and those custom splits will NOT result in any line going over | |
1323 | the configured line length, those custom splits are used. Otherwise the | |
1324 | string is split as late as possible (from left-to-right) while still | |
1325 | adhering to the transformation rules listed above. | |
1326 | ||
1327 | Collaborations: | |
1328 | StringSplitter relies on StringMerger to construct the appropriate | |
1329 | CustomSplit objects and add them to the custom split map. | |
1330 | """ | |
1331 | ||
1332 | MIN_SUBSTR_SIZE: Final = 6 | |
1333 | ||
1334 | def do_splitter_match(self, line: Line) -> TMatchResult: | |
1335 | LL = line.leaves | |
1336 | ||
1337 | if self._prefer_paren_wrap_match(LL) is not None: | |
1338 | return TErr("Line needs to be wrapped in parens first.") | |
1339 | ||
1340 | is_valid_index = is_valid_index_factory(LL) | |
1341 | ||
1342 | idx = 0 | |
1343 | ||
1344 | # The first two leaves MAY be the 'not in' keywords... | |
1345 | if ( | |
1346 | is_valid_index(idx) | |
1347 | and is_valid_index(idx + 1) | |
1348 | and [LL[idx].type, LL[idx + 1].type] == [token.NAME, token.NAME] | |
1349 | and str(LL[idx]) + str(LL[idx + 1]) == "not in" | |
1350 | ): | |
1351 | idx += 2 | |
1352 | # Else the first leaf MAY be a string operator symbol or the 'in' keyword... | |
1353 | elif is_valid_index(idx) and ( | |
1354 | LL[idx].type in self.STRING_OPERATORS | |
1355 | or LL[idx].type == token.NAME | |
1356 | and str(LL[idx]) == "in" | |
1357 | ): | |
1358 | idx += 1 | |
1359 | ||
1360 | # The next/first leaf MAY be an empty LPAR... | |
1361 | if is_valid_index(idx) and is_empty_lpar(LL[idx]): | |
1362 | idx += 1 | |
1363 | ||
1364 | # The next/first leaf MUST be a string... | |
1365 | if not is_valid_index(idx) or LL[idx].type != token.STRING: | |
1366 | return TErr("Line does not start with a string.") | |
1367 | ||
1368 | string_idx = idx | |
1369 | ||
1370 | # Skip the string trailer, if one exists. | |
1371 | string_parser = StringParser() | |
1372 | idx = string_parser.parse(LL, string_idx) | |
1373 | ||
1374 | # That string MAY be followed by an empty RPAR... | |
1375 | if is_valid_index(idx) and is_empty_rpar(LL[idx]): | |
1376 | idx += 1 | |
1377 | ||
1378 | # That string / empty RPAR leaf MAY be followed by a comma... | |
1379 | if is_valid_index(idx) and LL[idx].type == token.COMMA: | |
1380 | idx += 1 | |
1381 | ||
1382 | # But no more leaves are allowed... | |
1383 | if is_valid_index(idx): | |
1384 | return TErr("This line does not end with a string.") | |
1385 | ||
1386 | return Ok([string_idx]) | |
1387 | ||
1388 | def do_transform( | |
1389 | self, line: Line, string_indices: List[int] | |
1390 | ) -> Iterator[TResult[Line]]: | |
1391 | LL = line.leaves | |
1392 | assert len(string_indices) == 1, ( | |
1393 | f"{self.__class__.__name__} should only find one match at a time, found" | |
1394 | f" {len(string_indices)}" | |
1395 | ) | |
1396 | string_idx = string_indices[0] | |
1397 | ||
1398 | QUOTE = LL[string_idx].value[-1] | |
1399 | ||
1400 | is_valid_index = is_valid_index_factory(LL) | |
1401 | insert_str_child = insert_str_child_factory(LL[string_idx]) | |
1402 | ||
1403 | prefix = get_string_prefix(LL[string_idx].value).lower() | |
1404 | ||
1405 | # We MAY choose to drop the 'f' prefix from substrings that don't | |
1406 | # contain any f-expressions, but ONLY if the original f-string | |
1407 | # contains at least one f-expression. Otherwise, we will alter the AST | |
1408 | # of the program. | |
1409 | drop_pointless_f_prefix = ("f" in prefix) and fstring_contains_expr( | |
1410 | LL[string_idx].value | |
1411 | ) | |
1412 | ||
1413 | first_string_line = True | |
1414 | ||
1415 | string_op_leaves = self._get_string_operator_leaves(LL) | |
1416 | string_op_leaves_length = ( | |
1417 | sum(len(str(prefix_leaf)) for prefix_leaf in string_op_leaves) + 1 | |
1418 | if string_op_leaves | |
1419 | else 0 | |
1420 | ) | |
1421 | ||
1422 | def maybe_append_string_operators(new_line: Line) -> None: | |
1423 | """ | |
1424 | Side Effects: | |
1425 | If @line starts with a string operator and this is the first | |
1426 | line we are constructing, this function appends the string | |
1427 | operator to @new_line and replaces the old string operator leaf | |
1428 | in the node structure. Otherwise this function does nothing. | |
1429 | """ | |
1430 | maybe_prefix_leaves = string_op_leaves if first_string_line else [] | |
1431 | for i, prefix_leaf in enumerate(maybe_prefix_leaves): | |
1432 | replace_child(LL[i], prefix_leaf) | |
1433 | new_line.append(prefix_leaf) | |
1434 | ||
1435 | ends_with_comma = ( | |
1436 | is_valid_index(string_idx + 1) and LL[string_idx + 1].type == token.COMMA | |
1437 | ) | |
1438 | ||
1439 | def max_last_string_column() -> int: | |
1440 | """ | |
1441 | Returns: | |
1442 | The max allowed width of the string value used for the last | |
1443 | line we will construct. Note that this value means the width | |
1444 | rather than the number of characters (e.g., many East Asian | |
1445 | characters expand to two columns). | |
1446 | """ | |
1447 | result = self.line_length | |
1448 | result -= line.depth * 4 | |
1449 | result -= 1 if ends_with_comma else 0 | |
1450 | result -= string_op_leaves_length | |
1451 | return result | |
1452 | ||
1453 | # --- Calculate Max Break Width (for string value) | |
1454 | # We start with the line length limit | |
1455 | max_break_width = self.line_length | |
1456 | # The last index of a string of length N is N-1. | |
1457 | max_break_width -= 1 | |
1458 | # Leading whitespace is not present in the string value (e.g. Leaf.value). | |
1459 | max_break_width -= line.depth * 4 | |
1460 | if max_break_width < 0: | |
1461 | yield TErr( | |
1462 | f"Unable to split {LL[string_idx].value} at such high of a line depth:" | |
1463 | f" {line.depth}" | |
1464 | ) | |
1465 | return | |
1466 | ||
1467 | # Check if StringMerger registered any custom splits. | |
1468 | custom_splits = self.pop_custom_splits(LL[string_idx].value) | |
1469 | # We use them ONLY if none of them would produce lines that exceed the | |
1470 | # line limit. | |
1471 | use_custom_breakpoints = bool( | |
1472 | custom_splits | |
1473 | and all(csplit.break_idx <= max_break_width for csplit in custom_splits) | |
1474 | ) | |
1475 | ||
1476 | # Temporary storage for the remaining chunk of the string line that | |
1477 | # can't fit onto the line currently being constructed. | |
1478 | rest_value = LL[string_idx].value | |
1479 | ||
1480 | def more_splits_should_be_made() -> bool: | |
1481 | """ | |
1482 | Returns: | |
1483 | True iff `rest_value` (the remaining string value from the last | |
1484 | split), should be split again. | |
1485 | """ | |
1486 | if use_custom_breakpoints: | |
1487 | return len(custom_splits) > 1 | |
1488 | else: | |
1489 | return str_width(rest_value) > max_last_string_column() | |
1490 | ||
1491 | string_line_results: List[Ok[Line]] = [] | |
1492 | while more_splits_should_be_made(): | |
1493 | if use_custom_breakpoints: | |
1494 | # Custom User Split (manual) | |
1495 | csplit = custom_splits.pop(0) | |
1496 | break_idx = csplit.break_idx | |
1497 | else: | |
1498 | # Algorithmic Split (automatic) | |
1499 | max_bidx = ( | |
1500 | count_chars_in_width(rest_value, max_break_width) | |
1501 | - string_op_leaves_length | |
1502 | ) | |
1503 | maybe_break_idx = self._get_break_idx(rest_value, max_bidx) | |
1504 | if maybe_break_idx is None: | |
1505 | # If we are unable to algorithmically determine a good split | |
1506 | # and this string has custom splits registered to it, we | |
1507 | # fall back to using them--which means we have to start | |
1508 | # over from the beginning. | |
1509 | if custom_splits: | |
1510 | rest_value = LL[string_idx].value | |
1511 | string_line_results = [] | |
1512 | first_string_line = True | |
1513 | use_custom_breakpoints = True | |
1514 | continue | |
1515 | ||
1516 | # Otherwise, we stop splitting here. | |
1517 | break | |
1518 | ||
1519 | break_idx = maybe_break_idx | |
1520 | ||
1521 | # --- Construct `next_value` | |
1522 | next_value = rest_value[:break_idx] + QUOTE | |
1523 | ||
1524 | # HACK: The following 'if' statement is a hack to fix the custom | |
1525 | # breakpoint index in the case of either: (a) substrings that were | |
1526 | # f-strings but will have the 'f' prefix removed OR (b) substrings | |
1527 | # that were not f-strings but will now become f-strings because of | |
1528 | # redundant use of the 'f' prefix (i.e. none of the substrings | |
1529 | # contain f-expressions but one or more of them had the 'f' prefix | |
1530 | # anyway; in which case, we will prepend 'f' to _all_ substrings). | |
1531 | # | |
1532 | # There is probably a better way to accomplish what is being done | |
1533 | # here... | |
1534 | # | |
1535 | # If this substring is an f-string, we _could_ remove the 'f' | |
1536 | # prefix, and the current custom split did NOT originally use a | |
1537 | # prefix... | |
1538 | if ( | |
1539 | use_custom_breakpoints | |
1540 | and not csplit.has_prefix | |
1541 | and ( | |
1542 | # `next_value == prefix + QUOTE` happens when the custom | |
1543 | # split is an empty string. | |
1544 | next_value == prefix + QUOTE | |
1545 | or next_value != self._normalize_f_string(next_value, prefix) | |
1546 | ) | |
1547 | ): | |
1548 | # Then `csplit.break_idx` will be off by one after removing | |
1549 | # the 'f' prefix. | |
1550 | break_idx += 1 | |
1551 | next_value = rest_value[:break_idx] + QUOTE | |
1552 | ||
1553 | if drop_pointless_f_prefix: | |
1554 | next_value = self._normalize_f_string(next_value, prefix) | |
1555 | ||
1556 | # --- Construct `next_leaf` | |
1557 | next_leaf = Leaf(token.STRING, next_value) | |
1558 | insert_str_child(next_leaf) | |
1559 | self._maybe_normalize_string_quotes(next_leaf) | |
1560 | ||
1561 | # --- Construct `next_line` | |
1562 | next_line = line.clone() | |
1563 | maybe_append_string_operators(next_line) | |
1564 | next_line.append(next_leaf) | |
1565 | string_line_results.append(Ok(next_line)) | |
1566 | ||
1567 | rest_value = prefix + QUOTE + rest_value[break_idx:] | |
1568 | first_string_line = False | |
1569 | ||
1570 | yield from string_line_results | |
1571 | ||
1572 | if drop_pointless_f_prefix: | |
1573 | rest_value = self._normalize_f_string(rest_value, prefix) | |
1574 | ||
1575 | rest_leaf = Leaf(token.STRING, rest_value) | |
1576 | insert_str_child(rest_leaf) | |
1577 | ||
1578 | # NOTE: I could not find a test case that verifies that the following | |
1579 | # line is actually necessary, but it seems to be. Otherwise we risk | |
1580 | # not normalizing the last substring, right? | |
1581 | self._maybe_normalize_string_quotes(rest_leaf) | |
1582 | ||
1583 | last_line = line.clone() | |
1584 | maybe_append_string_operators(last_line) | |
1585 | ||
1586 | # If there are any leaves to the right of the target string... | |
1587 | if is_valid_index(string_idx + 1): | |
1588 | # We use `temp_value` here to determine how long the last line | |
1589 | # would be if we were to append all the leaves to the right of the | |
1590 | # target string to the last string line. | |
1591 | temp_value = rest_value | |
1592 | for leaf in LL[string_idx + 1 :]: | |
1593 | temp_value += str(leaf) | |
1594 | if leaf.type == token.LPAR: | |
1595 | break | |
1596 | ||
1597 | # Try to fit them all on the same line with the last substring... | |
1598 | if ( | |
1599 | str_width(temp_value) <= max_last_string_column() | |
1600 | or LL[string_idx + 1].type == token.COMMA | |
1601 | ): | |
1602 | last_line.append(rest_leaf) | |
1603 | append_leaves(last_line, line, LL[string_idx + 1 :]) | |
1604 | yield Ok(last_line) | |
1605 | # Otherwise, place the last substring on one line and everything | |
1606 | # else on a line below that... | |
1607 | else: | |
1608 | last_line.append(rest_leaf) | |
1609 | yield Ok(last_line) | |
1610 | ||
1611 | non_string_line = line.clone() | |
1612 | append_leaves(non_string_line, line, LL[string_idx + 1 :]) | |
1613 | yield Ok(non_string_line) | |
1614 | # Else the target string was the last leaf... | |
1615 | else: | |
1616 | last_line.append(rest_leaf) | |
1617 | last_line.comments = line.comments.copy() | |
1618 | yield Ok(last_line) | |
1619 | ||
1620 | def _iter_nameescape_slices(self, string: str) -> Iterator[Tuple[Index, Index]]: | |
1621 | """ | |
1622 | Yields: | |
1623 | All ranges of @string which, if @string were to be split there, | |
1624 | would result in the splitting of an \\N{...} expression (which is NOT | |
1625 | allowed). | |
1626 | """ | |
1627 | # True - the previous backslash was unescaped | |
1628 | # False - the previous backslash was escaped *or* there was no backslash | |
1629 | previous_was_unescaped_backslash = False | |
1630 | it = iter(enumerate(string)) | |
1631 | for idx, c in it: | |
1632 | if c == "\\": | |
1633 | previous_was_unescaped_backslash = not previous_was_unescaped_backslash | |
1634 | continue | |
1635 | if not previous_was_unescaped_backslash or c != "N": | |
1636 | previous_was_unescaped_backslash = False | |
1637 | continue | |
1638 | previous_was_unescaped_backslash = False | |
1639 | ||
1640 | begin = idx - 1 # the position of backslash before \N{...} | |
1641 | for idx, c in it: | |
1642 | if c == "}": | |
1643 | end = idx | |
1644 | break | |
1645 | else: | |
1646 | # malformed nameescape expression? | |
1647 | # should have been detected by AST parsing earlier... | |
1648 | raise RuntimeError(f"{self.__class__.__name__} LOGIC ERROR!") | |
1649 | yield begin, end | |
1650 | ||
1651 | def _iter_fexpr_slices(self, string: str) -> Iterator[Tuple[Index, Index]]: | |
1652 | """ | |
1653 | Yields: | |
1654 | All ranges of @string which, if @string were to be split there, | |
1655 | would result in the splitting of an f-expression (which is NOT | |
1656 | allowed). | |
1657 | """ | |
1658 | if "f" not in get_string_prefix(string).lower(): | |
1659 | return | |
1660 | yield from iter_fexpr_spans(string) | |
1661 | ||
1662 | def _get_illegal_split_indices(self, string: str) -> Set[Index]: | |
1663 | illegal_indices: Set[Index] = set() | |
1664 | iterators = [ | |
1665 | self._iter_fexpr_slices(string), | |
1666 | self._iter_nameescape_slices(string), | |
1667 | ] | |
1668 | for it in iterators: | |
1669 | for begin, end in it: | |
1670 | illegal_indices.update(range(begin, end + 1)) | |
1671 | return illegal_indices | |
1672 | ||
1673 | def _get_break_idx(self, string: str, max_break_idx: int) -> Optional[int]: | |
1674 | """ | |
1675 | This method contains the algorithm that StringSplitter uses to | |
1676 | determine which character to split each string at. | |
1677 | ||
1678 | Args: | |
1679 | @string: The substring that we are attempting to split. | |
1680 | @max_break_idx: The ideal break index. We will return this value if it | |
1681 | meets all the necessary conditions. In the likely event that it | |
1682 | doesn't we will try to find the closest index BELOW @max_break_idx | |
1683 | that does. If that fails, we will expand our search by also | |
1684 | considering all valid indices ABOVE @max_break_idx. | |
1685 | ||
1686 | Pre-Conditions: | |
1687 | * assert_is_leaf_string(@string) | |
1688 | * 0 <= @max_break_idx < len(@string) | |
1689 | ||
1690 | Returns: | |
1691 | break_idx, if an index is able to be found that meets all of the | |
1692 | conditions listed in the 'Transformations' section of this classes' | |
1693 | docstring. | |
1694 | OR | |
1695 | None, otherwise. | |
1696 | """ | |
1697 | is_valid_index = is_valid_index_factory(string) | |
1698 | ||
1699 | assert is_valid_index(max_break_idx) | |
1700 | assert_is_leaf_string(string) | |
1701 | ||
1702 | _illegal_split_indices = self._get_illegal_split_indices(string) | |
1703 | ||
1704 | def breaks_unsplittable_expression(i: Index) -> bool: | |
1705 | """ | |
1706 | Returns: | |
1707 | True iff returning @i would result in the splitting of an | |
1708 | unsplittable expression (which is NOT allowed). | |
1709 | """ | |
1710 | return i in _illegal_split_indices | |
1711 | ||
1712 | def passes_all_checks(i: Index) -> bool: | |
1713 | """ | |
1714 | Returns: | |
1715 | True iff ALL of the conditions listed in the 'Transformations' | |
1716 | section of this classes' docstring would be be met by returning @i. | |
1717 | """ | |
1718 | is_space = string[i] == " " | |
1719 | is_split_safe = is_valid_index(i - 1) and string[i - 1] in SPLIT_SAFE_CHARS | |
1720 | ||
1721 | is_not_escaped = True | |
1722 | j = i - 1 | |
1723 | while is_valid_index(j) and string[j] == "\\": | |
1724 | is_not_escaped = not is_not_escaped | |
1725 | j -= 1 | |
1726 | ||
1727 | is_big_enough = ( | |
1728 | len(string[i:]) >= self.MIN_SUBSTR_SIZE | |
1729 | and len(string[:i]) >= self.MIN_SUBSTR_SIZE | |
1730 | ) | |
1731 | return ( | |
1732 | (is_space or is_split_safe) | |
1733 | and is_not_escaped | |
1734 | and is_big_enough | |
1735 | and not breaks_unsplittable_expression(i) | |
1736 | ) | |
1737 | ||
1738 | # First, we check all indices BELOW @max_break_idx. | |
1739 | break_idx = max_break_idx | |
1740 | while is_valid_index(break_idx - 1) and not passes_all_checks(break_idx): | |
1741 | break_idx -= 1 | |
1742 | ||
1743 | if not passes_all_checks(break_idx): | |
1744 | # If that fails, we check all indices ABOVE @max_break_idx. | |
1745 | # | |
1746 | # If we are able to find a valid index here, the next line is going | |
1747 | # to be longer than the specified line length, but it's probably | |
1748 | # better than doing nothing at all. | |
1749 | break_idx = max_break_idx + 1 | |
1750 | while is_valid_index(break_idx + 1) and not passes_all_checks(break_idx): | |
1751 | break_idx += 1 | |
1752 | ||
1753 | if not is_valid_index(break_idx) or not passes_all_checks(break_idx): | |
1754 | return None | |
1755 | ||
1756 | return break_idx | |
1757 | ||
1758 | def _maybe_normalize_string_quotes(self, leaf: Leaf) -> None: | |
1759 | if self.normalize_strings: | |
1760 | leaf.value = normalize_string_quotes(leaf.value) | |
1761 | ||
1762 | def _normalize_f_string(self, string: str, prefix: str) -> str: | |
1763 | """ | |
1764 | Pre-Conditions: | |
1765 | * assert_is_leaf_string(@string) | |
1766 | ||
1767 | Returns: | |
1768 | * If @string is an f-string that contains no f-expressions, we | |
1769 | return a string identical to @string except that the 'f' prefix | |
1770 | has been stripped and all double braces (i.e. '{{' or '}}') have | |
1771 | been normalized (i.e. turned into '{' or '}'). | |
1772 | OR | |
1773 | * Otherwise, we return @string. | |
1774 | """ | |
1775 | assert_is_leaf_string(string) | |
1776 | ||
1777 | if "f" in prefix and not fstring_contains_expr(string): | |
1778 | new_prefix = prefix.replace("f", "") | |
1779 | ||
1780 | temp = string[len(prefix) :] | |
1781 | temp = re.sub(r"\{\{", "{", temp) | |
1782 | temp = re.sub(r"\}\}", "}", temp) | |
1783 | new_string = temp | |
1784 | ||
1785 | return f"{new_prefix}{new_string}" | |
1786 | else: | |
1787 | return string | |
1788 | ||
1789 | def _get_string_operator_leaves(self, leaves: Iterable[Leaf]) -> List[Leaf]: | |
1790 | LL = list(leaves) | |
1791 | ||
1792 | string_op_leaves = [] | |
1793 | i = 0 | |
1794 | while LL[i].type in self.STRING_OPERATORS + [token.NAME]: | |
1795 | prefix_leaf = Leaf(LL[i].type, str(LL[i]).strip()) | |
1796 | string_op_leaves.append(prefix_leaf) | |
1797 | i += 1 | |
1798 | return string_op_leaves | |
1799 | ||
1800 | ||
1801 | class StringParenWrapper(BaseStringSplitter, CustomSplitMapMixin): | |
1802 | """ | |
1803 | StringTransformer that wraps strings in parens and then splits at the LPAR. | |
1804 | ||
1805 | Requirements: | |
1806 | All of the requirements listed in BaseStringSplitter's docstring in | |
1807 | addition to the requirements listed below: | |
1808 | ||
1809 | * The line is a return/yield statement, which returns/yields a string. | |
1810 | OR | |
1811 | * The line is part of a ternary expression (e.g. `x = y if cond else | |
1812 | z`) such that the line starts with `else <string>`, where <string> is | |
1813 | some string. | |
1814 | OR | |
1815 | * The line is an assert statement, which ends with a string. | |
1816 | OR | |
1817 | * The line is an assignment statement (e.g. `x = <string>` or `x += | |
1818 | <string>`) such that the variable is being assigned the value of some | |
1819 | string. | |
1820 | OR | |
1821 | * The line is a dictionary key assignment where some valid key is being | |
1822 | assigned the value of some string. | |
1823 | OR | |
1824 | * The line is an lambda expression and the value is a string. | |
1825 | OR | |
1826 | * The line starts with an "atom" string that prefers to be wrapped in | |
1827 | parens. It's preferred to be wrapped when it's is an immediate child of | |
1828 | a list/set/tuple literal, AND the string is surrounded by commas (or is | |
1829 | the first/last child). | |
1830 | ||
1831 | Transformations: | |
1832 | The chosen string is wrapped in parentheses and then split at the LPAR. | |
1833 | ||
1834 | We then have one line which ends with an LPAR and another line that | |
1835 | starts with the chosen string. The latter line is then split again at | |
1836 | the RPAR. This results in the RPAR (and possibly a trailing comma) | |
1837 | being placed on its own line. | |
1838 | ||
1839 | NOTE: If any leaves exist to the right of the chosen string (except | |
1840 | for a trailing comma, which would be placed after the RPAR), those | |
1841 | leaves are placed inside the parentheses. In effect, the chosen | |
1842 | string is not necessarily being "wrapped" by parentheses. We can, | |
1843 | however, count on the LPAR being placed directly before the chosen | |
1844 | string. | |
1845 | ||
1846 | In other words, StringParenWrapper creates "atom" strings. These | |
1847 | can then be split again by StringSplitter, if necessary. | |
1848 | ||
1849 | Collaborations: | |
1850 | In the event that a string line split by StringParenWrapper is | |
1851 | changed such that it no longer needs to be given its own line, | |
1852 | StringParenWrapper relies on StringParenStripper to clean up the | |
1853 | parentheses it created. | |
1854 | ||
1855 | For "atom" strings that prefers to be wrapped in parens, it requires | |
1856 | StringSplitter to hold the split until the string is wrapped in parens. | |
1857 | """ | |
1858 | ||
1859 | def do_splitter_match(self, line: Line) -> TMatchResult: | |
1860 | LL = line.leaves | |
1861 | ||
1862 | if line.leaves[-1].type in OPENING_BRACKETS: | |
1863 | return TErr( | |
1864 | "Cannot wrap parens around a line that ends in an opening bracket." | |
1865 | ) | |
1866 | ||
1867 | string_idx = ( | |
1868 | self._return_match(LL) | |
1869 | or self._else_match(LL) | |
1870 | or self._assert_match(LL) | |
1871 | or self._assign_match(LL) | |
1872 | or self._dict_or_lambda_match(LL) | |
1873 | or self._prefer_paren_wrap_match(LL) | |
1874 | ) | |
1875 | ||
1876 | if string_idx is not None: | |
1877 | string_value = line.leaves[string_idx].value | |
1878 | # If the string has neither spaces nor East Asian stops... | |
1879 | if not any( | |
1880 | char == " " or char in SPLIT_SAFE_CHARS for char in string_value | |
1881 | ): | |
1882 | # And will still violate the line length limit when split... | |
1883 | max_string_width = self.line_length - ((line.depth + 1) * 4) | |
1884 | if str_width(string_value) > max_string_width: | |
1885 | # And has no associated custom splits... | |
1886 | if not self.has_custom_splits(string_value): | |
1887 | # Then we should NOT put this string on its own line. | |
1888 | return TErr( | |
1889 | "We do not wrap long strings in parentheses when the" | |
1890 | " resultant line would still be over the specified line" | |
1891 | " length and can't be split further by StringSplitter." | |
1892 | ) | |
1893 | return Ok([string_idx]) | |
1894 | ||
1895 | return TErr("This line does not contain any non-atomic strings.") | |
1896 | ||
1897 | @staticmethod | |
1898 | def _return_match(LL: List[Leaf]) -> Optional[int]: | |
1899 | """ | |
1900 | Returns: | |
1901 | string_idx such that @LL[string_idx] is equal to our target (i.e. | |
1902 | matched) string, if this line matches the return/yield statement | |
1903 | requirements listed in the 'Requirements' section of this classes' | |
1904 | docstring. | |
1905 | OR | |
1906 | None, otherwise. | |
1907 | """ | |
1908 | # If this line is apart of a return/yield statement and the first leaf | |
1909 | # contains either the "return" or "yield" keywords... | |
1910 | if parent_type(LL[0]) in [syms.return_stmt, syms.yield_expr] and LL[ | |
1911 | 0 | |
1912 | ].value in ["return", "yield"]: | |
1913 | is_valid_index = is_valid_index_factory(LL) | |
1914 | ||
1915 | idx = 2 if is_valid_index(1) and is_empty_par(LL[1]) else 1 | |
1916 | # The next visible leaf MUST contain a string... | |
1917 | if is_valid_index(idx) and LL[idx].type == token.STRING: | |
1918 | return idx | |
1919 | ||
1920 | return None | |
1921 | ||
1922 | @staticmethod | |
1923 | def _else_match(LL: List[Leaf]) -> Optional[int]: | |
1924 | """ | |
1925 | Returns: | |
1926 | string_idx such that @LL[string_idx] is equal to our target (i.e. | |
1927 | matched) string, if this line matches the ternary expression | |
1928 | requirements listed in the 'Requirements' section of this classes' | |
1929 | docstring. | |
1930 | OR | |
1931 | None, otherwise. | |
1932 | """ | |
1933 | # If this line is apart of a ternary expression and the first leaf | |
1934 | # contains the "else" keyword... | |
1935 | if ( | |
1936 | parent_type(LL[0]) == syms.test | |
1937 | and LL[0].type == token.NAME | |
1938 | and LL[0].value == "else" | |
1939 | ): | |
1940 | is_valid_index = is_valid_index_factory(LL) | |
1941 | ||
1942 | idx = 2 if is_valid_index(1) and is_empty_par(LL[1]) else 1 | |
1943 | # The next visible leaf MUST contain a string... | |
1944 | if is_valid_index(idx) and LL[idx].type == token.STRING: | |
1945 | return idx | |
1946 | ||
1947 | return None | |
1948 | ||
1949 | @staticmethod | |
1950 | def _assert_match(LL: List[Leaf]) -> Optional[int]: | |
1951 | """ | |
1952 | Returns: | |
1953 | string_idx such that @LL[string_idx] is equal to our target (i.e. | |
1954 | matched) string, if this line matches the assert statement | |
1955 | requirements listed in the 'Requirements' section of this classes' | |
1956 | docstring. | |
1957 | OR | |
1958 | None, otherwise. | |
1959 | """ | |
1960 | # If this line is apart of an assert statement and the first leaf | |
1961 | # contains the "assert" keyword... | |
1962 | if parent_type(LL[0]) == syms.assert_stmt and LL[0].value == "assert": | |
1963 | is_valid_index = is_valid_index_factory(LL) | |
1964 | ||
1965 | for i, leaf in enumerate(LL): | |
1966 | # We MUST find a comma... | |
1967 | if leaf.type == token.COMMA: | |
1968 | idx = i + 2 if is_empty_par(LL[i + 1]) else i + 1 | |
1969 | ||
1970 | # That comma MUST be followed by a string... | |
1971 | if is_valid_index(idx) and LL[idx].type == token.STRING: | |
1972 | string_idx = idx | |
1973 | ||
1974 | # Skip the string trailer, if one exists. | |
1975 | string_parser = StringParser() | |
1976 | idx = string_parser.parse(LL, string_idx) | |
1977 | ||
1978 | # But no more leaves are allowed... | |
1979 | if not is_valid_index(idx): | |
1980 | return string_idx | |
1981 | ||
1982 | return None | |
1983 | ||
1984 | @staticmethod | |
1985 | def _assign_match(LL: List[Leaf]) -> Optional[int]: | |
1986 | """ | |
1987 | Returns: | |
1988 | string_idx such that @LL[string_idx] is equal to our target (i.e. | |
1989 | matched) string, if this line matches the assignment statement | |
1990 | requirements listed in the 'Requirements' section of this classes' | |
1991 | docstring. | |
1992 | OR | |
1993 | None, otherwise. | |
1994 | """ | |
1995 | # If this line is apart of an expression statement or is a function | |
1996 | # argument AND the first leaf contains a variable name... | |
1997 | if ( | |
1998 | parent_type(LL[0]) in [syms.expr_stmt, syms.argument, syms.power] | |
1999 | and LL[0].type == token.NAME | |
2000 | ): | |
2001 | is_valid_index = is_valid_index_factory(LL) | |
2002 | ||
2003 | for i, leaf in enumerate(LL): | |
2004 | # We MUST find either an '=' or '+=' symbol... | |
2005 | if leaf.type in [token.EQUAL, token.PLUSEQUAL]: | |
2006 | idx = i + 2 if is_empty_par(LL[i + 1]) else i + 1 | |
2007 | ||
2008 | # That symbol MUST be followed by a string... | |
2009 | if is_valid_index(idx) and LL[idx].type == token.STRING: | |
2010 | string_idx = idx | |
2011 | ||
2012 | # Skip the string trailer, if one exists. | |
2013 | string_parser = StringParser() | |
2014 | idx = string_parser.parse(LL, string_idx) | |
2015 | ||
2016 | # The next leaf MAY be a comma iff this line is apart | |
2017 | # of a function argument... | |
2018 | if ( | |
2019 | parent_type(LL[0]) == syms.argument | |
2020 | and is_valid_index(idx) | |
2021 | and LL[idx].type == token.COMMA | |
2022 | ): | |
2023 | idx += 1 | |
2024 | ||
2025 | # But no more leaves are allowed... | |
2026 | if not is_valid_index(idx): | |
2027 | return string_idx | |
2028 | ||
2029 | return None | |
2030 | ||
2031 | @staticmethod | |
2032 | def _dict_or_lambda_match(LL: List[Leaf]) -> Optional[int]: | |
2033 | """ | |
2034 | Returns: | |
2035 | string_idx such that @LL[string_idx] is equal to our target (i.e. | |
2036 | matched) string, if this line matches the dictionary key assignment | |
2037 | statement or lambda expression requirements listed in the | |
2038 | 'Requirements' section of this classes' docstring. | |
2039 | OR | |
2040 | None, otherwise. | |
2041 | """ | |
2042 | # If this line is a part of a dictionary key assignment or lambda expression... | |
2043 | parent_types = [parent_type(LL[0]), parent_type(LL[0].parent)] | |
2044 | if syms.dictsetmaker in parent_types or syms.lambdef in parent_types: | |
2045 | is_valid_index = is_valid_index_factory(LL) | |
2046 | ||
2047 | for i, leaf in enumerate(LL): | |
2048 | # We MUST find a colon, it can either be dict's or lambda's colon... | |
2049 | if leaf.type == token.COLON and i < len(LL) - 1: | |
2050 | idx = i + 2 if is_empty_par(LL[i + 1]) else i + 1 | |
2051 | ||
2052 | # That colon MUST be followed by a string... | |
2053 | if is_valid_index(idx) and LL[idx].type == token.STRING: | |
2054 | string_idx = idx | |
2055 | ||
2056 | # Skip the string trailer, if one exists. | |
2057 | string_parser = StringParser() | |
2058 | idx = string_parser.parse(LL, string_idx) | |
2059 | ||
2060 | # That string MAY be followed by a comma... | |
2061 | if is_valid_index(idx) and LL[idx].type == token.COMMA: | |
2062 | idx += 1 | |
2063 | ||
2064 | # But no more leaves are allowed... | |
2065 | if not is_valid_index(idx): | |
2066 | return string_idx | |
2067 | ||
2068 | return None | |
2069 | ||
2070 | def do_transform( | |
2071 | self, line: Line, string_indices: List[int] | |
2072 | ) -> Iterator[TResult[Line]]: | |
2073 | LL = line.leaves | |
2074 | assert len(string_indices) == 1, ( | |
2075 | f"{self.__class__.__name__} should only find one match at a time, found" | |
2076 | f" {len(string_indices)}" | |
2077 | ) | |
2078 | string_idx = string_indices[0] | |
2079 | ||
2080 | is_valid_index = is_valid_index_factory(LL) | |
2081 | insert_str_child = insert_str_child_factory(LL[string_idx]) | |
2082 | ||
2083 | comma_idx = -1 | |
2084 | ends_with_comma = False | |
2085 | if LL[comma_idx].type == token.COMMA: | |
2086 | ends_with_comma = True | |
2087 | ||
2088 | leaves_to_steal_comments_from = [LL[string_idx]] | |
2089 | if ends_with_comma: | |
2090 | leaves_to_steal_comments_from.append(LL[comma_idx]) | |
2091 | ||
2092 | # --- First Line | |
2093 | first_line = line.clone() | |
2094 | left_leaves = LL[:string_idx] | |
2095 | ||
2096 | # We have to remember to account for (possibly invisible) LPAR and RPAR | |
2097 | # leaves that already wrapped the target string. If these leaves do | |
2098 | # exist, we will replace them with our own LPAR and RPAR leaves. | |
2099 | old_parens_exist = False | |
2100 | if left_leaves and left_leaves[-1].type == token.LPAR: | |
2101 | old_parens_exist = True | |
2102 | leaves_to_steal_comments_from.append(left_leaves[-1]) | |
2103 | left_leaves.pop() | |
2104 | ||
2105 | append_leaves(first_line, line, left_leaves) | |
2106 | ||
2107 | lpar_leaf = Leaf(token.LPAR, "(") | |
2108 | if old_parens_exist: | |
2109 | replace_child(LL[string_idx - 1], lpar_leaf) | |
2110 | else: | |
2111 | insert_str_child(lpar_leaf) | |
2112 | first_line.append(lpar_leaf) | |
2113 | ||
2114 | # We throw inline comments that were originally to the right of the | |
2115 | # target string to the top line. They will now be shown to the right of | |
2116 | # the LPAR. | |
2117 | for leaf in leaves_to_steal_comments_from: | |
2118 | for comment_leaf in line.comments_after(leaf): | |
2119 | first_line.append(comment_leaf, preformatted=True) | |
2120 | ||
2121 | yield Ok(first_line) | |
2122 | ||
2123 | # --- Middle (String) Line | |
2124 | # We only need to yield one (possibly too long) string line, since the | |
2125 | # `StringSplitter` will break it down further if necessary. | |
2126 | string_value = LL[string_idx].value | |
2127 | string_line = Line( | |
2128 | mode=line.mode, | |
2129 | depth=line.depth + 1, | |
2130 | inside_brackets=True, | |
2131 | should_split_rhs=line.should_split_rhs, | |
2132 | magic_trailing_comma=line.magic_trailing_comma, | |
2133 | ) | |
2134 | string_leaf = Leaf(token.STRING, string_value) | |
2135 | insert_str_child(string_leaf) | |
2136 | string_line.append(string_leaf) | |
2137 | ||
2138 | old_rpar_leaf = None | |
2139 | if is_valid_index(string_idx + 1): | |
2140 | right_leaves = LL[string_idx + 1 :] | |
2141 | if ends_with_comma: | |
2142 | right_leaves.pop() | |
2143 | ||
2144 | if old_parens_exist: | |
2145 | assert right_leaves and right_leaves[-1].type == token.RPAR, ( | |
2146 | "Apparently, old parentheses do NOT exist?!" | |
2147 | f" (left_leaves={left_leaves}, right_leaves={right_leaves})" | |
2148 | ) | |
2149 | old_rpar_leaf = right_leaves.pop() | |
2150 | elif right_leaves and right_leaves[-1].type == token.RPAR: | |
2151 | # Special case for lambda expressions as dict's value, e.g.: | |
2152 | # my_dict = { | |
2153 | # "key": lambda x: f"formatted: {x}, | |
2154 | # } | |
2155 | # After wrapping the dict's value with parentheses, the string is | |
2156 | # followed by a RPAR but its opening bracket is lambda's, not | |
2157 | # the string's: | |
2158 | # "key": (lambda x: f"formatted: {x}), | |
2159 | opening_bracket = right_leaves[-1].opening_bracket | |
2160 | if opening_bracket is not None and opening_bracket in left_leaves: | |
2161 | index = left_leaves.index(opening_bracket) | |
2162 | if ( | |
2163 | index > 0 | |
2164 | and index < len(left_leaves) - 1 | |
2165 | and left_leaves[index - 1].type == token.COLON | |
2166 | and left_leaves[index + 1].value == "lambda" | |
2167 | ): | |
2168 | right_leaves.pop() | |
2169 | ||
2170 | append_leaves(string_line, line, right_leaves) | |
2171 | ||
2172 | yield Ok(string_line) | |
2173 | ||
2174 | # --- Last Line | |
2175 | last_line = line.clone() | |
2176 | last_line.bracket_tracker = first_line.bracket_tracker | |
2177 | ||
2178 | new_rpar_leaf = Leaf(token.RPAR, ")") | |
2179 | if old_rpar_leaf is not None: | |
2180 | replace_child(old_rpar_leaf, new_rpar_leaf) | |
2181 | else: | |
2182 | insert_str_child(new_rpar_leaf) | |
2183 | last_line.append(new_rpar_leaf) | |
2184 | ||
2185 | # If the target string ended with a comma, we place this comma to the | |
2186 | # right of the RPAR on the last line. | |
2187 | if ends_with_comma: | |
2188 | comma_leaf = Leaf(token.COMMA, ",") | |
2189 | replace_child(LL[comma_idx], comma_leaf) | |
2190 | last_line.append(comma_leaf) | |
2191 | ||
2192 | yield Ok(last_line) | |
2193 | ||
2194 | ||
2195 | class StringParser: | |
2196 | """ | |
2197 | A state machine that aids in parsing a string's "trailer", which can be | |
2198 | either non-existent, an old-style formatting sequence (e.g. `% varX` or `% | |
2199 | (varX, varY)`), or a method-call / attribute access (e.g. `.format(varX, | |
2200 | varY)`). | |
2201 | ||
2202 | NOTE: A new StringParser object MUST be instantiated for each string | |
2203 | trailer we need to parse. | |
2204 | ||
2205 | Examples: | |
2206 | We shall assume that `line` equals the `Line` object that corresponds | |
2207 | to the following line of python code: | |
2208 | ``` | |
2209 | x = "Some {}.".format("String") + some_other_string | |
2210 | ``` | |
2211 | ||
2212 | Furthermore, we will assume that `string_idx` is some index such that: | |
2213 | ``` | |
2214 | assert line.leaves[string_idx].value == "Some {}." | |
2215 | ``` | |
2216 | ||
2217 | The following code snippet then holds: | |
2218 | ``` | |
2219 | string_parser = StringParser() | |
2220 | idx = string_parser.parse(line.leaves, string_idx) | |
2221 | assert line.leaves[idx].type == token.PLUS | |
2222 | ``` | |
2223 | """ | |
2224 | ||
2225 | DEFAULT_TOKEN: Final = 20210605 | |
2226 | ||
2227 | # String Parser States | |
2228 | START: Final = 1 | |
2229 | DOT: Final = 2 | |
2230 | NAME: Final = 3 | |
2231 | PERCENT: Final = 4 | |
2232 | SINGLE_FMT_ARG: Final = 5 | |
2233 | LPAR: Final = 6 | |
2234 | RPAR: Final = 7 | |
2235 | DONE: Final = 8 | |
2236 | ||
2237 | # Lookup Table for Next State | |
2238 | _goto: Final[Dict[Tuple[ParserState, NodeType], ParserState]] = { | |
2239 | # A string trailer may start with '.' OR '%'. | |
2240 | (START, token.DOT): DOT, | |
2241 | (START, token.PERCENT): PERCENT, | |
2242 | (START, DEFAULT_TOKEN): DONE, | |
2243 | # A '.' MUST be followed by an attribute or method name. | |
2244 | (DOT, token.NAME): NAME, | |
2245 | # A method name MUST be followed by an '(', whereas an attribute name | |
2246 | # is the last symbol in the string trailer. | |
2247 | (NAME, token.LPAR): LPAR, | |
2248 | (NAME, DEFAULT_TOKEN): DONE, | |
2249 | # A '%' symbol can be followed by an '(' or a single argument (e.g. a | |
2250 | # string or variable name). | |
2251 | (PERCENT, token.LPAR): LPAR, | |
2252 | (PERCENT, DEFAULT_TOKEN): SINGLE_FMT_ARG, | |
2253 | # If a '%' symbol is followed by a single argument, that argument is | |
2254 | # the last leaf in the string trailer. | |
2255 | (SINGLE_FMT_ARG, DEFAULT_TOKEN): DONE, | |
2256 | # If present, a ')' symbol is the last symbol in a string trailer. | |
2257 | # (NOTE: LPARS and nested RPARS are not included in this lookup table, | |
2258 | # since they are treated as a special case by the parsing logic in this | |
2259 | # classes' implementation.) | |
2260 | (RPAR, DEFAULT_TOKEN): DONE, | |
2261 | } | |
2262 | ||
2263 | def __init__(self) -> None: | |
2264 | self._state = self.START | |
2265 | self._unmatched_lpars = 0 | |
2266 | ||
2267 | def parse(self, leaves: List[Leaf], string_idx: int) -> int: | |
2268 | """ | |
2269 | Pre-conditions: | |
2270 | * @leaves[@string_idx].type == token.STRING | |
2271 | ||
2272 | Returns: | |
2273 | The index directly after the last leaf which is apart of the string | |
2274 | trailer, if a "trailer" exists. | |
2275 | OR | |
2276 | @string_idx + 1, if no string "trailer" exists. | |
2277 | """ | |
2278 | assert leaves[string_idx].type == token.STRING | |
2279 | ||
2280 | idx = string_idx + 1 | |
2281 | while idx < len(leaves) and self._next_state(leaves[idx]): | |
2282 | idx += 1 | |
2283 | return idx | |
2284 | ||
2285 | def _next_state(self, leaf: Leaf) -> bool: | |
2286 | """ | |
2287 | Pre-conditions: | |
2288 | * On the first call to this function, @leaf MUST be the leaf that | |
2289 | was directly after the string leaf in question (e.g. if our target | |
2290 | string is `line.leaves[i]` then the first call to this method must | |
2291 | be `line.leaves[i + 1]`). | |
2292 | * On the next call to this function, the leaf parameter passed in | |
2293 | MUST be the leaf directly following @leaf. | |
2294 | ||
2295 | Returns: | |
2296 | True iff @leaf is apart of the string's trailer. | |
2297 | """ | |
2298 | # We ignore empty LPAR or RPAR leaves. | |
2299 | if is_empty_par(leaf): | |
2300 | return True | |
2301 | ||
2302 | next_token = leaf.type | |
2303 | if next_token == token.LPAR: | |
2304 | self._unmatched_lpars += 1 | |
2305 | ||
2306 | current_state = self._state | |
2307 | ||
2308 | # The LPAR parser state is a special case. We will return True until we | |
2309 | # find the matching RPAR token. | |
2310 | if current_state == self.LPAR: | |
2311 | if next_token == token.RPAR: | |
2312 | self._unmatched_lpars -= 1 | |
2313 | if self._unmatched_lpars == 0: | |
2314 | self._state = self.RPAR | |
2315 | # Otherwise, we use a lookup table to determine the next state. | |
2316 | else: | |
2317 | # If the lookup table matches the current state to the next | |
2318 | # token, we use the lookup table. | |
2319 | if (current_state, next_token) in self._goto: | |
2320 | self._state = self._goto[current_state, next_token] | |
2321 | else: | |
2322 | # Otherwise, we check if a the current state was assigned a | |
2323 | # default. | |
2324 | if (current_state, self.DEFAULT_TOKEN) in self._goto: | |
2325 | self._state = self._goto[current_state, self.DEFAULT_TOKEN] | |
2326 | # If no default has been assigned, then this parser has a logic | |
2327 | # error. | |
2328 | else: | |
2329 | raise RuntimeError(f"{self.__class__.__name__} LOGIC ERROR!") | |
2330 | ||
2331 | if self._state == self.DONE: | |
2332 | return False | |
2333 | ||
2334 | return True | |
2335 | ||
2336 | ||
2337 | def insert_str_child_factory(string_leaf: Leaf) -> Callable[[LN], None]: | |
2338 | """ | |
2339 | Factory for a convenience function that is used to orphan @string_leaf | |
2340 | and then insert multiple new leaves into the same part of the node | |
2341 | structure that @string_leaf had originally occupied. | |
2342 | ||
2343 | Examples: | |
2344 | Let `string_leaf = Leaf(token.STRING, '"foo"')` and `N = | |
2345 | string_leaf.parent`. Assume the node `N` has the following | |
2346 | original structure: | |
2347 | ||
2348 | Node( | |
2349 | expr_stmt, [ | |
2350 | Leaf(NAME, 'x'), | |
2351 | Leaf(EQUAL, '='), | |
2352 | Leaf(STRING, '"foo"'), | |
2353 | ] | |
2354 | ) | |
2355 | ||
2356 | We then run the code snippet shown below. | |
2357 | ``` | |
2358 | insert_str_child = insert_str_child_factory(string_leaf) | |
2359 | ||
2360 | lpar = Leaf(token.LPAR, '(') | |
2361 | insert_str_child(lpar) | |
2362 | ||
2363 | bar = Leaf(token.STRING, '"bar"') | |
2364 | insert_str_child(bar) | |
2365 | ||
2366 | rpar = Leaf(token.RPAR, ')') | |
2367 | insert_str_child(rpar) | |
2368 | ``` | |
2369 | ||
2370 | After which point, it follows that `string_leaf.parent is None` and | |
2371 | the node `N` now has the following structure: | |
2372 | ||
2373 | Node( | |
2374 | expr_stmt, [ | |
2375 | Leaf(NAME, 'x'), | |
2376 | Leaf(EQUAL, '='), | |
2377 | Leaf(LPAR, '('), | |
2378 | Leaf(STRING, '"bar"'), | |
2379 | Leaf(RPAR, ')'), | |
2380 | ] | |
2381 | ) | |
2382 | """ | |
2383 | string_parent = string_leaf.parent | |
2384 | string_child_idx = string_leaf.remove() | |
2385 | ||
2386 | def insert_str_child(child: LN) -> None: | |
2387 | nonlocal string_child_idx | |
2388 | ||
2389 | assert string_parent is not None | |
2390 | assert string_child_idx is not None | |
2391 | ||
2392 | string_parent.insert_child(string_child_idx, child) | |
2393 | string_child_idx += 1 | |
2394 | ||
2395 | return insert_str_child | |
2396 | ||
2397 | ||
2398 | def is_valid_index_factory(seq: Sequence[Any]) -> Callable[[int], bool]: | |
2399 | """ | |
2400 | Examples: | |
2401 | ``` | |
2402 | my_list = [1, 2, 3] | |
2403 | ||
2404 | is_valid_index = is_valid_index_factory(my_list) | |
2405 | ||
2406 | assert is_valid_index(0) | |
2407 | assert is_valid_index(2) | |
2408 | ||
2409 | assert not is_valid_index(3) | |
2410 | assert not is_valid_index(-1) | |
2411 | ``` | |
2412 | """ | |
2413 | ||
2414 | def is_valid_index(idx: int) -> bool: | |
2415 | """ | |
2416 | Returns: | |
2417 | True iff @idx is positive AND seq[@idx] does NOT raise an | |
2418 | IndexError. | |
2419 | """ | |
2420 | return 0 <= idx < len(seq) | |
2421 | ||
2422 | return is_valid_index |