]>
Commit | Line | Data |
---|---|---|
53e6db90 DC |
1 | """ |
2 | This module implements Git's wildmatch pattern matching which itself is | |
3 | derived from Rsync's wildmatch. Git uses wildmatch for its ".gitignore" | |
4 | files. | |
5 | """ | |
6 | ||
7 | import re | |
8 | import warnings | |
9 | from typing import ( | |
10 | AnyStr, | |
11 | Optional, | |
12 | Tuple) | |
13 | ||
14 | from .. import util | |
15 | from ..pattern import RegexPattern | |
16 | ||
17 | _BYTES_ENCODING = 'latin1' | |
18 | """ | |
19 | The encoding to use when parsing a byte string pattern. | |
20 | """ | |
21 | ||
22 | _DIR_MARK = 'ps_d' | |
23 | """ | |
24 | The regex group name for the directory marker. This is only used by | |
25 | :class:`GitIgnoreSpec`. | |
26 | """ | |
27 | ||
28 | ||
29 | class GitWildMatchPatternError(ValueError): | |
30 | """ | |
31 | The :class:`GitWildMatchPatternError` indicates an invalid git wild match | |
32 | pattern. | |
33 | """ | |
34 | pass | |
35 | ||
36 | ||
37 | class GitWildMatchPattern(RegexPattern): | |
38 | """ | |
39 | The :class:`GitWildMatchPattern` class represents a compiled Git | |
40 | wildmatch pattern. | |
41 | """ | |
42 | ||
43 | # Keep the dict-less class hierarchy. | |
44 | __slots__ = () | |
45 | ||
46 | @classmethod | |
47 | def pattern_to_regex( | |
48 | cls, | |
49 | pattern: AnyStr, | |
50 | ) -> Tuple[Optional[AnyStr], Optional[bool]]: | |
51 | """ | |
52 | Convert the pattern into a regular expression. | |
53 | ||
54 | *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert | |
55 | into a regular expression. | |
56 | ||
57 | Returns the uncompiled regular expression (:class:`str`, :class:`bytes`, | |
58 | or :data:`None`); and whether matched files should be included | |
59 | (:data:`True`), excluded (:data:`False`), or if it is a | |
60 | null-operation (:data:`None`). | |
61 | """ | |
62 | if isinstance(pattern, str): | |
63 | return_type = str | |
64 | elif isinstance(pattern, bytes): | |
65 | return_type = bytes | |
66 | pattern = pattern.decode(_BYTES_ENCODING) | |
67 | else: | |
68 | raise TypeError(f"pattern:{pattern!r} is not a unicode or byte string.") | |
69 | ||
70 | original_pattern = pattern | |
71 | ||
72 | if pattern.endswith('\\ '): | |
73 | # EDGE CASE: Spaces can be escaped with backslash. | |
74 | # If a pattern that ends with backslash followed by a space, | |
75 | # only strip from left. | |
76 | pattern = pattern.lstrip() | |
77 | else: | |
78 | pattern = pattern.strip() | |
79 | ||
80 | if pattern.startswith('#'): | |
81 | # A pattern starting with a hash ('#') serves as a comment | |
82 | # (neither includes nor excludes files). Escape the hash with a | |
83 | # back-slash to match a literal hash (i.e., '\#'). | |
84 | regex = None | |
85 | include = None | |
86 | ||
87 | elif pattern == '/': | |
88 | # EDGE CASE: According to `git check-ignore` (v2.4.1), a single | |
89 | # '/' does not match any file. | |
90 | regex = None | |
91 | include = None | |
92 | ||
93 | elif pattern: | |
94 | if pattern.startswith('!'): | |
95 | # A pattern starting with an exclamation mark ('!') negates the | |
96 | # pattern (exclude instead of include). Escape the exclamation | |
97 | # mark with a back-slash to match a literal exclamation mark | |
98 | # (i.e., '\!'). | |
99 | include = False | |
100 | # Remove leading exclamation mark. | |
101 | pattern = pattern[1:] | |
102 | else: | |
103 | include = True | |
104 | ||
105 | # Allow a regex override for edge cases that cannot be handled | |
106 | # through normalization. | |
107 | override_regex = None | |
108 | ||
109 | # Split pattern into segments. | |
110 | pattern_segs = pattern.split('/') | |
111 | ||
112 | # Normalize pattern to make processing easier. | |
113 | ||
114 | # EDGE CASE: Deal with duplicate double-asterisk sequences. | |
115 | # Collapse each sequence down to one double-asterisk. Iterate over | |
116 | # the segments in reverse and remove the duplicate double | |
117 | # asterisks as we go. | |
118 | for i in range(len(pattern_segs) - 1, 0, -1): | |
119 | prev = pattern_segs[i-1] | |
120 | seg = pattern_segs[i] | |
121 | if prev == '**' and seg == '**': | |
122 | del pattern_segs[i] | |
123 | ||
124 | if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]: | |
125 | # EDGE CASE: The '**/' pattern should match everything except | |
126 | # individual files in the root directory. This case cannot be | |
127 | # adequately handled through normalization. Use the override. | |
128 | override_regex = f'^.+(?P<{_DIR_MARK}>/).*$' | |
129 | ||
130 | if not pattern_segs[0]: | |
131 | # A pattern beginning with a slash ('/') will only match paths | |
132 | # directly on the root directory instead of any descendant | |
133 | # paths. So, remove empty first segment to make pattern relative | |
134 | # to root. | |
135 | del pattern_segs[0] | |
136 | ||
137 | elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]): | |
138 | # A single pattern without a beginning slash ('/') will match | |
139 | # any descendant path. This is equivalent to "**/{pattern}". So, | |
140 | # prepend with double-asterisks to make pattern relative to | |
141 | # root. | |
142 | # EDGE CASE: This also holds for a single pattern with a | |
143 | # trailing slash (e.g. dir/). | |
144 | if pattern_segs[0] != '**': | |
145 | pattern_segs.insert(0, '**') | |
146 | ||
147 | else: | |
148 | # EDGE CASE: A pattern without a beginning slash ('/') but | |
149 | # contains at least one prepended directory (e.g. | |
150 | # "dir/{pattern}") should not match "**/dir/{pattern}", | |
151 | # according to `git check-ignore` (v2.4.1). | |
152 | pass | |
153 | ||
154 | if not pattern_segs: | |
155 | # After resolving the edge cases, we end up with no pattern at | |
156 | # all. This must be because the pattern is invalid. | |
157 | raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") | |
158 | ||
159 | if not pattern_segs[-1] and len(pattern_segs) > 1: | |
160 | # A pattern ending with a slash ('/') will match all descendant | |
161 | # paths if it is a directory but not if it is a regular file. | |
162 | # This is equivalent to "{pattern}/**". So, set last segment to | |
163 | # a double-asterisk to include all descendants. | |
164 | pattern_segs[-1] = '**' | |
165 | ||
166 | if override_regex is None: | |
167 | # Build regular expression from pattern. | |
168 | output = ['^'] | |
169 | need_slash = False | |
170 | end = len(pattern_segs) - 1 | |
171 | for i, seg in enumerate(pattern_segs): | |
172 | if seg == '**': | |
173 | if i == 0 and i == end: | |
174 | # A pattern consisting solely of double-asterisks ('**') | |
175 | # will match every path. | |
176 | output.append(f'[^/]+(?:(?P<{_DIR_MARK}>/).*)?') | |
177 | elif i == 0: | |
178 | # A normalized pattern beginning with double-asterisks | |
179 | # ('**') will match any leading path segments. | |
180 | output.append('(?:.+/)?') | |
181 | need_slash = False | |
182 | elif i == end: | |
183 | # A normalized pattern ending with double-asterisks ('**') | |
184 | # will match any trailing path segments. | |
185 | output.append(f'(?P<{_DIR_MARK}>/).*') | |
186 | else: | |
187 | # A pattern with inner double-asterisks ('**') will match | |
188 | # multiple (or zero) inner path segments. | |
189 | output.append('(?:/.+)?') | |
190 | need_slash = True | |
191 | ||
192 | elif seg == '*': | |
193 | # Match single path segment. | |
194 | if need_slash: | |
195 | output.append('/') | |
196 | ||
197 | output.append('[^/]+') | |
198 | ||
199 | if i == end: | |
200 | # A pattern ending without a slash ('/') will match a file | |
201 | # or a directory (with paths underneath it). E.g., "foo" | |
202 | # matches "foo", "foo/bar", "foo/bar/baz", etc. | |
203 | output.append(f'(?:(?P<{_DIR_MARK}>/).*)?') | |
204 | ||
205 | need_slash = True | |
206 | ||
207 | else: | |
208 | # Match segment glob pattern. | |
209 | if need_slash: | |
210 | output.append('/') | |
211 | ||
212 | try: | |
213 | output.append(cls._translate_segment_glob(seg)) | |
214 | except ValueError as e: | |
215 | raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e | |
216 | ||
217 | if i == end: | |
218 | # A pattern ending without a slash ('/') will match a file | |
219 | # or a directory (with paths underneath it). E.g., "foo" | |
220 | # matches "foo", "foo/bar", "foo/bar/baz", etc. | |
221 | output.append(f'(?:(?P<{_DIR_MARK}>/).*)?') | |
222 | ||
223 | need_slash = True | |
224 | ||
225 | output.append('$') | |
226 | regex = ''.join(output) | |
227 | ||
228 | else: | |
229 | # Use regex override. | |
230 | regex = override_regex | |
231 | ||
232 | else: | |
233 | # A blank pattern is a null-operation (neither includes nor | |
234 | # excludes files). | |
235 | regex = None | |
236 | include = None | |
237 | ||
238 | if regex is not None and return_type is bytes: | |
239 | regex = regex.encode(_BYTES_ENCODING) | |
240 | ||
241 | return regex, include | |
242 | ||
243 | @staticmethod | |
244 | def _translate_segment_glob(pattern: str) -> str: | |
245 | """ | |
246 | Translates the glob pattern to a regular expression. This is used in | |
247 | the constructor to translate a path segment glob pattern to its | |
248 | corresponding regular expression. | |
249 | ||
250 | *pattern* (:class:`str`) is the glob pattern. | |
251 | ||
252 | Returns the regular expression (:class:`str`). | |
253 | """ | |
254 | # NOTE: This is derived from `fnmatch.translate()` and is similar to | |
255 | # the POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set. | |
256 | ||
257 | escape = False | |
258 | regex = '' | |
259 | i, end = 0, len(pattern) | |
260 | while i < end: | |
261 | # Get next character. | |
262 | char = pattern[i] | |
263 | i += 1 | |
264 | ||
265 | if escape: | |
266 | # Escape the character. | |
267 | escape = False | |
268 | regex += re.escape(char) | |
269 | ||
270 | elif char == '\\': | |
271 | # Escape character, escape next character. | |
272 | escape = True | |
273 | ||
274 | elif char == '*': | |
275 | # Multi-character wildcard. Match any string (except slashes), | |
276 | # including an empty string. | |
277 | regex += '[^/]*' | |
278 | ||
279 | elif char == '?': | |
280 | # Single-character wildcard. Match any single character (except | |
281 | # a slash). | |
282 | regex += '[^/]' | |
283 | ||
284 | elif char == '[': | |
285 | # Bracket expression wildcard. Except for the beginning | |
286 | # exclamation mark, the whole bracket expression can be used | |
287 | # directly as regex but we have to find where the expression | |
288 | # ends. | |
289 | # - "[][!]" matches ']', '[' and '!'. | |
290 | # - "[]-]" matches ']' and '-'. | |
291 | # - "[!]a-]" matches any character except ']', 'a' and '-'. | |
292 | j = i | |
293 | ||
294 | # Pass bracket expression negation. | |
295 | if j < end and (pattern[j] == '!' or pattern[j] == '^'): | |
296 | j += 1 | |
297 | ||
298 | # Pass first closing bracket if it is at the beginning of the | |
299 | # expression. | |
300 | if j < end and pattern[j] == ']': | |
301 | j += 1 | |
302 | ||
303 | # Find closing bracket. Stop once we reach the end or find it. | |
304 | while j < end and pattern[j] != ']': | |
305 | j += 1 | |
306 | ||
307 | if j < end: | |
308 | # Found end of bracket expression. Increment j to be one past | |
309 | # the closing bracket: | |
310 | # | |
311 | # [...] | |
312 | # ^ ^ | |
313 | # i j | |
314 | # | |
315 | j += 1 | |
316 | expr = '[' | |
317 | ||
318 | if pattern[i] == '!': | |
319 | # Bracket expression needs to be negated. | |
320 | expr += '^' | |
321 | i += 1 | |
322 | elif pattern[i] == '^': | |
323 | # POSIX declares that the regex bracket expression negation | |
324 | # "[^...]" is undefined in a glob pattern. Python's | |
325 | # `fnmatch.translate()` escapes the caret ('^') as a | |
326 | # literal. Git supports the using a caret for negation. | |
327 | # Maintain consistency with Git because that is the expected | |
328 | # behavior. | |
329 | expr += '^' | |
330 | i += 1 | |
331 | ||
332 | # Build regex bracket expression. Escape slashes so they are | |
333 | # treated as literal slashes by regex as defined by POSIX. | |
334 | expr += pattern[i:j].replace('\\', '\\\\') | |
335 | ||
336 | # Add regex bracket expression to regex result. | |
337 | regex += expr | |
338 | ||
339 | # Set i to one past the closing bracket. | |
340 | i = j | |
341 | ||
342 | else: | |
343 | # Failed to find closing bracket, treat opening bracket as a | |
344 | # bracket literal instead of as an expression. | |
345 | regex += '\\[' | |
346 | ||
347 | else: | |
348 | # Regular character, escape it for regex. | |
349 | regex += re.escape(char) | |
350 | ||
351 | if escape: | |
352 | raise ValueError(f"Escape character found with no next character to escape: {pattern!r}") | |
353 | ||
354 | return regex | |
355 | ||
356 | @staticmethod | |
357 | def escape(s: AnyStr) -> AnyStr: | |
358 | """ | |
359 | Escape special characters in the given string. | |
360 | ||
361 | *s* (:class:`str` or :class:`bytes`) a filename or a string that you | |
362 | want to escape, usually before adding it to a ".gitignore". | |
363 | ||
364 | Returns the escaped string (:class:`str` or :class:`bytes`). | |
365 | """ | |
366 | if isinstance(s, str): | |
367 | return_type = str | |
368 | string = s | |
369 | elif isinstance(s, bytes): | |
370 | return_type = bytes | |
371 | string = s.decode(_BYTES_ENCODING) | |
372 | else: | |
373 | raise TypeError(f"s:{s!r} is not a unicode or byte string.") | |
374 | ||
375 | # Reference: https://git-scm.com/docs/gitignore#_pattern_format | |
376 | meta_characters = r"[]!*#?" | |
377 | ||
378 | out_string = "".join("\\" + x if x in meta_characters else x for x in string) | |
379 | ||
380 | if return_type is bytes: | |
381 | return out_string.encode(_BYTES_ENCODING) | |
382 | else: | |
383 | return out_string | |
384 | ||
385 | util.register_pattern('gitwildmatch', GitWildMatchPattern) | |
386 | ||
387 | ||
388 | class GitIgnorePattern(GitWildMatchPattern): | |
389 | """ | |
390 | The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`. | |
391 | This class only exists to maintain compatibility with v0.4. | |
392 | """ | |
393 | ||
394 | def __init__(self, *args, **kw) -> None: | |
395 | """ | |
396 | Warn about deprecation. | |
397 | """ | |
398 | self._deprecated() | |
399 | super(GitIgnorePattern, self).__init__(*args, **kw) | |
400 | ||
401 | @staticmethod | |
402 | def _deprecated() -> None: | |
403 | """ | |
404 | Warn about deprecation. | |
405 | """ | |
406 | warnings.warn(( | |
407 | "GitIgnorePattern ('gitignore') is deprecated. Use " | |
408 | "GitWildMatchPattern ('gitwildmatch') instead." | |
409 | ), DeprecationWarning, stacklevel=3) | |
410 | ||
411 | @classmethod | |
412 | def pattern_to_regex(cls, *args, **kw): | |
413 | """ | |
414 | Warn about deprecation. | |
415 | """ | |
416 | cls._deprecated() | |
417 | return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw) | |
418 | ||
419 | # Register `GitIgnorePattern` as "gitignore" for backward compatibility | |
420 | # with v0.4. | |
421 | util.register_pattern('gitignore', GitIgnorePattern) |