]>
Commit | Line | Data |
---|---|---|
53e6db90 DC |
1 | """Cache Management |
2 | """ | |
3 | ||
4 | import hashlib | |
5 | import json | |
6 | import logging | |
7 | import os | |
8 | from pathlib import Path | |
9 | from typing import Any, Dict, List, Optional, Set | |
10 | ||
11 | from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version | |
12 | from pip._vendor.packaging.utils import canonicalize_name | |
13 | ||
14 | from pip._internal.exceptions import InvalidWheelFilename | |
15 | from pip._internal.models.direct_url import DirectUrl | |
16 | from pip._internal.models.format_control import FormatControl | |
17 | from pip._internal.models.link import Link | |
18 | from pip._internal.models.wheel import Wheel | |
19 | from pip._internal.utils.temp_dir import TempDirectory, tempdir_kinds | |
20 | from pip._internal.utils.urls import path_to_url | |
21 | ||
22 | logger = logging.getLogger(__name__) | |
23 | ||
24 | ORIGIN_JSON_NAME = "origin.json" | |
25 | ||
26 | ||
27 | def _hash_dict(d: Dict[str, str]) -> str: | |
28 | """Return a stable sha224 of a dictionary.""" | |
29 | s = json.dumps(d, sort_keys=True, separators=(",", ":"), ensure_ascii=True) | |
30 | return hashlib.sha224(s.encode("ascii")).hexdigest() | |
31 | ||
32 | ||
33 | class Cache: | |
34 | """An abstract class - provides cache directories for data from links | |
35 | ||
36 | ||
37 | :param cache_dir: The root of the cache. | |
38 | :param format_control: An object of FormatControl class to limit | |
39 | binaries being read from the cache. | |
40 | :param allowed_formats: which formats of files the cache should store. | |
41 | ('binary' and 'source' are the only allowed values) | |
42 | """ | |
43 | ||
44 | def __init__( | |
45 | self, cache_dir: str, format_control: FormatControl, allowed_formats: Set[str] | |
46 | ) -> None: | |
47 | super().__init__() | |
48 | assert not cache_dir or os.path.isabs(cache_dir) | |
49 | self.cache_dir = cache_dir or None | |
50 | self.format_control = format_control | |
51 | self.allowed_formats = allowed_formats | |
52 | ||
53 | _valid_formats = {"source", "binary"} | |
54 | assert self.allowed_formats.union(_valid_formats) == _valid_formats | |
55 | ||
56 | def _get_cache_path_parts(self, link: Link) -> List[str]: | |
57 | """Get parts of part that must be os.path.joined with cache_dir""" | |
58 | ||
59 | # We want to generate an url to use as our cache key, we don't want to | |
60 | # just re-use the URL because it might have other items in the fragment | |
61 | # and we don't care about those. | |
62 | key_parts = {"url": link.url_without_fragment} | |
63 | if link.hash_name is not None and link.hash is not None: | |
64 | key_parts[link.hash_name] = link.hash | |
65 | if link.subdirectory_fragment: | |
66 | key_parts["subdirectory"] = link.subdirectory_fragment | |
67 | ||
68 | # Include interpreter name, major and minor version in cache key | |
69 | # to cope with ill-behaved sdists that build a different wheel | |
70 | # depending on the python version their setup.py is being run on, | |
71 | # and don't encode the difference in compatibility tags. | |
72 | # https://github.com/pypa/pip/issues/7296 | |
73 | key_parts["interpreter_name"] = interpreter_name() | |
74 | key_parts["interpreter_version"] = interpreter_version() | |
75 | ||
76 | # Encode our key url with sha224, we'll use this because it has similar | |
77 | # security properties to sha256, but with a shorter total output (and | |
78 | # thus less secure). However the differences don't make a lot of | |
79 | # difference for our use case here. | |
80 | hashed = _hash_dict(key_parts) | |
81 | ||
82 | # We want to nest the directories some to prevent having a ton of top | |
83 | # level directories where we might run out of sub directories on some | |
84 | # FS. | |
85 | parts = [hashed[:2], hashed[2:4], hashed[4:6], hashed[6:]] | |
86 | ||
87 | return parts | |
88 | ||
89 | def _get_candidates(self, link: Link, canonical_package_name: str) -> List[Any]: | |
90 | can_not_cache = not self.cache_dir or not canonical_package_name or not link | |
91 | if can_not_cache: | |
92 | return [] | |
93 | ||
94 | formats = self.format_control.get_allowed_formats(canonical_package_name) | |
95 | if not self.allowed_formats.intersection(formats): | |
96 | return [] | |
97 | ||
98 | candidates = [] | |
99 | path = self.get_path_for_link(link) | |
100 | if os.path.isdir(path): | |
101 | for candidate in os.listdir(path): | |
102 | candidates.append((candidate, path)) | |
103 | return candidates | |
104 | ||
105 | def get_path_for_link(self, link: Link) -> str: | |
106 | """Return a directory to store cached items in for link.""" | |
107 | raise NotImplementedError() | |
108 | ||
109 | def get( | |
110 | self, | |
111 | link: Link, | |
112 | package_name: Optional[str], | |
113 | supported_tags: List[Tag], | |
114 | ) -> Link: | |
115 | """Returns a link to a cached item if it exists, otherwise returns the | |
116 | passed link. | |
117 | """ | |
118 | raise NotImplementedError() | |
119 | ||
120 | ||
121 | class SimpleWheelCache(Cache): | |
122 | """A cache of wheels for future installs.""" | |
123 | ||
124 | def __init__(self, cache_dir: str, format_control: FormatControl) -> None: | |
125 | super().__init__(cache_dir, format_control, {"binary"}) | |
126 | ||
127 | def get_path_for_link(self, link: Link) -> str: | |
128 | """Return a directory to store cached wheels for link | |
129 | ||
130 | Because there are M wheels for any one sdist, we provide a directory | |
131 | to cache them in, and then consult that directory when looking up | |
132 | cache hits. | |
133 | ||
134 | We only insert things into the cache if they have plausible version | |
135 | numbers, so that we don't contaminate the cache with things that were | |
136 | not unique. E.g. ./package might have dozens of installs done for it | |
137 | and build a version of 0.0...and if we built and cached a wheel, we'd | |
138 | end up using the same wheel even if the source has been edited. | |
139 | ||
140 | :param link: The link of the sdist for which this will cache wheels. | |
141 | """ | |
142 | parts = self._get_cache_path_parts(link) | |
143 | assert self.cache_dir | |
144 | # Store wheels within the root cache_dir | |
145 | return os.path.join(self.cache_dir, "wheels", *parts) | |
146 | ||
147 | def get( | |
148 | self, | |
149 | link: Link, | |
150 | package_name: Optional[str], | |
151 | supported_tags: List[Tag], | |
152 | ) -> Link: | |
153 | candidates = [] | |
154 | ||
155 | if not package_name: | |
156 | return link | |
157 | ||
158 | canonical_package_name = canonicalize_name(package_name) | |
159 | for wheel_name, wheel_dir in self._get_candidates(link, canonical_package_name): | |
160 | try: | |
161 | wheel = Wheel(wheel_name) | |
162 | except InvalidWheelFilename: | |
163 | continue | |
164 | if canonicalize_name(wheel.name) != canonical_package_name: | |
165 | logger.debug( | |
166 | "Ignoring cached wheel %s for %s as it " | |
167 | "does not match the expected distribution name %s.", | |
168 | wheel_name, | |
169 | link, | |
170 | package_name, | |
171 | ) | |
172 | continue | |
173 | if not wheel.supported(supported_tags): | |
174 | # Built for a different python/arch/etc | |
175 | continue | |
176 | candidates.append( | |
177 | ( | |
178 | wheel.support_index_min(supported_tags), | |
179 | wheel_name, | |
180 | wheel_dir, | |
181 | ) | |
182 | ) | |
183 | ||
184 | if not candidates: | |
185 | return link | |
186 | ||
187 | _, wheel_name, wheel_dir = min(candidates) | |
188 | return Link(path_to_url(os.path.join(wheel_dir, wheel_name))) | |
189 | ||
190 | ||
191 | class EphemWheelCache(SimpleWheelCache): | |
192 | """A SimpleWheelCache that creates it's own temporary cache directory""" | |
193 | ||
194 | def __init__(self, format_control: FormatControl) -> None: | |
195 | self._temp_dir = TempDirectory( | |
196 | kind=tempdir_kinds.EPHEM_WHEEL_CACHE, | |
197 | globally_managed=True, | |
198 | ) | |
199 | ||
200 | super().__init__(self._temp_dir.path, format_control) | |
201 | ||
202 | ||
203 | class CacheEntry: | |
204 | def __init__( | |
205 | self, | |
206 | link: Link, | |
207 | persistent: bool, | |
208 | ): | |
209 | self.link = link | |
210 | self.persistent = persistent | |
211 | self.origin: Optional[DirectUrl] = None | |
212 | origin_direct_url_path = Path(self.link.file_path).parent / ORIGIN_JSON_NAME | |
213 | if origin_direct_url_path.exists(): | |
214 | self.origin = DirectUrl.from_json(origin_direct_url_path.read_text()) | |
215 | ||
216 | ||
217 | class WheelCache(Cache): | |
218 | """Wraps EphemWheelCache and SimpleWheelCache into a single Cache | |
219 | ||
220 | This Cache allows for gracefully degradation, using the ephem wheel cache | |
221 | when a certain link is not found in the simple wheel cache first. | |
222 | """ | |
223 | ||
224 | def __init__( | |
225 | self, cache_dir: str, format_control: Optional[FormatControl] = None | |
226 | ) -> None: | |
227 | if format_control is None: | |
228 | format_control = FormatControl() | |
229 | super().__init__(cache_dir, format_control, {"binary"}) | |
230 | self._wheel_cache = SimpleWheelCache(cache_dir, format_control) | |
231 | self._ephem_cache = EphemWheelCache(format_control) | |
232 | ||
233 | def get_path_for_link(self, link: Link) -> str: | |
234 | return self._wheel_cache.get_path_for_link(link) | |
235 | ||
236 | def get_ephem_path_for_link(self, link: Link) -> str: | |
237 | return self._ephem_cache.get_path_for_link(link) | |
238 | ||
239 | def get( | |
240 | self, | |
241 | link: Link, | |
242 | package_name: Optional[str], | |
243 | supported_tags: List[Tag], | |
244 | ) -> Link: | |
245 | cache_entry = self.get_cache_entry(link, package_name, supported_tags) | |
246 | if cache_entry is None: | |
247 | return link | |
248 | return cache_entry.link | |
249 | ||
250 | def get_cache_entry( | |
251 | self, | |
252 | link: Link, | |
253 | package_name: Optional[str], | |
254 | supported_tags: List[Tag], | |
255 | ) -> Optional[CacheEntry]: | |
256 | """Returns a CacheEntry with a link to a cached item if it exists or | |
257 | None. The cache entry indicates if the item was found in the persistent | |
258 | or ephemeral cache. | |
259 | """ | |
260 | retval = self._wheel_cache.get( | |
261 | link=link, | |
262 | package_name=package_name, | |
263 | supported_tags=supported_tags, | |
264 | ) | |
265 | if retval is not link: | |
266 | return CacheEntry(retval, persistent=True) | |
267 | ||
268 | retval = self._ephem_cache.get( | |
269 | link=link, | |
270 | package_name=package_name, | |
271 | supported_tags=supported_tags, | |
272 | ) | |
273 | if retval is not link: | |
274 | return CacheEntry(retval, persistent=False) | |
275 | ||
276 | return None | |
277 | ||
278 | @staticmethod | |
279 | def record_download_origin(cache_dir: str, download_info: DirectUrl) -> None: | |
280 | origin_path = Path(cache_dir) / ORIGIN_JSON_NAME | |
281 | if origin_path.is_file(): | |
282 | origin = DirectUrl.from_json(origin_path.read_text()) | |
283 | # TODO: use DirectUrl.equivalent when https://github.com/pypa/pip/pull/10564 | |
284 | # is merged. | |
285 | if origin.url != download_info.url: | |
286 | logger.warning( | |
287 | "Origin URL %s in cache entry %s does not match download URL %s. " | |
288 | "This is likely a pip bug or a cache corruption issue.", | |
289 | origin.url, | |
290 | cache_dir, | |
291 | download_info.url, | |
292 | ) | |
293 | origin_path.write_text(download_info.to_json(), encoding="utf-8") |