gabriel / muse public
pack_store.py python
391 lines 13.2 KB
Raw
sha256:2eaa5d95f9d9383498e76947410a26e5a3ba23d182f339910c424cf88fad412b fix: try fetch/presign before fetch/mpack to avoid Cloudfla… Sonnet 4.6 patch 7 days ago
1 """MPack local object store — pack file read/write for the Muse VCS.
2
3 Layout
4 ------
5 Pack files live under ``.muse/objects/pack/sha256/``, keeping the algorithm
6 canonical in the path, mirroring the loose object store convention::
7
8 .muse/objects/sha256/<prefix>/<remainder> ← loose objects
9 .muse/objects/pack/sha256/<64hex>.mpack ← MPack data file
10 .muse/objects/pack/sha256/<64hex>.idx ← MPack seek index
11
12 Pack file format (``.mpack``)
13 ------------------------------
14 ::
15
16 [4 bytes] magic: b"MUSE"
17 [1 byte] version: 1
18 [8 bytes] object_count: uint64 little-endian
19 --- object records (object_count entries) ---
20 [71 bytes] object_id: b"sha256:" + 64 lowercase hex chars
21 [8 bytes] length: uint64 little-endian
22 [N bytes] content: raw bytes
23 --- footer ---
24 [32 bytes] integrity: SHA-256 of every byte above
25
26 Index file format (``.idx``)
27 -----------------------------
28 ::
29
30 [4 bytes] magic: b"MUSI"
31 [1 byte] version: 1
32 [8 bytes] entry_count: uint64 little-endian
33 --- entries, sorted by object_id (enables binary search) ---
34 [71 bytes] object_id
35 [8 bytes] content_offset: uint64 little-endian
36 [8 bytes] content_length: uint64 little-endian
37 --- footer ---
38 [32 bytes] integrity: SHA-256 of every byte above
39
40 ``content_offset`` is the byte offset to the first content byte in the
41 ``.mpack`` file (past the object_id and length fields for that entry).
42 A read is: ``seek(content_offset); read(content_length)`` — O(1), no decode.
43
44 Pack ID
45 -------
46 ``pack_id = "sha256:" + sha256(<entire .mpack file bytes>).hexdigest()``
47
48 The pack is content-addressed like every other Muse object. Writing the same
49 objects twice produces the same pack file and the same pack_id — idempotent.
50 """
51
52 from __future__ import annotations
53
54 import bisect
55 import hashlib
56 import os
57 import pathlib
58 import struct
59 import tempfile
60 from typing import TYPE_CHECKING
61
62 from muse.core.paths import packs_dir
63 from muse.core.types import DEFAULT_HASH_ALGO, split_id
64 from muse.core.validation import validate_object_id
65
66 if TYPE_CHECKING:
67 pass
68
69 # ---------------------------------------------------------------------------
70 # Constants
71 # ---------------------------------------------------------------------------
72
73 _PACK_MAGIC = b"MUSE"
74 _IDX_MAGIC = b"MUSI"
75 _VERSION = 1
76 _OID_BYTES = 71 # len("sha256:") + 64 hex chars
77 _FOOTER_BYTES = 32 # SHA-256 digest
78 _PACK_HEADER_BYTES = 4 + 1 + 8 # magic + version + object_count
79 _IDX_HEADER_BYTES = 4 + 1 + 8 # magic + version + entry_count
80 _IDX_ENTRY_BYTES = _OID_BYTES + 8 + 8 # object_id + content_offset + content_length
81
82
83 # ---------------------------------------------------------------------------
84 # Internal helpers
85 # ---------------------------------------------------------------------------
86
87
88 def _sha256_file(path: pathlib.Path) -> bytes:
89 h = hashlib.sha256()
90 with path.open("rb") as fh:
91 for chunk in iter(lambda: fh.read(65536), b""):
92 h.update(chunk)
93 return h.digest()
94
95
96 def _atomic_write(dest: pathlib.Path, data: bytes) -> None:
97 """Write *data* to *dest* atomically via a temp file + rename."""
98 dest.parent.mkdir(parents=True, exist_ok=True)
99 fd, tmp_str = tempfile.mkstemp(dir=dest.parent, prefix=".pack-tmp-")
100 tmp = pathlib.Path(tmp_str)
101 try:
102 with os.fdopen(fd, "wb") as fh:
103 fh.write(data)
104 fh.flush()
105 os.fsync(fh.fileno())
106 os.replace(tmp, dest)
107 except Exception:
108 tmp.unlink(missing_ok=True)
109 raise
110
111
112 def _pack_path(repo_root: pathlib.Path, hex_id: str) -> pathlib.Path:
113 return packs_dir(repo_root) / f"{hex_id}.mpack"
114
115
116 def _idx_path(repo_root: pathlib.Path, hex_id: str) -> pathlib.Path:
117 return packs_dir(repo_root) / f"{hex_id}.idx"
118
119
120 # ---------------------------------------------------------------------------
121 # Build
122 # ---------------------------------------------------------------------------
123
124
125 def _build_pack(objects: list[tuple[str, bytes]]) -> bytes:
126 """Encode *objects* as an MPack binary blob. Returns raw bytes."""
127 h = hashlib.sha256()
128
129 def _emit(chunk: bytes) -> bytes:
130 h.update(chunk)
131 return chunk
132
133 parts: list[bytes] = []
134 parts.append(_emit(_PACK_MAGIC))
135 parts.append(_emit(struct.pack("<BQ", _VERSION, len(objects))))
136
137 for oid, content in objects:
138 oid_bytes = oid.encode()
139 assert len(oid_bytes) == _OID_BYTES, f"bad oid length: {oid!r}"
140 parts.append(_emit(oid_bytes))
141 parts.append(_emit(struct.pack("<Q", len(content))))
142 parts.append(_emit(content))
143
144 parts.append(h.digest()) # footer — NOT fed back into h
145 return b"".join(parts)
146
147
148 def _build_idx(objects: list[tuple[str, bytes]], pack_bytes: bytes) -> bytes:
149 """Build a sorted seek index for *objects* referencing *pack_bytes*."""
150 # Compute content offsets within pack_bytes.
151 # Pack layout: header(13) then for each object: oid(71) + length(8) + content(N)
152 offsets: dict[str, tuple[int, int]] = {}
153 cursor = _PACK_HEADER_BYTES
154 for oid, content in objects:
155 cursor += _OID_BYTES + 8 # skip object_id + length field
156 offsets[oid] = (cursor, len(content))
157 cursor += len(content)
158
159 # Sort entries by object_id for binary search.
160 sorted_oids = sorted(offsets)
161
162 h = hashlib.sha256()
163
164 def _emit(chunk: bytes) -> bytes:
165 h.update(chunk)
166 return chunk
167
168 parts: list[bytes] = []
169 parts.append(_emit(_IDX_MAGIC))
170 parts.append(_emit(struct.pack("<BQ", _VERSION, len(sorted_oids))))
171
172 for oid in sorted_oids:
173 content_offset, content_length = offsets[oid]
174 parts.append(_emit(oid.encode()))
175 parts.append(_emit(struct.pack("<QQ", content_offset, content_length)))
176
177 parts.append(h.digest())
178 return b"".join(parts)
179
180
181 # ---------------------------------------------------------------------------
182 # Public API
183 # ---------------------------------------------------------------------------
184
185
186 def _parse_pack_bytes(pack_bytes: bytes) -> list[tuple[str, bytes]]:
187 """Parse raw pack bytes (as produced by _build_pack) into (object_id, content) pairs.
188
189 Verifies the SHA-256 footer before parsing. Returns [] for empty input.
190 """
191 if not pack_bytes:
192 return []
193 if pack_bytes[:4] != _PACK_MAGIC:
194 raise ValueError(f"Bad pack magic: {pack_bytes[:4]!r}")
195 body = pack_bytes[:-_FOOTER_BYTES]
196 stored = pack_bytes[-_FOOTER_BYTES:]
197 if hashlib.sha256(body).digest() != stored:
198 raise OSError("Pack bytes failed SHA-256 integrity check")
199 count = struct.unpack_from("<Q", pack_bytes, 5)[0]
200 cursor = _PACK_HEADER_BYTES
201 objects: list[tuple[str, bytes]] = []
202 for _ in range(count):
203 oid = pack_bytes[cursor: cursor + _OID_BYTES].decode()
204 cursor += _OID_BYTES
205 length = struct.unpack_from("<Q", pack_bytes, cursor)[0]
206 cursor += 8
207 content = pack_bytes[cursor: cursor + length]
208 cursor += length
209 objects.append((oid, content))
210 return objects
211
212
213 def write_pack(repo_root: pathlib.Path, objects: list[tuple[str, bytes]]) -> str | None:
214 """Write *objects* as an MPack file and its seek index.
215
216 Returns the ``pack_id`` (``sha256:<64hex>``) on success, or ``None`` when
217 *objects* is empty (no files written).
218
219 Idempotent: if a pack with the same content already exists the existing
220 files are left untouched and the same pack_id is returned.
221
222 Args:
223 repo_root: Root of the Muse repository.
224 objects: List of ``(object_id, content)`` pairs to pack.
225
226 Returns:
227 ``"sha256:<64hex>"`` pack_id, or ``None`` for an empty list.
228 """
229 if not objects:
230 return None
231
232 pack_bytes = _build_pack(objects)
233 hex_id = hashlib.sha256(pack_bytes).hexdigest()
234 pack_id = f"{DEFAULT_HASH_ALGO}:{hex_id}"
235
236 mpack = _pack_path(repo_root, hex_id)
237 idx = _idx_path(repo_root, hex_id)
238
239 if mpack.exists() and idx.exists():
240 return pack_id
241
242 idx_bytes = _build_idx(objects, pack_bytes)
243 _atomic_write(mpack, pack_bytes)
244 _atomic_write(idx, idx_bytes)
245 return pack_id
246
247
248 # ---------------------------------------------------------------------------
249 # Index lookup helpers
250 # ---------------------------------------------------------------------------
251
252
253 def _load_idx(idx_path: pathlib.Path) -> list[tuple[str, int, int]]:
254 """Parse an index file into a sorted list of (object_id, offset, length)."""
255 data = idx_path.read_bytes()
256
257 # Verify footer integrity.
258 body = data[:-_FOOTER_BYTES]
259 stored_digest = data[-_FOOTER_BYTES:]
260 actual_digest = hashlib.sha256(body).digest()
261 if actual_digest != stored_digest:
262 raise OSError(
263 f"MPack index {idx_path.name} failed integrity check — store may be corrupt."
264 )
265
266 if not data.startswith(_IDX_MAGIC):
267 raise OSError(f"MPack index {idx_path.name} has wrong magic bytes.")
268
269 entry_count = struct.unpack_from("<Q", data, 5)[0]
270 entries: list[tuple[str, int, int]] = []
271 cursor = _IDX_HEADER_BYTES
272 for _ in range(entry_count):
273 oid = data[cursor: cursor + _OID_BYTES].decode()
274 cursor += _OID_BYTES
275 offset, length = struct.unpack_from("<QQ", data, cursor)
276 cursor += 16
277 entries.append((oid, offset, length))
278
279 return entries # already sorted (written sorted)
280
281
282 def _binary_search(entries: list[tuple[str, int, int]], oid: str) -> tuple[int, int] | None:
283 """Binary-search sorted *entries* for *oid*. Returns (offset, length) or None."""
284 keys = [e[0] for e in entries]
285 i = bisect.bisect_left(keys, oid)
286 if i < len(entries) and entries[i][0] == oid:
287 return entries[i][1], entries[i][2]
288 return None
289
290
291 def _all_idx_paths(repo_root: pathlib.Path) -> list[pathlib.Path]:
292 """Return all .idx files in the pack store, or [] if none exist."""
293 d = packs_dir(repo_root)
294 if not d.exists():
295 return []
296 return sorted(d.glob("*.idx"))
297
298
299 # ---------------------------------------------------------------------------
300 # Read / has / list / verify
301 # ---------------------------------------------------------------------------
302
303
304 def has_object_in_packs(repo_root: pathlib.Path, object_id: str) -> bool:
305 """Return ``True`` if *object_id* is present in any local pack."""
306 for idx_path in _all_idx_paths(repo_root):
307 try:
308 entries = _load_idx(idx_path)
309 except OSError:
310 continue
311 if _binary_search(entries, object_id) is not None:
312 return True
313 return False
314
315
316 def read_object_from_packs(repo_root: pathlib.Path, object_id: str) -> bytes | None:
317 """Seek-read *object_id* from the first pack that contains it.
318
319 Returns raw content bytes, or ``None`` if the object is absent from all
320 local packs.
321
322 Every read verifies the SHA-256 of the returned bytes against *object_id*
323 before returning — silent corruption raises ``OSError``.
324 """
325 for idx_path in _all_idx_paths(repo_root):
326 try:
327 entries = _load_idx(idx_path)
328 except OSError:
329 continue
330 result = _binary_search(entries, object_id)
331 if result is None:
332 continue
333 content_offset, content_length = result
334 hex_id = idx_path.stem
335 mpack = _pack_path(repo_root, hex_id)
336 with mpack.open("rb") as fh:
337 fh.seek(content_offset)
338 content = fh.read(content_length)
339 # Integrity check — object_id is the muse-format hash (hash_blob).
340 from muse.core.ids import hash_blob
341 actual = hash_blob(content)
342 if actual != object_id:
343 raise OSError(
344 f"Object {object_id} failed integrity check reading from pack "
345 f"{mpack.name} — store may be corrupt."
346 )
347 return content
348 return None
349
350
351 def list_packs(repo_root: pathlib.Path) -> list[str]:
352 """Return the pack_id for every pack present in the local store."""
353 result = []
354 for idx_path in _all_idx_paths(repo_root):
355 result.append(f"{DEFAULT_HASH_ALGO}:{idx_path.stem}")
356 return result
357
358
359 def verify_pack(repo_root: pathlib.Path, pack_id: str) -> bool:
360 """Re-hash *pack_id*'s ``.mpack`` and ``.idx`` footers.
361
362 Returns ``True`` when both files pass. Raises ``OSError`` on any
363 integrity failure so callers can distinguish corrupt from absent.
364
365 Args:
366 repo_root: Root of the Muse repository.
367 pack_id: ``sha256:<64hex>`` pack identifier.
368
369 Raises:
370 OSError: If either file is missing, has a wrong magic, or fails its
371 SHA-256 footer check.
372 """
373 _, hex_id = split_id(pack_id)
374 mpack = _pack_path(repo_root, hex_id)
375 idx = _idx_path(repo_root, hex_id)
376
377 for path, magic in [(mpack, _PACK_MAGIC), (idx, _IDX_MAGIC)]:
378 if not path.exists():
379 raise OSError(f"MPack file not found: {path}")
380 data = path.read_bytes()
381 if not data.startswith(magic):
382 raise OSError(f"{path.name} has wrong magic bytes.")
383 body = data[:-_FOOTER_BYTES]
384 stored = data[-_FOOTER_BYTES:]
385 actual = hashlib.sha256(body).digest()
386 if actual != stored:
387 raise OSError(
388 f"{path.name} failed SHA-256 integrity check — store may be corrupt."
389 )
390
391 return True
File History 1 commit
sha256:2eaa5d95f9d9383498e76947410a26e5a3ba23d182f339910c424cf88fad412b fix: try fetch/presign before fetch/mpack to avoid Cloudfla… Sonnet 4.6 patch 7 days ago