pack_store.py
python
sha256:2eaa5d95f9d9383498e76947410a26e5a3ba23d182f339910c424cf88fad412b
fix: try fetch/presign before fetch/mpack to avoid Cloudfla…
Sonnet 4.6
patch
7 days ago
| 1 | """MPack local object store — pack file read/write for the Muse VCS. |
| 2 | |
| 3 | Layout |
| 4 | ------ |
| 5 | Pack files live under ``.muse/objects/pack/sha256/``, keeping the algorithm |
| 6 | canonical in the path, mirroring the loose object store convention:: |
| 7 | |
| 8 | .muse/objects/sha256/<prefix>/<remainder> ← loose objects |
| 9 | .muse/objects/pack/sha256/<64hex>.mpack ← MPack data file |
| 10 | .muse/objects/pack/sha256/<64hex>.idx ← MPack seek index |
| 11 | |
| 12 | Pack file format (``.mpack``) |
| 13 | ------------------------------ |
| 14 | :: |
| 15 | |
| 16 | [4 bytes] magic: b"MUSE" |
| 17 | [1 byte] version: 1 |
| 18 | [8 bytes] object_count: uint64 little-endian |
| 19 | --- object records (object_count entries) --- |
| 20 | [71 bytes] object_id: b"sha256:" + 64 lowercase hex chars |
| 21 | [8 bytes] length: uint64 little-endian |
| 22 | [N bytes] content: raw bytes |
| 23 | --- footer --- |
| 24 | [32 bytes] integrity: SHA-256 of every byte above |
| 25 | |
| 26 | Index file format (``.idx``) |
| 27 | ----------------------------- |
| 28 | :: |
| 29 | |
| 30 | [4 bytes] magic: b"MUSI" |
| 31 | [1 byte] version: 1 |
| 32 | [8 bytes] entry_count: uint64 little-endian |
| 33 | --- entries, sorted by object_id (enables binary search) --- |
| 34 | [71 bytes] object_id |
| 35 | [8 bytes] content_offset: uint64 little-endian |
| 36 | [8 bytes] content_length: uint64 little-endian |
| 37 | --- footer --- |
| 38 | [32 bytes] integrity: SHA-256 of every byte above |
| 39 | |
| 40 | ``content_offset`` is the byte offset to the first content byte in the |
| 41 | ``.mpack`` file (past the object_id and length fields for that entry). |
| 42 | A read is: ``seek(content_offset); read(content_length)`` — O(1), no decode. |
| 43 | |
| 44 | Pack ID |
| 45 | ------- |
| 46 | ``pack_id = "sha256:" + sha256(<entire .mpack file bytes>).hexdigest()`` |
| 47 | |
| 48 | The pack is content-addressed like every other Muse object. Writing the same |
| 49 | objects twice produces the same pack file and the same pack_id — idempotent. |
| 50 | """ |
| 51 | |
| 52 | from __future__ import annotations |
| 53 | |
| 54 | import bisect |
| 55 | import hashlib |
| 56 | import os |
| 57 | import pathlib |
| 58 | import struct |
| 59 | import tempfile |
| 60 | from typing import TYPE_CHECKING |
| 61 | |
| 62 | from muse.core.paths import packs_dir |
| 63 | from muse.core.types import DEFAULT_HASH_ALGO, split_id |
| 64 | from muse.core.validation import validate_object_id |
| 65 | |
| 66 | if TYPE_CHECKING: |
| 67 | pass |
| 68 | |
| 69 | # --------------------------------------------------------------------------- |
| 70 | # Constants |
| 71 | # --------------------------------------------------------------------------- |
| 72 | |
| 73 | _PACK_MAGIC = b"MUSE" |
| 74 | _IDX_MAGIC = b"MUSI" |
| 75 | _VERSION = 1 |
| 76 | _OID_BYTES = 71 # len("sha256:") + 64 hex chars |
| 77 | _FOOTER_BYTES = 32 # SHA-256 digest |
| 78 | _PACK_HEADER_BYTES = 4 + 1 + 8 # magic + version + object_count |
| 79 | _IDX_HEADER_BYTES = 4 + 1 + 8 # magic + version + entry_count |
| 80 | _IDX_ENTRY_BYTES = _OID_BYTES + 8 + 8 # object_id + content_offset + content_length |
| 81 | |
| 82 | |
| 83 | # --------------------------------------------------------------------------- |
| 84 | # Internal helpers |
| 85 | # --------------------------------------------------------------------------- |
| 86 | |
| 87 | |
| 88 | def _sha256_file(path: pathlib.Path) -> bytes: |
| 89 | h = hashlib.sha256() |
| 90 | with path.open("rb") as fh: |
| 91 | for chunk in iter(lambda: fh.read(65536), b""): |
| 92 | h.update(chunk) |
| 93 | return h.digest() |
| 94 | |
| 95 | |
| 96 | def _atomic_write(dest: pathlib.Path, data: bytes) -> None: |
| 97 | """Write *data* to *dest* atomically via a temp file + rename.""" |
| 98 | dest.parent.mkdir(parents=True, exist_ok=True) |
| 99 | fd, tmp_str = tempfile.mkstemp(dir=dest.parent, prefix=".pack-tmp-") |
| 100 | tmp = pathlib.Path(tmp_str) |
| 101 | try: |
| 102 | with os.fdopen(fd, "wb") as fh: |
| 103 | fh.write(data) |
| 104 | fh.flush() |
| 105 | os.fsync(fh.fileno()) |
| 106 | os.replace(tmp, dest) |
| 107 | except Exception: |
| 108 | tmp.unlink(missing_ok=True) |
| 109 | raise |
| 110 | |
| 111 | |
| 112 | def _pack_path(repo_root: pathlib.Path, hex_id: str) -> pathlib.Path: |
| 113 | return packs_dir(repo_root) / f"{hex_id}.mpack" |
| 114 | |
| 115 | |
| 116 | def _idx_path(repo_root: pathlib.Path, hex_id: str) -> pathlib.Path: |
| 117 | return packs_dir(repo_root) / f"{hex_id}.idx" |
| 118 | |
| 119 | |
| 120 | # --------------------------------------------------------------------------- |
| 121 | # Build |
| 122 | # --------------------------------------------------------------------------- |
| 123 | |
| 124 | |
| 125 | def _build_pack(objects: list[tuple[str, bytes]]) -> bytes: |
| 126 | """Encode *objects* as an MPack binary blob. Returns raw bytes.""" |
| 127 | h = hashlib.sha256() |
| 128 | |
| 129 | def _emit(chunk: bytes) -> bytes: |
| 130 | h.update(chunk) |
| 131 | return chunk |
| 132 | |
| 133 | parts: list[bytes] = [] |
| 134 | parts.append(_emit(_PACK_MAGIC)) |
| 135 | parts.append(_emit(struct.pack("<BQ", _VERSION, len(objects)))) |
| 136 | |
| 137 | for oid, content in objects: |
| 138 | oid_bytes = oid.encode() |
| 139 | assert len(oid_bytes) == _OID_BYTES, f"bad oid length: {oid!r}" |
| 140 | parts.append(_emit(oid_bytes)) |
| 141 | parts.append(_emit(struct.pack("<Q", len(content)))) |
| 142 | parts.append(_emit(content)) |
| 143 | |
| 144 | parts.append(h.digest()) # footer — NOT fed back into h |
| 145 | return b"".join(parts) |
| 146 | |
| 147 | |
| 148 | def _build_idx(objects: list[tuple[str, bytes]], pack_bytes: bytes) -> bytes: |
| 149 | """Build a sorted seek index for *objects* referencing *pack_bytes*.""" |
| 150 | # Compute content offsets within pack_bytes. |
| 151 | # Pack layout: header(13) then for each object: oid(71) + length(8) + content(N) |
| 152 | offsets: dict[str, tuple[int, int]] = {} |
| 153 | cursor = _PACK_HEADER_BYTES |
| 154 | for oid, content in objects: |
| 155 | cursor += _OID_BYTES + 8 # skip object_id + length field |
| 156 | offsets[oid] = (cursor, len(content)) |
| 157 | cursor += len(content) |
| 158 | |
| 159 | # Sort entries by object_id for binary search. |
| 160 | sorted_oids = sorted(offsets) |
| 161 | |
| 162 | h = hashlib.sha256() |
| 163 | |
| 164 | def _emit(chunk: bytes) -> bytes: |
| 165 | h.update(chunk) |
| 166 | return chunk |
| 167 | |
| 168 | parts: list[bytes] = [] |
| 169 | parts.append(_emit(_IDX_MAGIC)) |
| 170 | parts.append(_emit(struct.pack("<BQ", _VERSION, len(sorted_oids)))) |
| 171 | |
| 172 | for oid in sorted_oids: |
| 173 | content_offset, content_length = offsets[oid] |
| 174 | parts.append(_emit(oid.encode())) |
| 175 | parts.append(_emit(struct.pack("<QQ", content_offset, content_length))) |
| 176 | |
| 177 | parts.append(h.digest()) |
| 178 | return b"".join(parts) |
| 179 | |
| 180 | |
| 181 | # --------------------------------------------------------------------------- |
| 182 | # Public API |
| 183 | # --------------------------------------------------------------------------- |
| 184 | |
| 185 | |
| 186 | def _parse_pack_bytes(pack_bytes: bytes) -> list[tuple[str, bytes]]: |
| 187 | """Parse raw pack bytes (as produced by _build_pack) into (object_id, content) pairs. |
| 188 | |
| 189 | Verifies the SHA-256 footer before parsing. Returns [] for empty input. |
| 190 | """ |
| 191 | if not pack_bytes: |
| 192 | return [] |
| 193 | if pack_bytes[:4] != _PACK_MAGIC: |
| 194 | raise ValueError(f"Bad pack magic: {pack_bytes[:4]!r}") |
| 195 | body = pack_bytes[:-_FOOTER_BYTES] |
| 196 | stored = pack_bytes[-_FOOTER_BYTES:] |
| 197 | if hashlib.sha256(body).digest() != stored: |
| 198 | raise OSError("Pack bytes failed SHA-256 integrity check") |
| 199 | count = struct.unpack_from("<Q", pack_bytes, 5)[0] |
| 200 | cursor = _PACK_HEADER_BYTES |
| 201 | objects: list[tuple[str, bytes]] = [] |
| 202 | for _ in range(count): |
| 203 | oid = pack_bytes[cursor: cursor + _OID_BYTES].decode() |
| 204 | cursor += _OID_BYTES |
| 205 | length = struct.unpack_from("<Q", pack_bytes, cursor)[0] |
| 206 | cursor += 8 |
| 207 | content = pack_bytes[cursor: cursor + length] |
| 208 | cursor += length |
| 209 | objects.append((oid, content)) |
| 210 | return objects |
| 211 | |
| 212 | |
| 213 | def write_pack(repo_root: pathlib.Path, objects: list[tuple[str, bytes]]) -> str | None: |
| 214 | """Write *objects* as an MPack file and its seek index. |
| 215 | |
| 216 | Returns the ``pack_id`` (``sha256:<64hex>``) on success, or ``None`` when |
| 217 | *objects* is empty (no files written). |
| 218 | |
| 219 | Idempotent: if a pack with the same content already exists the existing |
| 220 | files are left untouched and the same pack_id is returned. |
| 221 | |
| 222 | Args: |
| 223 | repo_root: Root of the Muse repository. |
| 224 | objects: List of ``(object_id, content)`` pairs to pack. |
| 225 | |
| 226 | Returns: |
| 227 | ``"sha256:<64hex>"`` pack_id, or ``None`` for an empty list. |
| 228 | """ |
| 229 | if not objects: |
| 230 | return None |
| 231 | |
| 232 | pack_bytes = _build_pack(objects) |
| 233 | hex_id = hashlib.sha256(pack_bytes).hexdigest() |
| 234 | pack_id = f"{DEFAULT_HASH_ALGO}:{hex_id}" |
| 235 | |
| 236 | mpack = _pack_path(repo_root, hex_id) |
| 237 | idx = _idx_path(repo_root, hex_id) |
| 238 | |
| 239 | if mpack.exists() and idx.exists(): |
| 240 | return pack_id |
| 241 | |
| 242 | idx_bytes = _build_idx(objects, pack_bytes) |
| 243 | _atomic_write(mpack, pack_bytes) |
| 244 | _atomic_write(idx, idx_bytes) |
| 245 | return pack_id |
| 246 | |
| 247 | |
| 248 | # --------------------------------------------------------------------------- |
| 249 | # Index lookup helpers |
| 250 | # --------------------------------------------------------------------------- |
| 251 | |
| 252 | |
| 253 | def _load_idx(idx_path: pathlib.Path) -> list[tuple[str, int, int]]: |
| 254 | """Parse an index file into a sorted list of (object_id, offset, length).""" |
| 255 | data = idx_path.read_bytes() |
| 256 | |
| 257 | # Verify footer integrity. |
| 258 | body = data[:-_FOOTER_BYTES] |
| 259 | stored_digest = data[-_FOOTER_BYTES:] |
| 260 | actual_digest = hashlib.sha256(body).digest() |
| 261 | if actual_digest != stored_digest: |
| 262 | raise OSError( |
| 263 | f"MPack index {idx_path.name} failed integrity check — store may be corrupt." |
| 264 | ) |
| 265 | |
| 266 | if not data.startswith(_IDX_MAGIC): |
| 267 | raise OSError(f"MPack index {idx_path.name} has wrong magic bytes.") |
| 268 | |
| 269 | entry_count = struct.unpack_from("<Q", data, 5)[0] |
| 270 | entries: list[tuple[str, int, int]] = [] |
| 271 | cursor = _IDX_HEADER_BYTES |
| 272 | for _ in range(entry_count): |
| 273 | oid = data[cursor: cursor + _OID_BYTES].decode() |
| 274 | cursor += _OID_BYTES |
| 275 | offset, length = struct.unpack_from("<QQ", data, cursor) |
| 276 | cursor += 16 |
| 277 | entries.append((oid, offset, length)) |
| 278 | |
| 279 | return entries # already sorted (written sorted) |
| 280 | |
| 281 | |
| 282 | def _binary_search(entries: list[tuple[str, int, int]], oid: str) -> tuple[int, int] | None: |
| 283 | """Binary-search sorted *entries* for *oid*. Returns (offset, length) or None.""" |
| 284 | keys = [e[0] for e in entries] |
| 285 | i = bisect.bisect_left(keys, oid) |
| 286 | if i < len(entries) and entries[i][0] == oid: |
| 287 | return entries[i][1], entries[i][2] |
| 288 | return None |
| 289 | |
| 290 | |
| 291 | def _all_idx_paths(repo_root: pathlib.Path) -> list[pathlib.Path]: |
| 292 | """Return all .idx files in the pack store, or [] if none exist.""" |
| 293 | d = packs_dir(repo_root) |
| 294 | if not d.exists(): |
| 295 | return [] |
| 296 | return sorted(d.glob("*.idx")) |
| 297 | |
| 298 | |
| 299 | # --------------------------------------------------------------------------- |
| 300 | # Read / has / list / verify |
| 301 | # --------------------------------------------------------------------------- |
| 302 | |
| 303 | |
| 304 | def has_object_in_packs(repo_root: pathlib.Path, object_id: str) -> bool: |
| 305 | """Return ``True`` if *object_id* is present in any local pack.""" |
| 306 | for idx_path in _all_idx_paths(repo_root): |
| 307 | try: |
| 308 | entries = _load_idx(idx_path) |
| 309 | except OSError: |
| 310 | continue |
| 311 | if _binary_search(entries, object_id) is not None: |
| 312 | return True |
| 313 | return False |
| 314 | |
| 315 | |
| 316 | def read_object_from_packs(repo_root: pathlib.Path, object_id: str) -> bytes | None: |
| 317 | """Seek-read *object_id* from the first pack that contains it. |
| 318 | |
| 319 | Returns raw content bytes, or ``None`` if the object is absent from all |
| 320 | local packs. |
| 321 | |
| 322 | Every read verifies the SHA-256 of the returned bytes against *object_id* |
| 323 | before returning — silent corruption raises ``OSError``. |
| 324 | """ |
| 325 | for idx_path in _all_idx_paths(repo_root): |
| 326 | try: |
| 327 | entries = _load_idx(idx_path) |
| 328 | except OSError: |
| 329 | continue |
| 330 | result = _binary_search(entries, object_id) |
| 331 | if result is None: |
| 332 | continue |
| 333 | content_offset, content_length = result |
| 334 | hex_id = idx_path.stem |
| 335 | mpack = _pack_path(repo_root, hex_id) |
| 336 | with mpack.open("rb") as fh: |
| 337 | fh.seek(content_offset) |
| 338 | content = fh.read(content_length) |
| 339 | # Integrity check — object_id is the muse-format hash (hash_blob). |
| 340 | from muse.core.ids import hash_blob |
| 341 | actual = hash_blob(content) |
| 342 | if actual != object_id: |
| 343 | raise OSError( |
| 344 | f"Object {object_id} failed integrity check reading from pack " |
| 345 | f"{mpack.name} — store may be corrupt." |
| 346 | ) |
| 347 | return content |
| 348 | return None |
| 349 | |
| 350 | |
| 351 | def list_packs(repo_root: pathlib.Path) -> list[str]: |
| 352 | """Return the pack_id for every pack present in the local store.""" |
| 353 | result = [] |
| 354 | for idx_path in _all_idx_paths(repo_root): |
| 355 | result.append(f"{DEFAULT_HASH_ALGO}:{idx_path.stem}") |
| 356 | return result |
| 357 | |
| 358 | |
| 359 | def verify_pack(repo_root: pathlib.Path, pack_id: str) -> bool: |
| 360 | """Re-hash *pack_id*'s ``.mpack`` and ``.idx`` footers. |
| 361 | |
| 362 | Returns ``True`` when both files pass. Raises ``OSError`` on any |
| 363 | integrity failure so callers can distinguish corrupt from absent. |
| 364 | |
| 365 | Args: |
| 366 | repo_root: Root of the Muse repository. |
| 367 | pack_id: ``sha256:<64hex>`` pack identifier. |
| 368 | |
| 369 | Raises: |
| 370 | OSError: If either file is missing, has a wrong magic, or fails its |
| 371 | SHA-256 footer check. |
| 372 | """ |
| 373 | _, hex_id = split_id(pack_id) |
| 374 | mpack = _pack_path(repo_root, hex_id) |
| 375 | idx = _idx_path(repo_root, hex_id) |
| 376 | |
| 377 | for path, magic in [(mpack, _PACK_MAGIC), (idx, _IDX_MAGIC)]: |
| 378 | if not path.exists(): |
| 379 | raise OSError(f"MPack file not found: {path}") |
| 380 | data = path.read_bytes() |
| 381 | if not data.startswith(magic): |
| 382 | raise OSError(f"{path.name} has wrong magic bytes.") |
| 383 | body = data[:-_FOOTER_BYTES] |
| 384 | stored = data[-_FOOTER_BYTES:] |
| 385 | actual = hashlib.sha256(body).digest() |
| 386 | if actual != stored: |
| 387 | raise OSError( |
| 388 | f"{path.name} failed SHA-256 integrity check — store may be corrupt." |
| 389 | ) |
| 390 | |
| 391 | return True |
File History
1 commit
sha256:2eaa5d95f9d9383498e76947410a26e5a3ba23d182f339910c424cf88fad412b
fix: try fetch/presign before fetch/mpack to avoid Cloudfla…
Sonnet 4.6
patch
7 days ago