muse/core/pack_store.py · gabriel/muse

pack_store.py python

391 lines 13.2 KB

sha256:2eaa5d95f9d9383498e76947410a26e5a3ba23d182f339910c424cf88fad412b fix: try fetch/presign before fetch/mpack to avoid Cloudfla… Sonnet 4.6 patch 7 days ago

1	"""MPack local object store — pack file read/write for the Muse VCS.
2
3	Layout
4	------
5	Pack files live under ``.muse/objects/pack/sha256/``, keeping the algorithm
6	canonical in the path, mirroring the loose object store convention::
7
8	.muse/objects/sha256/<prefix>/<remainder> ← loose objects
9	.muse/objects/pack/sha256/<64hex>.mpack ← MPack data file
10	.muse/objects/pack/sha256/<64hex>.idx ← MPack seek index
11
12	Pack file format (``.mpack``)
13	------------------------------
14	::
15
16	[4 bytes] magic: b"MUSE"
17	[1 byte] version: 1
18	[8 bytes] object_count: uint64 little-endian
19	--- object records (object_count entries) ---
20	[71 bytes] object_id: b"sha256:" + 64 lowercase hex chars
21	[8 bytes] length: uint64 little-endian
22	[N bytes] content: raw bytes
23	--- footer ---
24	[32 bytes] integrity: SHA-256 of every byte above
25
26	Index file format (``.idx``)
27	-----------------------------
28	::
29
30	[4 bytes] magic: b"MUSI"
31	[1 byte] version: 1
32	[8 bytes] entry_count: uint64 little-endian
33	--- entries, sorted by object_id (enables binary search) ---
34	[71 bytes] object_id
35	[8 bytes] content_offset: uint64 little-endian
36	[8 bytes] content_length: uint64 little-endian
37	--- footer ---
38	[32 bytes] integrity: SHA-256 of every byte above
39
40	``content_offset`` is the byte offset to the first content byte in the
41	``.mpack`` file (past the object_id and length fields for that entry).
42	A read is: ``seek(content_offset); read(content_length)`` — O(1), no decode.
43
44	Pack ID
45	-------
46	``pack_id = "sha256:" + sha256(<entire .mpack file bytes>).hexdigest()``
47
48	The pack is content-addressed like every other Muse object. Writing the same
49	objects twice produces the same pack file and the same pack_id — idempotent.
50	"""
51
52	from __future__ import annotations
53
54	import bisect
55	import hashlib
56	import os
57	import pathlib
58	import struct
59	import tempfile
60	from typing import TYPE_CHECKING
61
62	from muse.core.paths import packs_dir
63	from muse.core.types import DEFAULT_HASH_ALGO, split_id
64	from muse.core.validation import validate_object_id
65
66	if TYPE_CHECKING:
67	pass
68
69	# ---------------------------------------------------------------------------
70	# Constants
71	# ---------------------------------------------------------------------------
72
73	_PACK_MAGIC = b"MUSE"
74	_IDX_MAGIC = b"MUSI"
75	_VERSION = 1
76	_OID_BYTES = 71 # len("sha256:") + 64 hex chars
77	_FOOTER_BYTES = 32 # SHA-256 digest
78	_PACK_HEADER_BYTES = 4 + 1 + 8 # magic + version + object_count
79	_IDX_HEADER_BYTES = 4 + 1 + 8 # magic + version + entry_count
80	_IDX_ENTRY_BYTES = _OID_BYTES + 8 + 8 # object_id + content_offset + content_length
81
82
83	# ---------------------------------------------------------------------------
84	# Internal helpers
85	# ---------------------------------------------------------------------------
86
87
88	def _sha256_file(path: pathlib.Path) -> bytes:
89	h = hashlib.sha256()
90	with path.open("rb") as fh:
91	for chunk in iter(lambda: fh.read(65536), b""):
92	h.update(chunk)
93	return h.digest()
94
95
96	def _atomic_write(dest: pathlib.Path, data: bytes) -> None:
97	"""Write data to dest atomically via a temp file + rename."""
98	dest.parent.mkdir(parents=True, exist_ok=True)
99	fd, tmp_str = tempfile.mkstemp(dir=dest.parent, prefix=".pack-tmp-")
100	tmp = pathlib.Path(tmp_str)
101	try:
102	with os.fdopen(fd, "wb") as fh:
103	fh.write(data)
104	fh.flush()
105	os.fsync(fh.fileno())
106	os.replace(tmp, dest)
107	except Exception:
108	tmp.unlink(missing_ok=True)
109	raise
110
111
112	def _pack_path(repo_root: pathlib.Path, hex_id: str) -> pathlib.Path:
113	return packs_dir(repo_root) / f"{hex_id}.mpack"
114
115
116	def _idx_path(repo_root: pathlib.Path, hex_id: str) -> pathlib.Path:
117	return packs_dir(repo_root) / f"{hex_id}.idx"
118
119
120	# ---------------------------------------------------------------------------
121	# Build
122	# ---------------------------------------------------------------------------
123
124
125	def _build_pack(objects: list[tuple[str, bytes]]) -> bytes:
126	"""Encode objects as an MPack binary blob. Returns raw bytes."""
127	h = hashlib.sha256()
128
129	def _emit(chunk: bytes) -> bytes:
130	h.update(chunk)
131	return chunk
132
133	parts: list[bytes] = []
134	parts.append(_emit(_PACK_MAGIC))
135	parts.append(_emit(struct.pack("<BQ", _VERSION, len(objects))))
136
137	for oid, content in objects:
138	oid_bytes = oid.encode()
139	assert len(oid_bytes) == _OID_BYTES, f"bad oid length: {oid!r}"
140	parts.append(_emit(oid_bytes))
141	parts.append(_emit(struct.pack("<Q", len(content))))
142	parts.append(_emit(content))
143
144	parts.append(h.digest()) # footer — NOT fed back into h
145	return b"".join(parts)
146
147
148	def _build_idx(objects: list[tuple[str, bytes]], pack_bytes: bytes) -> bytes:
149	"""Build a sorted seek index for objects referencing pack_bytes."""
150	# Compute content offsets within pack_bytes.
151	# Pack layout: header(13) then for each object: oid(71) + length(8) + content(N)
152	offsets: dict[str, tuple[int, int]] = {}
153	cursor = _PACK_HEADER_BYTES
154	for oid, content in objects:
155	cursor += _OID_BYTES + 8 # skip object_id + length field
156	offsets[oid] = (cursor, len(content))
157	cursor += len(content)
158
159	# Sort entries by object_id for binary search.
160	sorted_oids = sorted(offsets)
161
162	h = hashlib.sha256()
163
164	def _emit(chunk: bytes) -> bytes:
165	h.update(chunk)
166	return chunk
167
168	parts: list[bytes] = []
169	parts.append(_emit(_IDX_MAGIC))
170	parts.append(_emit(struct.pack("<BQ", _VERSION, len(sorted_oids))))
171
172	for oid in sorted_oids:
173	content_offset, content_length = offsets[oid]
174	parts.append(_emit(oid.encode()))
175	parts.append(_emit(struct.pack("<QQ", content_offset, content_length)))
176
177	parts.append(h.digest())
178	return b"".join(parts)
179
180
181	# ---------------------------------------------------------------------------
182	# Public API
183	# ---------------------------------------------------------------------------
184
185
186	def _parse_pack_bytes(pack_bytes: bytes) -> list[tuple[str, bytes]]:
187	"""Parse raw pack bytes (as produced by _build_pack) into (object_id, content) pairs.
188
189	Verifies the SHA-256 footer before parsing. Returns [] for empty input.
190	"""
191	if not pack_bytes:
192	return []
193	if pack_bytes[:4] != _PACK_MAGIC:
194	raise ValueError(f"Bad pack magic: {pack_bytes[:4]!r}")
195	body = pack_bytes[:-_FOOTER_BYTES]
196	stored = pack_bytes[-_FOOTER_BYTES:]
197	if hashlib.sha256(body).digest() != stored:
198	raise OSError("Pack bytes failed SHA-256 integrity check")
199	count = struct.unpack_from("<Q", pack_bytes, 5)[0]
200	cursor = _PACK_HEADER_BYTES
201	objects: list[tuple[str, bytes]] = []
202	for _ in range(count):
203	oid = pack_bytes[cursor: cursor + _OID_BYTES].decode()
204	cursor += _OID_BYTES
205	length = struct.unpack_from("<Q", pack_bytes, cursor)[0]
206	cursor += 8
207	content = pack_bytes[cursor: cursor + length]
208	cursor += length
209	objects.append((oid, content))
210	return objects
211
212
213	def write_pack(repo_root: pathlib.Path, objects: list[tuple[str, bytes]]) -> str \| None:
214	"""Write objects as an MPack file and its seek index.
215
216	Returns the ``pack_id`` (``sha256:<64hex>``) on success, or ``None`` when
217	objects is empty (no files written).
218
219	Idempotent: if a pack with the same content already exists the existing
220	files are left untouched and the same pack_id is returned.
221
222	Args:
223	repo_root: Root of the Muse repository.
224	objects: List of ``(object_id, content)`` pairs to pack.
225
226	Returns:
227	``"sha256:<64hex>"`` pack_id, or ``None`` for an empty list.
228	"""
229	if not objects:
230	return None
231
232	pack_bytes = _build_pack(objects)
233	hex_id = hashlib.sha256(pack_bytes).hexdigest()
234	pack_id = f"{DEFAULT_HASH_ALGO}:{hex_id}"
235
236	mpack = _pack_path(repo_root, hex_id)
237	idx = _idx_path(repo_root, hex_id)
238
239	if mpack.exists() and idx.exists():
240	return pack_id
241
242	idx_bytes = _build_idx(objects, pack_bytes)
243	_atomic_write(mpack, pack_bytes)
244	_atomic_write(idx, idx_bytes)
245	return pack_id
246
247
248	# ---------------------------------------------------------------------------
249	# Index lookup helpers
250	# ---------------------------------------------------------------------------
251
252
253	def _load_idx(idx_path: pathlib.Path) -> list[tuple[str, int, int]]:
254	"""Parse an index file into a sorted list of (object_id, offset, length)."""
255	data = idx_path.read_bytes()
256
257	# Verify footer integrity.
258	body = data[:-_FOOTER_BYTES]
259	stored_digest = data[-_FOOTER_BYTES:]
260	actual_digest = hashlib.sha256(body).digest()
261	if actual_digest != stored_digest:
262	raise OSError(
263	f"MPack index {idx_path.name} failed integrity check — store may be corrupt."
264	)
265
266	if not data.startswith(_IDX_MAGIC):
267	raise OSError(f"MPack index {idx_path.name} has wrong magic bytes.")
268
269	entry_count = struct.unpack_from("<Q", data, 5)[0]
270	entries: list[tuple[str, int, int]] = []
271	cursor = _IDX_HEADER_BYTES
272	for _ in range(entry_count):
273	oid = data[cursor: cursor + _OID_BYTES].decode()
274	cursor += _OID_BYTES
275	offset, length = struct.unpack_from("<QQ", data, cursor)
276	cursor += 16
277	entries.append((oid, offset, length))
278
279	return entries # already sorted (written sorted)
280
281
282	def _binary_search(entries: list[tuple[str, int, int]], oid: str) -> tuple[int, int] \| None:
283	"""Binary-search sorted entries for oid. Returns (offset, length) or None."""
284	keys = [e[0] for e in entries]
285	i = bisect.bisect_left(keys, oid)
286	if i < len(entries) and entries[i][0] == oid:
287	return entries[i][1], entries[i][2]
288	return None
289
290
291	def _all_idx_paths(repo_root: pathlib.Path) -> list[pathlib.Path]:
292	"""Return all .idx files in the pack store, or [] if none exist."""
293	d = packs_dir(repo_root)
294	if not d.exists():
295	return []
296	return sorted(d.glob("*.idx"))
297
298
299	# ---------------------------------------------------------------------------
300	# Read / has / list / verify
301	# ---------------------------------------------------------------------------
302
303
304	def has_object_in_packs(repo_root: pathlib.Path, object_id: str) -> bool:
305	"""Return ``True`` if object_id is present in any local pack."""
306	for idx_path in _all_idx_paths(repo_root):
307	try:
308	entries = _load_idx(idx_path)
309	except OSError:
310	continue
311	if _binary_search(entries, object_id) is not None:
312	return True
313	return False
314
315
316	def read_object_from_packs(repo_root: pathlib.Path, object_id: str) -> bytes \| None:
317	"""Seek-read object_id from the first pack that contains it.
318
319	Returns raw content bytes, or ``None`` if the object is absent from all
320	local packs.
321
322	Every read verifies the SHA-256 of the returned bytes against object_id
323	before returning — silent corruption raises ``OSError``.
324	"""
325	for idx_path in _all_idx_paths(repo_root):
326	try:
327	entries = _load_idx(idx_path)
328	except OSError:
329	continue
330	result = _binary_search(entries, object_id)
331	if result is None:
332	continue
333	content_offset, content_length = result
334	hex_id = idx_path.stem
335	mpack = _pack_path(repo_root, hex_id)
336	with mpack.open("rb") as fh:
337	fh.seek(content_offset)
338	content = fh.read(content_length)
339	# Integrity check — object_id is the muse-format hash (hash_blob).
340	from muse.core.ids import hash_blob
341	actual = hash_blob(content)
342	if actual != object_id:
343	raise OSError(
344	f"Object {object_id} failed integrity check reading from pack "
345	f"{mpack.name} — store may be corrupt."
346	)
347	return content
348	return None
349
350
351	def list_packs(repo_root: pathlib.Path) -> list[str]:
352	"""Return the pack_id for every pack present in the local store."""
353	result = []
354	for idx_path in _all_idx_paths(repo_root):
355	result.append(f"{DEFAULT_HASH_ALGO}:{idx_path.stem}")
356	return result
357
358
359	def verify_pack(repo_root: pathlib.Path, pack_id: str) -> bool:
360	"""Re-hash pack_id's ``.mpack`` and ``.idx`` footers.
361
362	Returns ``True`` when both files pass. Raises ``OSError`` on any
363	integrity failure so callers can distinguish corrupt from absent.
364
365	Args:
366	repo_root: Root of the Muse repository.
367	pack_id: ``sha256:<64hex>`` pack identifier.
368
369	Raises:
370	OSError: If either file is missing, has a wrong magic, or fails its
371	SHA-256 footer check.
372	"""
373	_, hex_id = split_id(pack_id)
374	mpack = _pack_path(repo_root, hex_id)
375	idx = _idx_path(repo_root, hex_id)
376
377	for path, magic in [(mpack, _PACK_MAGIC), (idx, _IDX_MAGIC)]:
378	if not path.exists():
379	raise OSError(f"MPack file not found: {path}")
380	data = path.read_bytes()
381	if not data.startswith(magic):
382	raise OSError(f"{path.name} has wrong magic bytes.")
383	body = data[:-_FOOTER_BYTES]
384	stored = data[-_FOOTER_BYTES:]
385	actual = hashlib.sha256(body).digest()
386	if actual != stored:
387	raise OSError(
388	f"{path.name} failed SHA-256 integrity check — store may be corrupt."
389	)
390
391	return True

File History 1 commit

sha256:2eaa5d95f9d9383498e76947410a26e5a3ba23d182f339910c424cf88fad412b fix: try fetch/presign before fetch/mpack to avoid Cloudfla… Sonnet 4.6 patch 7 days ago

function _sha256_file

function _atomic_write

function _pack_path

function _idx_path

function _build_pack

function _emit

function _build_idx

function _emit

function _parse_pack_bytes

function write_pack

function _load_idx

function _binary_search

function _all_idx_paths

function has_object_in_packs

function read_object_from_packs

function list_packs

function verify_pack

Pathmuse/core/pack_store.py

Lines391

Size13.2 KB

LangPython

Refsha256:2eaa5d95f9d9383498e76947410a26e5a3ba23d182f339910c424cf88fad412b

Object ID

sha256:33bab059ed713e7a5be24787c547f4f4a9206ce20e873901d0b808ae170755d1…

Last commit

sha256:2eaa5d95f9d9383498e76947410a26e5a3ba23d182f339910c424cf88fad412b

fix: try fetch/presign before fetch/mpack to avoi…

7 days ago

Quick links

Blame History