snapshot.py
python
sha256:ecca645d94c5f39c88f4bc1283447ba0f4635ef3cbb11d0cd9b3759cba289d00
fix: compute_snapshot_id uses typed-object formula and wire…
Sonnet 4.6
minor
⚠ breaking
13 days ago
| 1 | """Snapshot and commit ID hashing — MuseHub-side implementation. |
| 2 | |
| 3 | This module provides the canonical server-side ID computation functions used |
| 4 | by MuseHub services and test fixtures. It intentionally mirrors the hashing |
| 5 | logic in ``muse.core.snapshot`` (the Muse CLI) so that IDs generated on the |
| 6 | server can be cross-verified against IDs sent by the CLI. |
| 7 | |
| 8 | CONTRACT: The separator constant ``_SEP`` and the hash construction algorithm |
| 9 | MUST remain identical to ``muse.core.snapshot._SEP`` and |
| 10 | ``muse.core.snapshot.compute_snapshot_id``. Both sides return IDs in |
| 11 | ``sha256:<hex>`` form. Any change to either side must be applied to both |
| 12 | simultaneously. A mismatch is a silent data-integrity bug. |
| 13 | """ |
| 14 | |
| 15 | from __future__ import annotations |
| 16 | |
| 17 | import json as _json |
| 18 | from typing import TYPE_CHECKING |
| 19 | |
| 20 | import hashlib as _hashlib |
| 21 | |
| 22 | from muse.core.ids import hash_commit |
| 23 | from muse.core.types import split_id |
| 24 | from musehub.types.json_types import StrDict |
| 25 | |
| 26 | if TYPE_CHECKING: |
| 27 | from musehub.db.musehub_repo_models import MusehubCommit, MusehubSnapshot |
| 28 | |
| 29 | # Must match muse.core.snapshot._SEP exactly. |
| 30 | _SEP = "\x00" |
| 31 | |
| 32 | |
| 33 | def compute_snapshot_id(manifest: StrDict, directories: list[str] | None = None) -> str: |
| 34 | """Return the canonical sha256:<hex> content-addressed ID for a snapshot. |
| 35 | |
| 36 | Uses a null-byte separator to prevent collision attacks via filenames or |
| 37 | object IDs that contain the previous ``|``/``:`` separators. |
| 38 | |
| 39 | When *directories* is non-empty, directory paths are appended with a |
| 40 | ``"dir"`` prefix so they occupy a distinct namespace from file entries. |
| 41 | This mirrors the ``directories`` support added to ``muse.core.snapshot``. |
| 42 | |
| 43 | Strips the algorithm prefix (``sha256:``) from each object ID before hashing |
| 44 | so that this output matches ``muse.core.snapshot.compute_snapshot_id`` exactly. |
| 45 | """ |
| 46 | parts = sorted(f"{path}{_SEP}{split_id(oid)[1]}" for path, oid in manifest.items()) |
| 47 | if directories: |
| 48 | parts.extend(f"dir{_SEP}{d}" for d in sorted(directories)) |
| 49 | canonical = _SEP.join(parts).encode() |
| 50 | header = f"snapshot {len(canonical)}\x00".encode() |
| 51 | return "sha256:" + _hashlib.sha256(header + canonical).hexdigest() |
| 52 | |
| 53 | |
| 54 | def compute_commit_id( |
| 55 | parent_ids: list[str], |
| 56 | snapshot_id: str, |
| 57 | message: str, |
| 58 | committed_at_iso: str, |
| 59 | author: str = "", |
| 60 | signer_public_key: str = "", |
| 61 | ) -> str: |
| 62 | """Return the canonical sha256:<hex> content-addressed ID for a commit. |
| 63 | |
| 64 | Must match muse.core.snapshot.compute_commit_id exactly. |
| 65 | Field order: parents, snapshot_id, message, committed_at, author, signer_public_key. |
| 66 | Parent IDs and snapshot_id have their algorithm prefix stripped before hashing. |
| 67 | """ |
| 68 | return hash_commit( |
| 69 | parent_ids=parent_ids, |
| 70 | snapshot_id=snapshot_id, |
| 71 | message=message, |
| 72 | committed_at_iso=committed_at_iso, |
| 73 | author=author, |
| 74 | signer_public_key=signer_public_key, |
| 75 | ) |
| 76 | |
| 77 | |
| 78 | def snapshot_to_bytes(snapshot_id: str, manifest: StrDict, directories: list[str] | None = None) -> bytes: |
| 79 | """Serialize a snapshot to the canonical muse binary format. |
| 80 | |
| 81 | Produces ``snapshot <size>\\0<json>`` — identical to the format written by |
| 82 | ``muse.core.snapshots.write_snapshot`` on the CLI side. This is the object |
| 83 | that must be stored in S3 so that ``muse pull`` can fetch and verify it. |
| 84 | """ |
| 85 | import datetime as _dt |
| 86 | d = { |
| 87 | "schema_version": 1, |
| 88 | "snapshot_id": snapshot_id, |
| 89 | "manifest": dict(manifest), |
| 90 | "directories": sorted(directories) if directories else [], |
| 91 | "created_at": _dt.datetime.now(_dt.timezone.utc).isoformat(), |
| 92 | "note": "", |
| 93 | } |
| 94 | json_bytes = _json.dumps(d).encode() |
| 95 | return f"snapshot {len(json_bytes)}\x00".encode() + json_bytes |
| 96 | |
| 97 | |
| 98 | def commit_to_bytes(commit: "MusehubCommit") -> bytes: |
| 99 | """Serialize a MusehubCommit to the canonical muse binary format. |
| 100 | |
| 101 | Produces ``commit <size>\\0<json>`` — identical to the format written by |
| 102 | ``muse.core.commits.write_commit`` on the CLI side. This is the object |
| 103 | that must be stored in S3 so that ``muse pull`` can fetch and verify it. |
| 104 | """ |
| 105 | parent_commit_id = commit.parent_ids[0] if commit.parent_ids else None |
| 106 | parent2_commit_id = commit.parent_ids[1] if len(commit.parent_ids) > 1 else None |
| 107 | d = { |
| 108 | "commit_id": commit.commit_id, |
| 109 | "branch": commit.branch, |
| 110 | "snapshot_id": commit.snapshot_id or "", |
| 111 | "message": commit.message, |
| 112 | "committed_at": commit.timestamp.isoformat(), |
| 113 | "parent_commit_id": parent_commit_id, |
| 114 | "parent2_commit_id": parent2_commit_id, |
| 115 | "author": commit.author or "", |
| 116 | "metadata": {}, |
| 117 | "structured_delta": None, |
| 118 | "sem_ver_bump": "none", |
| 119 | "breaking_changes": [], |
| 120 | "agent_id": commit.agent_id or "", |
| 121 | "model_id": commit.model_id or "", |
| 122 | "toolchain_id": commit.toolchain_id or "", |
| 123 | "prompt_hash": commit.prompt_hash or "", |
| 124 | "signature": commit.signature or "", |
| 125 | "signer_public_key": commit.signer_public_key or "", |
| 126 | "signer_key_id": commit.signer_key_id or "", |
| 127 | "reviewed_by": list(commit.reviewed_by or []), |
| 128 | "test_runs": commit.test_runs or 0, |
| 129 | "labels": [], |
| 130 | "status": "", |
| 131 | "notes": [], |
| 132 | "score": None, |
| 133 | } |
| 134 | json_bytes = _json.dumps(d).encode() |
| 135 | return f"commit {len(json_bytes)}\x00".encode() + json_bytes |
File History
3 commits
sha256:ecca645d94c5f39c88f4bc1283447ba0f4635ef3cbb11d0cd9b3759cba289d00
fix: compute_snapshot_id uses typed-object formula and wire…
Sonnet 4.6
minor
⚠
13 days ago
sha256:450998d182617fa93b737cbbdb3fe956c61566051739acec8c63ec5e7b4587f8
feat(phase3): write snapshot objects to S3 at all 3 write s…
Sonnet 4.6
patch
14 days ago
sha256:e597c0b97ade9c3c52ac4735ceb437ee69d1b6f0db61b8d7caa6467c5866566d
feat(phase2): write commit objects to S3 at all 5 write sit…
Sonnet 4.6
patch
14 days ago