gabriel / musehub public
snapshot.py python
135 lines 5.4 KB
Raw
sha256:ecca645d94c5f39c88f4bc1283447ba0f4635ef3cbb11d0cd9b3759cba289d00 fix: compute_snapshot_id uses typed-object formula and wire… Sonnet 4.6 minor ⚠ breaking 13 days ago
1 """Snapshot and commit ID hashing — MuseHub-side implementation.
2
3 This module provides the canonical server-side ID computation functions used
4 by MuseHub services and test fixtures. It intentionally mirrors the hashing
5 logic in ``muse.core.snapshot`` (the Muse CLI) so that IDs generated on the
6 server can be cross-verified against IDs sent by the CLI.
7
8 CONTRACT: The separator constant ``_SEP`` and the hash construction algorithm
9 MUST remain identical to ``muse.core.snapshot._SEP`` and
10 ``muse.core.snapshot.compute_snapshot_id``. Both sides return IDs in
11 ``sha256:<hex>`` form. Any change to either side must be applied to both
12 simultaneously. A mismatch is a silent data-integrity bug.
13 """
14
15 from __future__ import annotations
16
17 import json as _json
18 from typing import TYPE_CHECKING
19
20 import hashlib as _hashlib
21
22 from muse.core.ids import hash_commit
23 from muse.core.types import split_id
24 from musehub.types.json_types import StrDict
25
26 if TYPE_CHECKING:
27 from musehub.db.musehub_repo_models import MusehubCommit, MusehubSnapshot
28
29 # Must match muse.core.snapshot._SEP exactly.
30 _SEP = "\x00"
31
32
33 def compute_snapshot_id(manifest: StrDict, directories: list[str] | None = None) -> str:
34 """Return the canonical sha256:<hex> content-addressed ID for a snapshot.
35
36 Uses a null-byte separator to prevent collision attacks via filenames or
37 object IDs that contain the previous ``|``/``:`` separators.
38
39 When *directories* is non-empty, directory paths are appended with a
40 ``"dir"`` prefix so they occupy a distinct namespace from file entries.
41 This mirrors the ``directories`` support added to ``muse.core.snapshot``.
42
43 Strips the algorithm prefix (``sha256:``) from each object ID before hashing
44 so that this output matches ``muse.core.snapshot.compute_snapshot_id`` exactly.
45 """
46 parts = sorted(f"{path}{_SEP}{split_id(oid)[1]}" for path, oid in manifest.items())
47 if directories:
48 parts.extend(f"dir{_SEP}{d}" for d in sorted(directories))
49 canonical = _SEP.join(parts).encode()
50 header = f"snapshot {len(canonical)}\x00".encode()
51 return "sha256:" + _hashlib.sha256(header + canonical).hexdigest()
52
53
54 def compute_commit_id(
55 parent_ids: list[str],
56 snapshot_id: str,
57 message: str,
58 committed_at_iso: str,
59 author: str = "",
60 signer_public_key: str = "",
61 ) -> str:
62 """Return the canonical sha256:<hex> content-addressed ID for a commit.
63
64 Must match muse.core.snapshot.compute_commit_id exactly.
65 Field order: parents, snapshot_id, message, committed_at, author, signer_public_key.
66 Parent IDs and snapshot_id have their algorithm prefix stripped before hashing.
67 """
68 return hash_commit(
69 parent_ids=parent_ids,
70 snapshot_id=snapshot_id,
71 message=message,
72 committed_at_iso=committed_at_iso,
73 author=author,
74 signer_public_key=signer_public_key,
75 )
76
77
78 def snapshot_to_bytes(snapshot_id: str, manifest: StrDict, directories: list[str] | None = None) -> bytes:
79 """Serialize a snapshot to the canonical muse binary format.
80
81 Produces ``snapshot <size>\\0<json>`` — identical to the format written by
82 ``muse.core.snapshots.write_snapshot`` on the CLI side. This is the object
83 that must be stored in S3 so that ``muse pull`` can fetch and verify it.
84 """
85 import datetime as _dt
86 d = {
87 "schema_version": 1,
88 "snapshot_id": snapshot_id,
89 "manifest": dict(manifest),
90 "directories": sorted(directories) if directories else [],
91 "created_at": _dt.datetime.now(_dt.timezone.utc).isoformat(),
92 "note": "",
93 }
94 json_bytes = _json.dumps(d).encode()
95 return f"snapshot {len(json_bytes)}\x00".encode() + json_bytes
96
97
98 def commit_to_bytes(commit: "MusehubCommit") -> bytes:
99 """Serialize a MusehubCommit to the canonical muse binary format.
100
101 Produces ``commit <size>\\0<json>`` — identical to the format written by
102 ``muse.core.commits.write_commit`` on the CLI side. This is the object
103 that must be stored in S3 so that ``muse pull`` can fetch and verify it.
104 """
105 parent_commit_id = commit.parent_ids[0] if commit.parent_ids else None
106 parent2_commit_id = commit.parent_ids[1] if len(commit.parent_ids) > 1 else None
107 d = {
108 "commit_id": commit.commit_id,
109 "branch": commit.branch,
110 "snapshot_id": commit.snapshot_id or "",
111 "message": commit.message,
112 "committed_at": commit.timestamp.isoformat(),
113 "parent_commit_id": parent_commit_id,
114 "parent2_commit_id": parent2_commit_id,
115 "author": commit.author or "",
116 "metadata": {},
117 "structured_delta": None,
118 "sem_ver_bump": "none",
119 "breaking_changes": [],
120 "agent_id": commit.agent_id or "",
121 "model_id": commit.model_id or "",
122 "toolchain_id": commit.toolchain_id or "",
123 "prompt_hash": commit.prompt_hash or "",
124 "signature": commit.signature or "",
125 "signer_public_key": commit.signer_public_key or "",
126 "signer_key_id": commit.signer_key_id or "",
127 "reviewed_by": list(commit.reviewed_by or []),
128 "test_runs": commit.test_runs or 0,
129 "labels": [],
130 "status": "",
131 "notes": [],
132 "score": None,
133 }
134 json_bytes = _json.dumps(d).encode()
135 return f"commit {len(json_bytes)}\x00".encode() + json_bytes
File History 3 commits
sha256:ecca645d94c5f39c88f4bc1283447ba0f4635ef3cbb11d0cd9b3759cba289d00 fix: compute_snapshot_id uses typed-object formula and wire… Sonnet 4.6 minor 13 days ago
sha256:450998d182617fa93b737cbbdb3fe956c61566051739acec8c63ec5e7b4587f8 feat(phase3): write snapshot objects to S3 at all 3 write s… Sonnet 4.6 patch 14 days ago
sha256:e597c0b97ade9c3c52ac4735ceb437ee69d1b6f0db61b8d7caa6467c5866566d feat(phase2): write commit objects to S3 at all 5 write sit… Sonnet 4.6 patch 14 days ago