gabriel / muse public
plugin.py python
556 lines 20.7 KB
Raw
sha256:f1f585ee9ca4e1ada936668c1b14f42f961a1fa78a2c033b643595f9c1bf9ac7 fixes for proposal flow Human patch 5 days ago
1 """Mist domain plugin — content-addressed, signed, agent-native artifact hosting.
2
3 A *mist* is a single versioned artifact stored under a content-derived filename.
4 The filename **is** the identity: the first 12 characters of the base-58 encoding
5 of its SHA-256 digest, optionally suffixed with a human-readable extension.
6
7 Design goals
8 ------------
9 - **Content-addressed** — same bytes always produce the same mist ID; no
10 collision with any other content is possible under SHA-256.
11 - **Domain-agnostic** — MIDI files, Solidity ABIs, JSON Schemas, prose, code,
12 images, and arbitrary binary blobs are all first-class citizens.
13 - **Signed** — every mist carries an MSign Ed25519 author signature; AI-produced
14 mists also embed ``agent_id`` + ``model_id`` for provenance.
15 - **VCS-native** — because MistPlugin satisfies ``MuseDomainPlugin``, all 14
16 ``muse`` CLI commands (status, diff, merge, log, …) work on mist repos without
17 any core engine changes.
18
19 Phase 1 scope
20 -------------
21 Pure domain layer only. No CLI sub-commands, no MuseHub API routes, no UI.
22 Those land in Phases 2–4.
23
24 See ``docs/mists.md`` for the full architecture document.
25 """
26
27 import hashlib
28 import os
29 import pathlib
30 import stat as _stat
31
32 from muse._version import __version__
33 from muse.core.diff_algorithms import snapshot_diff
34 from muse.core.schema import (
35 DimensionSpec,
36 DomainSchema,
37 SetSchema,
38 )
39 from muse.core.stat_cache import load_cache
40 from muse.core.types import Manifest
41
42 type _ArtifactInfo = dict[str, str]
43 from muse.domain import (
44 DriftReport,
45 LiveState,
46 MergeResult,
47 SnapshotManifest,
48 StateDelta,
49 StateSnapshot,
50 )
51
52
53 # ---------------------------------------------------------------------------
54 # Module-level constants
55 # ---------------------------------------------------------------------------
56
57 _DOMAIN_NAME = "mist"
58
59 # Valid visibility values for a mist artifact. Exported so that musehub and
60 # other consumers can validate against the same authoritative set.
61 MIST_VISIBILITIES: frozenset[str] = frozenset({"public", "secret"})
62
63 # Bitcoin base-58 alphabet — omits visually ambiguous characters: 0, O, I, l.
64 # Same bytes always produce the same base-58 string, making mist IDs
65 # deterministic and URL-safe.
66 _BASE58_ALPHABET = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"
67
68 # Length of the mist ID prefix taken from the full base-58 encoding.
69 # 12 characters from SHA-256 → ~10^21 distinct values; collision probability is
70 # negligible for any realistic corpus.
71 _MIST_ID_LENGTH = 12
72
73 # Known artifact type → (category, language/subtype) pairs.
74 # Checked after magic-byte and JSON-key inspection — extension is the last resort.
75 _EXT_MAP: dict[str, tuple[str, str]] = {
76 # Code
77 ".py": ("code", "python"),
78 ".js": ("code", "javascript"),
79 ".ts": ("code", "typescript"),
80 ".tsx": ("code", "typescript"),
81 ".jsx": ("code", "javascript"),
82 ".rs": ("code", "rust"),
83 ".go": ("code", "go"),
84 ".java": ("code", "java"),
85 ".c": ("code", "c"),
86 ".cpp": ("code", "cpp"),
87 ".h": ("code", "c"),
88 ".hpp": ("code", "cpp"),
89 ".rb": ("code", "ruby"),
90 ".swift": ("code", "swift"),
91 ".kt": ("code", "kotlin"),
92 ".sol": ("code", "solidity"),
93 ".vy": ("code", "vyper"),
94 # Prose — classified as code (same domain, plain text artifacts)
95 ".md": ("code", "markdown"),
96 ".txt": ("code", "text"),
97 ".rst": ("code", "restructuredtext"),
98 # Data
99 ".csv": ("data", "csv"),
100 ".toml": ("data", "toml"),
101 ".yaml": ("data", "yaml"),
102 ".yml": ("data", "yaml"),
103 ".xml": ("data", "xml"),
104 # MIDI — also detected by magic bytes (MThd)
105 ".mid": ("midi", "midi"),
106 ".midi": ("midi", "midi"),
107 }
108
109 # ---------------------------------------------------------------------------
110 # Pure functions — the domain's intelligence layer
111 # ---------------------------------------------------------------------------
112
113 def compute_mist_id(content: bytes) -> str:
114 """Derive the globally unique mist ID from raw artifact bytes.
115
116 The ID is the first :data:`_MIST_ID_LENGTH` characters of the base-58
117 encoding of the artifact's SHA-256 digest. The same bytes always produce
118 the same ID.
119
120 Args:
121 content: Raw bytes of the artifact.
122
123 Returns:
124 A 12-character URL-safe base-58 string, e.g. ``"aB3xQ9fWmK2r"``.
125
126 Examples:
127 >>> compute_mist_id(b"hello")
128 'GJGvdqT2tQ5j'
129 >>> compute_mist_id(b"") != compute_mist_id(b"x")
130 True
131 """
132 digest = hashlib.sha256(content).digest()
133 # Encode the 32-byte digest as a big-endian integer then convert to base-58.
134 n = int.from_bytes(digest, "big")
135 chars: list[str] = []
136 while n:
137 n, remainder = divmod(n, 58)
138 chars.append(_BASE58_ALPHABET[remainder])
139 # Preserve leading zero bytes as base-58 '1' characters.
140 for byte in digest:
141 if byte == 0:
142 chars.append(_BASE58_ALPHABET[0])
143 else:
144 break
145 encoded = "".join(reversed(chars))
146 return encoded[:_MIST_ID_LENGTH]
147
148 def detect_artifact_type(filename: str, content: bytes) -> _ArtifactInfo:
149 """Infer the artifact type and language from filename and raw content.
150
151 Detection order (most-to-least reliable):
152 1. Magic bytes — MIDI ``MThd`` header; catches mis-named files.
153 2. JSON key inspection — ABI arrays, JSON Schema ``$schema`` key.
154 3. Extension map — falls back gracefully to ``("unknown", "binary")``.
155
156 Args:
157 filename: Bare filename (no path separators), e.g. ``"contract.abi.json"``.
158 content: Raw bytes of the artifact.
159
160 Returns:
161 A dict with keys ``"artifact_type"`` and ``"language"``, e.g.
162 ``{"artifact_type": "midi", "language": "midi"}`` or
163 ``{"artifact_type": "code", "language": "python"}``.
164
165 Examples:
166 >>> detect_artifact_type("track.mid", b"MThd\\x00\\x00\\x00\\x06")
167 {'artifact_type': 'midi', 'language': 'midi'}
168 >>> detect_artifact_type("schema.json", b'{"$schema":"http://..."}')
169 {'artifact_type': 'json_schema', 'language': 'json'}
170 """
171 # 1. Magic bytes — MIDI
172 if content[:4] == b"MThd":
173 return {"artifact_type": "midi", "language": "midi"}
174
175 # 2. JSON key inspection
176 if filename.endswith(".json") or filename.endswith(".abi"):
177 try:
178 import json as _json
179
180 parsed = _json.loads(content.decode("utf-8", errors="replace"))
181 if isinstance(parsed, list) and parsed and isinstance(parsed[0], dict):
182 # Ethereum ABI is an array of objects each with "type" and "name"
183 if "type" in parsed[0] and "name" in parsed[0]:
184 return {"artifact_type": "abi", "language": "json"}
185 if isinstance(parsed, dict):
186 if "$schema" in parsed:
187 return {"artifact_type": "json_schema", "language": "json"}
188 except Exception:
189 pass
190
191 # 3. Extension map
192 ext = pathlib.PurePosixPath(filename).suffix.lower()
193 if ext in _EXT_MAP:
194 artifact_type, language = _EXT_MAP[ext]
195 return {"artifact_type": artifact_type, "language": language}
196
197 return {"artifact_type": "unknown", "language": "binary"}
198
199 def validate_mist_filename(filename: str) -> None:
200 """Validate a proposed mist filename for safety and correctness.
201
202 This is the security gate for all user-supplied filenames. It rejects
203 any name that could be used to escape the mist store or inject control
204 sequences into terminals.
205
206 Enforced rules
207 --------------
208 - No null bytes (``\\x00``).
209 - No path separators (``/`` or ``\\``).
210 - No directory traversal sequences (``..``).
211 - No control characters (``\\x01``–``\\x1f``, ``\\x7f``).
212 - No ANSI escape sequences (``\\x1b[``).
213 - Length ≤ 255 characters.
214
215 Args:
216 filename: The filename to validate (must be a bare name, no path).
217
218 Raises:
219 ValueError: Describing exactly which rule was violated.
220
221 Examples:
222 >>> validate_mist_filename("aB3xQ9fWmK2r.py") # valid — no error
223 >>> validate_mist_filename("../traversal")
224 Traceback (most recent call last):
225 ...
226 ValueError: Mist filename must not contain path traversal sequences: '../traversal'
227 """
228 if len(filename) > 255:
229 raise ValueError(
230 f"Mist filename exceeds 255-character limit: {len(filename)} chars"
231 )
232 if "\x00" in filename:
233 raise ValueError(f"Mist filename must not contain null bytes: {filename!r}")
234 if ".." in filename:
235 raise ValueError(
236 f"Mist filename must not contain path traversal sequences: {filename!r}"
237 )
238 if "/" in filename or "\\" in filename:
239 raise ValueError(
240 f"Mist filename must not contain path separators: {filename!r}"
241 )
242 if "\x1b[" in filename:
243 raise ValueError(
244 f"Mist filename must not contain ANSI escape sequences: {filename!r}"
245 )
246 for ch in filename:
247 cp = ord(ch)
248 if 0x01 <= cp <= 0x1F or cp == 0x7F:
249 raise ValueError(
250 f"Mist filename must not contain control characters: {filename!r}"
251 )
252
253 def extract_mist_symbol_anchors(filename: str, content: bytes) -> list[str]:
254 """Extract symbol anchors for code and structured mist artifacts.
255
256 Delegates to the ``muse.plugins.code.ast_parser`` layer, which supports
257 Python, TypeScript, Solidity, Markdown headings, TOML sections, and more.
258 Returns an empty list for binary or unrecognised file types — these are
259 still valid mists; they just have no intra-file anchor points.
260
261 Args:
262 filename: Bare filename (no path separators), e.g. ``"utils.py"``.
263 content: Raw bytes of the artifact.
264
265 Returns:
266 A list of symbol address strings in ``"filename::SymbolName"`` format,
267 e.g. ``["utils.py::compute_checksum", "utils.py::BaseHandler"]``.
268
269 Examples:
270 >>> anchors = extract_mist_symbol_anchors("add.py", b"def add(a, b): return a + b")
271 >>> "add.py::add" in anchors
272 True
273 """
274 try:
275 from muse.plugins.code.ast_parser import parse_symbols
276
277 tree = parse_symbols(content, filename)
278 # SymbolTree keys are full addresses: "filename.py::SymbolName"
279 # Filter out import pseudo-symbols (kind == "import") by checking address.
280 return [addr for addr in tree if "::import::" not in addr]
281 except Exception:
282 return []
283
284 # ---------------------------------------------------------------------------
285 # MistPlugin — MuseDomainPlugin implementation
286 # ---------------------------------------------------------------------------
287
288 class MistPlugin:
289 """Domain plugin for mist repositories.
290
291 Satisfies the full :class:`~muse.domain.MuseDomainPlugin` protocol plus the
292 optional address-keyed merge extension (:class:`~muse.domain.AddressedMergePlugin`).
293 No explicit inheritance needed — structural duck-typing applies.
294
295 All 14 ``muse`` CLI commands work immediately on any mist repo once this
296 plugin is registered. The mist-specific behaviour is:
297
298 - Every tracked file is a *mist*: a single content-addressed artifact.
299 - The snapshot manifest maps mist IDs (filenames) to their SHA-256 hashes.
300 - Merges are set-algebraic at file granularity — a mist either exists or not.
301 - Symbol anchors are extracted for code/structured mists; binary mists have none.
302 """
303
304 # ------------------------------------------------------------------
305 # MuseDomainPlugin — required core protocol
306 # ------------------------------------------------------------------
307
308 def snapshot(self, live_state: LiveState) -> StateSnapshot:
309 """Capture the current mist store as a content-addressed manifest.
310
311 Walks every file under ``live_state`` (respecting ``.museignore``),
312 hashing raw bytes with SHA-256. Returns a ``SnapshotManifest`` whose
313 ``files`` dict maps workspace-relative POSIX paths to their digests.
314
315 Args:
316 live_state: Either a ``pathlib.Path`` pointing to the mist store
317 directory, or a ``SnapshotManifest`` dict for in-memory use.
318
319 Returns:
320 A ``SnapshotManifest`` mapping mist filenames to SHA-256 digests.
321 """
322 if isinstance(live_state, pathlib.Path):
323 from muse.core.ignore import is_ignored, load_ignore_config, resolve_patterns
324
325 workdir = live_state
326 patterns = resolve_patterns(load_ignore_config(workdir), _DOMAIN_NAME)
327 cache = load_cache(workdir)
328 files: Manifest = {}
329 root_str = str(workdir)
330 prefix_len = len(root_str) + 1
331
332 for dirpath, dirnames, filenames in os.walk(root_str, followlinks=False):
333 dirnames[:] = sorted(d for d in dirnames if not d.startswith("."))
334 for fname in sorted(filenames):
335 if fname.startswith("."):
336 continue
337 abs_str = os.path.join(dirpath, fname)
338 try:
339 st = os.lstat(abs_str)
340 except OSError:
341 continue
342 if not _stat.S_ISREG(st.st_mode):
343 continue
344 rel = abs_str[prefix_len:]
345 if os.sep != "/":
346 rel = rel.replace(os.sep, "/")
347 if is_ignored(rel, patterns):
348 continue
349 files[rel] = cache.get_cached(
350 rel, abs_str, st.st_mtime, st.st_size, st.st_ino
351 )
352
353 cache.prune(set(files))
354 cache.save()
355 return SnapshotManifest(files=files, domain=_DOMAIN_NAME, directories=[])
356
357 # SnapshotManifest dict path — used by merge / diff in memory
358 return live_state
359
360 def diff(
361 self,
362 base: StateSnapshot,
363 target: StateSnapshot,
364 *,
365 repo_root: pathlib.Path | None = None,
366 ) -> StateDelta:
367 """Compute the typed operation list between two mist snapshots.
368
369 Delegates to ``snapshot_diff`` which performs set algebra on the
370 ``files`` dicts: new mists → InsertOp, removed mists → DeleteOp,
371 replaced mists → ReplaceOp.
372
373 Args:
374 base: Snapshot of the earlier state (e.g. HEAD).
375 target: Snapshot of the later state (e.g. working tree).
376
377 Returns:
378 A ``StructuredDelta`` whose ``ops`` list describes every change.
379 """
380 return snapshot_diff(self.schema(), base, target)
381
382 def merge(
383 self,
384 base: StateSnapshot,
385 left: StateSnapshot,
386 right: StateSnapshot,
387 *,
388 repo_root: pathlib.Path | None = None,
389 ) -> MergeResult:
390 """Three-way merge of two mist snapshots against a common ancestor.
391
392 Mists are content-addressed, so set-algebraic merge is correct by
393 construction: if both branches added the same bytes, they added the same
394 mist and there is no conflict.
395
396 Conflict rules
397 ~~~~~~~~~~~~~~
398 - Both sides agree → consensus wins.
399 - Only one side changed → take that side.
400 - Both sides changed differently → conflict (same path, different content).
401
402 Args:
403 base: Common ancestor snapshot.
404 left: Snapshot from the current branch (ours).
405 right: Snapshot from the incoming branch (theirs).
406
407 Returns:
408 A ``MergeResult`` with ``merged`` snapshot and ``conflicts`` list.
409 """
410 base_files = base["files"]
411 left_files = left["files"]
412 right_files = right["files"]
413
414 merged: Manifest = dict(base_files)
415 conflicts: list[str] = []
416
417 all_paths = set(base_files) | set(left_files) | set(right_files)
418 for path in sorted(all_paths):
419 b_val = base_files.get(path)
420 l_val = left_files.get(path)
421 r_val = right_files.get(path)
422
423 if l_val == r_val:
424 # Both sides agree — consensus wins (including both deleted)
425 if l_val is None:
426 merged.pop(path, None)
427 else:
428 merged[path] = l_val
429 elif b_val == l_val:
430 # Only right changed
431 if r_val is None:
432 merged.pop(path, None)
433 else:
434 merged[path] = r_val
435 elif b_val == r_val:
436 # Only left changed
437 if l_val is None:
438 merged.pop(path, None)
439 else:
440 merged[path] = l_val
441 else:
442 # Both changed differently — conflict; keep left as placeholder
443 conflicts.append(path)
444 merged[path] = l_val or r_val or b_val or ""
445
446 return MergeResult(
447 merged=SnapshotManifest(files=merged, domain=_DOMAIN_NAME, directories=[]),
448 conflicts=conflicts,
449 )
450
451 def drift(self, committed: StateSnapshot, live: LiveState) -> DriftReport:
452 """Report how much the mist store has drifted from the last commit.
453
454 Called by ``muse status``. Snapshots the current working tree, diffs
455 it against the committed state, and returns a ``DriftReport``.
456
457 Args:
458 committed: The last committed snapshot.
459 live: Current live state (path or snapshot manifest).
460
461 Returns:
462 A ``DriftReport`` with ``has_drift``, ``summary``, and ``delta``.
463 """
464 current = self.snapshot(live)
465 delta = self.diff(committed, current)
466 has_drift = len(delta["ops"]) > 0
467 return DriftReport(
468 has_drift=has_drift,
469 summary=delta["summary"],
470 delta=delta,
471 )
472
473 def apply(self, delta: StateDelta, live_state: LiveState) -> LiveState:
474 """Apply a delta to the mist store.
475
476 Mists are atomic blobs — the core engine already handles file-level
477 object restoration during ``muse checkout``. No domain-level
478 post-processing is needed.
479
480 Args:
481 delta: The typed operation list to apply.
482 live_state: Current live state.
483
484 Returns:
485 The unchanged live state.
486 """
487 return live_state
488
489 # ------------------------------------------------------------------
490 # Domain schema — required
491 # ------------------------------------------------------------------
492
493 def schema(self) -> DomainSchema:
494 """Declare the structural shape of the mist domain.
495
496 Mists are a **set** of content-addressed artifacts identified by
497 content (same bytes = same mist). The schema drives diff algorithm
498 selection and merge routing.
499
500 Dimensions
501 ----------
502 ``artifacts``
503 The primary dimension: the set of mist files. Identity is
504 ``"by_content"`` — the mist ID *is* the content hash prefix.
505
506 ``metadata``
507 A set of key-value annotation pairs (tags, descriptions, provenance
508 fields). Added in Phase 3 when MuseHub-side metadata is versioned.
509
510 Returns:
511 A ``DomainSchema`` describing the mist domain's structure.
512 """
513 return DomainSchema(
514 domain=_DOMAIN_NAME,
515 description=(
516 "Mist domain — content-addressed, signed, agent-native artifact hosting. "
517 "A mist is a single versioned artifact (code, MIDI, ABI, prose, or any "
518 "binary blob) identified by the first 12 characters of its SHA-256 "
519 "base-58 digest. Same bytes = same mist ID, always."
520 ),
521 top_level=SetSchema(
522 kind="set",
523 element_type="artifact",
524 identity="by_content",
525 ),
526 dimensions=[
527 DimensionSpec(
528 name="artifacts",
529 description=(
530 "The set of mist artifacts in this store. "
531 "Identity is by content — the mist ID is the hash prefix."
532 ),
533 schema=SetSchema(
534 kind="set",
535 element_type="artifact",
536 identity="by_content",
537 ),
538 independent_merge=True,
539 ),
540 DimensionSpec(
541 name="metadata",
542 description=(
543 "Annotation metadata for mists: tags, descriptions, "
544 "provenance fields (agent_id, model_id, signature)."
545 ),
546 schema=SetSchema(
547 kind="set",
548 element_type="annotation",
549 identity="by_content",
550 ),
551 independent_merge=True,
552 ),
553 ],
554 merge_mode="three_way",
555 schema_version=__version__,
556 )
File History 1 commit
sha256:f1f585ee9ca4e1ada936668c1b14f42f961a1fa78a2c033b643595f9c1bf9ac7 fixes for proposal flow Human patch 5 days ago