gabriel / muse public
archive.py python
648 lines 22.2 KB
Raw
sha256:2eaa5d95f9d9383498e76947410a26e5a3ba23d182f339910c424cf88fad412b fix: try fetch/presign before fetch/mpack to avoid Cloudfla… Sonnet 4.6 patch 7 days ago
1 """``muse archive`` — export a snapshot as a portable archive.
2
3 Creates a ``tar.gz`` or ``zip`` archive from any historical snapshot —
4 HEAD by default. The archive contains only the tracked files (the contents
5 of the snapshot at that point in time), making it the canonical way to
6 distribute a specific version without exposing ``.muse/`` internals.
7
8 Commit reference
9 ----------------
10 ``--ref`` accepts any reference understood by ``resolve_commit_ref``:
11
12 - Omitted or ``HEAD`` — the most recent commit on the current branch.
13 - A branch name — the tip commit of that branch.
14 - ``HEAD~N`` — *N* first-parent steps back from HEAD.
15 - A full or abbreviated commit SHA.
16
17 Formats
18 -------
19 - ``tar.gz`` (default) — gzip-compressed POSIX tar.
20 - ``zip`` — Deflate-compressed ZIP.
21
22 Security model
23 --------------
24 - Every archive entry name is validated by ``_safe_arcname`` before being
25 written. Entries with ``..`` path segments, absolute paths, or null bytes
26 are silently skipped with a warning — this prevents both zip-slip and
27 tar-slip path-traversal attacks regardless of what is stored in a snapshot.
28 - ``--prefix`` is validated up-front for ``..`` segments so users get a clear
29 error before any I/O begins.
30 - ``--output`` paths that would write outside the current directory are
31 permitted (agents often write to ``/tmp/`` or explicit destinations), but
32 the directory must already exist.
33 - All user-supplied strings are sanitized via ``sanitize_display()`` before
34 appearing in human-readable terminal output.
35 - All error messages go to **stderr**; **stdout** carries only data.
36
37 Agent UX
38 --------
39 Pass ``--json`` for a machine-readable result. Pass ``--list`` to preview
40 what would be archived without writing anything to disk — useful for agents
41 that need to reason about snapshot contents before committing to a file.
42
43 Usage::
44
45 muse archive # HEAD → <sha12>.tar.gz
46 muse archive --ref feat/audio # branch tip
47 muse archive --ref a1b2c3d4 # commit SHA prefix
48 muse archive --format zip # zip instead of tar.gz
49 muse archive --output release-v1.0.zip # custom output path
50 muse archive --prefix myproject/ # directory prefix inside archive
51 muse archive --list # preview without writing
52 muse archive --list --json # agent-readable manifest
53 muse archive --json # machine-readable result
54
55 JSON schema — normal output (``--json``)::
56
57 {
58 "path": "<output file path>",
59 "format": "tar.gz" | "zip",
60 "file_count": <int>,
61 "bytes": <int>,
62 "commit_id": "<sha256:…>",
63 "snapshot_id": "<sha256:…>",
64 "message": "<commit message>",
65 "branch": "<branch name>",
66 "author": "<author>",
67 "agent_id": "<agent id or empty>",
68 "model_id": "<model id or empty>",
69 "committed_at": "<ISO-8601>",
70 "ref": "<ref used or null>",
71 "prefix": "<directory prefix or empty>"
72 }
73
74 JSON schema — list mode (``--list --json``)::
75
76 {
77 "commit_id": "<sha256:…>",
78 "snapshot_id": "<sha256:…>",
79 "message": "<commit message>",
80 "branch": "<branch name>",
81 "author": "<author>",
82 "committed_at": "<ISO-8601>",
83 "ref": "<ref used or null>",
84 "prefix": "<directory prefix or empty>",
85 "file_count": <int>,
86 "entries": [
87 {"path": "<archive path>", "object_id": "<sha256:…>"},
88 ...
89 ]
90 }
91
92 Exit codes
93 ----------
94 - 0 — success
95 - 1 — bad arguments (bad format, bad prefix, missing commit, output dir missing)
96 - 2 — not inside a Muse repository
97 - 3 — internal error (snapshot or object data missing)
98 """
99
100 import argparse
101 import json
102 import logging
103 import pathlib
104 import sys
105 import tarfile
106 import zipfile
107 from typing import TypedDict
108
109 from muse.core.types import split_id
110 from muse.core.envelope import EnvelopeJson, make_envelope
111 from muse.core.errors import ExitCode
112 from muse.core.object_store import object_path, read_object
113 from muse.core.repo import require_repo
114 from muse.core.timing import start_timer
115 from muse.core.refs import (
116 get_head_commit_id,
117 read_current_branch,
118 )
119 from muse.core.commits import (
120 read_commit,
121 resolve_commit_ref,
122 )
123 from muse.core.snapshots import read_snapshot
124 from muse.core.validation import sanitize_display
125 from muse.core.types import Manifest
126
127 logger = logging.getLogger(__name__)
128
129 _FORMAT_CHOICES = {"tar.gz", "zip"}
130
131 # ---------------------------------------------------------------------------
132 # Typed JSON schemas
133 # ---------------------------------------------------------------------------
134
135 class _ArchiveJson(EnvelopeJson):
136 """Machine-readable output of ``muse archive --json`` (write mode).
137
138 Fields
139 ------
140 path
141 Absolute or relative path to the archive file that was written.
142 format
143 ``"tar.gz"`` or ``"zip"``.
144 file_count
145 Number of files successfully written into the archive.
146 bytes
147 Size of the archive file on disk in bytes.
148 commit_id
149 Full ``sha256:…`` commit ID that was archived.
150 snapshot_id
151 Full ``sha256:…`` snapshot ID — the content-addressed tree at that commit.
152 message
153 Commit message.
154 branch
155 Branch that was current when the archive was created.
156 author
157 Author field from the commit record.
158 agent_id
159 Agent identity string (empty for human commits).
160 model_id
161 Model identifier (empty for human commits).
162 committed_at
163 ISO-8601 commit timestamp.
164 ref
165 The ``--ref`` value passed by the caller, or ``null`` for HEAD.
166 prefix
167 The ``--prefix`` value used (empty string if none).
168 """
169
170 path: str
171 format: str
172 file_count: int
173 bytes: int
174 commit_id: str
175 snapshot_id: str
176 message: str
177 branch: str
178 author: str
179 agent_id: str
180 model_id: str
181 committed_at: str
182 ref: str | None
183 prefix: str
184
185 class _ListEntryJson(TypedDict):
186 """One file entry in the ``--list --json`` output."""
187
188 path: str
189 object_id: str
190
191 class _ListJson(EnvelopeJson):
192 """Machine-readable output of ``muse archive --list --json``.
193
194 Fields
195 ------
196 commit_id
197 Full ``sha256:…`` commit ID.
198 snapshot_id
199 Full ``sha256:…`` snapshot ID.
200 message
201 Commit message.
202 branch
203 Current branch name.
204 author
205 Author field from the commit record.
206 committed_at
207 ISO-8601 commit timestamp.
208 ref
209 ``--ref`` value passed by the caller, or ``null`` for HEAD.
210 prefix
211 ``--prefix`` value used (empty string if none).
212 file_count
213 Total number of entries that would be written.
214 entries
215 Ordered list of ``{"path": <archive path>, "object_id": <sha256:…>}``
216 dicts — one per file, sorted by archive path.
217 """
218
219 commit_id: str
220 snapshot_id: str
221 message: str
222 branch: str
223 author: str
224 committed_at: str
225 ref: str | None
226 prefix: str
227 file_count: int
228 entries: list[_ListEntryJson]
229
230 # ---------------------------------------------------------------------------
231 # Path safety
232 # ---------------------------------------------------------------------------
233
234 def _safe_arcname(prefix: str, rel_path: str) -> str | None:
235 """Build a safe archive entry name, guarding against zip-slip and tar-slip.
236
237 Validates both the caller-supplied *prefix* and the per-file *rel_path*
238 from the snapshot manifest. Returns the combined archive path string on
239 success, or ``None`` if either component is unsafe — the caller must skip
240 ``None`` entries and log a warning.
241
242 Safety rules enforced
243 ---------------------
244 - *rel_path* must be non-empty and must not normalise to ``"."``.
245 - *rel_path* must not be an absolute path.
246 - Neither *prefix* nor *rel_path* may contain ``..`` path components.
247 - Null bytes in either argument are rejected (they confuse archive readers
248 and some OS path APIs).
249
250 Args:
251 prefix: Directory prefix to prepend inside the archive (may be empty).
252 rel_path: Relative file path from the snapshot manifest.
253
254 Returns:
255 The safe archive entry name, or ``None`` if the entry should be skipped.
256 """
257 if not rel_path or "\x00" in rel_path or "\x00" in prefix:
258 return None
259
260 clean_prefix = prefix.rstrip("/").strip()
261 if clean_prefix and ".." in clean_prefix.split("/"):
262 return None
263
264 resolved = pathlib.PurePosixPath(rel_path)
265 if resolved.is_absolute() or ".." in resolved.parts:
266 return None
267
268 safe_rel = str(resolved)
269 # PurePosixPath("") normalises to "." — reject it.
270 if not safe_rel or safe_rel == ".":
271 return None
272
273 return f"{clean_prefix}/{safe_rel}" if clean_prefix else safe_rel
274
275 # ---------------------------------------------------------------------------
276 # Manifest helpers
277 # ---------------------------------------------------------------------------
278
279 def _build_entries(
280 root: pathlib.Path,
281 manifest: Manifest,
282 prefix: str,
283 ) -> tuple[list[tuple[str, str, pathlib.Path]], list[str]]:
284 """Resolve manifest entries into (arcname, object_id, obj_path) triples.
285
286 Validates every entry through ``_safe_arcname`` and checks object existence.
287 Returns a tuple of:
288
289 - ``entries`` — safe ``(arcname, object_id, obj_path)`` triples, sorted by arcname.
290 - ``skipped`` — display-safe descriptions of any entries that were skipped.
291
292 Args:
293 root: Repository root.
294 manifest: Snapshot manifest mapping relative path → object ID.
295 prefix: Directory prefix to prepend inside the archive.
296
297 Returns:
298 ``(entries, skipped)`` where *entries* are ready to write and *skipped*
299 are human-readable descriptions of skipped paths for logging/warnings.
300 """
301 entries: list[tuple[str, str, pathlib.Path]] = []
302 skipped: list[str] = []
303
304 for rel_path, object_id in sorted(manifest.items()):
305 arcname = _safe_arcname(prefix, rel_path)
306 if arcname is None:
307 skipped.append(f"unsafe path: {sanitize_display(rel_path)}")
308 continue
309 obj = object_path(root, object_id)
310 if not obj.exists():
311 skipped.append(f"missing object {object_id} for {sanitize_display(rel_path)}")
312 continue
313 entries.append((arcname, object_id, obj))
314
315 return entries, skipped
316
317 # ---------------------------------------------------------------------------
318 # Archive builders
319 # ---------------------------------------------------------------------------
320
321 def _build_tar(
322 entries: list[tuple[str, str, pathlib.Path]],
323 output_path: pathlib.Path,
324 root: pathlib.Path | None = None,
325 ) -> int:
326 """Write a ``tar.gz`` archive from pre-validated *entries*.
327
328 Each entry is a ``(arcname, object_id, obj_path)`` triple produced by
329 ``_build_entries`` — every path has already been validated for safety.
330
331 Args:
332 entries: Validated ``(arcname, object_id, obj_path)`` triples.
333 output_path: Destination file path for the archive.
334 root: Repository root (used to resolve object content).
335
336 Returns:
337 Number of files written into the archive.
338 """
339 import io
340 count = 0
341 with tarfile.open(output_path, "w:gz") as tar:
342 for arcname, object_id, obj_path in entries:
343 if root is not None:
344 content = read_object(root, object_id)
345 if content is None:
346 continue
347 info = tarfile.TarInfo(name=arcname)
348 info.size = len(content)
349 tar.addfile(info, io.BytesIO(content))
350 else:
351 tar.add(str(obj_path), arcname=arcname, recursive=False)
352 count += 1
353 return count
354
355 def _build_zip(
356 entries: list[tuple[str, str, pathlib.Path]],
357 output_path: pathlib.Path,
358 root: pathlib.Path | None = None,
359 ) -> int:
360 """Write a ``zip`` archive from pre-validated *entries*.
361
362 Each entry is a ``(arcname, object_id, obj_path)`` triple produced by
363 ``_build_entries`` — every path has already been validated for safety.
364
365 Args:
366 entries: Validated ``(arcname, object_id, obj_path)`` triples.
367 output_path: Destination file path for the archive.
368 root: Repository root (used to resolve object content).
369
370 Returns:
371 Number of files written into the archive.
372 """
373 count = 0
374 with zipfile.ZipFile(output_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
375 for arcname, object_id, obj_path in entries:
376 if root is not None:
377 content = read_object(root, object_id)
378 if content is None:
379 continue
380 zf.writestr(arcname, content)
381 else:
382 zf.write(str(obj_path), arcname=arcname)
383 count += 1
384 return count
385
386 # ---------------------------------------------------------------------------
387 # Registration
388 # ---------------------------------------------------------------------------
389
390 def register(subparsers: "argparse._SubParsersAction[argparse.ArgumentParser]") -> None:
391 """Register the ``archive`` subcommand with its argument parser.
392
393 Flags
394 -----
395 --ref REF
396 Branch, tag, or commit SHA to archive (default: HEAD).
397 --format / -f {tar.gz,zip}
398 Archive format. Default is ``tar.gz``.
399 --output / -o PATH
400 Output file path. Default: ``<sha12>.<format>`` in the current
401 directory. The destination directory must already exist.
402 --prefix DIR
403 Directory prefix prepended to every entry inside the archive
404 (e.g. ``myproject/``). Must not contain ``..`` segments.
405 --list
406 Preview mode — print what would be archived without writing a file.
407 Compatible with ``--ref``, ``--prefix``, and ``--json``.
408 --json
409 Emit a machine-readable JSON object to stdout instead of human text.
410 In list mode the schema is ``_ListJson``; otherwise ``_ArchiveJson``.
411 """
412 parser = subparsers.add_parser(
413 "archive",
414 help="Export any historical snapshot as a portable archive.",
415 description=__doc__,
416 formatter_class=argparse.RawDescriptionHelpFormatter,
417 )
418 parser.add_argument(
419 "--ref",
420 default=None,
421 help="Branch, tag, or commit SHA to archive (default: HEAD).",
422 )
423 parser.add_argument(
424 "--format", "-f",
425 default="tar.gz",
426 dest="fmt",
427 choices=sorted(_FORMAT_CHOICES),
428 help="Archive format: tar.gz or zip (default: tar.gz).",
429 )
430 parser.add_argument(
431 "--output", "-o",
432 default=None,
433 help=(
434 "Output file path (default: <sha12>.<format>). "
435 "The destination directory must already exist."
436 ),
437 )
438 parser.add_argument(
439 "--prefix",
440 default="",
441 help="Directory prefix inside the archive (e.g. myproject/).",
442 )
443 parser.add_argument(
444 "--list",
445 action="store_true",
446 dest="list_mode",
447 default=False,
448 help=(
449 "Preview what would be archived without writing a file. "
450 "Compatible with --ref, --prefix, and --json."
451 ),
452 )
453 parser.add_argument(
454 "--json", "-j",
455 action="store_true",
456 dest="json_out",
457 help="Emit machine-readable JSON to stdout instead of human text.",
458 )
459 parser.set_defaults(func=run)
460
461 # ---------------------------------------------------------------------------
462 # Command implementation
463 # ---------------------------------------------------------------------------
464
465 def run(args: argparse.Namespace) -> None:
466 """Export any historical snapshot as a portable archive.
467
468 Resolves the commit ref, loads the snapshot manifest, validates every path
469 for traversal safety, then writes a ``tar.gz`` or ``zip`` archive containing
470 only tracked files (no ``.muse/`` internals). Use ``--list`` to preview
471 entries without writing anything to disk.
472
473 Agent quickstart
474 ----------------
475 ::
476
477 muse archive --json # HEAD → tar.gz
478 muse archive --ref feat/audio --json # branch tip
479 muse archive --format zip --output out.zip --json
480 muse archive --list --json # preview without writing
481
482 JSON fields
483 -----------
484 path Output file path written.
485 format ``"tar.gz"`` or ``"zip"``.
486 file_count Number of files in the archive.
487 bytes Archive size on disk in bytes.
488 commit_id Full ``sha256:…`` commit ID archived.
489 snapshot_id Full ``sha256:…`` snapshot ID.
490 message Commit message.
491 branch Branch name at archive time.
492 author Author field from the commit record.
493 agent_id Agent identity (empty for human commits).
494 model_id Model identifier (empty for human commits).
495 committed_at ISO-8601 commit timestamp.
496 ref ``--ref`` value passed, or ``null`` for HEAD.
497 prefix ``--prefix`` value used (empty string if none).
498
499 With ``--list``, ``path``/``format``/``bytes`` are absent and an
500 ``entries`` list is added — each entry: ``path`` (archive path),
501 ``object_id`` (sha256).
502
503 Exit codes
504 ----------
505 0 Archive written (or list preview complete).
506 1 Invalid arguments, bad prefix, output directory missing, ref not found.
507 2 Not inside a Muse repository.
508 3 Snapshot or object data missing.
509 """
510 elapsed = start_timer()
511 ref: str | None = args.ref
512 fmt: str = args.fmt
513 output: str | None = args.output
514 prefix: str = args.prefix
515 list_mode: bool = args.list_mode
516 json_out: bool = args.json_out
517
518 # Validate prefix up-front so the user gets a clear error before any I/O.
519 clean_prefix = prefix.rstrip("/").strip()
520 if clean_prefix and ".." in clean_prefix.split("/"):
521 print(
522 f"❌ --prefix must not contain '..' segments: {sanitize_display(prefix)}",
523 file=sys.stderr,
524 )
525 raise SystemExit(ExitCode.USER_ERROR)
526
527 root = require_repo()
528 branch = read_current_branch(root)
529
530 # Resolve the commit reference.
531 if ref is None:
532 commit_id = get_head_commit_id(root, branch)
533 if not commit_id:
534 print("❌ No commits yet on this branch.", file=sys.stderr)
535 raise SystemExit(ExitCode.USER_ERROR)
536 commit = read_commit(root, commit_id)
537 else:
538 # Try as a branch name first (e.g. "main", "feat/audio"), then fall
539 # through to resolve_commit_ref for SHA prefixes and HEAD~N syntax.
540 commit = None
541 try:
542 branch_tip_id = get_head_commit_id(root, ref)
543 if branch_tip_id:
544 commit = read_commit(root, branch_tip_id)
545 except Exception:
546 pass
547 if commit is None:
548 commit = resolve_commit_ref(root, branch, ref)
549
550 if commit is None:
551 print(
552 f"❌ Ref {sanitize_display(ref or 'HEAD')!r} not found.",
553 file=sys.stderr,
554 )
555 raise SystemExit(ExitCode.USER_ERROR)
556
557 snapshot = read_snapshot(root, commit.snapshot_id)
558 if snapshot is None:
559 print(
560 f"❌ Snapshot {commit.snapshot_id} not found.",
561 file=sys.stderr,
562 )
563 raise SystemExit(ExitCode.INTERNAL_ERROR)
564
565 # Build and validate the entry list (shared between list and write modes).
566 entries, skipped = _build_entries(root, snapshot.manifest, clean_prefix)
567
568 for desc in skipped:
569 logger.warning("⚠️ Skipping %s", desc)
570
571 # --- List mode: preview without writing ---
572 if list_mode:
573 list_entries: list[_ListEntryJson] = [
574 _ListEntryJson(path=arcname, object_id=object_id)
575 for arcname, object_id, _ in entries
576 ]
577 if json_out:
578 print(json.dumps(_ListJson(
579 **make_envelope(elapsed),
580 commit_id=commit.commit_id,
581 snapshot_id=commit.snapshot_id,
582 message=commit.message,
583 branch=branch,
584 author=commit.author,
585 committed_at=commit.committed_at.isoformat(),
586 ref=ref,
587 prefix=clean_prefix,
588 file_count=len(entries),
589 entries=list_entries,
590 )))
591 return
592
593 print(
594 f"ℹ️ Snapshot {commit.commit_id} {sanitize_display(commit.message)}\n"
595 f" {len(entries)} file(s) would be archived:"
596 )
597 for entry in list_entries:
598 print(f" {entry['path']}")
599 if skipped:
600 print(f"\n ⚠️ {len(skipped)} entry/entries skipped (unsafe or missing).")
601 return
602
603 # --- Write mode: build the archive ---
604 # Use bare hex for the default filename — colons are invalid on Windows.
605 _, _commit_hex = split_id(commit.commit_id)
606 out_name = output or f"{_commit_hex}.{fmt}"
607 out_path = pathlib.Path(out_name)
608
609 # Validate that the destination directory exists before doing any work.
610 if out_path.parent != pathlib.Path(".") and not out_path.parent.exists():
611 print(
612 f"❌ Output directory does not exist: {sanitize_display(str(out_path.parent))}",
613 file=sys.stderr,
614 )
615 raise SystemExit(ExitCode.USER_ERROR)
616
617 if fmt == "tar.gz":
618 count = _build_tar(entries, out_path, root=root)
619 else:
620 count = _build_zip(entries, out_path, root=root)
621
622 archive_bytes = out_path.stat().st_size if out_path.exists() else 0
623
624 if json_out:
625 print(json.dumps(_ArchiveJson(
626 **make_envelope(elapsed),
627 path=str(out_path),
628 format=fmt,
629 file_count=count,
630 bytes=archive_bytes,
631 commit_id=commit.commit_id,
632 snapshot_id=commit.snapshot_id,
633 message=commit.message,
634 branch=branch,
635 author=commit.author,
636 agent_id=commit.agent_id,
637 model_id=commit.model_id,
638 committed_at=commit.committed_at.isoformat(),
639 ref=ref,
640 prefix=clean_prefix,
641 )))
642 return
643
644 size_kb = archive_bytes / 1024
645 print(
646 f"✅ Archive: {out_path} ({count} file(s), {size_kb:.1f} KiB)\n"
647 f" Commit: {commit.commit_id} {sanitize_display(commit.message)}"
648 )
File History 1 commit
sha256:2eaa5d95f9d9383498e76947410a26e5a3ba23d182f339910c424cf88fad412b fix: try fetch/presign before fetch/mpack to avoid Cloudfla… Sonnet 4.6 patch 7 days ago