gabriel / muse public
callgraph_cache.py python
164 lines 5.8 KB
Raw
sha256:e6465e8a9b7fa8e6223ed4a3576e96c568c913ae2caeb9c31f15e7a81b250b40 docs: add | jq convention to --json section of agent-guide Sonnet 4.6 1 day ago
1 """Persistent call-graph cache — eliminates re-parsing on every CLI invocation.
2
3 Architecture
4 ------------
5 ``build_forward_graph`` currently re-reads every Python blob from the object
6 store and re-parses every AST on each invocation: ~15 s for a 778-file repo.
7 The result is fully determined by the file content — the same bytes always
8 produce the same caller→callee mapping.
9
10 ``CallGraphCache`` exploits this by persisting a **per-file subgraph** keyed
11 by the SHA-256 of the file bytes (``object_id`` from the manifest):
12
13 key: SHA-256 hex digest of raw Python file bytes (``object_id``)
14 value: ``dict[caller_address, frozenset[callee_bare_name]]``
15 — the portion of the ``ForwardGraph`` contributed by this file
16
17 Storing the per-file subgraph (not just a flattened set of names) preserves
18 full per-address granularity so the warm path produces an identical
19 ``ForwardGraph`` to the cold path.
20
21 Storage
22 -------
23 ``.muse/cache/callgraph.json``::
24
25 {
26 "version": 2,
27 "entries": {
28 "<sha256-hex>": {
29 "file.py::caller_fn": ["callee_a", "callee_b"],
30 "file.py::leaf_fn": []
31 }
32 }
33 }
34
35 Inner lists are sorted for deterministic output. On load they are converted
36 back to frozensets.
37
38 Writes are atomic (Pattern A): ``mkstemp`` gives each writer a unique temp
39 file so two concurrent saves cannot interleave bytes; ``os.replace`` is the
40 atomic rename.
41
42 Pruning
43 -------
44 Use :meth:`CallGraphCache.prune` from ``muse gc`` to remove entries whose
45 object IDs are no longer reachable in the object store.
46
47 Typical lifecycle inside ``build_forward_graph``::
48
49 cache = load_callgraph_cache(root)
50 for file_path, obj_id in manifest.items():
51 subgraph = cache.get(obj_id)
52 if subgraph is not None:
53 graph.update(subgraph) # warm: no parse, no ast walk
54 else:
55 subgraph = _parse_file_subgraph(root, file_path, obj_id)
56 cache.put(obj_id, subgraph)
57 graph.update(subgraph)
58 cache.save()
59 """
60
61 import pathlib
62
63 from muse.core.cache_base import MsgpackCache, _RawCacheMap
64
65 _CACHE_VERSION = 2
66 _CACHE_FILENAME = "callgraph.json"
67
68 # Type alias: per-file portion of the forward call graph
69 _Subgraph = dict[str, frozenset[str]]
70
71 class CallGraphCache(MsgpackCache):
72 """Persistent JSON cache mapping object_id → per-file forward subgraph.
73
74 Subgraph type: ``{caller_address: frozenset[callee_bare_name]}``.
75
76 Inherits load/save/get/put/prune/size/empty from :class:`MsgpackCache`
77 (Pattern A — mkstemp + replace).
78
79 Typical lifecycle inside ``build_forward_graph``::
80
81 cache = CallGraphCache.load(muse_dir)
82 for file_path, object_id in manifest.items():
83 subgraph = cache.get(object_id)
84 if subgraph is None:
85 subgraph = _parse_file(root, file_path, object_id)
86 cache.put(object_id, subgraph)
87 graph.update(subgraph)
88 cache.save()
89
90 Attributes
91 ----------
92 _cache_dir : pathlib.Path | None
93 Absolute path to ``.muse/cache/``. ``None`` for in-memory-only
94 instances (``empty()``). ``save()`` is a no-op when ``None``.
95 _dirty : bool
96 Set to ``True`` by ``put()`` and ``prune()`` when entries change.
97 Reset to ``False`` by a successful ``save()``.
98 """
99
100 _CACHE_FILENAME = "callgraph.json"
101 _CACHE_VERSION = 2
102 _TEMP_PREFIX = ".callgraph_"
103
104 @classmethod
105 def _deserialize_entries(cls, raw: _RawCacheMap) -> _RawCacheMap:
106 """Validate and convert raw JSON entries to typed subgraph entries.
107
108 Each value must be a dict of address → list[str]; lists are converted
109 to frozensets. Invalid entries are skipped.
110 """
111 entries: dict[str, _Subgraph] = {}
112 for obj_id, subgraph_raw in raw.items():
113 if not isinstance(obj_id, str):
114 continue
115 if not isinstance(subgraph_raw, dict):
116 continue
117 subgraph: _Subgraph = {}
118 valid = True
119 for addr, callee_list in subgraph_raw.items():
120 if not isinstance(addr, str):
121 valid = False
122 break
123 if not isinstance(callee_list, list):
124 valid = False
125 break
126 if not all(isinstance(n, str) for n in callee_list):
127 valid = False
128 break
129 subgraph[addr] = frozenset(callee_list)
130 if valid:
131 entries[obj_id] = subgraph
132 return entries
133
134 def _serialize_entries(self) -> _RawCacheMap:
135 """Convert frozenset callees to sorted lists for JSON serialisation."""
136 return {
137 obj_id: {addr: sorted(callees) for addr, callees in subgraph.items()}
138 for obj_id, subgraph in self._entries.items()
139 }
140
141 # Type-narrowing overrides
142
143 def get(self, object_id: str) -> _Subgraph | None:
144 """Return the cached per-file subgraph for *object_id*, or ``None`` on miss."""
145 return self._entries.get(object_id) # type: ignore[return-value]
146
147 def put(self, object_id: str, subgraph: _Subgraph) -> None:
148 """Store *subgraph* under *object_id* and mark the cache dirty."""
149 super().put(object_id, subgraph)
150
151 def prune(self, live_ids: set[str]) -> None:
152 """Remove entries whose object IDs are not in *live_ids*.
153
154 Call this from ``muse gc`` after identifying all reachable object IDs.
155 """
156 super().prune(live_ids)
157
158 def load_callgraph_cache(root: pathlib.Path) -> CallGraphCache:
159 """Convenience loader: return a ``CallGraphCache`` for a repository root.
160
161 Returns ``CallGraphCache.empty()`` when *root* has no ``.muse`` directory
162 so callers never need to guard against a missing repo.
163 """
164 return CallGraphCache.from_root(root) # type: ignore[return-value]
File History 1 commit
sha256:e6465e8a9b7fa8e6223ed4a3576e96c568c913ae2caeb9c31f15e7a81b250b40 docs: add | jq convention to --json section of agent-guide Sonnet 4.6 1 day ago