musehub/services/musehub_intel_providers.py · gabriel/musehub

1

"""Domain-agnostic intelligence provider registry.

2

3

Every intelligence result written to ``musehub_intel_results`` is produced by

4

an ``IntelProvider``. Providers are keyed by job_type string and registered in

5

``_PROVIDER_REGISTRY``. The worker imports the registry and dispatches jobs

6

without knowing anything about the underlying domain.

7

8

Adding a new domain means:

9

1. Implement a class that satisfies the ``IntelProvider`` Protocol.

10

2. Add it to ``_PROVIDER_REGISTRY`` under the appropriate ``"intel.<domain>"`` key.

11

3. Update ``job_types_for_push`` to enqueue it for repos in that domain.

12

13

No other files need changes.

Provider contract

-----------------

- ``compute`` receives an async session, repo_id, head commit_id, and the raw

18

job payload. It returns a list of ``(intel_type, data_dict)`` tuples.

19

- Each tuple is written to ``musehub_intel_results`` via upsert:

20

``(repo_id, intel_type)`` is the unique key; every push overwrites the

21

previous result for that type.

22

- ``schema_version`` defaults to 1; bump it inside the provider when the

23

``data_dict`` schema changes in a backwards-incompatible way so readers can

24

detect stale entries.

"""

import asyncio

import json

import logging

from datetime import datetime, timedelta, timezone

31

from typing import Protocol, runtime_checkable

32

33

import sqlalchemy as sa

34

from sqlalchemy import select

35

from sqlalchemy.dialects.postgresql import insert as pg_insert

36

from sqlalchemy.ext.asyncio import AsyncSession

37

38

from muse.core.types import blob_id

39

from muse.plugins.code.ast_parser import parse_symbols

40

from muse.plugins.code._query import language_of

41

from musehub.core.genesis import compute_intel_result_id

42

from musehub.db.musehub_intel_models import (

43

MusehubHashOccurrenceEntry,

44

MusehubIntelApiSurface,

45

MusehubIntelBlastRisk,

46

MusehubIntelBreakageIssue,

47

MusehubIntelBreakageMeta,

48

MusehubIntelClones,

49

MusehubIntelCodemapMeta,

50

MusehubIntelCodemapModule,

51

MusehubIntelCoupling,

52

MusehubIntelDead,

53

MusehubIntelEntangle,

54

MusehubIntelLanguages,

55

MusehubIntelRefactorEvent,

MusehubIntelResult,

MusehubIntelStable,

MusehubIntelType,

MusehubIntelVelocity,

60

MusehubSymbolHistoryEntry,

61

MusehubSymbolIntel,

62

)

63

from musehub.db.musehub_repo_models import MusehubCommit, MusehubCommitRef, MusehubMist, MusehubRepo, MusehubSnapshot

64

from musehub.storage.backends import StorageBackend, get_backend

65

from musehub.types.json_types import IntDict, JSONObject, StrDict

66

67

logger = logging.getLogger(__name__)

68

69

# ---------------------------------------------------------------------------

70

# Types

71

# ---------------------------------------------------------------------------

72

73

IntelResults = list[tuple[str, dict]] # [(intel_type, data_dict), ...]

74

75

# Named type aliases — avoids boundary_dict / dict_of_dict audit violations.

76

ModuleStatsMap = dict[str, IntDict] # module_name → {added, removed, modified, ...}

77

SymbolInfoMap = dict[str, JSONObject] # address → {body_hash, signature_id, file, kind}

78

GraphMap = dict[str, list[str]] # file_path → [imported_file_path, ...]

79

LangKindsMap = dict[str, IntDict] # language → {kind → count}

80

81

@runtime_checkable

82

class IntelProvider(Protocol):

83

"""Protocol satisfied by every domain intelligence provider."""

async def compute(

self,

session: AsyncSession,

repo_id: str,

ref: str,

payload: JSONObject,

) -> IntelResults:

"""Compute intelligence results for a repo at the given ref.

93

94

Returns a list of (intel_type, data_dict) tuples. Each will be

95

written to ``musehub_intel_results`` with an upsert on

96

``(repo_id, intel_type)``.

"""

...

# ---------------------------------------------------------------------------

101

# Persistence helper

102

# ---------------------------------------------------------------------------

103

104

async def persist_intel_results(

105

session: AsyncSession,

106

repo_id: str,

107

ref: str,

108

results: IntelResults,

109

) -> None:

110

"""Upsert a batch of intel results into musehub_intel_results.

111

112

Each (intel_type, data_dict) tuple becomes one row. The unique

113

constraint on (repo_id, intel_type) means this is always an overwrite

114

of the previous result for that type.

115

"""

116

now = datetime.now(tz=timezone.utc)

117

for intel_type, data in results:

118

domain = intel_type.split(".")[0]

119

result_id = compute_intel_result_id(repo_id, intel_type, ref)

120

stmt = (

121

pg_insert(MusehubIntelResult)

.values(

result_id=result_id,

repo_id=repo_id,

intel_type=intel_type,

126

domain=domain,

127

ref=ref,

128

data_json=json.dumps(data),

schema_version=1,

computed_at=now,

)

.on_conflict_do_update(

133

constraint="uq_musehub_intel_results_repo_type",

134

set_={

135

"result_id": result_id,

136

"ref": ref,

137

"data_json": json.dumps(data),

"computed_at": now,

},

)

)

await session.execute(stmt)

143

144

# ---------------------------------------------------------------------------

145

# Structural provider — works for ALL domains

146

# ---------------------------------------------------------------------------

147

148

class StructuralProvider:

149

"""Compute cross-domain structural intelligence from DB-stored commit history.

150

151

No subprocess, no domain-specific logic — reads directly from

152

``musehub_commits``. Produces results meaningful for any Muse domain.

153

154

Intel types produced:

155

``structural.velocity`` — weekly commit counts (12 weeks, newest first)

156

``structural.contributors`` — author → commit count over all time

"""

async def compute(

self,

session: AsyncSession,

repo_id: str,

ref: str,

payload: JSONObject,

) -> IntelResults:

commits_q = await session.execute(

167

select(MusehubCommit.author, MusehubCommit.timestamp)

168

.join(MusehubCommitRef, MusehubCommitRef.commit_id == MusehubCommit.commit_id)

169

.where(MusehubCommitRef.repo_id == repo_id)

170

.order_by(MusehubCommit.timestamp.desc())

171

)

172

rows = commits_q.all()

173

174

now = datetime.now(tz=timezone.utc)

175

weekly_counts: list[int] = [0] * 12

176

177

contributor_counts: dict[str, int] = {}

178

for author, ts in rows:

179

if author:

180

contributor_counts[author] = contributor_counts.get(author, 0) + 1

181

if ts:

182

ts_aware = ts if ts.tzinfo else ts.replace(tzinfo=timezone.utc)

183

age_days = (now - ts_aware).days

184

week_idx = age_days // 7

185

if 0 <= week_idx < 12:

186

weekly_counts[week_idx] += 1

return [

(

"structural.velocity",

191

{

192

"weekly": weekly_counts,

193

"total_commits": len(rows),

},

),

(

"structural.contributors",

198

{

199

"authors": contributor_counts,

200

"total_authors": len(contributor_counts),

},

),

]

# ---------------------------------------------------------------------------

206

# Code provider — wraps the symbol indexer

207

# ---------------------------------------------------------------------------

208

209

class CodeProvider:

210

"""Compute code-domain intelligence using the symbol indexer.

211

212

Delegates to ``build_symbol_index`` from the existing indexer module,

213

which reads the structured_delta column from stored commits and outputs five result

214

types into ``musehub_intel_results``.

215

216

Intel types produced:

217

``code.symbol_history`` — address → op list (provenance)

218

``code.hash_occurrence`` — content_id → address list (clone detection)

219

``code.intel_snapshot`` — full IntelSnapshot (dashboards, health)

220

``code.intel_summary`` — condensed for the repo home page

221

``code.per_symbol_intel``— per-symbol {churn, blast, gravity, …}

"""

async def compute(

self,

session: AsyncSession,

repo_id: str,

ref: str,

payload: JSONObject,

) -> IntelResults:

from musehub.services.musehub_symbol_indexer import build_symbol_index

232

return await build_symbol_index(session, repo_id, ref)

233

234

# ---------------------------------------------------------------------------

235

# Subprocess helper and repo-root resolution for code intel providers

236

# ---------------------------------------------------------------------------

237

238

async def _run_muse(repo_root: str, *args: str) -> JSONObject | None:

239

"""Run ``muse -C <repo_root> <args> --json`` and return parsed JSON.

240

241

Returns ``None`` on non-zero exit or JSON parse failure so callers can

242

safely return ``[]`` without crashing the worker.

243

"""

244

try:

245

proc = await asyncio.create_subprocess_exec(

246

"muse", "-C", repo_root, *args, "--json",

247

stdout=asyncio.subprocess.PIPE,

248

stderr=asyncio.subprocess.PIPE,

249

)

250

stdout, _ = await proc.communicate()

251

if proc.returncode != 0:

252

logger.warning("muse %s exited %s", " ".join(args), proc.returncode)

253

return None

254

return json.loads(stdout)

255

except Exception:

256

logger.exception("_run_muse failed: args=%s", args)

return None

# ---------------------------------------------------------------------------

261

# MIDI provider — stub for future implementation

262

# ---------------------------------------------------------------------------

263

264

class MidiProvider:

265

"""Compute MIDI-domain intelligence (stub — returns empty until implemented).

266

267

When the MIDI domain intelligence commands are available via ``muse`` CLI

268

subprocess calls or direct Muse library APIs, implement this provider to

269

produce ``midi.*`` intel types such as:

270

``midi.tags`` — tag frequency, commit coverage

271

``midi.harmony`` — chord progression statistics

272

``midi.tempo_map`` — tempo change history

"""

async def compute(

self,

session: AsyncSession,

repo_id: str,

ref: str,

payload: JSONObject,

) -> IntelResults:

return []

# ---------------------------------------------------------------------------

285

# Mist provider — symbol anchor extraction for mist repos

286

# ---------------------------------------------------------------------------

287

288

class MistProvider:

289

"""Compute mist-domain intelligence for a mist-domain repo.

290

291

Two result types are produced per push:

292

293

``mist.anchor_index``

294

Snapshot-level symbol anchor index built by

295

``build_mist_anchor_index``. Works for VCS-only mist repos (bare

296

push) and for API-created mists alike. Writes normalized rows to

297

``musehub_symbol_history_entries`` and ``musehub_symbol_intel`` and

298

returns ``{anchor_count, filename_count}``. Returns ``[]`` when the

299

commit has no snapshot, the snapshot is empty, or no artifact yields

any anchors.

``mist.anchors``

Per-mist anchor blob — only emitted when a ``MusehubMist`` row is

304

linked to this repo (i.e. the mist was created via ``POST /api/mists``).

305

Re-extracts anchors from the stored content and refreshes

306

``MusehubMist.symbol_anchors`` in the DB. Callers that only push via

307

the wire protocol without an API-created mist will not see this type.

308

309

Never raises — binary artifacts with no parseable symbols yield an empty

310

anchor list; missing snapshots return ``[]`` immediately.

"""

async def compute(

self,

session: AsyncSession,

repo_id: str,

ref: str,

payload: JSONObject,

) -> IntelResults:

from muse.plugins.mist.plugin import extract_mist_symbol_anchors

321

from sqlalchemy import select as _select

322

from musehub.services.musehub_mist_indexer import build_mist_anchor_index

323

324

# Snapshot-level index — always run, works for VCS-only and API-created mists.

325

results: list[tuple[str, dict]] = []

326

results.extend(await build_mist_anchor_index(session, repo_id, ref))

327

328

# Per-mist anchor blob — only when a MusehubMist row is linked to this repo.

329

result = await session.execute(

330

_select(MusehubMist).where(MusehubMist.repo_id == repo_id)

331

)

332

mist = result.scalar_one_or_none()

333

if mist is not None:

334

raw = mist.content.encode("utf-8")

335

anchors = extract_mist_symbol_anchors(mist.filename, raw)

336

mist.symbol_anchors = anchors

results.append((

"mist.anchors",

{

"mist_id": mist.mist_id,

341

"filename": mist.filename,

342

"artifact_type": mist.artifact_type,

343

"symbol_anchors": anchors,

344

"anchor_count": len(anchors),

},

))

return results

# ---------------------------------------------------------------------------

351

# Profile snapshot provider — cross-repo user-level aggregates

352

# ---------------------------------------------------------------------------

353

354

class ProfileSnapshotProvider:

355

"""Compute and persist the full profile snapshot for a repo's owner.

356

357

Unlike repo-scoped providers this provider reads ``payload["handle"]`` to

358

identify the target user, computes all expensive cross-repo aggregates,

359

and upserts a single row in ``musehub_profile_snapshots`` keyed by handle.

360

361

Returns ``[]`` — results are written directly to the snapshots table,

362

not through ``persist_intel_results``.

363

364

Data computed:

365

stats — {repo_count, commit_count, agent_count, avg_health}

366

repos — list of repo card dicts (all owned repos)

367

heatmap — 52-week daily heatmap with streaks

368

agent_fleet — agents that committed to owned repos

369

badges — provenance badge dicts

370

footprint — top-N touched symbols across repos

371

activity_canvas — 6-domain 52×7 activity grid (active domains only)

"""

async def compute(

self,

session: AsyncSession,

repo_id: str,

ref: str,

payload: JSONObject,

) -> IntelResults:

handle = str(payload.get("handle", ""))

382

if not handle:

383

logger.warning("profile.snapshot: missing handle in payload, skipping")

return []

try:

await _compute_and_persist_profile_snapshot(session, handle)

except Exception:

import traceback

logger.error(

"profile.snapshot: failed for handle=%s\n%s",

392

handle, traceback.format_exc(),

)

return []

async def _compute_and_persist_profile_snapshot(

397

session: AsyncSession,

398

handle: str,

399

) -> None:

400

"""Compute all expensive profile data for *handle* and upsert the snapshot row."""

401

import json as _json

402

403

# Lazy import to avoid circular dependency at module level.

404

from musehub.api.routes.musehub.ui_user_profile import (

405

_resolve_identity,

406

_compute_provenance_stats,

_fetch_repos,

_build_heatmap,

_fetch_agent_fleet,

_compute_provenance_badges,

411

_fetch_symbol_footprint,

412

)

413

from musehub.services.musehub_profile import build_activity_canvas

414

415

identity = await _resolve_identity(session, handle)

416

if identity is None:

417

logger.info("profile.snapshot: no identity found for handle=%s, skipping", handle)

418

return

419

420

stats = await _compute_provenance_stats(session, identity)

421

repos = await _fetch_repos(session, identity)

422

heatmap = await _build_heatmap(session, identity)

423

agent_fleet = await _fetch_agent_fleet(session, identity)

424

badges = await _compute_provenance_badges(session, identity, stats)

425

footprint = await _fetch_symbol_footprint(session, identity)

426

427

activity_canvas_raw = await build_activity_canvas(session, handle)

428

activity_canvas = [

429

{

430

"domain": dom.domain,

431

"weeks": [dom.grid[i: i + 7] for i in range(0, len(dom.grid), 7)],

"peak": dom.peak,

"total": dom.total,

}

for dom in activity_canvas_raw

]

snapshot_data = {

"stats": stats,

"repos": repos,

"heatmap": heatmap,

"agent_fleet": agent_fleet,

443

"badges": badges,

444

"footprint": footprint,

445

"activity_canvas": activity_canvas,

446

}

447

448

from musehub.db.musehub_identity_models import MusehubProfileSnapshot

449

now = datetime.now(tz=timezone.utc)

450

stmt = (

451

pg_insert(MusehubProfileSnapshot)

452

.values(

453

handle=handle,

454

data_json=_json.dumps(snapshot_data, default=str),

computed_at=now,

is_stale=False,

)

.on_conflict_do_update(

459

index_elements=["handle"],

460

set_={

461

"data_json": _json.dumps(snapshot_data, default=str),

"computed_at": now,

"is_stale": False,

},

)

)

await session.execute(stmt)

468

logger.info("profile.snapshot: persisted snapshot for handle=%s", handle)

469

470

# ---------------------------------------------------------------------------

471

# Shared commit-walk helper

472

# ---------------------------------------------------------------------------

473

474

async def _load_commit_walk(

475

session: AsyncSession,

repo_id: str,

ref: str,

max_walk: int,

) -> list[str]:

"""Return commit IDs in BFS order reachable from ref, up to max_walk.

481

482

Fetches all commits for the repo in one query, builds a parent map, and

483

delegates the walk to muse.core.graph.walk_dag. Only commits present in

484

the DB for this repo are included in the result.

485

"""

486

import sqlalchemy as sa

487

from muse.core.graph import walk_dag

488

489

commits_result = await session.execute(

490

sa.select(

491

MusehubCommit.commit_id,

492

MusehubCommit.parent_ids,

493

)

494

.join(MusehubCommitRef, MusehubCommitRef.commit_id == MusehubCommit.commit_id)

495

.where(MusehubCommitRef.repo_id == repo_id)

496

)

497

commit_parents: dict[str, list[str]] = {

498

row.commit_id: (row.parent_ids or []) for row in commits_result

499

}

500

if not commit_parents:

return []

return [

cid for cid in walk_dag(

505

ref,

506

lambda cid: [p for p in commit_parents.get(cid, []) if p],

507

max_nodes=max_walk,

508

)

509

if cid in commit_parents

]

# ---------------------------------------------------------------------------

514

# Code intel providers — one per normalized intel table

515

# ---------------------------------------------------------------------------

516

517

class CouplingProvider:

518

"""Persist co-changing file pairs by mining musehub_symbol_history_entries.

519

520

Mirrors ``muse code coupling`` exactly — same BFS commit walk, same

521

mass-commit exclusion, same minimum co-change threshold.

522

523

Unlike EntangleProvider (symbol-level), this works at the file level.

524

For each history entry, the file is derived as ``address.split("::")[0]``.

525

Bare-path entries (no "::" in address) are treated as filenames directly —

526

they are valid signals at the file level, unlike at the symbol level where

527

they must be filtered out.

Algorithm

---------

1. Fetch all commits for the repo; build a commit_id → parent_ids map.

532

2. BFS-walk from HEAD ref, cap at _MAX_WALK commits.

533

3. Bulk-fetch all history entries for the repo.

534

4. Per commit: derive the distinct file set from each address.

535

5. Skip commits where the distinct file count exceeds _MAX_FILES_PER_COMMIT

536

(mass scaffolding / import commits produce O(N²) noise).

537

6. For every unordered file pair (a < b) in a qualifying commit,

538

accumulate pair_co_changes[(a, b)] += 1.

539

7. Filter: co_changes >= _MIN_CO_CHANGES; sort DESC; truncate to _MAX_PAIRS.

540

8. DELETE stale rows for the repo; upsert the fresh set.

Constants

---------

_MAX_WALK = 10_000 BFS depth cap

545

_MAX_FILES_PER_COMMIT = 200 mass-commit guard (tighter than symbol-level 500)

546

_MAX_PAIRS = 200 stored leaderboard size

547

_MIN_CO_CHANGES = 2 noise floor

"""

_MAX_WALK = 10_000

_MAX_FILES_PER_COMMIT = 200

_MAX_PAIRS = 200

_MIN_CO_CHANGES = 2

async def compute(

self, session: AsyncSession, repo_id: str, ref: str, payload: JSONObject

557

) -> IntelResults:

558

import sqlalchemy as sa

559

from collections import defaultdict

560

561

# ── Step 1: fetch all commits, BFS walk from HEAD ─────────────────────

562

walk_order = await _load_commit_walk(session, repo_id, ref, self._MAX_WALK)

if not walk_order:

return []

walk_set = set(walk_order)

567

568

# ── Step 2: bulk-fetch history entries, derive files ──────────────────

569

history_result = await session.execute(

570

sa.select(

571

MusehubSymbolHistoryEntry.commit_id,

572

MusehubSymbolHistoryEntry.address,

573

).where(MusehubSymbolHistoryEntry.repo_id == repo_id)

574

)

575

576

commit_files: dict[str, set[str]] = defaultdict(set)

577

for cid, addr in history_result:

578

if cid not in walk_set:

579

continue

580

file = addr.split("::")[0] if "::" in addr else addr

581

if file:

582

commit_files[cid].add(file)

583

584

# ── Step 3: accumulate co-change counts ───────────────────────────────

585

pair_co_changes: dict[tuple[str, str], int] = defaultdict(int)

586

for cid, files in commit_files.items():

587

if len(files) > self._MAX_FILES_PER_COMMIT:

588

continue

589

sorted_files = sorted(files)

590

for i in range(len(sorted_files)):

591

for j in range(i + 1, len(sorted_files)):

592

pair_co_changes[(sorted_files[i], sorted_files[j])] += 1

593

594

if not pair_co_changes:

595

return []

596

597

# ── Step 4: filter, sort, truncate ────────────────────────────────────

598

pairs = [

599

{"file_a": a, "file_b": b, "co_changes": co}

600

for (a, b), co in pair_co_changes.items()

601

if co >= self._MIN_CO_CHANGES

602

]

603

pairs.sort(key=lambda p: -p["co_changes"])

604

truncated = len(pairs) > self._MAX_PAIRS

605

pairs = pairs[: self._MAX_PAIRS]

606

607

# ── Step 5: delete stale rows, upsert fresh set ───────────────────────

608

await session.execute(

609

sa.delete(MusehubIntelCoupling).where(

610

MusehubIntelCoupling.repo_id == repo_id

)

)

for p in pairs:

await session.execute(

615

pg_insert(MusehubIntelCoupling)

.values(

repo_id=repo_id,

file_a=p["file_a"],

file_b=p["file_b"],

co_changes=p["co_changes"],

621

ref=ref,

622

)

623

.on_conflict_do_update(

624

index_elements=["repo_id", "file_a", "file_b"],

625

set_={"co_changes": p["co_changes"], "ref": ref},

)

)

return [("intel.code.coupling", {

630

"count": len(pairs),

631

"commits_analysed": len(walk_order),

632

"truncated": truncated,

633

})]

634

635

class EntangleProvider:

636

"""Persist symbol co-change pairs by mining musehub_symbol_history_entries.

637

638

Mirrors ``muse code entangle`` exactly — same Jaccard-based co-change rate,

639

same mass-commit exclusion, same import-symbol filter.

Algorithm

---------

1. BFS-walk commits from HEAD (cap 10 000), same as StableProvider.

644

2. Bulk-fetch all history entries for this repo; group by commit_id.

645

3. Skip commits touching > MAX_SYMBOLS_PER_COMMIT distinct symbols.

646

4. Exclude import pseudo-symbols (``::import::`` in address).

647

5. For each qualifying commit, accumulate:

648

- symbol_commits[addr] → set of commit_ids where addr was touched

649

- pair_co_changes[(a,b)] → count of commits where BOTH appeared

650

(pair key is always canonical: a < b lexicographically)

651

6. Filter: co_changes >= 2, file_a != file_b.

652

7. co_change_rate = co_changes / |commits_a ∪ commits_b| (Jaccard index).

653

8. Sort by (rate DESC, co_changes DESC); store top MAX_PAIRS.

654

9. DELETE stale rows then upsert fresh set.

"""

_MAX_WALK = 10_000

_MAX_SYMBOLS_PER_COMMIT = 500

_MAX_PAIRS = 500

_MIN_CO_CHANGES = 2

async def compute(

self, session: AsyncSession, repo_id: str, ref: str, payload: JSONObject

664

) -> IntelResults:

665

import sqlalchemy as sa

666

from collections import defaultdict

667

668

# ── Step 1: fetch all commits, BFS walk from HEAD ─────────────────────

669

walk_order = await _load_commit_walk(session, repo_id, ref, self._MAX_WALK)

if not walk_order:

return []

walk_set = set(walk_order)

674

675

# ── Step 2: bulk-fetch all history entries for this repo ──────────────

676

history_result = await session.execute(

677

sa.select(

678

MusehubSymbolHistoryEntry.commit_id,

679

MusehubSymbolHistoryEntry.address,

680

).where(MusehubSymbolHistoryEntry.repo_id == repo_id)

681

)

682

683

commit_symbols: dict[str, set[str]] = defaultdict(set)

684

symbol_commits: dict[str, set[str]] = defaultdict(set)

685

for cid, addr in history_result:

686

if cid not in walk_set:

687

continue

688

if "::" not in addr or "::import::" in addr:

689

continue

690

commit_symbols[cid].add(addr)

691

symbol_commits[addr].add(cid)

692

693

# ── Step 3: accumulate co-change counts ───────────────────────────────

694

pair_co_changes: dict[tuple[str, str], int] = defaultdict(int)

695

for cid, symbols in commit_symbols.items():

696

if len(symbols) > self._MAX_SYMBOLS_PER_COMMIT:

697

continue

698

syms = sorted(symbols)

699

for i in range(len(syms)):

700

for j in range(i + 1, len(syms)):

701

pair_co_changes[(syms[i], syms[j])] += 1

702

703

if not pair_co_changes:

704

return []

705

706

# ── Step 4: compute rates, filter, sort ───────────────────────────────

707

pairs: list[dict] = []

708

for (a, b), co_changes in pair_co_changes.items():

709

if co_changes < self._MIN_CO_CHANGES:

710

continue

711

file_a = a.split("::")[0]

712

file_b = b.split("::")[0]

713

if file_a == file_b:

714

continue

715

count_a = len(symbol_commits[a])

716

count_b = len(symbol_commits[b])

717

union_count = count_a + count_b - co_changes

718

if union_count == 0:

719

continue

720

commits_both_active = union_count

721

rate = round(co_changes / union_count, 6)

pairs.append({

"symbol_a": a,

"symbol_b": b,

"file_a": file_a,

"file_b": file_b,

"co_changes": co_changes,

728

"commits_both_active": commits_both_active,

729

"co_change_rate": rate,

730

"same_file": False,

731

"structurally_linked": False,

732

"a_in_test": "test" in file_a.lower(),

733

"b_in_test": "test" in file_b.lower(),

734

})

735

736

pairs.sort(key=lambda p: (-p["co_change_rate"], -p["co_changes"]))

737

truncated = len(pairs) > self._MAX_PAIRS

738

pairs = pairs[: self._MAX_PAIRS]

739

740

# ── Step 5: delete stale rows, upsert fresh set ───────────────────────

741

await session.execute(

742

sa.delete(MusehubIntelEntangle).where(

743

MusehubIntelEntangle.repo_id == repo_id

)

)

for p in pairs:

await session.execute(

748

pg_insert(MusehubIntelEntangle)

749

.values(

750

repo_id=repo_id,

751

symbol_a=p["symbol_a"],

752

symbol_b=p["symbol_b"],

753

co_change_rate=p["co_change_rate"],

754

co_changes=p["co_changes"],

755

commits_both_active=p["commits_both_active"],

756

file_a=p["file_a"],

757

file_b=p["file_b"],

758

same_file=p["same_file"],

759

a_in_test=p["a_in_test"],

760

b_in_test=p["b_in_test"],

761

structurally_linked=p["structurally_linked"],

762

ref=ref,

763

)

764

.on_conflict_do_update(

765

index_elements=["repo_id", "symbol_a", "symbol_b"],

766

set_={

767

"co_change_rate": p["co_change_rate"],

768

"co_changes": p["co_changes"],

769

"commits_both_active": p["commits_both_active"],

770

"file_a": p["file_a"],

771

"file_b": p["file_b"],

772

"same_file": p["same_file"],

773

"a_in_test": p["a_in_test"],

774

"b_in_test": p["b_in_test"],

775

"structurally_linked": p["structurally_linked"],

"ref": ref,

},

)

)

return [("intel.code.entangle", {

782

"count": len(pairs),

783

"commits_analysed": len(walk_order),

784

"truncated": truncated,

785

})]

786

787

_DEAD_TRACKED_KINDS = frozenset(

788

{"function", "async_function", "method", "async_method", "class"}

789

)

790

791

_DEAD_REASONS: dict[str, str] = {

792

"high": "Added once, never modified. Zero blast radius in full history.",

793

"medium": "Modified in past but zero blast radius for ≥ 30 days.",

794

"low": "Zero blast radius. Recently active — verify before deleting.",

}

class DeadProvider:

"""Persist dead-code candidates derived from musehub_symbol_intel."""

799

800

async def compute(

801

self, session: AsyncSession, repo_id: str, ref: str, payload: JSONObject

802

) -> IntelResults:

803

import sqlalchemy as sa

804

805

rows_result = await session.execute(

806

sa.select(MusehubSymbolIntel).where(

807

MusehubSymbolIntel.repo_id == repo_id,

808

MusehubSymbolIntel.symbol_kind.in_(_DEAD_TRACKED_KINDS),

809

MusehubSymbolIntel.blast == 0,

810

)

811

)

812

rows = rows_result.scalars().all()

if not rows:

return []

count = 0

for row in rows:

if row.churn == 1:

confidence = "high"

elif row.churn_30d == 0:

821

confidence = "medium"

else:

confidence = "low"

stmt = (

pg_insert(MusehubIntelDead)

.values(

repo_id=repo_id,

address=row.address,

kind=row.symbol_kind or "unknown",

831

confidence=confidence,

832

reason=_DEAD_REASONS[confidence],

ref=ref,

dismissed=False,

)

.on_conflict_do_update(

837

index_elements=["repo_id", "address"],

838

set_={

839

"kind": row.symbol_kind or "unknown",

840

"confidence": confidence,

841

"reason": _DEAD_REASONS[confidence],

842

"ref": ref,

843

"dismissed": sa.func.coalesce(

844

MusehubIntelDead.__table__.c.dismissed,

False,

),

},

)

)

await session.execute(stmt)

851

count += 1

852

853

return [("intel.code.dead", {"count": count})]

854

855

_BLAST_RISK_TRACKED_KINDS = frozenset(

856

{"function", "async_function", "method", "async_method", "class"}

857

)

858

859

# Weights must sum to 100.

860

_BLAST_RISK_WEIGHTS = {

"impact": 40,

"churn": 25,

"test_gap": 20,

"coupling": 15,

}

def _compute_risk_score(

*,

impact_score: float,

churn_score: float,

test_gap_score: float,

872

coupling_score: float,

873

) -> int:

874

"""Return composite risk score 0–100 from four normalized sub-scores.

875

876

Each sub-score must be in [0.0, 1.0]. The result is clamped to [0, 100]

877

so overflow inputs (e.g. blast=9999) never produce an out-of-range value.

878

879

Weights: impact=40, churn=25, test_gap=20, coupling=15.

880

"""

881

raw = (

882

impact_score * _BLAST_RISK_WEIGHTS["impact"] +

883

churn_score * _BLAST_RISK_WEIGHTS["churn"] +

884

test_gap_score * _BLAST_RISK_WEIGHTS["test_gap"] +

885

coupling_score * _BLAST_RISK_WEIGHTS["coupling"]

886

)

887

return min(100, max(0, round(raw)))

888

889

def _risk_tier(risk_score: int) -> str:

890

"""Map a composite risk score to a named tier.

891

892

critical → score >= 75

high → score >= 50

medium → score >= 25

low → score < 25

"""

if risk_score >= 75:

return "critical"

if risk_score >= 50:

return "high"

if risk_score >= 25:

return "medium"

return "low"

class BlastRiskProvider:

906

"""Persist composite pre-release risk scores derived from ``musehub_symbol_intel``.

907

908

Replaces the former ``muse code blast-risk`` subprocess approach with a pure

909

SQL derivation so the provider works in every environment without a local

910

muse repository on disk.

Data source

-----------

``musehub_symbol_intel`` columns:

915

- ``blast`` → normalized to ``impact_score`` (cap at 50)

916

- ``churn_30d`` → normalized to ``churn_score`` (cap at 20)

917

- ``blast_cross`` → normalized to ``coupling_score``(cap at 10)

918

- test_gap_score is fixed at 1.0 until symbol-level coverage data lands

919

920

Only tracked kinds (function / async_function / method / async_method / class)

921

with ``blast > 0`` are candidates — zero-blast symbols have no downstream impact.

Output

------

Upserts into ``musehub_intel_blast_risk`` (repo_id, address) and returns

926

``[("intel.code.blast_risk", {"count": N})]``.

"""

async def compute(

self, session: AsyncSession, repo_id: str, ref: str, payload: JSONObject

931

) -> IntelResults:

932

"""Derive and persist blast-risk scores for all tracked symbols in ``repo_id``.

Parameters

----------

session: Async SQLAlchemy session.

937

repo_id: Target repository ID.

938

ref: Head commit ref that triggered this run (stored per-row).

939

payload: Raw job payload (unused — kept for protocol compatibility).

Returns

-------

A one-element list ``[("intel.code.blast_risk", {"count": N})]`` where N

944

is the number of symbols upserted, or ``[]`` if no candidates exist.

Side effects

------------

Upserts rows in ``musehub_intel_blast_risk``. Never touches

949

``musehub_symbol_intel`` blast/churn columns.

950

"""

951

import sqlalchemy as sa

952

953

rows_result = await session.execute(

954

sa.select(MusehubSymbolIntel).where(

955

MusehubSymbolIntel.repo_id == repo_id,

956

MusehubSymbolIntel.symbol_kind.in_(_BLAST_RISK_TRACKED_KINDS),

957

MusehubSymbolIntel.blast > 0,

958

).execution_options(populate_existing=True)

959

)

960

rows = rows_result.scalars().all()

if not rows:

return []

count = 0

for row in rows:

impact_score = min(row.blast / 50.0, 1.0)

967

churn_score = min((row.churn_30d or 0) / 20.0, 1.0)

968

test_gap_score = 1.0 # no coverage data yet — worst case

969

coupling_score = min((row.blast_cross or 0) / 10.0, 1.0)

970

971

risk_score = _compute_risk_score(

972

impact_score=impact_score,

973

churn_score=churn_score,

974

test_gap_score=test_gap_score,

975

coupling_score=coupling_score,

976

)

977

risk = _risk_tier(risk_score)

978

979

stmt = (

980

pg_insert(MusehubIntelBlastRisk)

.values(

repo_id=repo_id,

address=row.address,

kind=row.symbol_kind or "unknown",

985

risk=risk,

986

risk_score=risk_score,

987

impact_score=impact_score,

988

churn_score=churn_score,

989

test_gap_score=test_gap_score,

990

coupling_score=coupling_score,

991

ref=ref,

992

)

993

.on_conflict_do_update(

994

index_elements=["repo_id", "address"],

995

set_={

996

"kind": row.symbol_kind or "unknown",

997

"risk": risk,

998

"risk_score": risk_score,

999

"impact_score": impact_score,

1000

"churn_score": churn_score,

1001

"test_gap_score": test_gap_score,

1002

"coupling_score": coupling_score,

"ref": ref,

},

)

)

await session.execute(stmt)

1008

count += 1

1009

1010

return [("intel.code.blast_risk", {"count": count})]

1011

1012

def _days_stable_from_dt(last_changed: datetime | None) -> int:

1013

"""Return whole days elapsed since ``last_changed`` as of now (UTC).

Parameters

----------

last_changed:

UTC-aware or naive datetime of the symbol's last modification.

1019

Naive datetimes are treated as UTC. ``None`` returns 0.

Returns

-------

Non-negative integer — days elapsed, clamped to zero (future timestamps

1024

return 0 rather than a negative value).

Examples

--------

>>> from datetime import datetime, timezone, timedelta

1029

>>> _days_stable_from_dt(datetime.now(timezone.utc) - timedelta(days=90))

1030

90

1031

>>> _days_stable_from_dt(None)

1032

0

1033

"""

1034

if last_changed is None:

1035

return 0

1036

now = datetime.now(timezone.utc)

1037

if last_changed.tzinfo is None:

1038

last_changed = last_changed.replace(tzinfo=timezone.utc)

1039

delta = now - last_changed

1040

return max(0, delta.days)

1041

1042

class StableProvider:

1043

"""Persist stability records derived directly from ``musehub_symbol_intel``.

1044

1045

A symbol is *stable* when it has not changed in the last 30 or 90 days

1046

(``churn_30d == 0`` and ``churn_90d == 0``) and has a known ``last_changed``

1047

timestamp. ``days_stable`` is calendar days elapsed since ``last_changed``.

1048

``since_start`` is True when the symbol's lifetime ``churn`` is zero.

"""

async def compute(

self, session: AsyncSession, repo_id: str, ref: str, payload: JSONObject

1053

) -> IntelResults:

1054

import sqlalchemy as sa

1055

1056

sym_result = await session.execute(

1057

sa.select(MusehubSymbolIntel).where(

1058

MusehubSymbolIntel.repo_id == repo_id,

1059

MusehubSymbolIntel.churn_30d == 0,

1060

MusehubSymbolIntel.churn_90d == 0,

1061

MusehubSymbolIntel.last_changed.is_not(None),

1062

)

1063

)

1064

qualifying = sym_result.scalars().all()

if not qualifying:

return []

count = 0

for sym in qualifying:

1070

days_stable = _days_stable_from_dt(sym.last_changed)

1071

since_start = sym.churn == 0

1072

stmt = (

1073

pg_insert(MusehubIntelStable)

.values(

repo_id=repo_id,

address=sym.address,

days_stable=days_stable,

1078

since_start=since_start,

1079

last_changed_commit=sym.last_commit_id,

1080

symbol_kind=sym.symbol_kind,

1081

ref=ref,

1082

)

1083

.on_conflict_do_update(

1084

index_elements=["repo_id", "address"],

1085

set_={

1086

"days_stable": days_stable,

1087

"since_start": since_start,

1088

"last_changed_commit": sym.last_commit_id,

1089

"symbol_kind": sym.symbol_kind,

"ref": ref,

},

)

)

await session.execute(stmt)

1095

count += 1

1096

1097

return [("intel.code.stable", {"count": count, "truncated": False})]

1098

1099

class VelocityProvider:

1100

"""Persist module growth velocity by mining musehub_symbol_history_entries.

1101

1102

Mirrors ``muse code velocity`` exactly — same BFS commit walk, same

1103

two-window (current / prior) structure, same per-module metrics.

Algorithm

---------

1. Fetch all commits for the repo; build a commit_id → parent_ids map.

1108

2. BFS-walk from HEAD ref, cap at _MAX_WALK commits.

1109

3. Split walk_order into current window (0.._WINDOW-1) and prior window

1110

(_WINDOW..2*_WINDOW-1).

1111

4. Bulk-fetch all history entries for the repo.

1112

5. Per-commit per-module: accumulate added / removed / modified counts.

1113

Module is derived as the directory of the file component of each address.

1114

6. active_commits = distinct commits in window that touched the module.

1115

stagnant_commits = commits in window where module's commit-level net == 0.

1116

7. acceleration = current_net - prior_net.

1117

8. Sort by active_commits DESC; truncate to _TOP; DELETE stale; upsert fresh.

Constants

---------

_MAX_WALK = 10_000 BFS depth cap

1122

_WINDOW = 20 commits per analysis window (current + prior each)

1123

_TOP = 20 stored leaderboard size

"""

_MAX_WALK = 10_000

_WINDOW = 20

_TOP = 20

async def compute(

self, session: AsyncSession, repo_id: str, ref: str, payload: JSONObject

1132

) -> IntelResults:

1133

import sqlalchemy as sa

1134

from collections import defaultdict

1135

1136

# ── Step 1: fetch all commits, BFS walk from HEAD ─────────────────────

1137

walk_order = await _load_commit_walk(session, repo_id, ref, self._MAX_WALK)

if not walk_order:

return []

current_window = set(walk_order[: self._WINDOW])

1142

prior_window = set(walk_order[self._WINDOW : self._WINDOW * 2])

1143

combined_window = current_window | prior_window

1144

1145

# ── Step 2: bulk-fetch history entries ────────────────────────────────

1146

history_result = await session.execute(

1147

sa.select(

1148

MusehubSymbolHistoryEntry.commit_id,

1149

MusehubSymbolHistoryEntry.address,

1150

MusehubSymbolHistoryEntry.op,

1151

).where(MusehubSymbolHistoryEntry.repo_id == repo_id)

1152

)

1153

1154

def _module(addr: str) -> str:

1155

file = addr.split("::")[0] if "::" in addr else addr

1156

if "/" in file:

1157

return f"{file.rsplit('/', 1)[0]}/"

1158

return f"{file}/"

1159

1160

# per_commit_module[(cid, module)] = {added, removed, modified}

1161

per_commit_module: dict[tuple[str, str], dict[str, int]] = defaultdict(

1162

lambda: {"added": 0, "removed": 0, "modified": 0}

1163

)

1164

for cid, addr, op in history_result:

1165

if cid not in combined_window or not addr:

1166

continue

1167

mod = _module(addr)

1168

bucket = per_commit_module[(cid, mod)]

if op == "add":

bucket["added"] += 1

elif op == "delete":

bucket["removed"] += 1

1173

else: # modify, rename

1174

bucket["modified"] += 1

1175

1176

# ── Step 3: aggregate per-module per-window ───────────────────────────

1177

def _blank() -> IntDict:

1178

return {"added": 0, "removed": 0, "modified": 0, "net": 0,

1179

"active_commits": 0, "stagnant_commits": 0}

1180

1181

current_stats: ModuleStatsMap = defaultdict(_blank)

1182

prior_stats: ModuleStatsMap = defaultdict(_blank)

1183

1184

for (cid, mod), counts in per_commit_module.items():

1185

commit_net = counts["added"] - counts["removed"]

1186

target = current_stats[mod] if cid in current_window else prior_stats[mod]

1187

target["added"] += counts["added"]

1188

target["removed"] += counts["removed"]

1189

target["modified"] += counts["modified"]

1190

target["net"] += commit_net

1191

target["active_commits"] += 1

1192

if commit_net == 0:

1193

target["stagnant_commits"] += 1

1194

1195

all_modules = set(current_stats) | set(prior_stats)

if not all_modules:

return []

# ── Step 4: build final records ───────────────────────────────────────

1200

records = []

1201

for mod in all_modules:

1202

cur = current_stats[mod]

1203

pri = prior_stats[mod]

1204

records.append({

1205

"module": mod,

1206

"added": cur["added"],

1207

"removed": cur["removed"],

1208

"modified": cur["modified"],

1209

"net": cur["net"],

1210

"active_commits": cur["active_commits"],

1211

"stagnant_commits": cur["stagnant_commits"],

1212

"prior_added": pri["added"],

1213

"prior_net": pri["net"],

1214

"prior_modified": pri["modified"],

1215

"prior_active_commits": pri["active_commits"],

1216

"acceleration": float(cur["net"] - pri["net"]),

1217

"window_size": self._WINDOW,

1218

"commits_analysed": len(walk_order),

1219

})

1220

1221

records.sort(key=lambda r: -r["active_commits"])

1222

truncated = len(records) > self._TOP

1223

records = records[: self._TOP]

1224

1225

# ── Step 5: delete stale rows, upsert fresh set ───────────────────────

1226

await session.execute(

1227

sa.delete(MusehubIntelVelocity).where(

1228

MusehubIntelVelocity.repo_id == repo_id

)

)

for r in records:

await session.execute(

1233

pg_insert(MusehubIntelVelocity)

.values(

repo_id=repo_id,

module=r["module"],

added=r["added"],

removed=r["removed"],

1239

net=r["net"],

1240

modified=r["modified"],

1241

active_commits=r["active_commits"],

1242

prior_added=r["prior_added"],

1243

prior_net=r["prior_net"],

1244

prior_modified=r["prior_modified"],

1245

prior_active_commits=r["prior_active_commits"],

1246

acceleration=r["acceleration"],

1247

stagnant_commits=r["stagnant_commits"],

1248

window_size=r["window_size"],

1249

commits_analysed=r["commits_analysed"],

1250

ref=ref,

1251

)

1252

.on_conflict_do_update(

1253

index_elements=["repo_id", "module"],

1254

set_={

1255

"added": r["added"],

1256

"removed": r["removed"],

1257

"net": r["net"],

1258

"modified": r["modified"],

1259

"active_commits": r["active_commits"],

1260

"prior_added": r["prior_added"],

1261

"prior_net": r["prior_net"],

1262

"prior_modified": r["prior_modified"],

1263

"prior_active_commits": r["prior_active_commits"],

1264

"acceleration": r["acceleration"],

1265

"stagnant_commits": r["stagnant_commits"],

1266

"window_size": r["window_size"],

1267

"commits_analysed": r["commits_analysed"],

"ref": ref,

},

)

)

return [("intel.code.velocity", {

1274

"count": len(records),

1275

"commits_analysed": len(walk_order),

1276

"truncated": truncated,

1277

})]

1278

1279

_EXT_TO_LANGUAGE: dict[str, str] = {

1280

".py": "Python", ".js": "JavaScript", ".ts": "TypeScript",

1281

".tsx": "TypeScript", ".jsx": "JavaScript", ".rs": "Rust",

1282

".go": "Go", ".rb": "Ruby", ".java": "Java", ".kt": "Kotlin",

1283

".swift": "Swift", ".cpp": "C++", ".c": "C", ".cs": "C#",

1284

".php": "PHP", ".md": "Markdown", ".toml": "TOML", ".yaml": "YAML",

1285

".yml": "YAML", ".json": "JSON", ".sh": "Shell", ".sql": "SQL",

1286

}

1287

1288

def _lang_from_address(address: str) -> str:

1289

"""Infer language from the file portion of a symbol address."""

1290

file_part = address.split("::")[0] if "::" in address else address

1291

ext = f".{file_part.rsplit('.', 1)[-1]}" if "." in file_part else ""

1292

return _EXT_TO_LANGUAGE.get(ext.lower(), "Unknown")

1293

1294

class ClonesProvider:

1295

"""Persist exact duplicate-code clusters derived from musehub_hash_occurrence_entries.

1296

1297

Reads ``musehub_hash_occurrence_entries`` — populated by ``CodeProvider``

1298

on every push — and groups addresses by ``content_id``. Any content_id

1299

with 2+ distinct addresses is an exact-clone cluster. No subprocess or

1300

on-disk repo access required: all data comes from the DB.

1301

1302

Near-clone detection (``signature_id`` grouping) requires ``signature_id``

1303

to be stored at push time, which is not yet implemented. All clusters

1304

produced here are tier ``"exact"``.

"""

_MIN_CLUSTER = 2

async def compute(

self, session: AsyncSession, repo_id: str, ref: str, payload: JSONObject

1311

) -> IntelResults:

1312

import sqlalchemy as sa

1313

1314

# Fetch all (content_id, address, symbol_kind) for this repo where

1315

# content_id appears on 2+ distinct addresses (exact clone).

1316

subq = (

1317

sa.select(MusehubHashOccurrenceEntry.content_id)

1318

.where(MusehubHashOccurrenceEntry.repo_id == repo_id)

1319

.group_by(MusehubHashOccurrenceEntry.content_id)

1320

.having(sa.func.count() >= self._MIN_CLUSTER)

1321

.scalar_subquery()

1322

)

1323

rows_result = await session.execute(

1324

sa.select(

1325

MusehubHashOccurrenceEntry.content_id,

1326

MusehubHashOccurrenceEntry.address,

1327

MusehubSymbolIntel.symbol_kind,

)

.outerjoin(

MusehubSymbolIntel,

sa.and_(

MusehubSymbolIntel.repo_id == MusehubHashOccurrenceEntry.repo_id,

1333

MusehubSymbolIntel.address == MusehubHashOccurrenceEntry.address,

),

)

.where(

MusehubHashOccurrenceEntry.repo_id == repo_id,

1338

MusehubHashOccurrenceEntry.content_id.in_(subq),

1339

)

1340

)

1341

rows = rows_result.all()

if not rows:

return []

# Group into clusters keyed by content_id.

1346

clusters: dict[str, list[dict]] = {}

1347

for content_id, address, symbol_kind in rows:

1348

if content_id not in clusters:

1349

clusters[content_id] = []

1350

clusters[content_id].append({

1351

"address": address,

1352

"kind": symbol_kind or "unknown",

1353

"language": _lang_from_address(address),

1354

"body_hash": content_id,

1355

"signature_id": content_id,

1356

"content_id": content_id,

})

count = 0

for content_id, members in clusters.items():

1361

# Stable cluster hash: sha256 of the sorted content_id string.

1362

cluster_hash = blob_id(content_id.encode())

1363

members_json = json.dumps(members)

1364

stmt = (

1365

pg_insert(MusehubIntelClones)

1366

.values(

1367

repo_id=repo_id,

1368

cluster_hash=cluster_hash,

1369

tier="exact",

1370

member_count=len(members),

1371

members_json=members_json,

1372

ref=ref,

1373

)

1374

.on_conflict_do_update(

1375

index_elements=["repo_id", "cluster_hash"],

1376

set_={

1377

"tier": "exact",

1378

"member_count": len(members),

1379

"members_json": members_json,

"ref": ref,

},

)

)

await session.execute(stmt)

1385

count += 1

1386

1387

return [("intel.code.clones", {"count": count})]

1388

1389

class TypeProvider:

1390

"""Persist per-symbol type-annotation health derived from stored object content.

1391

1392

Reads the HEAD snapshot manifest from ``musehub_snapshots.manifest_blob``

1393

(msgpack dict of path → object_id), fetches each Python file's bytes via

1394

the storage backend, runs ``muse.core.type_analysis.extract_annotations``

1395

(pure AST — no subprocess, no on-disk repo checkout), then batch-upserts

1396

into ``musehub_intel_type`` in chunks of 1,000 rows.

1397

1398

This follows the same DB-only pattern as every other intel provider:

1399

all data flows from objects stored at push time; nothing requires a

1400

working muse repository on disk.

"""

_CHUNK = 1_000

async def compute(

self, session: AsyncSession, repo_id: str, ref: str, payload: JSONObject

1407

) -> IntelResults:

1408

import msgpack

1409

from muse.core.type_analysis import extract_annotations

1410

from musehub.storage.backends import get_backend

1411

1412

# ── 1. Resolve owner/slug for the storage backend ────────────────────

1413

owner: str | None = payload.get("owner") # type: ignore[assignment]

1414

slug: str | None = payload.get("slug") # type: ignore[assignment]

1415

if not (owner and slug):

1416

row = await session.execute(

1417

select(MusehubRepo.owner, MusehubRepo.slug)

1418

.where(MusehubRepo.repo_id == repo_id)

.limit(1)

)

result = row.first()

if result is None:

return []

owner, slug = result

# ── 2. Fetch HEAD commit → snapshot manifest ─────────────────────────

1427

commit_row = await session.execute(

1428

select(MusehubCommit.snapshot_id)

1429

.join(MusehubCommitRef, MusehubCommitRef.commit_id == MusehubCommit.commit_id)

1430

.where(MusehubCommitRef.repo_id == repo_id)

1431

.order_by(MusehubCommit.timestamp.desc())

1432

.limit(1)

1433

)

1434

commit_result = commit_row.first()

1435

if commit_result is None or commit_result[0] is None:

1436

return []

1437

snapshot_id: str = commit_result[0]

1438

1439

snap_row = await session.execute(

1440

select(MusehubSnapshot.manifest_blob)

1441

.where(MusehubSnapshot.snapshot_id == snapshot_id)

1442

.limit(1)

1443

)

1444

snap_result = snap_row.first()

1445

if snap_result is None or snap_result[0] is None:

1446

return []

1447

1448

manifest: dict[str, str] = msgpack.unpackb(bytes(snap_result[0]), raw=False)

1449

1450

# ── 3. Extract type annotations from each Python file ─────────────────

1451

from musehub.storage.backends import read_object_bytes as _read_obj_bytes

1452

from musehub.types.compression import decompress_if_needed as _decompress

1453

from musehub.db.musehub_repo_models import MusehubObject as _MusehubObject

1454

py_entries = [(path, oid) for path, oid in manifest.items() if path.endswith(".py")]

if not py_entries:

return []

rows: list[dict] = []

1459

for path, object_id in py_entries:

1460

obj_row = await session.get(_MusehubObject, object_id)

1461

if obj_row is None:

1462

continue

1463

raw = await _read_obj_bytes(obj_row, session=session)

1464

if not raw:

1465

continue

1466

src = _decompress(raw)

1467

try:

1468

ann_map = extract_annotations(src, path)

1469

except SyntaxError:

1470

continue

1471

for address, ann in ann_map.items():

rows.append({

"repo_id": repo_id,

"address": address,

"kind": ann.get("kind", "unknown"),

1476

"return_is_any": ann.get("return_is_any", False),

1477

"params_total": ann.get("params_total", 0),

1478

"params_annotated": ann.get("params_annotated", 0),

1479

"params_with_any": ann.get("params_with_any", 0),

1480

"type_score": ann.get("type_score", 0.0),

1481

"return_annotation": (ann.get("return_annotation") or "")[:256] or None,

"ref": ref,

})

if not rows:

return []

# ── 4. Batch-upsert in chunks of 1,000 ───────────────────────────────

1489

for i in range(0, len(rows), self._CHUNK):

1490

chunk = rows[i : i + self._CHUNK]

1491

stmt = (

1492

pg_insert(MusehubIntelType)

1493

.values(chunk)

1494

.on_conflict_do_update(

1495

index_elements=["repo_id", "address"],

1496

set_={

1497

"kind": sa.literal_column("excluded.kind"),

1498

"return_is_any": sa.literal_column("excluded.return_is_any"),

1499

"params_total": sa.literal_column("excluded.params_total"),

1500

"params_annotated": sa.literal_column("excluded.params_annotated"),

1501

"params_with_any": sa.literal_column("excluded.params_with_any"),

1502

"type_score": sa.literal_column("excluded.type_score"),

1503

"return_annotation": sa.literal_column("excluded.return_annotation"),

1504

"ref": sa.literal_column("excluded.ref"),

},

)

)

await session.execute(stmt)

1509

1510

return [("intel.code.type", {"count": len(rows)})]

1511

1512

class ApiSurfaceProvider:

1513

"""Persist public API surface entries by parsing stored snapshot objects.

1514

1515

Reads the HEAD snapshot manifest from ``musehub_snapshots.manifest_blob``

1516

(msgpack dict of path → object_id), fetches each source file's bytes via

1517

the storage backend, runs ``parse_symbols`` (pure AST — no subprocess, no

1518

on-disk repo checkout), filters to public symbols using the same

1519

``_is_public`` predicate as ``muse code api-surface``, then batch-upserts

1520

into ``musehub_intel_api_surface`` in chunks of 1,000 rows.

1521

1522

This follows the same DB-only pattern as ``TypeProvider``:

1523

all data flows from objects stored at push time; nothing requires a

1524

working muse repository on disk.

Parameters

----------

session:

Active async SQLAlchemy session provided by the push worker.

1530

repo_id:

1531

Opaque repo identifier from ``musehub_repos.repo_id``.

1532

ref:

1533

Branch or commit ref string at push time (e.g. ``"dev"``).

1534

payload:

1535

Raw push-event JSON — used to resolve owner/slug for the storage

1536

backend when not already present on the session.

Returns

-------

List of ``(event_name, metadata)`` tuples consumed by the telemetry layer.

1541

Returns ``[("intel.code.api_surface", {"count": N})]`` on success,

1542

``[]`` when the snapshot manifest cannot be resolved or no public symbols

are found.

"""

_CHUNK = 1_000

_PUBLIC_KINDS: frozenset[str] = frozenset({

1549

"function", "async_function", "class", "method", "async_method",

1550

})

1551

1552

def _is_public(self, name: str, kind: str) -> bool:

1553

return kind in self._PUBLIC_KINDS and not name.split(".")[-1].startswith("_")

1554

1555

async def compute(

1556

self, session: AsyncSession, repo_id: str, ref: str, payload: JSONObject

) -> IntelResults:

import msgpack

# ── 1. Resolve owner/slug for the storage backend ────────────────────

1561

owner: str | None = payload.get("owner") # type: ignore[assignment]

1562

slug: str | None = payload.get("slug") # type: ignore[assignment]

1563

if not (owner and slug):

1564

row = await session.execute(

1565

select(MusehubRepo.owner, MusehubRepo.slug)

1566

.where(MusehubRepo.repo_id == repo_id)

.limit(1)

)

result = row.first()

if result is None:

return []

owner, slug = result

# ── 2. Fetch HEAD commit → snapshot manifest ─────────────────────────

1575

commit_row = await session.execute(

1576

select(MusehubCommit.snapshot_id)

1577

.join(MusehubCommitRef, MusehubCommitRef.commit_id == MusehubCommit.commit_id)

1578

.where(MusehubCommitRef.repo_id == repo_id)

1579

.order_by(MusehubCommit.timestamp.desc())

1580

.limit(1)

1581

)

1582

commit_result = commit_row.first()

1583

if commit_result is None or commit_result[0] is None:

1584

return []

1585

snapshot_id: str = commit_result[0]

1586

1587

snap_row = await session.execute(

1588

select(MusehubSnapshot.manifest_blob)

1589

.where(MusehubSnapshot.snapshot_id == snapshot_id)

1590

.limit(1)

1591

)

1592

snap_result = snap_row.first()

1593

if snap_result is None or snap_result[0] is None:

1594

return []

1595

1596

manifest: dict[str, str] = msgpack.unpackb(bytes(snap_result[0]), raw=False)

1597

1598

# ── 3. Parse public symbols from each source file ─────────────────────

1599

from musehub.storage.backends import read_object_bytes as _read_obj_bytes

1600

from musehub.types.compression import decompress_if_needed as _decompress

1601

from musehub.db.musehub_repo_models import MusehubObject as _MusehubObject

1602

rows: list[dict] = []

1603

1604

for file_path, object_id in manifest.items():

1605

obj_row = await session.get(_MusehubObject, object_id)

1606

if obj_row is None:

1607

continue

1608

raw = await _read_obj_bytes(obj_row, session=session)

1609

if not raw:

1610

continue

1611

src = _decompress(raw)

1612

try:

1613

tree = parse_symbols(src, file_path)

1614

except Exception:

1615

continue

1616

for address, rec in tree.items():

1617

if not self._is_public(rec["name"], rec["kind"]):

continue

rows.append({

"repo_id": repo_id,

"address": address,

"kind": rec["kind"],

"signature_id": rec.get("signature_id"),

1624

"visibility": "public",

"ref": ref,

})

if not rows:

return []

# ── 4. Batch-upsert in chunks of 1,000 ───────────────────────────────

1632

for i in range(0, len(rows), self._CHUNK):

1633

chunk = rows[i : i + self._CHUNK]

1634

stmt = (

1635

pg_insert(MusehubIntelApiSurface)

1636

.values(chunk)

1637

.on_conflict_do_update(

1638

index_elements=["repo_id", "address"],

1639

set_={

1640

"kind": sa.literal_column("excluded.kind"),

1641

"signature_id": sa.literal_column("excluded.signature_id"),

1642

"visibility": sa.literal_column("excluded.visibility"),

1643

"ref": sa.literal_column("excluded.ref"),

},

)

)

await session.execute(stmt)

1648

1649

return [("intel.code.api_surface", {"count": len(rows)})]

1650

1651

class LanguagesProvider:

1652

"""Persist language composition derived from stored snapshot objects.

1653

1654

Reads the HEAD snapshot manifest from ``musehub_snapshots.manifest_blob``

1655

(msgpack dict of path → object_id), calls ``language_of()`` (pure extension

1656

map — no subprocess, no I/O) to group files by language, fetches each

1657

file's bytes from the storage backend, and runs ``parse_symbols()`` (pure

1658

AST — no subprocess) to count symbols and kind breakdowns per language.

1659

Aggregated results are batch-upserted into ``musehub_intel_languages`` in

1660

a single SQL statement (language count is typically < 20 rows per repo).

1661

1662

This follows the same DB-only pattern as ``TypeProvider`` and

1663

``ApiSurfaceProvider``: all data flows from objects stored at push time;

1664

nothing requires a working muse repository on disk and no subprocess is

ever spawned.

Parameters

----------

session : AsyncSession

1670

Active async SQLAlchemy session provided by the push worker.

1671

repo_id : str

1672

Opaque repo identifier from ``musehub_repos.repo_id``.

1673

ref : str

1674

Branch or commit ref string at push time (e.g. ``"dev"``).

1675

payload : JSONObject

1676

Raw push-event JSON — used to resolve owner/slug for the storage

1677

backend when not already present in the session.

Returns

-------

IntelResults

``[("intel.code.languages", {"count": N})]`` on success where N is

1683

the number of distinct languages found. ``[]`` when the snapshot

1684

manifest cannot be resolved or the repo has no tracked files.

Notes

-----

Import pseudo-symbols (``kind == "import"``) are excluded from counts to

1689

match the default behaviour of ``muse code languages``. ``pct`` is

1690

computed as ``symbol_count / total_symbols * 100``; it is ``0.0`` for

1691

languages with no parseable symbols (assets, config, docs). All languages

1692

present in the manifest are stored — not just those with symbols — so the

1693

GUI can display the full file-type composition of the repo.

"""

_CHUNK = 1_000

_IMPORT_KINDS: frozenset[str] = frozenset({"import"})

1698

1699

async def compute(

1700

self, session: AsyncSession, repo_id: str, ref: str, payload: JSONObject

) -> IntelResults:

import msgpack

# ── 1. Resolve owner/slug for the storage backend ────────────────────

1705

owner: str | None = payload.get("owner") # type: ignore[assignment]

1706

slug: str | None = payload.get("slug") # type: ignore[assignment]

1707

if not (owner and slug):

1708

row = await session.execute(

1709

select(MusehubRepo.owner, MusehubRepo.slug)

1710

.where(MusehubRepo.repo_id == repo_id)

.limit(1)

)

result = row.first()

if result is None:

return []

owner, slug = result

# ── 2. Fetch HEAD commit → snapshot manifest ─────────────────────────

1719

commit_row = await session.execute(

1720

select(MusehubCommit.snapshot_id)

1721

.join(MusehubCommitRef, MusehubCommitRef.commit_id == MusehubCommit.commit_id)

1722

.where(MusehubCommitRef.repo_id == repo_id)

1723

.order_by(MusehubCommit.timestamp.desc())

1724

.limit(1)

1725

)

1726

commit_result = commit_row.first()

1727

if commit_result is None or commit_result[0] is None:

1728

return []

1729

snapshot_id: str = commit_result[0]

1730

1731

snap_row = await session.execute(

1732

select(MusehubSnapshot.manifest_blob)

1733

.where(MusehubSnapshot.snapshot_id == snapshot_id)

1734

.limit(1)

1735

)

1736

snap_result = snap_row.first()

1737

if snap_result is None or snap_result[0] is None:

1738

return []

1739

1740

manifest: dict[str, str] = msgpack.unpackb(bytes(snap_result[0]), raw=False)

if not manifest:

return []

# ── 3. Aggregate file counts by language (pure dict lookup, zero I/O) ─

1745

lang_files: dict[str, int] = {}

1746

for file_path in manifest:

1747

lang = language_of(file_path)

1748

lang_files[lang] = lang_files.get(lang, 0) + 1

1749

1750

# ── 4. Fetch bytes + parse symbols → kind counts per language ─────────

1751

backend = get_backend()

1752

lang_symbols: dict[str, int] = {}

1753

lang_kinds: LangKindsMap = {}

1754

1755

for file_path, object_id in manifest.items():

1756

src = await backend.get(object_id)

if not src:

continue

try:

tree = parse_symbols(src, file_path)

1761

except Exception:

1762

continue

1763

lang = language_of(file_path)

1764

kinds = lang_kinds.setdefault(lang, {})

1765

for rec in tree.values():

1766

kind: str = rec["kind"]

1767

if kind in self._IMPORT_KINDS:

1768

continue

1769

lang_symbols[lang] = lang_symbols.get(lang, 0) + 1

1770

kinds[kind] = kinds.get(kind, 0) + 1

1771

1772

# ── 5. Compute pct over total symbols across all languages ────────────

1773

total_symbols = sum(lang_symbols.values())

1774

1775

# ── 6. Build upsert rows — every language in the manifest ────────────

rows = [

{

"repo_id": repo_id,

"language": lang,

"file_count": file_count,

1781

"symbol_count": lang_symbols.get(lang, 0),

1782

"pct": (lang_symbols.get(lang, 0) / total_symbols * 100)

1783

if total_symbols > 0 else 0.0,

1784

"kinds_json": lang_kinds.get(lang) or None,

1785

"ref": ref,

1786

}

1787

for lang, file_count in lang_files.items()

1788

]

1789

1790

# ── 7. Batch-upsert in chunks of 1,000 ───────────────────────────────

1791

for i in range(0, len(rows), self._CHUNK):

1792

chunk = rows[i : i + self._CHUNK]

1793

stmt = (

1794

pg_insert(MusehubIntelLanguages)

1795

.values(chunk)

1796

.on_conflict_do_update(

1797

index_elements=["repo_id", "language"],

1798

set_={

1799

"file_count": sa.literal_column("excluded.file_count"),

1800

"symbol_count": sa.literal_column("excluded.symbol_count"),

1801

"pct": sa.literal_column("excluded.pct"),

1802

"kinds_json": sa.literal_column("excluded.kinds_json"),

1803

"ref": sa.literal_column("excluded.ref"),

},

)

)

await session.execute(stmt)

1808

1809

return [("intel.code.languages", {"count": len(rows)})]

1810

1811

class DetectRefactorProvider:

1812

"""Persist refactoring events derived from snapshot diffs at push time.

Algorithm

---------

At each push the HEAD commit's snapshot is diffed against the parent

1817

commit's snapshot using pure-Python symbol parsing — no subprocess, no

1818

on-disk muse repository required.

1819

1820

For every symbol that appears in both HEAD and parent we compare:

1821

1822

* ``body_hash`` — hash of the function/class body bytes. A change

1823

means the *implementation* changed (kind="implementation").

1824

* ``signature_id`` — hash of the signature (name + parameter types +

1825

return type). A change means the *signature* changed

1826

(kind="signature"). Checked only when ``body_hash``

1827

is unchanged to avoid double-counting.

1828

1829

For symbols that are present in parent but absent in HEAD we check whether

1830

an identical ``body_hash`` exists at a *different address* in HEAD:

1831

1832

* Address differs, file differs → kind="move" (same body, different file)

1833

* Address differs, same file → kind="rename" (same body, different name)

1834

1835

Symbols that vanish with no body-hash match are silently dropped (deleted

1836

code produces no event).

1837

1838

Each event is upserted on ``event_id`` (stable hash of

1839

``repo_id:commit_id:address:kind``) so re-runs are idempotent.

Data flow

---------

push → job → this provider

1844

→ HEAD commit row (musehub_commits)

1845

→ HEAD snapshot manifest (musehub_snapshots.manifest_blob, msgpack)

1846

→ parent snapshot manifest (same table)

1847

→ backend.get(object_id) for each file in both manifests

1848

→ parse_symbols(src, path) → body_hash / signature_id per symbol

1849

→ diff → upsert into musehub_intel_refactor_events

"""

_CHUNK = 500

# ── helpers ──────────────────────────────────────────────────────────────

1855

1856

@staticmethod

1857

def _is_tracked(kind: str) -> bool:

1858

"""True for symbol kinds that undergo meaningful refactoring."""

1859

return kind in {

1860

"function", "async_function",

1861

"method", "async_method",

"class",

}

@staticmethod

async def _manifest_symbols(

1867

backend: StorageBackend,

1868

manifest: StrDict,

1869

) -> SymbolInfoMap:

1870

"""Parse all tracked symbols from a snapshot manifest.

1871

1872

Returns ``{address: {"body_hash": ..., "signature_id": ...,

1873

"file": ..., "kind": ...}}`` for every tracked symbol kind.

1874

1875

Errors in individual files are silently skipped so a single

1876

unparseable file cannot abort the whole push job.

1877

"""

1878

out: SymbolInfoMap = {}

1879

for file_path, object_id in manifest.items():

1880

src = await backend.get(object_id)

if not src:

continue

try:

tree = parse_symbols(src, file_path)

1885

except Exception:

1886

continue

1887

for address, rec in tree.items():

1888

if not DetectRefactorProvider._is_tracked(rec.get("kind", "")):

1889

continue

1890

out[address] = {

1891

"body_hash": rec.get("body_hash", ""),

1892

"signature_id": rec.get("signature_id", ""),

1893

"file": file_path,

1894

"kind": rec.get("kind", ""),

}

return out

# ── main entry point ─────────────────────────────────────────────────────

1899

1900

async def compute(

1901

self, session: AsyncSession, repo_id: str, ref: str, payload: JSONObject

1902

) -> IntelResults:

1903

"""Diff HEAD vs parent snapshot and persist detected refactoring events.

1904

1905

Returns a single ``("intel.code.detect_refactor", {"count": N})``

1906

tuple where *N* is the number of events upserted for this push.

1907

Returns ``[]`` when there is no parent commit to diff against (initial

1908

push) or when the snapshot manifests cannot be loaded.

1909

"""

1910

import msgpack

1911

import sqlalchemy as sa

1912

1913

owner: str | None = payload.get("owner") # type: ignore[assignment]

1914

slug: str | None = payload.get("slug") # type: ignore[assignment]

1915

if not owner or not slug:

1916

repo_result = await session.execute(

1917

sa.select(MusehubRepo.owner, MusehubRepo.slug)

1918

.where(MusehubRepo.repo_id == repo_id)

1919

)

1920

row = repo_result.first()

1921

if row is None:

1922

return []

1923

owner, slug = row.owner, row.slug

1924

1925

# ── 1. Fetch HEAD commit + its parent ─────────────────────────────────

1926

head_result = await session.execute(

1927

sa.select(

1928

MusehubCommit.commit_id,

1929

MusehubCommit.parent_ids,

1930

MusehubCommit.snapshot_id,

1931

MusehubCommit.message,

1932

MusehubCommit.timestamp,

1933

)

1934

.join(MusehubCommitRef, MusehubCommitRef.commit_id == MusehubCommit.commit_id)

1935

.where(MusehubCommitRef.repo_id == repo_id)

1936

.where(MusehubCommit.commit_id == ref)

1937

.limit(1)

1938

)

1939

head_row = head_result.first()

1940

if head_row is None:

1941

# Fallback: most-recent commit for this repo

1942

head_result2 = await session.execute(

1943

sa.select(

1944

MusehubCommit.commit_id,

1945

MusehubCommit.parent_ids,

1946

MusehubCommit.snapshot_id,

1947

MusehubCommit.message,

1948

MusehubCommit.timestamp,

1949

)

1950

.join(MusehubCommitRef, MusehubCommitRef.commit_id == MusehubCommit.commit_id)

1951

.where(MusehubCommitRef.repo_id == repo_id)

1952

.order_by(MusehubCommit.timestamp.desc())

1953

.limit(1)

1954

)

1955

head_row = head_result2.first()

1956

if head_row is None or not head_row.snapshot_id:

1957

return []

1958

1959

head_commit_id = head_row.commit_id

1960

head_message = head_row.message or ""

1961

head_committed = head_row.timestamp

1962

parent_ids: list[str] = head_row.parent_ids or []

1963

head_snap_id = head_row.snapshot_id

1964

1965

if not parent_ids:

1966

return [] # initial commit — nothing to diff against

1967

1968

parent_commit_id = parent_ids[0]

1969

1970

# ── 2. Fetch parent snapshot_id ───────────────────────────────────────

1971

parent_result = await session.execute(

1972

sa.select(MusehubCommit.snapshot_id)

1973

.where(MusehubCommit.commit_id == parent_commit_id)

1974

.limit(1)

1975

)

1976

parent_row = parent_result.first()

1977

if parent_row is None or not parent_row.snapshot_id:

1978

return []

1979

1980

parent_snap_id = parent_row.snapshot_id

1981

1982

# ── 3. Load both snapshot manifests ───────────────────────────────────

1983

snap_result = await session.execute(

1984

sa.select(

1985

MusehubSnapshot.snapshot_id,

1986

MusehubSnapshot.manifest_blob,

1987

).where(

1988

MusehubSnapshot.snapshot_id.in_([head_snap_id, parent_snap_id])

1989

)

1990

)

1991

snap_rows = {row.snapshot_id: row.manifest_blob for row in snap_result}

1992

1993

head_blob = snap_rows.get(head_snap_id)

1994

parent_blob = snap_rows.get(parent_snap_id)

1995

if not head_blob or not parent_blob:

1996

return []

1997

1998

head_manifest: StrDict = msgpack.unpackb(bytes(head_blob), raw=False)

1999

parent_manifest: StrDict = msgpack.unpackb(bytes(parent_blob), raw=False)

2000

2001

# ── 4. Parse symbols from both snapshots ──────────────────────────────

2002

backend = get_backend()

2003

head_syms = await self._manifest_symbols(backend, head_manifest)

2004

parent_syms = await self._manifest_symbols(backend, parent_manifest)

2005

2006

# ── 5. Build reverse index: body_hash → address in HEAD ───────────────

2007

head_body_to_addr: dict[str, str] = {

2008

v["body_hash"]: addr

2009

for addr, v in head_syms.items()

if v["body_hash"]

}

# ── 6. Diff ────────────────────────────────────────────────────────────

2014

events: list[dict] = []

2015

2016

# 6a. Symbols present in HEAD — check for impl/sig changes

2017

for addr, hdata in head_syms.items():

2018

if addr not in parent_syms:

2019

continue # new symbol — not a refactoring event

2020

pdata = parent_syms[addr]

2021

if hdata["body_hash"] != pdata["body_hash"]:

2022

events.append({

2023

"kind": "implementation",

2024

"address": addr,

2025

"detail": f"body rewritten ({hdata['kind']})",

2026

})

2027

elif hdata["signature_id"] != pdata["signature_id"]:

events.append({

"kind": "signature",

"address": addr,

"detail": f"signature changed ({hdata['kind']})",

2032

})

2033

2034

# 6b. Symbols in parent but absent in HEAD — check for move/rename

2035

for addr, pdata in parent_syms.items():

2036

if addr in head_syms:

2037

continue # still present — handled above

2038

matched_addr = head_body_to_addr.get(pdata["body_hash"])

2039

if matched_addr is None:

2040

continue # deleted — not a refactoring event

2041

p_file = pdata["file"]

2042

m_file = head_syms[matched_addr]["file"]

if m_file != p_file:

kind = "move"

else:

kind = "rename"

events.append({

"kind": kind,

"address": addr,

"detail": matched_addr, # destination address

})

if not events:

return [("intel.code.detect_refactor", {"count": 0})]

2055

2056

# ── 7. Upsert events in chunks ─────────────────────────────────────────

2057

for i in range(0, len(events), self._CHUNK):

2058

chunk = events[i : i + self._CHUNK]

2059

values: list[dict] = []

2060

for ev in chunk:

2061

event_id = blob_id(

2062

f"{repo_id}:{head_commit_id}:{ev['address']}:{ev['kind']}".encode()

2063

)

2064

values.append({

2065

"event_id": event_id,

2066

"repo_id": repo_id,

2067

"kind": ev["kind"],

2068

"address": ev["address"],

2069

"detail": ev.get("detail"),

2070

"commit_id": head_commit_id,

2071

"commit_message": head_message,

2072

"committed_at": head_committed,

2073

})

2074

stmt = (

2075

pg_insert(MusehubIntelRefactorEvent)

2076

.values(values)

2077

.on_conflict_do_update(

2078

index_elements=["event_id"],

2079

set_={

2080

"kind": sa.literal_column("excluded.kind"),

2081

"address": sa.literal_column("excluded.address"),

2082

"detail": sa.literal_column("excluded.detail"),

2083

"commit_id": sa.literal_column("excluded.commit_id"),

2084

"commit_message": sa.literal_column("excluded.commit_message"),

2085

"committed_at": sa.literal_column("excluded.committed_at"),

},

)

)

await session.execute(stmt)

2090

2091

return [("intel.code.detect_refactor", {"count": len(events)})]

2092

2093

_GRAVITY_TRACKED_KINDS = frozenset(

2094

{"function", "async_function", "method", "async_method", "class"}

2095

)

2096

2097

# ---------------------------------------------------------------------------

2098

# BreakageProvider — stale-import detection

2099

# ---------------------------------------------------------------------------

2100

2101

class BreakageProvider:

2102

"""Detect stale imports in the HEAD snapshot at push time.

Algorithm

---------

A stale import is an import statement whose symbol cannot be found anywhere

2107

in the repo — neither as a non-import symbol in any tracked file, nor as a

2108

resolvable module path.

2109

2110

Pass 1 — build ``known_symbol_names``:

2111

Walk every file in the manifest via ``parse_symbols()``. Collect the

2112

``name`` of every symbol whose ``kind`` is NOT ``"import"``.

2113

2114

Pass 2 — find unresolved imports:

2115

For every import record (``kind == "import"``) in every file:

2116

2117

* Extract ``(module_dotted, symbol_name)`` from ``qualified_name``

2118

(format: ``"import::<dotted.module>::<symbol_name>"``).

2119

* Try to resolve the dotted module path to a tracked file (same logic

2120

as ``CodemapProvider._resolve_import``).

2121

* If the module resolves → not stale (the target file exists; whether

2122

the symbol is exported cannot be checked cheaply).

2123

* If the module does NOT resolve AND ``symbol_name`` is not in

2124

``known_symbol_names`` → stale import.

2125

2126

Each issue is upserted on a stable ``issue_id`` =

2127

``blob_id(repo_id:file_path:symbol_name:stale_import)`` so re-runs on the

2128

same snapshot are idempotent.

2129

2130

A ``musehub_intel_breakage_meta`` row is upserted after every run with

2131

aggregate counts used by the stat chips.

2132

2133

No subprocess is ever spawned.

"""

_CHUNK = 1_000

async def compute(

self, session: AsyncSession, repo_id: str, ref: str, payload: JSONObject

2140

) -> IntelResults:

2141

"""Detect stale imports and persist issues + meta for the HEAD snapshot."""

2142

import msgpack

2143

2144

# ── 1. Resolve owner/slug ─────────────────────────────────────────────

2145

owner: str | None = payload.get("owner") # type: ignore[assignment]

2146

slug: str | None = payload.get("slug") # type: ignore[assignment]

2147

if not (owner and slug):

2148

row = await session.execute(

2149

select(MusehubRepo.owner, MusehubRepo.slug)

2150

.where(MusehubRepo.repo_id == repo_id)

.limit(1)

)

result = row.first()

if result is None:

return []

owner, slug = result

# ── 2. HEAD commit → snapshot manifest ───────────────────────────────

2159

commit_row = await session.execute(

2160

select(MusehubCommit.snapshot_id)

2161

.join(MusehubCommitRef, MusehubCommitRef.commit_id == MusehubCommit.commit_id)

2162

.where(MusehubCommitRef.repo_id == repo_id)

2163

.order_by(MusehubCommit.timestamp.desc())

2164

.limit(1)

2165

)

2166

commit_result = commit_row.first()

2167

if commit_result is None or commit_result[0] is None:

2168

return [("intel.code.breakage", {"count": 0})]

2169

snapshot_id: str = commit_result[0]

2170

2171

snap_row = await session.execute(

2172

select(MusehubSnapshot.manifest_blob)

2173

.where(MusehubSnapshot.snapshot_id == snapshot_id)

2174

.limit(1)

2175

)

2176

snap_result = snap_row.first()

2177

if snap_result is None or snap_result[0] is None:

2178

return [("intel.code.breakage", {"count": 0})]

2179

2180

manifest: dict[str, str] = msgpack.unpackb(bytes(snap_result[0]), raw=False)

2181

if not manifest:

2182

return [("intel.code.breakage", {"count": 0})]

2183

2184

# ── 3. Pass 1 — build known symbol names ─────────────────────────────

2185

backend = get_backend()

2186

manifest_paths: set[str] = set(manifest.keys())

2187

known_symbol_names: set[str] = set()

2188

# file_path → list of (symbol_name, import_rec) for pass 2

2189

import_records: list[tuple[str, str, dict]] = [] # (file_path, address, rec)

2190

2191

for file_path, object_id in manifest.items():

2192

src = await backend.get(object_id)

if not src:

continue

try:

tree = parse_symbols(src, file_path)

2197

except Exception:

2198

continue

2199

for address, rec in tree.items():

2200

kind = rec.get("kind", "")

2201

if kind == "import":

2202

import_records.append((file_path, address, rec))

2203

else:

2204

name = rec.get("name", "")

2205

if name:

2206

known_symbol_names.add(name)

2207

2208

# ── 4. Pass 2 — detect stale imports ─────────────────────────────────

2209

# Use a dict keyed on issue_id to deduplicate: the same symbol may be

2210

# imported from multiple modules in the same file, producing the same

2211

# stable issue_id. Last write wins; all duplicates are semantically

2212

# identical so order doesn't matter.

2213

issues_map: dict[str, dict] = {}

2214

for file_path, _address, rec in import_records:

2215

qn: str = rec.get("qualified_name", "")

2216

parts = qn.split("::")

2217

if len(parts) < 3:

2218

continue

2219

module_dotted = parts[1]

2220

symbol_name = parts[2]

if not symbol_name:

continue

# If the module resolves to a tracked file → not stale

2225

resolved = CodemapProvider._resolve_import(qn, manifest_paths)

if resolved:

continue

# If the symbol name exists anywhere → not stale

2230

if symbol_name in known_symbol_names:

2231

continue

2232

2233

iid = blob_id(f"{repo_id}:{file_path}:{symbol_name}:stale_import".encode())

issues_map[iid] = {

"issue_id": iid,

"repo_id": repo_id,

"file_path": file_path,

2238

"issue_type": "stale_import",

2239

"description": (

2240

f"imports '{symbol_name}' but no symbol or module with that "

2241

f"name exists in the HEAD snapshot"

2242

),

2243

"severity": "warning",

"ref": ref,

}

issues = list(issues_map.values())

2248

2249

# ── 5. Upsert issues in chunks ────────────────────────────────────────

2250

for i in range(0, len(issues), self._CHUNK):

2251

chunk = issues[i : i + self._CHUNK]

2252

stmt = (

2253

pg_insert(MusehubIntelBreakageIssue)

2254

.values(chunk)

2255

.on_conflict_do_update(

2256

index_elements=["issue_id"],

2257

set_={

2258

"file_path": sa.literal_column("excluded.file_path"),

2259

"issue_type": sa.literal_column("excluded.issue_type"),

2260

"description": sa.literal_column("excluded.description"),

2261

"severity": sa.literal_column("excluded.severity"),

2262

"ref": sa.literal_column("excluded.ref"),

},

)

)

await session.execute(stmt)

2267

2268

# ── 6. Upsert meta row ────────────────────────────────────────────────

2269

warning_count = sum(1 for issue in issues if issue["severity"] == "warning")

2270

error_count = sum(1 for issue in issues if issue["severity"] == "error")

2271

file_count = len({issue["file_path"] for issue in issues})

2272

2273

meta_stmt = (

2274

pg_insert(MusehubIntelBreakageMeta)

2275

.values(

2276

repo_id = repo_id,

2277

total_issues = len(issues),

2278

warning_count= warning_count,

2279

error_count = error_count,

2280

file_count = file_count,

2281

ref = ref,

2282

)

2283

.on_conflict_do_update(

2284

index_elements=["repo_id"],

2285

set_={

2286

"total_issues": sa.literal_column("excluded.total_issues"),

2287

"warning_count": sa.literal_column("excluded.warning_count"),

2288

"error_count": sa.literal_column("excluded.error_count"),

2289

"file_count": sa.literal_column("excluded.file_count"),

2290

"ref": sa.literal_column("excluded.ref"),

},

)

)

await session.execute(meta_stmt)

2295

2296

return [("intel.code.breakage", {"count": len(issues)})]

2297

2298

# ---------------------------------------------------------------------------

2299

# CodemapProvider — structural dependency topology (fan_in, fan_out, cycles)

2300

# ---------------------------------------------------------------------------

2301

2302

class CodemapProvider:

2303

"""Persist structural dependency topology derived from stored snapshot objects.

2304

2305

Reads the HEAD snapshot manifest, calls ``parse_symbols()`` per file to

2306

extract import relationships, resolves dotted module names back to tracked

2307

file paths, computes ``fan_in`` / ``fan_out`` per module, runs Tarjan's SCC

2308

for cycle detection, and batch-upserts results into

2309

``musehub_intel_codemap_modules`` and ``musehub_intel_codemap_meta``.

2310

2311

No subprocess is ever spawned. All data flows from objects stored at push

2312

time via ``get_backend().get(object_id)``.

Parameters

----------

session : AsyncSession

2317

Active async SQLAlchemy session provided by the push worker.

2318

repo_id : str

2319

Opaque repo identifier from ``musehub_repos.repo_id``.

2320

ref : str

2321

Branch or commit ref string at push time (e.g. ``"dev"``).

2322

payload : JSONObject

2323

Raw push-event JSON — used to resolve owner/slug for the storage

2324

backend when not already present in the session.

Returns

-------

IntelResults

``[("intel.code.codemap", {"modules": N, "edges": E, "cycles": C})]``

2330

on success. ``[]`` when the snapshot manifest cannot be resolved.

Notes

-----

Import record ``qualified_name`` format from ``parse_symbols()`` is

2335

``"import::<dotted.module.path>::<symbol_name>"``. The middle segment is

2336

used to resolve the import target to a tracked file path by converting

2337

``dotted.module.path`` → ``dotted/module/path.py`` (or ``__init__.py``).

2338

2339

Fan-in is computed in a post-processing pass after the full manifest is

2340

walked: for each resolved import edge ``A → B``, ``fan_in[B] += 1``.

2341

Stdlib and third-party imports that cannot be resolved to a tracked file

2342

path are silently skipped — they inflate ``fan_out`` only when the

2343

resolved file is actually in the manifest.

2344

2345

Tarjan's algorithm runs in O(V + E) on the resolved import graph; for a

2346

typical repo of 300 files and 1,800 edges this completes in < 10 ms.

"""

_CHUNK = 1_000

# ------------------------------------------------------------------

2352

# Public entry point

2353

# ------------------------------------------------------------------

2354

2355

async def compute(

2356

self, session: AsyncSession, repo_id: str, ref: str, payload: JSONObject

) -> IntelResults:

import msgpack

# ── 1. Resolve owner/slug ─────────────────────────────────────────────

2361

owner: str | None = payload.get("owner") # type: ignore[assignment]

2362

slug: str | None = payload.get("slug") # type: ignore[assignment]

2363

if not (owner and slug):

2364

row = await session.execute(

2365

select(MusehubRepo.owner, MusehubRepo.slug)

2366

.where(MusehubRepo.repo_id == repo_id)

.limit(1)

)

result = row.first()

if result is None:

return []

owner, slug = result

# ── 2. HEAD commit → snapshot manifest ───────────────────────────────

2375

commit_row = await session.execute(

2376

select(MusehubCommit.snapshot_id)

2377

.join(MusehubCommitRef, MusehubCommitRef.commit_id == MusehubCommit.commit_id)

2378

.where(MusehubCommitRef.repo_id == repo_id)

2379

.order_by(MusehubCommit.timestamp.desc())

2380

.limit(1)

2381

)

2382

commit_result = commit_row.first()

2383

if commit_result is None or commit_result[0] is None:

2384

return []

2385

snapshot_id: str = commit_result[0]

2386

2387

snap_row = await session.execute(

2388

select(MusehubSnapshot.manifest_blob)

2389

.where(MusehubSnapshot.snapshot_id == snapshot_id)

2390

.limit(1)

2391

)

2392

snap_result = snap_row.first()

2393

if snap_result is None or snap_result[0] is None:

2394

return []

2395

2396

manifest: dict[str, str] = msgpack.unpackb(bytes(snap_result[0]), raw=False)

if not manifest:

return []

# ── 3. Build a set of tracked paths for import resolution ─────────────

2401

manifest_paths: set[str] = set(manifest.keys())

2402

2403

# ── 4. Walk manifest: fetch bytes, parse symbols, accumulate edges ────

2404

backend = get_backend()

2405

sym_counts: IntDict = {} # file_path → non-import symbol count

2406

fan_out: IntDict = {} # file_path → resolved import edges

2407

fan_in: IntDict = {} # file_path → times imported by others

2408

edges: list[tuple[str, str]] = [] # (importer, importee) resolved pairs

2409

2410

_non_import_kinds = frozenset(

2411

{"function", "async_function", "method", "async_method", "class"}

2412

)

2413

2414

for file_path, object_id in manifest.items():

2415

src = await backend.get(object_id)

2416

if not src:

2417

fan_out[file_path] = 0

2418

sym_counts[file_path] = 0

2419

continue

2420

try:

2421

tree = parse_symbols(src, file_path)

2422

except Exception:

2423

fan_out[file_path] = 0

2424

sym_counts[file_path] = 0

2425

continue

2426

2427

sym_counts[file_path] = sum(

2428

1 for rec in tree.values()

2429

if rec.get("kind") in _non_import_kinds

2430

)

2431

2432

resolved_targets: set[str] = set()

2433

for rec in tree.values():

2434

if rec.get("kind") != "import":

2435

continue

2436

qn: str = rec.get("qualified_name", "")

2437

target = self._resolve_import(qn, manifest_paths)

2438

if target and target != file_path:

2439

resolved_targets.add(target)

2440

2441

fan_out[file_path] = len(resolved_targets)

2442

for target in resolved_targets:

2443

edges.append((file_path, target))

2444

fan_in[target] = fan_in.get(target, 0) + 1

2445

2446

# ── 5. Detect import cycles via Tarjan's SCC ──────────────────────────

2447

graph: dict[str, list[str]] = {f: [] for f in manifest_paths}

2448

for importer, importee in edges:

2449

graph[importer].append(importee)

2450

2451

cycles = self._tarjan_cycles(graph)

2452

cycle_paths = [[*cycle, cycle[0]] for cycle in cycles] # closed paths

2453

2454

# ── 6. Build upsert rows ──────────────────────────────────────────────

rows = [

{

"repo_id": repo_id,

"file_path": fp,

"symbol_count": sym_counts.get(fp, 0),

2460

"fan_in": fan_in.get(fp, 0),

2461

"fan_out": fan_out.get(fp, 0),

2462

"language": language_of(fp),

2463

"ref": ref,

2464

}

2465

for fp in manifest_paths

2466

]

2467

2468

# ── 7. Batch-upsert module rows ───────────────────────────────────────

2469

for i in range(0, len(rows), self._CHUNK):

2470

chunk = rows[i : i + self._CHUNK]

2471

stmt = (

2472

pg_insert(MusehubIntelCodemapModule)

2473

.values(chunk)

2474

.on_conflict_do_update(

2475

index_elements=["repo_id", "file_path"],

2476

set_={

2477

"symbol_count": sa.literal_column("excluded.symbol_count"),

2478

"fan_in": sa.literal_column("excluded.fan_in"),

2479

"fan_out": sa.literal_column("excluded.fan_out"),

2480

"language": sa.literal_column("excluded.language"),

2481

"ref": sa.literal_column("excluded.ref"),

},

)

)

await session.execute(stmt)

2486

2487

# ── 8. Upsert meta row ────────────────────────────────────────────────

2488

meta_stmt = (

2489

pg_insert(MusehubIntelCodemapMeta)

2490

.values(

2491

repo_id=repo_id,

2492

total_modules=len(rows),

2493

total_edges=len(edges),

2494

cycle_count=len(cycles),

2495

cycles_json=cycle_paths or None,

2496

ref=ref,

2497

)

2498

.on_conflict_do_update(

2499

index_elements=["repo_id"],

2500

set_={

2501

"total_modules": sa.literal_column("excluded.total_modules"),

2502

"total_edges": sa.literal_column("excluded.total_edges"),

2503

"cycle_count": sa.literal_column("excluded.cycle_count"),

2504

"cycles_json": sa.literal_column("excluded.cycles_json"),

2505

"ref": sa.literal_column("excluded.ref"),

},

)

)

await session.execute(meta_stmt)

2510

2511

return [("intel.code.codemap", {"modules": len(rows), "edges": len(edges), "cycles": len(cycles)})]

2512

2513

# ------------------------------------------------------------------

2514

# Import resolution helpers

2515

# ------------------------------------------------------------------

2516

2517

@staticmethod

2518

def _resolve_import(qualified_name: str, manifest_paths: set[str]) -> str | None:

2519

"""Resolve a ``parse_symbols`` import record to a tracked file path.

2520

2521

The ``qualified_name`` format is ``"import::<dotted.module>::<symbol>"``.

2522

This method extracts the dotted module segment and tries both

2523

``dotted/module.py`` and ``dotted/module/__init__.py`` against the

tracked path set.

Parameters

----------

qualified_name : str

Full qualified name from the import record.

2530

manifest_paths : set[str]

2531

Set of all tracked file paths from the snapshot manifest.

Returns

-------

str | None

Resolved repo-relative path, or ``None`` if not in manifest.

2537

"""

2538

parts = qualified_name.split("::")

if len(parts) < 2:

return None

dotted = parts[1]

if not dotted:

return None

as_path = dotted.replace(".", "/")

2545

candidate_py = f"{as_path}.py"

2546

candidate_init = f"{as_path}/__init__.py"

2547

if candidate_py in manifest_paths:

2548

return candidate_py

2549

if candidate_init in manifest_paths:

2550

return candidate_init

2551

return None

2552

2553

# ------------------------------------------------------------------

2554

# Tarjan's SCC — O(V + E) cycle detection

2555

# ------------------------------------------------------------------

2556

2557

@staticmethod

2558

def _tarjan_cycles(graph: GraphMap) -> list[list[str]]:

2559

"""Return all strongly-connected components of size ≥ 2 (import cycles).

2560

2561

Uses Tarjan's iterative SCC algorithm to avoid recursion limit issues

2562

on large graphs. Runs in O(V + E) time.

Parameters

----------

graph : dict[str, list[str]]

2567

Adjacency list: ``{file_path: [imported_file_path, ...]}``.

Returns

-------

list[list[str]]

Each inner list is one SCC (cycle) with ≥ 2 nodes. Self-loops

2573

(a file that imports itself) are excluded.

2574

"""

2575

index_counter = [0]

2576

stack: list[str] = []

2577

lowlinks: dict[str, int] = {}

2578

index: dict[str, int] = {}

2579

on_stack: dict[str, bool] = {}

2580

sccs: list[list[str]] = []

2581

2582

def strongconnect(v: str) -> None:

2583

index[v] = index_counter[0]

2584

lowlinks[v] = index_counter[0]

2585

index_counter[0] += 1

stack.append(v)

on_stack[v] = True

for w in graph.get(v, []):

2590

if w not in index:

2591

strongconnect(w)

2592

lowlinks[v] = min(lowlinks[v], lowlinks[w])

2593

elif on_stack.get(w, False):

2594

lowlinks[v] = min(lowlinks[v], index[w])

2595

2596

if lowlinks[v] == index[v]:

scc: list[str] = []

while True:

w = stack.pop()

on_stack[w] = False

scc.append(w)

if w == v:

break

if len(scc) >= 2:

sccs.append(scc)

for v in graph:

if v not in index:

strongconnect(v)

return sccs

class GravityProvider:

2614

"""Derive gravity scores from existing blast columns in musehub_symbol_intel.

2615

2616

Formula mirrors muse/muse/cli/commands/gravity.py exactly:

2617

2618

total = count of tracked-kind symbols for this repo

2619

denom = max(1, total - 1)

2620

gravity_pct = round(blast / denom * 100, 1)

2621

2622

Column mapping:

2623

gravity_direct_dependents ← blast_direct

2624

gravity_transitive_dependents ← blast (blast_direct + blast_cross)

2625

gravity_pct ← derived formula above

2626

gravity_max_depth — not derivable; left NULL

2627

gravity_depth_distribution — not derivable; left NULL

2628

2629

No muse CLI call — pure SQL UPDATE on existing rows.

"""

async def compute(

self, session: AsyncSession, repo_id: str, ref: str, payload: JSONObject

2634

) -> IntelResults:

2635

import sqlalchemy as sa

2636

from musehub.services.musehub_symbol_indexer import backfill_symbol_kinds

2637

2638

# One-time backfill: populate symbol_kind for rows written before kind

2639

# extraction was added to the symbol indexer.

2640

await backfill_symbol_kinds(session, repo_id)

2641

2642

# Count tracked-kind production symbols — the denominator.

2643

total_result = await session.execute(

2644

sa.select(sa.func.count())

2645

.select_from(MusehubSymbolIntel)

2646

.where(

2647

MusehubSymbolIntel.repo_id == repo_id,

2648

MusehubSymbolIntel.symbol_kind.in_(_GRAVITY_TRACKED_KINDS),

2649

)

2650

)

2651

total: int = total_result.scalar_one()

if total == 0:

return []

denom = max(1, total - 1)

2656

2657

# Fetch all tracked rows to update.

2658

rows_result = await session.execute(

2659

sa.select(MusehubSymbolIntel)

2660

.where(

2661

MusehubSymbolIntel.repo_id == repo_id,

2662

MusehubSymbolIntel.symbol_kind.in_(_GRAVITY_TRACKED_KINDS),

2663

)

2664

)

2665

rows = rows_result.scalars().all()

2666

2667

for row in rows:

2668

blast: int = row.blast or 0

2669

row.gravity_pct = round(blast / denom * 100, 1)

2670

row.gravity_direct_dependents = row.blast_direct

2671

row.gravity_transitive_dependents = blast

2672

# gravity_max_depth and gravity_depth_distribution not derivable

2673

await session.flush()

2674

2675

return [("intel.code.gravity", {"count": len(rows)})]

2676

2677

# ---------------------------------------------------------------------------

2678

# Registry

2679

# ---------------------------------------------------------------------------

2680

2681

# Maps job_type → provider instance.

2682

# Workers dispatch jobs by looking up job.job_type in this dict.

2683

_PROVIDER_REGISTRY: dict[str, IntelProvider] = {

2684

"intel.structural": StructuralProvider(),

2685

"intel.code": CodeProvider(),

2686

"intel.midi": MidiProvider(),

2687

"intel.mist": MistProvider(),

2688

"profile.snapshot": ProfileSnapshotProvider(),

2689

"intel.code.coupling": CouplingProvider(),

2690

"intel.code.entangle": EntangleProvider(),

2691

"intel.code.dead": DeadProvider(),

2692

"intel.code.blast_risk": BlastRiskProvider(),

2693

"intel.code.stable": StableProvider(),

2694

"intel.code.velocity": VelocityProvider(),

2695

"intel.code.clones": ClonesProvider(),

2696

"intel.code.type": TypeProvider(),

2697

"intel.code.api_surface": ApiSurfaceProvider(),

2698

"intel.code.languages": LanguagesProvider(),

2699

"intel.code.detect_refactor": DetectRefactorProvider(),

2700

"intel.code.gravity": GravityProvider(),

2701

"intel.code.codemap": CodemapProvider(),

2702

"intel.code.breakage": BreakageProvider(),

2703

}

2704

2705

def get_provider(job_type: str) -> IntelProvider | None:

2706

"""Return the provider for a given job_type, or None if not registered."""

2707

return _PROVIDER_REGISTRY.get(job_type)

2708

2709

def job_types_for_push(domain_id: str | None) -> list[str]:

2710

"""Return the job types to enqueue after a push for the given domain.

2711

2712

``intel.structural`` is always enqueued — it works for every domain.

2713

Domain-specific jobs are added based on the repo's ``domain_id``.

2714

``gc`` is always enqueued to prune orphaned objects after any push.

2715

"""

2716

types = ["intel.structural", "push.file_last_commits", "mpack.index", "fetch.mpack.prebuild"]

2717

if domain_id is None or "code" in (domain_id or ""):

2718

types.append("intel.code")

2719

types.extend([

2720

"intel.code.coupling",

2721

"intel.code.entangle",

2722

"intel.code.dead",

2723

"intel.code.blast_risk",

2724

"intel.code.stable",

2725

"intel.code.velocity",

2726

"intel.code.clones",

2727

"intel.code.type",

2728

"intel.code.api_surface",

2729

"intel.code.languages",

2730

"intel.code.detect_refactor",

2731

"intel.code.gravity",

2732

"intel.code.codemap",

2733

"intel.code.breakage",

2734

])

2735

elif "midi" in (domain_id or ""):

2736

types.append("intel.midi")

2737

elif "mist" in (domain_id or ""):

2738

types.append("intel.mist")

2739

types.append("gc")

2740

return types