/**
 * Contract tests for the `feat/bridge-embed-hash-cache` wiring inside
 * `hub/bridge/server.mjs POST /api/v1/index`. The handler is too tightly coupled
 * to Netlify Blobs / canister export / live embedding to boot in a Node test, so
 * we lock in the static wiring that PR 1 introduces with source-string asserts:
 *
 *   - imports the right helpers (computeChunkContentHashTagged, partitionChunksForReindex,
 *     runWithConcurrency, parseEmbedConcurrency, parseEmbedBatchSize);
 *   - calls store.getChunkHashes(vaultId) for cache lookup;
 *   - upsert payload includes content_hash;
 *   - response includes chunksSkippedCached + chunksEmbedded (so the UI can show savings);
 *   - timer logs cache_lookup step (post-mortem signal);
 *   - BATCH_EMBED + EMBED_CONCURRENCY are env-tunable, not hard-coded 10.
 *
 * If any of these regress in a future refactor, post-PR1 re-indexes silently fall
 * back to "embed everything" — the exact failure mode this PR exists to eliminate.
 */

import { readFileSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import { dirname, join } from 'node:path';
import test from 'node:test';
import assert from 'node:assert/strict';

const root = join(dirname(fileURLToPath(import.meta.url)), '..');
const bridgeJs = readFileSync(join(root, 'hub/bridge/server.mjs'), 'utf8');

test('bridge imports content-hash + parallel-pool + partition helpers', () => {
  assert.match(
    bridgeJs,
    /from\s+['"]\.\.\/\.\.\/lib\/chunk-content-hash\.mjs['"]/,
    'must import from lib/chunk-content-hash.mjs',
  );
  assert.match(
    bridgeJs,
    /from\s+['"]\.\.\/\.\.\/lib\/parallel-embed-pool\.mjs['"]/,
    'must import from lib/parallel-embed-pool.mjs',
  );
  assert.match(
    bridgeJs,
    /from\s+['"]\.\.\/\.\.\/lib\/index-partition\.mjs['"]/,
    'must import from lib/index-partition.mjs',
  );
  assert.match(bridgeJs, /\bcomputeChunkContentHashTagged\b/);
  assert.match(bridgeJs, /\brunWithConcurrency\b/);
  assert.match(bridgeJs, /\bpartitionChunksForReindex\b/);
});

test('BATCH_EMBED + EMBED_CONCURRENCY are derived from env (not hard-coded 10)', () => {
  assert.match(
    bridgeJs,
    /parseEmbedBatchSize\(\s*process\.env\.INDEXER_EMBED_BATCH_SIZE\s*\)/,
    'INDEXER_EMBED_BATCH_SIZE must drive the batch size',
  );
  assert.match(
    bridgeJs,
    /parseEmbedConcurrency\(\s*process\.env\.INDEXER_EMBED_CONCURRENCY\s*\)/,
    'INDEXER_EMBED_CONCURRENCY must drive parallelism',
  );
  // The legacy serial-loop constant `const BATCH_EMBED = 10` is gone.
  assert.doesNotMatch(
    bridgeJs,
    /^\s*const\s+BATCH_EMBED\s*=\s*10\s*;/m,
    'BATCH_EMBED = 10 is the pre-PR sequential-loop default; must not return',
  );
});

test('handler queries store.getChunkHashes(vaultId) for cache lookup', () => {
  assert.match(
    bridgeJs,
    /store\.getChunkHashes\s*\(\s*vaultId\s*\)/,
    'index handler must call store.getChunkHashes(vaultId) to populate the cache',
  );
});

test('upsert payload includes content_hash so future runs can hit the cache', () => {
  // Look in a window around the upsert payload object literal.
  const upsertWindow = bridgeJs.match(
    /points\s*=\s*slice\.map\([\s\S]{0,1500}?content_hash:\s*item\.contentHash/,
  );
  assert.ok(
    upsertWindow,
    'upsert payload must include content_hash: item.contentHash for the cache to populate',
  );
});

test('response + timer surface chunksSkippedCached and chunksEmbedded', () => {
  assert.match(
    bridgeJs,
    /chunksSkippedCached:\s*chunks_skipped_cached/,
    'response JSON must expose chunksSkippedCached',
  );
  assert.match(
    bridgeJs,
    /chunksEmbedded:\s*toEmbed\.length/,
    'response JSON must expose chunksEmbedded',
  );
  assert.match(
    bridgeJs,
    /timer\.step\(['"]cache_lookup['"]/,
    'timer must emit cache_lookup step (post-mortem signal for cache hit rate)',
  );
});

test('parallel embed loop uses runWithConcurrency, not a serial for-of/await', () => {
  assert.match(
    bridgeJs,
    /runWithConcurrency\(\s*embedBatches\.map/,
    'embed step must call runWithConcurrency with embedBatches.map of thunks',
  );
});

test('getBridgeStoreConfig sets allow_dimension_migration: true so a provider switch unblocks the indexer', () => {
  // Without this, an OpenAI(1536) → DeepInfra(1024) switch leaves a 1536-dim table on disk and
  // every subsequent `POST /api/v1/index` aborts with "Vector store dimension mismatch" before
  // the new content-hash migration logic can run. The bridge owns the data lifecycle (downloads
  // from blob → re-indexes → uploads to blob), so an automatic drop+recreate is the only correct
  // resolution. CLI keeps the throw — see `lib/vector-store-sqlite.mjs:ensureCollection`.
  assert.match(
    bridgeJs,
    /function getBridgeStoreConfig[\s\S]{0,1500}?allow_dimension_migration:\s*true/,
    'getBridgeStoreConfig must set allow_dimension_migration: true',
  );
});

test('chunk content-hash is bound to the active embedding provider+model', () => {
  // The hash call site MUST pass the embedding config. Without it,
  // `computeChunkContentHashTagged` throws (loud caller bug) — but the test asserts the
  // intentional wiring so a future refactor cannot silently drop the second arg and re-introduce
  // the same-dimension model-swap silent corruption (BGE-large 1024 → BGE-m3 1024 etc.).
  assert.match(
    bridgeJs,
    /computeChunkContentHashTagged\(\s*chunk\s*,\s*embeddingConfigForHash\s*\)/,
    'bridge must call computeChunkContentHashTagged(chunk, embeddingConfigForHash) so a model swap invalidates the cache',
  );
});