indexer-chunk-options.test.mjs file-level

at sha256:3 · View file ↗ · Intel ↗

History
1 files
1 commits
0 hotspots
0 🧊 dead
0 💥 blast risk
sha256:4 fix(security): pin patched transitive deps to clear Dependabot moderate… · aaronrene · Jun 11, 2026
1 /**
2 * Hosted indexer chunk option tests.
3 *
4 * DeepInfra BGE embedding models enforce a 512-token context window. These
5 * tests keep the bridge defaults below that provider limit before any network
6 * embed request can fail with an over-context error.
7 */
8 import { describe, it } from 'node:test';
9 import assert from 'node:assert/strict';
10
11 import {
12 DEEPINFRA_SAFE_CHUNK_OVERLAP,
13 DEEPINFRA_SAFE_CHUNK_SIZE,
14 DEFAULT_CHUNK_OVERLAP,
15 DEFAULT_CHUNK_SIZE,
16 defaultBridgeEmbeddingModelForProvider,
17 resolveIndexerChunkOptions,
18 } from '../lib/indexer-chunk-options.mjs';
19 import { splitByHeadingOrSize } from '../lib/chunk.mjs';
20
21 describe('indexer chunk options', () => {
22 it('unit: defaults DeepInfra bridge embeddings to a real DeepInfra model', () => {
23 assert.equal(defaultBridgeEmbeddingModelForProvider('deepinfra'), 'BAAI/bge-large-en-v1.5');
24 assert.equal(defaultBridgeEmbeddingModelForProvider('openai'), 'text-embedding-3-small');
25 assert.equal(defaultBridgeEmbeddingModelForProvider('voyage'), 'voyage-4-lite');
26 assert.equal(defaultBridgeEmbeddingModelForProvider('ollama'), 'nomic-embed-text');
27 });
28
29 it('integration: clamps hosted DeepInfra chunks below the 512-token model limit margin', () => {
30 const opts = resolveIndexerChunkOptions(
31 { INDEXER_CHUNK_SIZE: '2048', INDEXER_CHUNK_OVERLAP: '256' },
32 { provider: 'deepinfra', model: 'BAAI/bge-large-en-v1.5' },
33 );
34
35 assert.equal(opts.chunkSize, DEEPINFRA_SAFE_CHUNK_SIZE);
36 assert.equal(opts.chunkOverlap, DEEPINFRA_SAFE_CHUNK_OVERLAP);
37 });
38
39 it('end-to-end: DeepInfra-sized chunking splits a note that old defaults sent as one 2048-char chunk', () => {
40 const text = 'a '.repeat(1024).trim();
41 const oldChunks = splitByHeadingOrSize(text, {
42 chunkSize: DEFAULT_CHUNK_SIZE,
43 chunkOverlap: DEFAULT_CHUNK_OVERLAP,
44 });
45 const safeChunks = splitByHeadingOrSize(text, resolveIndexerChunkOptions({}, {
46 provider: 'deepinfra',
47 model: 'BAAI/bge-large-en-v1.5',
48 }));
49
50 assert.equal(oldChunks.length, 1);
51 assert.equal(safeChunks.length > 1, true);
52 assert.equal(safeChunks.every((chunk) => chunk.length <= DEEPINFRA_SAFE_CHUNK_SIZE), true);
53 });
54
55 it('stress: invalid env values fall back to bounded provider defaults', () => {
56 const opts = resolveIndexerChunkOptions(
57 { INDEXER_CHUNK_SIZE: 'not-a-number', INDEXER_CHUNK_OVERLAP: '-5' },
58 { provider: 'deepinfra', model: 'BAAI/bge-large-en-v1.5' },
59 );
60
61 assert.deepEqual(opts, {
62 chunkSize: DEEPINFRA_SAFE_CHUNK_SIZE,
63 chunkOverlap: DEEPINFRA_SAFE_CHUNK_OVERLAP,
64 });
65 });
66
67 it('data-integrity: non-DeepInfra providers keep the existing default chunk contract', () => {
68 assert.deepEqual(resolveIndexerChunkOptions({}, { provider: 'openai' }), {
69 chunkSize: DEFAULT_CHUNK_SIZE,
70 chunkOverlap: DEFAULT_CHUNK_OVERLAP,
71 });
72 });
73
74 it('performance: explicit smaller DeepInfra chunk sizes are honored', () => {
75 const opts = resolveIndexerChunkOptions(
76 { INDEXER_CHUNK_SIZE: '768', INDEXER_CHUNK_OVERLAP: '80' },
77 { provider: 'deepinfra', model: 'BAAI/bge-large-en-v1.5' },
78 );
79
80 assert.deepEqual(opts, { chunkSize: 768, chunkOverlap: 80 });
81 });
82
83 it('security: overlap is capped below chunk size so chunking always advances', () => {
84 const opts = resolveIndexerChunkOptions(
85 { INDEXER_CHUNK_SIZE: '32', INDEXER_CHUNK_OVERLAP: '9999' },
86 { provider: 'openai', model: 'text-embedding-3-small' },
87 );
88
89 assert.deepEqual(opts, { chunkSize: 32, chunkOverlap: 31 });
90 });
91 });