indexer-chunk-options.mjs file-level

at sha256:3 · View file ↗ · Intel ↗

History
1 files
1 commits
0 hotspots
0 🧊 dead
0 💥 blast risk
sha256:9 feat(calendar): hosted bridge/gateway route parity and timeline noteRec… · aaronrene · Jun 19, 2026
1 /**
2 * Hosted indexer chunk sizing policy.
3 *
4 * The bridge indexer uses character chunks before sending text to embedding
5 * providers. DeepInfra's commonly used BGE embedding models reject requests
6 * above a 512-token context window, so the hosted default must keep a real
7 * safety margin instead of relying on the old 2048 chars ~= 512 tokens rule.
8 */
9
10 const DEFAULT_CHUNK_SIZE = 2048;
11 const DEFAULT_CHUNK_OVERLAP = 256;
12 const DEEPINFRA_SAFE_CHUNK_SIZE = 1024;
13 const DEEPINFRA_SAFE_CHUNK_OVERLAP = 128;
14
15 /**
16 * @param {unknown} raw
17 * @returns {number|null}
18 */
19 function parsePositiveInteger(raw) {
20 if (raw == null || raw === '') return null;
21 const value = typeof raw === 'number' ? raw : Number.parseInt(String(raw).trim(), 10);
22 if (!Number.isFinite(value) || value < 1) return null;
23 return Math.floor(value);
24 }
25
26 /**
27 * @param {string|null|undefined} provider
28 * @returns {string}
29 */
30 export function defaultBridgeEmbeddingModelForProvider(provider) {
31 const p = String(provider || 'ollama').trim().toLowerCase();
32 if (p === 'openai') return 'text-embedding-3-small';
33 if (p === 'voyage') return 'voyage-4-lite';
34 if (p === 'deepinfra') return 'BAAI/bge-large-en-v1.5';
35 return 'nomic-embed-text';
36 }
37
38 /**
39 * @param {{ provider?: string|null, model?: string|null }} embeddingConfig
40 * @returns {number}
41 */
42 export function safeIndexerChunkSizeForEmbedding(embeddingConfig = {}) {
43 const provider = String(embeddingConfig?.provider || '').trim().toLowerCase();
44 if (provider === 'deepinfra') return DEEPINFRA_SAFE_CHUNK_SIZE;
45 return DEFAULT_CHUNK_SIZE;
46 }
47
48 /**
49 * @param {{
50 * INDEXER_CHUNK_SIZE?: string|number|null,
51 * INDEXER_CHUNK_OVERLAP?: string|number|null,
52 * }} env
53 * @param {{ provider?: string|null, model?: string|null }} embeddingConfig
54 * @returns {{ chunkSize: number, chunkOverlap: number }}
55 */
56 export function resolveIndexerChunkOptions(env = {}, embeddingConfig = {}) {
57 const provider = String(embeddingConfig?.provider || '').trim().toLowerCase();
58 const safeChunkSize = safeIndexerChunkSizeForEmbedding(embeddingConfig);
59 const rawChunkSize = parsePositiveInteger(env.INDEXER_CHUNK_SIZE);
60 const chunkSize =
61 provider === 'deepinfra'
62 ? Math.min(rawChunkSize ?? safeChunkSize, safeChunkSize)
63 : (rawChunkSize ?? safeChunkSize);
64
65 const defaultOverlap =
66 provider === 'deepinfra' ? DEEPINFRA_SAFE_CHUNK_OVERLAP : DEFAULT_CHUNK_OVERLAP;
67 const rawOverlap = parsePositiveInteger(env.INDEXER_CHUNK_OVERLAP);
68 const providerMaxOverlap =
69 provider === 'deepinfra' ? DEEPINFRA_SAFE_CHUNK_OVERLAP : Math.max(0, chunkSize - 1);
70 const maxOverlap = Math.min(Math.max(0, chunkSize - 1), providerMaxOverlap);
71 const chunkOverlap = Math.min(rawOverlap ?? defaultOverlap, maxOverlap);
72
73 return { chunkSize, chunkOverlap };
74 }
75
76 export {
77 DEFAULT_CHUNK_SIZE,
78 DEFAULT_CHUNK_OVERLAP,
79 DEEPINFRA_SAFE_CHUNK_SIZE,
80 DEEPINFRA_SAFE_CHUNK_OVERLAP,
81 };