indexer-chunk-options.mjs
file-level
1
files
1
commits
0
hotspots
0
🧊 dead
0
💥 blast risk
| 1 | /** |
| 2 | * Hosted indexer chunk sizing policy. |
| 3 | * |
| 4 | * The bridge indexer uses character chunks before sending text to embedding |
| 5 | * providers. DeepInfra's commonly used BGE embedding models reject requests |
| 6 | * above a 512-token context window, so the hosted default must keep a real |
| 7 | * safety margin instead of relying on the old 2048 chars ~= 512 tokens rule. |
| 8 | */ |
| 9 | |
| 10 | const DEFAULT_CHUNK_SIZE = 2048; |
| 11 | const DEFAULT_CHUNK_OVERLAP = 256; |
| 12 | const DEEPINFRA_SAFE_CHUNK_SIZE = 1024; |
| 13 | const DEEPINFRA_SAFE_CHUNK_OVERLAP = 128; |
| 14 | |
| 15 | /** |
| 16 | * @param {unknown} raw |
| 17 | * @returns {number|null} |
| 18 | */ |
| 19 | function parsePositiveInteger(raw) { |
| 20 | if (raw == null || raw === '') return null; |
| 21 | const value = typeof raw === 'number' ? raw : Number.parseInt(String(raw).trim(), 10); |
| 22 | if (!Number.isFinite(value) || value < 1) return null; |
| 23 | return Math.floor(value); |
| 24 | } |
| 25 | |
| 26 | /** |
| 27 | * @param {string|null|undefined} provider |
| 28 | * @returns {string} |
| 29 | */ |
| 30 | export function defaultBridgeEmbeddingModelForProvider(provider) { |
| 31 | const p = String(provider || 'ollama').trim().toLowerCase(); |
| 32 | if (p === 'openai') return 'text-embedding-3-small'; |
| 33 | if (p === 'voyage') return 'voyage-4-lite'; |
| 34 | if (p === 'deepinfra') return 'BAAI/bge-large-en-v1.5'; |
| 35 | return 'nomic-embed-text'; |
| 36 | } |
| 37 | |
| 38 | /** |
| 39 | * @param {{ provider?: string|null, model?: string|null }} embeddingConfig |
| 40 | * @returns {number} |
| 41 | */ |
| 42 | export function safeIndexerChunkSizeForEmbedding(embeddingConfig = {}) { |
| 43 | const provider = String(embeddingConfig?.provider || '').trim().toLowerCase(); |
| 44 | if (provider === 'deepinfra') return DEEPINFRA_SAFE_CHUNK_SIZE; |
| 45 | return DEFAULT_CHUNK_SIZE; |
| 46 | } |
| 47 | |
| 48 | /** |
| 49 | * @param {{ |
| 50 | * INDEXER_CHUNK_SIZE?: string|number|null, |
| 51 | * INDEXER_CHUNK_OVERLAP?: string|number|null, |
| 52 | * }} env |
| 53 | * @param {{ provider?: string|null, model?: string|null }} embeddingConfig |
| 54 | * @returns {{ chunkSize: number, chunkOverlap: number }} |
| 55 | */ |
| 56 | export function resolveIndexerChunkOptions(env = {}, embeddingConfig = {}) { |
| 57 | const provider = String(embeddingConfig?.provider || '').trim().toLowerCase(); |
| 58 | const safeChunkSize = safeIndexerChunkSizeForEmbedding(embeddingConfig); |
| 59 | const rawChunkSize = parsePositiveInteger(env.INDEXER_CHUNK_SIZE); |
| 60 | const chunkSize = |
| 61 | provider === 'deepinfra' |
| 62 | ? Math.min(rawChunkSize ?? safeChunkSize, safeChunkSize) |
| 63 | : (rawChunkSize ?? safeChunkSize); |
| 64 | |
| 65 | const defaultOverlap = |
| 66 | provider === 'deepinfra' ? DEEPINFRA_SAFE_CHUNK_OVERLAP : DEFAULT_CHUNK_OVERLAP; |
| 67 | const rawOverlap = parsePositiveInteger(env.INDEXER_CHUNK_OVERLAP); |
| 68 | const providerMaxOverlap = |
| 69 | provider === 'deepinfra' ? DEEPINFRA_SAFE_CHUNK_OVERLAP : Math.max(0, chunkSize - 1); |
| 70 | const maxOverlap = Math.min(Math.max(0, chunkSize - 1), providerMaxOverlap); |
| 71 | const chunkOverlap = Math.min(rawOverlap ?? defaultOverlap, maxOverlap); |
| 72 | |
| 73 | return { chunkSize, chunkOverlap }; |
| 74 | } |
| 75 | |
| 76 | export { |
| 77 | DEFAULT_CHUNK_SIZE, |
| 78 | DEFAULT_CHUNK_OVERLAP, |
| 79 | DEEPINFRA_SAFE_CHUNK_SIZE, |
| 80 | DEEPINFRA_SAFE_CHUNK_OVERLAP, |
| 81 | }; |