transcribe.mjs
146 lines 5.0 KB
Raw
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor ⚠ breaking 2 days ago
1 /**
2 * Transcription: audio/video → text. Phase 7.
3 * Provider: OpenAI Whisper (OPENAI_API_KEY required).
4 * Optional: ffmpeg transcodes files over 25MB when enabled and ffmpeg is available.
5 */
6
7 import fs from 'fs';
8 import path from 'path';
9 import { readTranscriptionYaml } from './config.mjs';
10 import { getRepoRoot } from './repo-root.mjs';
11
12 const WHISPER_URL = 'https://api.openai.com/v1/audio/transcriptions';
13
14 /** OpenAI transcription endpoint rejects files over this size (bytes). See API docs; matches observed 413 errors. */
15 export const WHISPER_MAX_FILE_BYTES = 25 * 1024 * 1024;
16
17 const FFMPEG_HINT =
18 'Install ffmpeg (https://ffmpeg.org/download.html) and ensure it is on PATH, or set FFMPEG_PATH, so Knowtation can compress oversized files automatically. Or export a smaller MP3/M4A, use a shorter clip, or import an existing transcript as Markdown.';
19
20 /** Supported extensions for Whisper (mp3, mp4, mpeg, mpga, m4a, wav, webm) */
21 const SUPPORTED_EXT = new Set(['.mp3', '.mp4', '.mpeg', '.mpga', '.m4a', '.wav', '.webm']);
22
23 /**
24 * @param {boolean} transcodeEnabled
25 * @param {number} sizeBytes
26 * @returns {Error}
27 */
28 function oversizeError(transcodeEnabled, sizeBytes) {
29 const mb = (sizeBytes / (1024 * 1024)).toFixed(1);
30 const base = `File is ${mb}MB; OpenAI Whisper accepts at most 25MB per request.`;
31 if (transcodeEnabled) {
32 return new Error(`${base} ${FFMPEG_HINT}`);
33 }
34 return new Error(
35 `${base} Automatic compression is disabled (transcription.transcode_oversized: false or KNOWTATION_TRANSCODE_OVERSIZED=0). Export a smaller MP3/M4A, use a shorter clip, or import an existing transcript as Markdown.`
36 );
37 }
38
39 /**
40 * @param {{ transcodeOversized?: boolean }} options
41 */
42 function resolveTranscodeOversized(options) {
43 if (options.transcodeOversized === false) return false;
44 if (options.transcodeOversized === true) return true;
45 const ev = process.env.KNOWTATION_TRANSCODE_OVERSIZED;
46 if (ev === '0' || ev === 'false') return false;
47 if (ev === '1' || ev === 'true') return true;
48 try {
49 const y = readTranscriptionYaml(getRepoRoot());
50 return y.transcode_oversized !== false;
51 } catch (_) {
52 return true;
53 }
54 }
55
56 /**
57 * Transcribe an audio or video file to text.
58 * @param {string} filePath - Absolute or cwd-relative path to audio/video file
59 * @param {{ apiKey?: string, model?: string, transcodeOversized?: boolean }} options
60 * @returns {Promise<{ text: string, transcoded?: boolean }>}
61 */
62 export async function transcribe(filePath, options = {}) {
63 const absPath = path.isAbsolute(filePath) ? filePath : path.resolve(process.cwd(), filePath);
64 if (!fs.existsSync(absPath) || !fs.statSync(absPath).isFile()) {
65 throw new Error(`File not found: ${filePath}`);
66 }
67
68 const ext = path.extname(absPath).toLowerCase();
69 if (!SUPPORTED_EXT.has(ext)) {
70 throw new Error(
71 `Unsupported format: ${ext}. Use mp3, mp4, mpeg, mpga, m4a, wav, or webm.`
72 );
73 }
74
75 const st = fs.statSync(absPath);
76 const transcodeEnabled = resolveTranscodeOversized(options);
77
78 let pathForUpload = absPath;
79 let transcoded = false;
80 /** @type {(() => void) | null} */
81 let cleanupTemp = null;
82
83 try {
84 if (st.size > WHISPER_MAX_FILE_BYTES) {
85 if (!transcodeEnabled) {
86 throw oversizeError(false, st.size);
87 }
88 const { transcodeUnderWhisperLimit } = await import('./ffmpeg-whisper-transcode.mjs');
89 const result = await transcodeUnderWhisperLimit(absPath, WHISPER_MAX_FILE_BYTES);
90 if (!result) {
91 throw oversizeError(true, st.size);
92 }
93 pathForUpload = result.path;
94 transcoded = true;
95 cleanupTemp = result.cleanup;
96 const st2 = fs.statSync(pathForUpload);
97 if (st2.size > WHISPER_MAX_FILE_BYTES) {
98 throw new Error(
99 `After compression the file is still over 25MB. Split the recording or reduce length. ${FFMPEG_HINT}`
100 );
101 }
102 }
103
104 const apiKey = options.apiKey ?? process.env.OPENAI_API_KEY;
105 if (!apiKey) {
106 throw new Error(
107 'OPENAI_API_KEY is required for transcription. Set it in the environment or config.'
108 );
109 }
110
111 let model = options.model;
112 if (model == null || model === '') {
113 try {
114 model = readTranscriptionYaml(getRepoRoot()).model || 'whisper-1';
115 } catch (_) {
116 model = 'whisper-1';
117 }
118 }
119
120 const blob = new Blob([fs.readFileSync(pathForUpload)]);
121 const form = new FormData();
122 form.append('file', blob, path.basename(pathForUpload));
123 form.append('model', model);
124
125 const res = await fetch(WHISPER_URL, {
126 method: 'POST',
127 headers: { Authorization: `Bearer ${apiKey}` },
128 body: form,
129 });
130
131 if (!res.ok) {
132 const err = await res.text();
133 throw new Error(`Transcription failed: ${res.status} ${res.statusText} - ${err}`);
134 }
135
136 const data = await res.json();
137 const text = data.text?.trim() ?? '';
138 return transcoded ? { text, transcoded: true } : { text };
139 } finally {
140 if (typeof cleanupTemp === 'function') {
141 try {
142 cleanupTemp();
143 } catch (_) {}
144 }
145 }
146 }
File History 2 commits
sha256:65ccb454656ea5acdea0a10e559b78bcde1eb6ff753ecc2911bc99d1c3d7cadd feat(calendar): enforce agent context tiers in retrieval AP… Human minor 2 days ago
sha256:9103f98c89257ed2b01c237cea895dabb3e85ea337dccb1161c175e4422355b6 docs: accept Calendar Events v0 spec with Phase 0 security … Human 2 days ago