media-url-extract.mjs
143 lines 3.7 KB
Raw
sha256:8915fe406161f95c1681f9469375e7bae5b28c884f00bedbdef65e4b0cd0738d docs(flow): commit FLOW-V0-SPEC.md hygiene for 7A-INT merge Human 12 hours ago
1 /**
2 * Extract image and video URLs from markdown note bodies.
3 * Foundation for Phase 18 MCP image/video resources and Hub rendering.
4 */
5
6 const MAX_URLS_PER_NOTE = 50;
7
8 const IMAGE_EXT_MIME = {
9 jpg: 'image/jpeg',
10 jpeg: 'image/jpeg',
11 png: 'image/png',
12 gif: 'image/gif',
13 webp: 'image/webp',
14 svg: 'image/svg+xml',
15 };
16
17 const VIDEO_EXT_MIME = {
18 mp4: 'video/mp4',
19 webm: 'video/webm',
20 mov: 'video/quicktime',
21 };
22
23 const IMAGE_EXTENSIONS = Object.keys(IMAGE_EXT_MIME);
24 const VIDEO_EXTENSIONS = Object.keys(VIDEO_EXT_MIME);
25
26 /**
27 * Strip query string and fragment from a URL for extension detection.
28 * @param {string} url
29 * @returns {string} extension without dot, lowercased
30 */
31 function extractExtension(url) {
32 try {
33 const u = new URL(url);
34 const pathname = u.pathname;
35 const dot = pathname.lastIndexOf('.');
36 if (dot === -1) return '';
37 return pathname.slice(dot + 1).toLowerCase();
38 } catch {
39 const clean = url.split('?')[0].split('#')[0];
40 const dot = clean.lastIndexOf('.');
41 if (dot === -1) return '';
42 return clean.slice(dot + 1).toLowerCase();
43 }
44 }
45
46 /**
47 * Markdown image syntax: ![alt](url)
48 * Captures: group 1 = alt text, group 2 = URL
49 */
50 const MD_IMAGE_RE = /!\[([^\]]*)\]\((https?:\/\/[^)\s]+)\)/gi;
51
52 /**
53 * Bare URL on its own line (not inside markdown link/image syntax).
54 * Matches lines that are just a URL (with optional whitespace).
55 */
56 const BARE_URL_LINE_RE = /^[ \t]*(https?:\/\/[^\s]+)[ \t]*$/gm;
57
58 /**
59 * Extract image URLs from a markdown body.
60 * Finds both `![alt](url)` syntax and bare image URLs on their own line.
61 * @param {string} body
62 * @returns {Array<{ alt: string, url: string, mimeType: string }>}
63 */
64 export function extractImageUrls(body) {
65 if (!body || typeof body !== 'string') return [];
66
67 const seen = new Set();
68 const results = [];
69
70 function addIfImage(url, alt) {
71 if (results.length >= MAX_URLS_PER_NOTE) return;
72 const trimmed = url.trim();
73 if (seen.has(trimmed)) return;
74 if (/^data:/i.test(trimmed)) return;
75 const ext = extractExtension(trimmed);
76 if (!IMAGE_EXTENSIONS.includes(ext)) return;
77 if (VIDEO_EXTENSIONS.includes(ext)) return;
78 seen.add(trimmed);
79 results.push({
80 alt: alt || '',
81 url: trimmed,
82 mimeType: IMAGE_EXT_MIME[ext] || 'image/png',
83 });
84 }
85
86 let m;
87 MD_IMAGE_RE.lastIndex = 0;
88 while ((m = MD_IMAGE_RE.exec(body)) !== null) {
89 const url = m[2];
90 const ext = extractExtension(url);
91 if (VIDEO_EXTENSIONS.includes(ext)) continue;
92 addIfImage(url, m[1]);
93 }
94
95 BARE_URL_LINE_RE.lastIndex = 0;
96 while ((m = BARE_URL_LINE_RE.exec(body)) !== null) {
97 addIfImage(m[1], '');
98 }
99
100 return results;
101 }
102
103 /**
104 * Extract video URLs from a markdown body.
105 * Finds bare video URLs and video URLs inside `![alt](url)` syntax.
106 * @param {string} body
107 * @returns {Array<{ url: string, mimeType: string }>}
108 */
109 export function extractVideoUrls(body) {
110 if (!body || typeof body !== 'string') return [];
111
112 const seen = new Set();
113 const results = [];
114
115 function addIfVideo(url) {
116 if (results.length >= MAX_URLS_PER_NOTE) return;
117 const trimmed = url.trim();
118 if (seen.has(trimmed)) return;
119 if (/^data:/i.test(trimmed)) return;
120 const ext = extractExtension(trimmed);
121 if (!VIDEO_EXTENSIONS.includes(ext)) return;
122 seen.add(trimmed);
123 results.push({
124 url: trimmed,
125 mimeType: VIDEO_EXT_MIME[ext] || 'video/mp4',
126 });
127 }
128
129 let m;
130 MD_IMAGE_RE.lastIndex = 0;
131 while ((m = MD_IMAGE_RE.exec(body)) !== null) {
132 addIfVideo(m[2]);
133 }
134
135 BARE_URL_LINE_RE.lastIndex = 0;
136 while ((m = BARE_URL_LINE_RE.exec(body)) !== null) {
137 addIfVideo(m[1]);
138 }
139
140 return results;
141 }
142
143 export { MAX_URLS_PER_NOTE, IMAGE_EXT_MIME, VIDEO_EXT_MIME };
File History 1 commit
sha256:8915fe406161f95c1681f9469375e7bae5b28c884f00bedbdef65e4b0cd0738d docs(flow): commit FLOW-V0-SPEC.md hygiene for 7A-INT merge Human 12 hours ago