url.mjs
164 lines 5.6 KB
Raw
sha256:41d741fb345c4abdb640838aa3d847de02ccffd7a39fce04894e743e683b50d0 fix(security): pin patched transitive deps to clear Dependa… Human minor ⚠ breaking 7 days ago
1 /**
2 * Import a public HTTPS URL into a vault note (article extract or bookmark).
3 */
4
5 import crypto from 'crypto';
6 import path from 'path';
7 import { Readability } from '@mozilla/readability';
8 import { parseHTML } from 'linkedom';
9 import TurndownService from 'turndown';
10 import { writeNote } from '../write.mjs';
11 import { normalizeSlug } from '../vault.mjs';
12 import { fetchUrlForImport } from '../url-fetch-safe.mjs';
13
14 /** @typedef {'auto' | 'bookmark' | 'extract'} UrlImportMode */
15
16 /**
17 * Stable id from canonical URL (hex, 32 chars).
18 * @param {string} canonicalUrl
19 */
20 function sourceIdFromUrl(canonicalUrl) {
21 return crypto.createHash('sha256').update(canonicalUrl, 'utf8').digest('hex').slice(0, 32);
22 }
23
24 /**
25 * @param {string} html
26 * @param {string} pageUrl
27 * @returns {{ title: string, bodyMd: string } | null}
28 */
29 function extractArticleMarkdown(html, pageUrl) {
30 const { document } = parseHTML(html);
31 const reader = new Readability(document, { url: pageUrl });
32 const article = reader.parse();
33 if (!article || (!article.content && !article.textContent)) return null;
34 const title = (article.title || '').trim() || 'Imported page';
35 let bodyMd = '';
36 if (article.content && String(article.content).trim()) {
37 const td = new TurndownService({
38 headingStyle: 'atx',
39 codeBlockStyle: 'fenced',
40 });
41 bodyMd = td.turndown(article.content).trim();
42 } else if (article.textContent) {
43 bodyMd = String(article.textContent).trim();
44 }
45 if (!bodyMd || bodyMd.length < 40) return null;
46 return { title, bodyMd };
47 }
48
49 /**
50 * @param {string} url
51 * @param {string} finalUrl
52 * @param {string} titleGuess
53 * @param {UrlImportMode} mode
54 */
55 function bookmarkBody(url, finalUrl, titleGuess, mode) {
56 const lines = [
57 `[Open original](${finalUrl})`,
58 '',
59 `_Imported as bookmark (${mode})._`,
60 ];
61 if (url !== finalUrl) lines.push('', `Requested: ${url}`);
62 return lines.join('\n');
63 }
64
65 /**
66 * @param {string} input - HTTPS URL string
67 * @param {{
68 * vaultPath: string,
69 * outputBase: string,
70 * project?: string | null,
71 * tags: string[],
72 * dryRun: boolean,
73 * urlMode?: UrlImportMode,
74 * onProgress?: (p: { progress: number, total?: number, message?: string }) => void | Promise<void>
75 * }} ctx
76 * @returns {Promise<{ imported: { path: string, source_id?: string }[], count: number }>}
77 */
78 export async function importUrl(input, ctx) {
79 const raw = typeof input === 'string' ? input.trim() : '';
80 if (!raw) throw new Error('URL is required');
81
82 const mode = ctx.urlMode === 'bookmark' || ctx.urlMode === 'extract' || ctx.urlMode === 'auto' ? ctx.urlMode : 'auto';
83 const { vaultPath, outputBase, project, tags, dryRun, onProgress } = ctx;
84
85 if (onProgress) await onProgress({ progress: 0, total: 1, message: 'Fetching URL…' });
86
87 const fetched = await fetchUrlForImport(raw);
88 const canonical = fetched.finalUrl;
89 const source_id = sourceIdFromUrl(canonical);
90 const now = new Date().toISOString().slice(0, 10);
91 const short = source_id.slice(0, 12);
92 const outputRel = path.join(outputBase, 'imports', 'url', `${short}.md`).replace(/\\/g, '/');
93
94 const ct = fetched.contentType || '';
95 const isHtml = ct.includes('html') || fetched.text.trimStart().toLowerCase().startsWith('<!doctype') || fetched.text.includes('<html');
96
97 let title = 'Imported link';
98 let body = '';
99
100 if (mode === 'bookmark') {
101 title = new URL(canonical).hostname.replace(/^www\./, '') || title;
102 body = bookmarkBody(raw, canonical, title, 'bookmark');
103 } else if (mode === 'extract') {
104 if (!isHtml) {
105 throw new Error(`Extract mode requires HTML; got content-type "${ct || 'unknown'}"`);
106 }
107 const extracted = extractArticleMarkdown(fetched.text, canonical);
108 if (!extracted) throw new Error('Could not extract readable article content from this page');
109 title = extracted.title;
110 body =
111 extracted.bodyMd +
112 '\n\n---\n\n' +
113 `Source: [${canonical}](${canonical})\n` +
114 (raw !== canonical ? `\nRequested URL: ${raw}\n` : '');
115 } else {
116 // auto
117 if (isHtml) {
118 const extracted = extractArticleMarkdown(fetched.text, canonical);
119 if (extracted && extracted.bodyMd.length >= 80) {
120 title = extracted.title;
121 body =
122 extracted.bodyMd +
123 '\n\n---\n\n' +
124 `Source: [${canonical}](${canonical})\n` +
125 (raw !== canonical ? `\nRequested URL: ${raw}\n` : '');
126 } else {
127 title = new URL(canonical).hostname.replace(/^www\./, '') || title;
128 body = bookmarkBody(raw, canonical, title, 'auto (fallback)');
129 }
130 } else {
131 title = new URL(canonical).hostname.replace(/^www\./, '') || title;
132 body =
133 bookmarkBody(raw, canonical, title, 'auto (non-HTML)') +
134 '\n\n' +
135 (fetched.text.trim()
136 ? '```\n' + fetched.text.trim().slice(0, 8000) + (fetched.text.length > 8000 ? '\n…' : '') + '\n```'
137 : '');
138 }
139 }
140
141 const merged = {
142 title,
143 date: now,
144 source: 'url-import',
145 source_id,
146 canonical_url: canonical,
147 ...(project && { project: normalizeSlug(project) }),
148 ...(tags.length && { tags }),
149 };
150 if (typeof merged.tags === 'string') merged.tags = tags;
151 else if (Array.isArray(merged.tags)) merged.tags = [...new Set([...merged.tags, ...tags])];
152 else merged.tags = tags;
153
154 if (!dryRun) {
155 writeNote(vaultPath, outputRel, {
156 body,
157 frontmatter: Object.fromEntries(Object.entries(merged).filter(([, v]) => v !== undefined && v !== null && v !== '')),
158 });
159 }
160
161 if (onProgress) await onProgress({ progress: 1, total: 1, message: 'Done' });
162
163 return { imported: [{ path: outputRel, source_id }], count: 1 };
164 }
File History 1 commit
sha256:41d741fb345c4abdb640838aa3d847de02ccffd7a39fce04894e743e683b50d0 fix(security): pin patched transitive deps to clear Dependa… Human minor 7 days ago