url.mjs
sha256:41d741fb345c4abdb640838aa3d847de02ccffd7a39fce04894e743e683b50d0
fix(security): pin patched transitive deps to clear Dependa…
Human
minor
⚠ breaking
7 days ago
| 1 | /** |
| 2 | * Import a public HTTPS URL into a vault note (article extract or bookmark). |
| 3 | */ |
| 4 | |
| 5 | import crypto from 'crypto'; |
| 6 | import path from 'path'; |
| 7 | import { Readability } from '@mozilla/readability'; |
| 8 | import { parseHTML } from 'linkedom'; |
| 9 | import TurndownService from 'turndown'; |
| 10 | import { writeNote } from '../write.mjs'; |
| 11 | import { normalizeSlug } from '../vault.mjs'; |
| 12 | import { fetchUrlForImport } from '../url-fetch-safe.mjs'; |
| 13 | |
| 14 | /** @typedef {'auto' | 'bookmark' | 'extract'} UrlImportMode */ |
| 15 | |
| 16 | /** |
| 17 | * Stable id from canonical URL (hex, 32 chars). |
| 18 | * @param {string} canonicalUrl |
| 19 | */ |
| 20 | function sourceIdFromUrl(canonicalUrl) { |
| 21 | return crypto.createHash('sha256').update(canonicalUrl, 'utf8').digest('hex').slice(0, 32); |
| 22 | } |
| 23 | |
| 24 | /** |
| 25 | * @param {string} html |
| 26 | * @param {string} pageUrl |
| 27 | * @returns {{ title: string, bodyMd: string } | null} |
| 28 | */ |
| 29 | function extractArticleMarkdown(html, pageUrl) { |
| 30 | const { document } = parseHTML(html); |
| 31 | const reader = new Readability(document, { url: pageUrl }); |
| 32 | const article = reader.parse(); |
| 33 | if (!article || (!article.content && !article.textContent)) return null; |
| 34 | const title = (article.title || '').trim() || 'Imported page'; |
| 35 | let bodyMd = ''; |
| 36 | if (article.content && String(article.content).trim()) { |
| 37 | const td = new TurndownService({ |
| 38 | headingStyle: 'atx', |
| 39 | codeBlockStyle: 'fenced', |
| 40 | }); |
| 41 | bodyMd = td.turndown(article.content).trim(); |
| 42 | } else if (article.textContent) { |
| 43 | bodyMd = String(article.textContent).trim(); |
| 44 | } |
| 45 | if (!bodyMd || bodyMd.length < 40) return null; |
| 46 | return { title, bodyMd }; |
| 47 | } |
| 48 | |
| 49 | /** |
| 50 | * @param {string} url |
| 51 | * @param {string} finalUrl |
| 52 | * @param {string} titleGuess |
| 53 | * @param {UrlImportMode} mode |
| 54 | */ |
| 55 | function bookmarkBody(url, finalUrl, titleGuess, mode) { |
| 56 | const lines = [ |
| 57 | `[Open original](${finalUrl})`, |
| 58 | '', |
| 59 | `_Imported as bookmark (${mode})._`, |
| 60 | ]; |
| 61 | if (url !== finalUrl) lines.push('', `Requested: ${url}`); |
| 62 | return lines.join('\n'); |
| 63 | } |
| 64 | |
| 65 | /** |
| 66 | * @param {string} input - HTTPS URL string |
| 67 | * @param {{ |
| 68 | * vaultPath: string, |
| 69 | * outputBase: string, |
| 70 | * project?: string | null, |
| 71 | * tags: string[], |
| 72 | * dryRun: boolean, |
| 73 | * urlMode?: UrlImportMode, |
| 74 | * onProgress?: (p: { progress: number, total?: number, message?: string }) => void | Promise<void> |
| 75 | * }} ctx |
| 76 | * @returns {Promise<{ imported: { path: string, source_id?: string }[], count: number }>} |
| 77 | */ |
| 78 | export async function importUrl(input, ctx) { |
| 79 | const raw = typeof input === 'string' ? input.trim() : ''; |
| 80 | if (!raw) throw new Error('URL is required'); |
| 81 | |
| 82 | const mode = ctx.urlMode === 'bookmark' || ctx.urlMode === 'extract' || ctx.urlMode === 'auto' ? ctx.urlMode : 'auto'; |
| 83 | const { vaultPath, outputBase, project, tags, dryRun, onProgress } = ctx; |
| 84 | |
| 85 | if (onProgress) await onProgress({ progress: 0, total: 1, message: 'Fetching URL…' }); |
| 86 | |
| 87 | const fetched = await fetchUrlForImport(raw); |
| 88 | const canonical = fetched.finalUrl; |
| 89 | const source_id = sourceIdFromUrl(canonical); |
| 90 | const now = new Date().toISOString().slice(0, 10); |
| 91 | const short = source_id.slice(0, 12); |
| 92 | const outputRel = path.join(outputBase, 'imports', 'url', `${short}.md`).replace(/\\/g, '/'); |
| 93 | |
| 94 | const ct = fetched.contentType || ''; |
| 95 | const isHtml = ct.includes('html') || fetched.text.trimStart().toLowerCase().startsWith('<!doctype') || fetched.text.includes('<html'); |
| 96 | |
| 97 | let title = 'Imported link'; |
| 98 | let body = ''; |
| 99 | |
| 100 | if (mode === 'bookmark') { |
| 101 | title = new URL(canonical).hostname.replace(/^www\./, '') || title; |
| 102 | body = bookmarkBody(raw, canonical, title, 'bookmark'); |
| 103 | } else if (mode === 'extract') { |
| 104 | if (!isHtml) { |
| 105 | throw new Error(`Extract mode requires HTML; got content-type "${ct || 'unknown'}"`); |
| 106 | } |
| 107 | const extracted = extractArticleMarkdown(fetched.text, canonical); |
| 108 | if (!extracted) throw new Error('Could not extract readable article content from this page'); |
| 109 | title = extracted.title; |
| 110 | body = |
| 111 | extracted.bodyMd + |
| 112 | '\n\n---\n\n' + |
| 113 | `Source: [${canonical}](${canonical})\n` + |
| 114 | (raw !== canonical ? `\nRequested URL: ${raw}\n` : ''); |
| 115 | } else { |
| 116 | // auto |
| 117 | if (isHtml) { |
| 118 | const extracted = extractArticleMarkdown(fetched.text, canonical); |
| 119 | if (extracted && extracted.bodyMd.length >= 80) { |
| 120 | title = extracted.title; |
| 121 | body = |
| 122 | extracted.bodyMd + |
| 123 | '\n\n---\n\n' + |
| 124 | `Source: [${canonical}](${canonical})\n` + |
| 125 | (raw !== canonical ? `\nRequested URL: ${raw}\n` : ''); |
| 126 | } else { |
| 127 | title = new URL(canonical).hostname.replace(/^www\./, '') || title; |
| 128 | body = bookmarkBody(raw, canonical, title, 'auto (fallback)'); |
| 129 | } |
| 130 | } else { |
| 131 | title = new URL(canonical).hostname.replace(/^www\./, '') || title; |
| 132 | body = |
| 133 | bookmarkBody(raw, canonical, title, 'auto (non-HTML)') + |
| 134 | '\n\n' + |
| 135 | (fetched.text.trim() |
| 136 | ? '```\n' + fetched.text.trim().slice(0, 8000) + (fetched.text.length > 8000 ? '\n…' : '') + '\n```' |
| 137 | : ''); |
| 138 | } |
| 139 | } |
| 140 | |
| 141 | const merged = { |
| 142 | title, |
| 143 | date: now, |
| 144 | source: 'url-import', |
| 145 | source_id, |
| 146 | canonical_url: canonical, |
| 147 | ...(project && { project: normalizeSlug(project) }), |
| 148 | ...(tags.length && { tags }), |
| 149 | }; |
| 150 | if (typeof merged.tags === 'string') merged.tags = tags; |
| 151 | else if (Array.isArray(merged.tags)) merged.tags = [...new Set([...merged.tags, ...tags])]; |
| 152 | else merged.tags = tags; |
| 153 | |
| 154 | if (!dryRun) { |
| 155 | writeNote(vaultPath, outputRel, { |
| 156 | body, |
| 157 | frontmatter: Object.fromEntries(Object.entries(merged).filter(([, v]) => v !== undefined && v !== null && v !== '')), |
| 158 | }); |
| 159 | } |
| 160 | |
| 161 | if (onProgress) await onProgress({ progress: 1, total: 1, message: 'Done' }); |
| 162 | |
| 163 | return { imported: [{ path: outputRel, source_id }], count: 1 }; |
| 164 | } |
File History
1 commit
sha256:41d741fb345c4abdb640838aa3d847de02ccffd7a39fce04894e743e683b50d0
fix(security): pin patched transitive deps to clear Dependa…
Human
minor
⚠
7 days ago