test_markdown_adapter.py
file-level
1
files
1
commits
0
hotspots
0
π§ dead
0
π₯ blast risk
| 1 | """Tests for the rewritten MarkdownAdapter. |
| 2 | |
| 3 | Coverage: |
| 4 | - Extension routing: only .md / .rst / .txt are accepted. |
| 5 | - Section symbols: flat headings, hierarchical qualified names, level encoding. |
| 6 | - Content-ID correctness: full section bytes hashed, not just heading text. |
| 7 | - Body-hash / signature split: retitle detection, level-change detection. |
| 8 | - Code block symbols: language tag, no-language fallback, content hash. |
| 9 | - GFM table symbols: header signature, data-row body_hash, schema changes. |
| 10 | - Inline markup stripping: bold, italic, inline-code, links in headings. |
| 11 | - Deduplication: identical sibling headings get @L{lineno} suffix. |
| 12 | - Depth limit: sections beyond _MAX_DEPTH are silently dropped. |
| 13 | - Edge cases: empty file, no headings, setext headings (unsupported β skip). |
| 14 | - Real-world shape: README-shaped document exercises all three emitters. |
| 15 | - _plain_heading unit tests: images dropped, markup stripped, truncation. |
| 16 | """ |
| 17 | |
| 18 | from __future__ import annotations |
| 19 | |
| 20 | import pytest |
| 21 | from muse.plugins.code.ast_parser import ( |
| 22 | MarkdownAdapter, |
| 23 | SymbolRecord, |
| 24 | SymbolTree, |
| 25 | _plain_heading, |
| 26 | ) |
| 27 | |
| 28 | |
| 29 | # --------------------------------------------------------------------------- |
| 30 | # Helpers |
| 31 | # --------------------------------------------------------------------------- |
| 32 | |
| 33 | def _parse(source: str, path: str = "README.md") -> SymbolTree: |
| 34 | adapter = MarkdownAdapter() |
| 35 | if adapter._parser is None: |
| 36 | pytest.skip("tree-sitter-markdown not available") |
| 37 | return adapter.parse_symbols(source.encode(), path) |
| 38 | |
| 39 | |
| 40 | # --------------------------------------------------------------------------- |
| 41 | # _plain_heading unit tests |
| 42 | # --------------------------------------------------------------------------- |
| 43 | |
| 44 | class TestPlainHeading: |
| 45 | def test_plain_text_unchanged(self) -> None: |
| 46 | assert _plain_heading("Hello World") == "Hello World" |
| 47 | |
| 48 | def test_bold_stripped(self) -> None: |
| 49 | assert _plain_heading("**Bold** heading") == "Bold heading" |
| 50 | |
| 51 | def test_italic_star_stripped(self) -> None: |
| 52 | assert _plain_heading("*italic* text") == "italic text" |
| 53 | |
| 54 | def test_bold_italic_combined(self) -> None: |
| 55 | assert _plain_heading("***bold italic***") == "bold italic" |
| 56 | |
| 57 | def test_italic_underscore_stripped(self) -> None: |
| 58 | assert _plain_heading("_italic_") == "italic" |
| 59 | |
| 60 | def test_bold_underscore_stripped(self) -> None: |
| 61 | assert _plain_heading("__bold__") == "bold" |
| 62 | |
| 63 | def test_inline_code_stripped(self) -> None: |
| 64 | assert _plain_heading("`code` block") == "code block" |
| 65 | |
| 66 | def test_triple_backtick_stripped(self) -> None: |
| 67 | assert _plain_heading("```code```") == "code" |
| 68 | |
| 69 | def test_link_keeps_text(self) -> None: |
| 70 | assert _plain_heading("[link text](https://example.com)") == "link text" |
| 71 | |
| 72 | def test_reference_link_keeps_text(self) -> None: |
| 73 | assert _plain_heading("[link text][ref]") == "link text" |
| 74 | |
| 75 | def test_image_dropped_entirely(self) -> None: |
| 76 | assert _plain_heading(" caption") == "caption" |
| 77 | |
| 78 | def test_reference_image_dropped(self) -> None: |
| 79 | assert _plain_heading("![alt][ref] caption") == "caption" |
| 80 | |
| 81 | def test_html_entity_amp(self) -> None: |
| 82 | assert _plain_heading("foo & bar") == "foo & bar" |
| 83 | |
| 84 | def test_html_entity_lt_gt(self) -> None: |
| 85 | assert _plain_heading("a < b > c") == "a < b > c" |
| 86 | |
| 87 | def test_html_entity_quot(self) -> None: |
| 88 | assert _plain_heading("say "hi"") == 'say "hi"' |
| 89 | |
| 90 | def test_html_entity_apos(self) -> None: |
| 91 | assert _plain_heading("it's") == "it's" |
| 92 | |
| 93 | def test_whitespace_collapsed(self) -> None: |
| 94 | assert _plain_heading(" too many spaces ") == "too many spaces" |
| 95 | |
| 96 | def test_truncation_at_120_chars(self) -> None: |
| 97 | long = "A" * 200 |
| 98 | result = _plain_heading(long) |
| 99 | assert len(result) == 120 |
| 100 | |
| 101 | def test_empty_string(self) -> None: |
| 102 | assert _plain_heading("") == "" |
| 103 | |
| 104 | def test_mixed_markup(self) -> None: |
| 105 | # Realistic heading: "**API** `Reference` Guide" |
| 106 | result = _plain_heading("**API** `Reference` Guide") |
| 107 | assert result == "API Reference Guide" |
| 108 | |
| 109 | |
| 110 | # --------------------------------------------------------------------------- |
| 111 | # Extension routing |
| 112 | # --------------------------------------------------------------------------- |
| 113 | |
| 114 | class TestExtensionRouting: |
| 115 | def test_md_supported(self) -> None: |
| 116 | adapter = MarkdownAdapter() |
| 117 | assert ".md" in adapter.supported_extensions() |
| 118 | |
| 119 | def test_rst_supported(self) -> None: |
| 120 | adapter = MarkdownAdapter() |
| 121 | assert ".rst" in adapter.supported_extensions() |
| 122 | |
| 123 | def test_txt_supported(self) -> None: |
| 124 | adapter = MarkdownAdapter() |
| 125 | assert ".txt" in adapter.supported_extensions() |
| 126 | |
| 127 | def test_py_not_supported(self) -> None: |
| 128 | adapter = MarkdownAdapter() |
| 129 | assert ".py" not in adapter.supported_extensions() |
| 130 | |
| 131 | def test_html_not_supported(self) -> None: |
| 132 | adapter = MarkdownAdapter() |
| 133 | assert ".html" not in adapter.supported_extensions() |
| 134 | |
| 135 | |
| 136 | # --------------------------------------------------------------------------- |
| 137 | # Section symbols: flat headings |
| 138 | # --------------------------------------------------------------------------- |
| 139 | |
| 140 | class TestFlatSections: |
| 141 | def test_h1_emitted(self) -> None: |
| 142 | syms = _parse("# Hello\n\nContent.\n") |
| 143 | keys = list(syms) |
| 144 | assert any("Hello" in k for k in keys) |
| 145 | |
| 146 | def test_h1_kind_is_section(self) -> None: |
| 147 | syms = _parse("# Hello\n\nContent.\n") |
| 148 | rec = next(v for k, v in syms.items() if "Hello" in k) |
| 149 | assert rec["kind"] == "section" |
| 150 | |
| 151 | def test_h2_emitted(self) -> None: |
| 152 | syms = _parse("## Setup\n\nDo the thing.\n") |
| 153 | keys = list(syms) |
| 154 | assert any("Setup" in k for k in keys) |
| 155 | |
| 156 | def test_h3_emitted(self) -> None: |
| 157 | syms = _parse("### Detail\n\nMore detail.\n") |
| 158 | keys = list(syms) |
| 159 | assert any("Detail" in k for k in keys) |
| 160 | |
| 161 | def test_address_contains_file_path(self) -> None: |
| 162 | syms = _parse("# Hello\n", "docs/guide.md") |
| 163 | assert any(k.startswith("docs/guide.md::") for k in syms) |
| 164 | |
| 165 | def test_lineno_is_one_based(self) -> None: |
| 166 | syms = _parse("# Hello\n\nContent.\n") |
| 167 | rec = next(v for k, v in syms.items() if "Hello" in k) |
| 168 | assert rec["lineno"] == 1 |
| 169 | |
| 170 | def test_end_lineno_greater_than_lineno(self) -> None: |
| 171 | syms = _parse("# Hello\n\nSome content.\n") |
| 172 | rec = next(v for k, v in syms.items() if "Hello" in k) |
| 173 | assert rec["end_lineno"] >= rec["lineno"] |
| 174 | |
| 175 | def test_name_is_plain_text(self) -> None: |
| 176 | syms = _parse("# **Bold** Heading\n\nContent.\n") |
| 177 | rec = next(v for k, v in syms.items() if "Bold Heading" in k) |
| 178 | assert rec["name"] == "Bold Heading" |
| 179 | |
| 180 | |
| 181 | # --------------------------------------------------------------------------- |
| 182 | # Section symbols: hierarchy |
| 183 | # --------------------------------------------------------------------------- |
| 184 | |
| 185 | class TestSectionHierarchy: |
| 186 | def test_h2_under_h1_has_qualified_name(self) -> None: |
| 187 | src = "# Parent\n\n## Child\n\nText.\n" |
| 188 | syms = _parse(src) |
| 189 | assert any("Parent.Child" in k for k in syms) |
| 190 | |
| 191 | def test_h3_under_h2_under_h1(self) -> None: |
| 192 | src = "# A\n\n## B\n\n### C\n\nText.\n" |
| 193 | syms = _parse(src) |
| 194 | assert any("A.B.C" in k for k in syms) |
| 195 | |
| 196 | def test_sibling_h2s_are_distinct(self) -> None: |
| 197 | src = "# Root\n\n## Alpha\n\nFoo.\n\n## Beta\n\nBar.\n" |
| 198 | syms = _parse(src) |
| 199 | assert any("Alpha" in k for k in syms) |
| 200 | assert any("Beta" in k for k in syms) |
| 201 | |
| 202 | def test_h2_address_does_not_bleed_into_sibling(self) -> None: |
| 203 | src = "# Root\n\n## A\n\nFoo.\n\n## B\n\nBar.\n" |
| 204 | syms = _parse(src) |
| 205 | # "A.B" should NOT appear; B is a sibling, not a child of A. |
| 206 | assert not any("A.B" in k for k in syms) |
| 207 | |
| 208 | def test_parent_section_includes_child_in_content_id(self) -> None: |
| 209 | src_with_child = "# Parent\n\n## Child\n\nText.\n" |
| 210 | src_no_child = "# Parent\n\nText.\n" |
| 211 | syms_with = _parse(src_with_child) |
| 212 | syms_no = _parse(src_no_child) |
| 213 | parent_with = next(v for k, v in syms_with.items() if k.endswith("::Parent")) |
| 214 | parent_no = next(v for k, v in syms_no.items() if k.endswith("::Parent")) |
| 215 | # Adding a child section changes the parent's content_id. |
| 216 | assert parent_with["content_id"] != parent_no["content_id"] |
| 217 | |
| 218 | def test_parallel_h2s_in_separate_h1_sections_dont_collide(self) -> None: |
| 219 | src = "# Intro\n\n## Overview\n\nX.\n\n# Usage\n\n## Overview\n\nY.\n" |
| 220 | syms = _parse(src) |
| 221 | # Two Overview headings exist; they must have different addresses. |
| 222 | overview_keys = [k for k in syms if "Overview" in k] |
| 223 | assert len(overview_keys) == 2 |
| 224 | assert overview_keys[0] != overview_keys[1] |
| 225 | |
| 226 | |
| 227 | # --------------------------------------------------------------------------- |
| 228 | # Content-ID correctness β the core bug fix |
| 229 | # --------------------------------------------------------------------------- |
| 230 | |
| 231 | class TestContentIDCorrectness: |
| 232 | def test_changing_body_changes_content_id(self) -> None: |
| 233 | src_a = "# Intro\n\nFirst paragraph.\n" |
| 234 | src_b = "# Intro\n\nFirst paragraph changed entirely.\n" |
| 235 | a = _parse(src_a) |
| 236 | b = _parse(src_b) |
| 237 | key_a = next(k for k in a if "Intro" in k) |
| 238 | key_b = next(k for k in b if "Intro" in k) |
| 239 | assert a[key_a]["content_id"] != b[key_b]["content_id"] |
| 240 | |
| 241 | def test_same_content_produces_same_content_id(self) -> None: |
| 242 | src = "# Hello\n\nSame content.\n" |
| 243 | a = _parse(src) |
| 244 | b = _parse(src) |
| 245 | key = next(k for k in a if "Hello" in k) |
| 246 | assert a[key]["content_id"] == b[key]["content_id"] |
| 247 | |
| 248 | def test_adding_paragraph_changes_content_id(self) -> None: |
| 249 | src_a = "# Section\n\nParagraph one.\n" |
| 250 | src_b = "# Section\n\nParagraph one.\n\nParagraph two.\n" |
| 251 | a = _parse(src_a) |
| 252 | b = _parse(src_b) |
| 253 | key_a = next(k for k in a if "Section" in k) |
| 254 | key_b = next(k for k in b if "Section" in k) |
| 255 | assert a[key_a]["content_id"] != b[key_b]["content_id"] |
| 256 | |
| 257 | def test_heading_retitle_changes_content_id(self) -> None: |
| 258 | src_a = "# Old Title\n\nSame body.\n" |
| 259 | src_b = "# New Title\n\nSame body.\n" |
| 260 | a = _parse(src_a) |
| 261 | b = _parse(src_b) |
| 262 | # Different addresses (different titles) β both content_ids checked |
| 263 | key_a = next(k for k in a if "Old Title" in k) |
| 264 | key_b = next(k for k in b if "New Title" in k) |
| 265 | # content_id differs because heading text changed. |
| 266 | assert a[key_a]["content_id"] != b[key_b]["content_id"] |
| 267 | |
| 268 | def test_retitle_with_same_body_has_same_body_hash(self) -> None: |
| 269 | """Retitle detection: body_hash stable, signature_id changes.""" |
| 270 | src_a = "# Old Title\n\nIdentical body content.\n" |
| 271 | src_b = "# New Title\n\nIdentical body content.\n" |
| 272 | a = _parse(src_a) |
| 273 | b = _parse(src_b) |
| 274 | key_a = next(k for k in a if "Old Title" in k) |
| 275 | key_b = next(k for k in b if "New Title" in k) |
| 276 | # Same body text below heading β same body_hash. |
| 277 | assert a[key_a]["body_hash"] == b[key_b]["body_hash"] |
| 278 | # Different heading text β different signature_id. |
| 279 | assert a[key_a]["signature_id"] != b[key_b]["signature_id"] |
| 280 | |
| 281 | def test_level_change_changes_metadata_id(self) -> None: |
| 282 | """Promoting a heading level is visible in metadata_id, not body_hash.""" |
| 283 | src_a = "## Section\n\nBody.\n" |
| 284 | src_b = "# Section\n\nBody.\n" |
| 285 | a = _parse(src_a) |
| 286 | b = _parse(src_b) |
| 287 | key_a = next(k for k in a if "Section" in k) |
| 288 | key_b = next(k for k in b if "Section" in k) |
| 289 | assert a[key_a]["metadata_id"] != b[key_b]["metadata_id"] |
| 290 | # Body content is the same, so body_hash should match. |
| 291 | assert a[key_a]["body_hash"] == b[key_b]["body_hash"] |
| 292 | |
| 293 | def test_level_change_changes_signature_id(self) -> None: |
| 294 | src_a = "## Section\n\nBody.\n" |
| 295 | src_b = "# Section\n\nBody.\n" |
| 296 | a = _parse(src_a) |
| 297 | b = _parse(src_b) |
| 298 | key_a = next(k for k in a if "Section" in k) |
| 299 | key_b = next(k for k in b if "Section" in k) |
| 300 | assert a[key_a]["signature_id"] != b[key_b]["signature_id"] |
| 301 | |
| 302 | |
| 303 | # --------------------------------------------------------------------------- |
| 304 | # Fenced code blocks |
| 305 | # --------------------------------------------------------------------------- |
| 306 | |
| 307 | class TestCodeBlockSymbols: |
| 308 | def test_python_block_emitted(self) -> None: |
| 309 | src = "# Section\n\n```python\nprint('hello')\n```\n" |
| 310 | syms = _parse(src) |
| 311 | assert any("code[python]" in k for k in syms) |
| 312 | |
| 313 | def test_code_block_kind_is_variable(self) -> None: |
| 314 | src = "# Section\n\n```python\nprint('hello')\n```\n" |
| 315 | syms = _parse(src) |
| 316 | rec = next(v for k, v in syms.items() if "code[python]" in k) |
| 317 | assert rec["kind"] == "variable" |
| 318 | |
| 319 | def test_no_language_block_emitted(self) -> None: |
| 320 | src = "# Section\n\n```\nplain text\n```\n" |
| 321 | syms = _parse(src) |
| 322 | assert any(k.endswith(".code") or k.endswith("::code") for k in syms) |
| 323 | |
| 324 | def test_no_language_not_in_symbol_name(self) -> None: |
| 325 | src = "# Section\n\n```\nplain text\n```\n" |
| 326 | syms = _parse(src) |
| 327 | # Should be code@L... not code[]@L... |
| 328 | assert not any("code[]" in k for k in syms) |
| 329 | |
| 330 | def test_code_block_scoped_to_section(self) -> None: |
| 331 | src = "# Intro\n\n```python\nx = 1\n```\n" |
| 332 | syms = _parse(src) |
| 333 | # code block address should contain the parent section name |
| 334 | assert any("Intro" in k and "code[python]" in k for k in syms) |
| 335 | |
| 336 | def test_code_content_change_changes_content_id(self) -> None: |
| 337 | src_a = "# S\n\n```python\nx = 1\n```\n" |
| 338 | src_b = "# S\n\n```python\nx = 2\n```\n" |
| 339 | a = _parse(src_a) |
| 340 | b = _parse(src_b) |
| 341 | key_a = next(k for k in a if "code[python]" in k) |
| 342 | key_b = next(k for k in b if "code[python]" in k) |
| 343 | assert a[key_a]["content_id"] != b[key_b]["content_id"] |
| 344 | |
| 345 | def test_lang_change_changes_signature_id(self) -> None: |
| 346 | src_a = "# S\n\n```python\nx = 1\n```\n" |
| 347 | src_b = "# S\n\n```javascript\nx = 1\n```\n" |
| 348 | a = _parse(src_a) |
| 349 | b = _parse(src_b) |
| 350 | key_a = next(k for k in a if "code[python]" in k) |
| 351 | key_b = next(k for k in b if "code[javascript]" in k) |
| 352 | assert a[key_a]["signature_id"] != b[key_b]["signature_id"] |
| 353 | |
| 354 | def test_lang_tag_is_lowercased(self) -> None: |
| 355 | src = "# S\n\n```Python\npass\n```\n" |
| 356 | syms = _parse(src) |
| 357 | # Language tag must be lowercased in the symbol name. |
| 358 | assert any("code[python]" in k for k in syms) |
| 359 | |
| 360 | def test_multiple_code_blocks_are_distinct(self) -> None: |
| 361 | src = ( |
| 362 | "# Section\n\n" |
| 363 | "```python\nblock_one = 1\n```\n\n" |
| 364 | "```python\nblock_two = 2\n```\n" |
| 365 | ) |
| 366 | syms = _parse(src) |
| 367 | code_keys = [k for k in syms if "code[python]" in k] |
| 368 | assert len(code_keys) == 2 |
| 369 | assert code_keys[0] != code_keys[1] |
| 370 | |
| 371 | def test_code_block_lineno_populated(self) -> None: |
| 372 | src = "# Section\n\n```python\npass\n```\n" |
| 373 | syms = _parse(src) |
| 374 | rec = next(v for k, v in syms.items() if "code[python]" in k) |
| 375 | assert rec["lineno"] > 0 |
| 376 | |
| 377 | |
| 378 | # --------------------------------------------------------------------------- |
| 379 | # GFM pipe tables |
| 380 | # --------------------------------------------------------------------------- |
| 381 | |
| 382 | class TestTableSymbols: |
| 383 | _TABLE_SRC = ( |
| 384 | "# Section\n\n" |
| 385 | "| Name | Value |\n" |
| 386 | "| ---- | ----- |\n" |
| 387 | "| foo | 1 |\n" |
| 388 | "| bar | 2 |\n" |
| 389 | ) |
| 390 | |
| 391 | def test_table_emitted(self) -> None: |
| 392 | syms = _parse(self._TABLE_SRC) |
| 393 | assert any(k.endswith(".table") or k.endswith("::table") for k in syms) |
| 394 | |
| 395 | def test_table_kind_is_section(self) -> None: |
| 396 | syms = _parse(self._TABLE_SRC) |
| 397 | rec = next(v for k, v in syms.items() if k.endswith(".table") or k.endswith("::table")) |
| 398 | assert rec["kind"] == "section" |
| 399 | |
| 400 | def test_table_scoped_to_section(self) -> None: |
| 401 | syms = _parse(self._TABLE_SRC) |
| 402 | assert any("Section" in k and (k.endswith(".table") or k.endswith("::table")) for k in syms) |
| 403 | |
| 404 | def test_adding_data_row_changes_content_id(self) -> None: |
| 405 | src_a = ( |
| 406 | "# S\n\n" |
| 407 | "| A | B |\n| - | - |\n| 1 | 2 |\n" |
| 408 | ) |
| 409 | src_b = ( |
| 410 | "# S\n\n" |
| 411 | "| A | B |\n| - | - |\n| 1 | 2 |\n| 3 | 4 |\n" |
| 412 | ) |
| 413 | a = _parse(src_a) |
| 414 | b = _parse(src_b) |
| 415 | key_a = next(k for k in a if k.endswith(".table") or k.endswith("::table")) |
| 416 | key_b = next(k for k in b if k.endswith(".table") or k.endswith("::table")) |
| 417 | assert a[key_a]["content_id"] != b[key_b]["content_id"] |
| 418 | |
| 419 | def test_adding_data_row_changes_body_hash(self) -> None: |
| 420 | src_a = "# S\n\n| A | B |\n| - | - |\n| 1 | 2 |\n" |
| 421 | src_b = "# S\n\n| A | B |\n| - | - |\n| 1 | 2 |\n| 3 | 4 |\n" |
| 422 | a = _parse(src_a) |
| 423 | b = _parse(src_b) |
| 424 | key_a = next(k for k in a if k.endswith(".table") or k.endswith("::table")) |
| 425 | key_b = next(k for k in b if k.endswith(".table") or k.endswith("::table")) |
| 426 | assert a[key_a]["body_hash"] != b[key_b]["body_hash"] |
| 427 | |
| 428 | def test_column_rename_changes_signature_id(self) -> None: |
| 429 | src_a = "# S\n\n| Name | Value |\n| ---- | ----- |\n| x | 1 |\n" |
| 430 | src_b = "# S\n\n| Label | Value |\n| ----- | ----- |\n| x | 1 |\n" |
| 431 | a = _parse(src_a) |
| 432 | b = _parse(src_b) |
| 433 | key_a = next(k for k in a if k.endswith(".table") or k.endswith("::table")) |
| 434 | key_b = next(k for k in b if k.endswith(".table") or k.endswith("::table")) |
| 435 | assert a[key_a]["signature_id"] != b[key_b]["signature_id"] |
| 436 | |
| 437 | def test_column_rename_does_not_change_body_hash(self) -> None: |
| 438 | """Renaming a column header should change signature_id but not body_hash.""" |
| 439 | src_a = "# S\n\n| Name | Value |\n| ---- | ----- |\n| x | 1 |\n" |
| 440 | src_b = "# S\n\n| Label | Value |\n| ------ | ----- |\n| x | 1 |\n" |
| 441 | a = _parse(src_a) |
| 442 | b = _parse(src_b) |
| 443 | key_a = next(k for k in a if k.endswith(".table") or k.endswith("::table")) |
| 444 | key_b = next(k for k in b if k.endswith(".table") or k.endswith("::table")) |
| 445 | # Data rows are the same β body_hash must be equal. |
| 446 | assert a[key_a]["body_hash"] == b[key_b]["body_hash"] |
| 447 | |
| 448 | def test_table_lineno_populated(self) -> None: |
| 449 | syms = _parse(self._TABLE_SRC) |
| 450 | rec = next(v for k, v in syms.items() if k.endswith(".table") or k.endswith("::table")) |
| 451 | assert rec["lineno"] > 0 |
| 452 | |
| 453 | |
| 454 | # --------------------------------------------------------------------------- |
| 455 | # Inline markup stripping β address stability |
| 456 | # --------------------------------------------------------------------------- |
| 457 | |
| 458 | class TestInlineMarkupStripping: |
| 459 | def test_bold_heading_address_matches_plain(self) -> None: |
| 460 | src_bold = "# **Setup**\n\nContent.\n" |
| 461 | src_plain = "# Setup\n\nContent.\n" |
| 462 | syms_bold = _parse(src_bold) |
| 463 | syms_plain = _parse(src_plain) |
| 464 | # Both should produce a key containing "Setup" (not **Setup**). |
| 465 | assert any("Setup" in k for k in syms_bold) |
| 466 | assert any("Setup" in k for k in syms_plain) |
| 467 | # The qualified name in both should be identical. |
| 468 | name_bold = next(v for k, v in syms_bold.items() if "Setup" in k)["name"] |
| 469 | name_plain = next(v for k, v in syms_plain.items() if "Setup" in k)["name"] |
| 470 | assert name_bold == name_plain |
| 471 | |
| 472 | def test_inline_code_heading_stripped(self) -> None: |
| 473 | src = "# `muse init` Command\n\nContent.\n" |
| 474 | syms = _parse(src) |
| 475 | assert any("muse init Command" in k for k in syms) |
| 476 | |
| 477 | def test_link_heading_keeps_text(self) -> None: |
| 478 | src = "# [API Reference](https://example.com/api)\n\nContent.\n" |
| 479 | syms = _parse(src) |
| 480 | assert any("API Reference" in k for k in syms) |
| 481 | |
| 482 | def test_image_in_heading_dropped(self) -> None: |
| 483 | src = "#  Intro\n\nContent.\n" |
| 484 | syms = _parse(src) |
| 485 | # The logo image should be gone; "Intro" should remain. |
| 486 | assert any("Intro" in k for k in syms) |
| 487 | assert not any("logo.png" in k for k in syms) |
| 488 | |
| 489 | |
| 490 | # --------------------------------------------------------------------------- |
| 491 | # Deduplication |
| 492 | # --------------------------------------------------------------------------- |
| 493 | |
| 494 | class TestDeduplication: |
| 495 | def test_two_identical_h2s_get_unique_addresses(self) -> None: |
| 496 | src = ( |
| 497 | "# Root\n\n" |
| 498 | "## Examples\n\nFirst.\n\n" |
| 499 | "## Examples\n\nSecond.\n" |
| 500 | ) |
| 501 | syms = _parse(src) |
| 502 | examples_keys = [k for k in syms if "Examples" in k] |
| 503 | assert len(examples_keys) == 2 |
| 504 | assert examples_keys[0] != examples_keys[1] |
| 505 | |
| 506 | def test_deduplicated_key_contains_lineno(self) -> None: |
| 507 | src = ( |
| 508 | "# Root\n\n" |
| 509 | "## Examples\n\nFirst.\n\n" |
| 510 | "## Examples\n\nSecond.\n" |
| 511 | ) |
| 512 | syms = _parse(src) |
| 513 | examples_keys = [k for k in syms if "Examples" in k] |
| 514 | # One of the two keys must have @L appended. |
| 515 | assert any("@L" in k for k in examples_keys) |
| 516 | |
| 517 | def test_identical_headings_in_different_parents_not_deduped(self) -> None: |
| 518 | src = ( |
| 519 | "# Alpha\n\n## Notes\n\nFoo.\n\n" |
| 520 | "# Beta\n\n## Notes\n\nBar.\n" |
| 521 | ) |
| 522 | syms = _parse(src) |
| 523 | notes_keys = [k for k in syms if "Notes" in k] |
| 524 | assert len(notes_keys) == 2 |
| 525 | # Should be Alpha.Notes and Beta.Notes β no @L suffix needed. |
| 526 | assert any("Alpha.Notes" in k for k in notes_keys) |
| 527 | assert any("Beta.Notes" in k for k in notes_keys) |
| 528 | |
| 529 | |
| 530 | # --------------------------------------------------------------------------- |
| 531 | # Depth limit |
| 532 | # --------------------------------------------------------------------------- |
| 533 | |
| 534 | class TestDepthLimit: |
| 535 | def test_deep_nesting_does_not_crash(self) -> None: |
| 536 | # Build 20 levels of nesting: # A, ## A.B, ### A.B.C, etc. |
| 537 | levels = ["#" * i + f" Level{i}\n\nText.\n\n" for i in range(1, 21)] |
| 538 | src = "".join(levels) |
| 539 | # Should not raise; may return fewer symbols than levels. |
| 540 | syms = _parse(src) |
| 541 | assert isinstance(syms, dict) |
| 542 | |
| 543 | def test_symbols_within_limit_are_extracted(self) -> None: |
| 544 | # Only 3 levels β all should be extracted. |
| 545 | src = "# A\n\n## A B\n\n### A B C\n\nText.\n" |
| 546 | syms = _parse(src) |
| 547 | assert any("A" in k for k in syms) |
| 548 | |
| 549 | |
| 550 | # --------------------------------------------------------------------------- |
| 551 | # Edge cases |
| 552 | # --------------------------------------------------------------------------- |
| 553 | |
| 554 | class TestEdgeCases: |
| 555 | def test_empty_file_returns_empty(self) -> None: |
| 556 | adapter = MarkdownAdapter() |
| 557 | if adapter._parser is None: |
| 558 | pytest.skip("tree-sitter-markdown not available") |
| 559 | result = adapter.parse_symbols(b"", "empty.md") |
| 560 | assert result == {} |
| 561 | |
| 562 | def test_no_headings_returns_empty(self) -> None: |
| 563 | src = "Just a paragraph with no headings.\n" |
| 564 | syms = _parse(src) |
| 565 | assert syms == {} |
| 566 | |
| 567 | def test_only_horizontal_rule_returns_empty(self) -> None: |
| 568 | src = "---\n" |
| 569 | syms = _parse(src) |
| 570 | assert syms == {} |
| 571 | |
| 572 | def test_binary_like_content_does_not_crash(self) -> None: |
| 573 | adapter = MarkdownAdapter() |
| 574 | if adapter._parser is None: |
| 575 | pytest.skip("tree-sitter-markdown not available") |
| 576 | # Non-UTF-8 bytes should not raise. |
| 577 | result = adapter.parse_symbols(b"\xff\xfe# Title\n", "weird.md") |
| 578 | assert isinstance(result, dict) |
| 579 | |
| 580 | def test_very_long_heading_truncated_in_name(self) -> None: |
| 581 | long_heading = "Word " * 50 # 250 chars |
| 582 | src = f"# {long_heading}\n\nContent.\n" |
| 583 | syms = _parse(src) |
| 584 | assert len(syms) == 1 |
| 585 | rec = next(iter(syms.values())) |
| 586 | # name must be at most 120 chars. |
| 587 | assert len(rec["name"]) <= 120 |
| 588 | |
| 589 | def test_file_content_id_changes_on_any_change(self) -> None: |
| 590 | adapter = MarkdownAdapter() |
| 591 | src_a = b"# Hello\n\nWorld.\n" |
| 592 | src_b = b"# Hello\n\nWorld. " # trailing space |
| 593 | assert adapter.file_content_id(src_a) != adapter.file_content_id(src_b) |
| 594 | |
| 595 | def test_file_content_id_is_hex_sha256(self) -> None: |
| 596 | adapter = MarkdownAdapter() |
| 597 | cid = adapter.file_content_id(b"# Hello\n") |
| 598 | assert cid.startswith("sha256:") |
| 599 | assert len(cid) == 71 |
| 600 | hex_part = cid[len("sha256:"):] |
| 601 | assert all(c in "0123456789abcdef" for c in hex_part) |
| 602 | |
| 603 | def test_headings_only_no_body(self) -> None: |
| 604 | src = "# Title\n## Subtitle\n" |
| 605 | syms = _parse(src) |
| 606 | assert any("Title" in k for k in syms) |
| 607 | |
| 608 | def test_code_block_at_root_level(self) -> None: |
| 609 | """A code block not inside any section gets a root-level address.""" |
| 610 | src = "```python\nprint('hi')\n```\n" |
| 611 | syms = _parse(src) |
| 612 | # Should be emitted even without a parent section. |
| 613 | assert any("code[python]" in k for k in syms) |
| 614 | |
| 615 | def test_table_at_root_level(self) -> None: |
| 616 | src = "| A | B |\n| - | - |\n| 1 | 2 |\n" |
| 617 | syms = _parse(src) |
| 618 | assert any(k.endswith(".table") or k.endswith("::table") for k in syms) |
| 619 | |
| 620 | |
| 621 | # --------------------------------------------------------------------------- |
| 622 | # Real-world README shape |
| 623 | # --------------------------------------------------------------------------- |
| 624 | |
| 625 | class TestRealWorldShape: |
| 626 | _README = """\ |
| 627 | # Muse |
| 628 | |
| 629 | A domain-agnostic version control system. |
| 630 | |
| 631 | ## Installation |
| 632 | |
| 633 | ```bash |
| 634 | pip install muse-vcs |
| 635 | ``` |
| 636 | |
| 637 | ## Usage |
| 638 | |
| 639 | Run `muse init` to initialise a repository. |
| 640 | |
| 641 | ### Commands |
| 642 | |
| 643 | | Command | Description | |
| 644 | | -------------- | ------------------------- | |
| 645 | | `muse init` | Initialise a repository | |
| 646 | | `muse commit` | Record a new snapshot | |
| 647 | | `muse log` | Show commit history | |
| 648 | |
| 649 | ## API Reference |
| 650 | |
| 651 | ### `muse.core.snapshot` |
| 652 | |
| 653 | Snapshot hashing and workdir diffing. |
| 654 | |
| 655 | ```python |
| 656 | from muse.core import snapshot |
| 657 | snap = snapshot.build(root) |
| 658 | ``` |
| 659 | |
| 660 | ## Contributing |
| 661 | |
| 662 | See CONTRIBUTING.md for guidelines. |
| 663 | """ |
| 664 | |
| 665 | def test_top_level_sections_extracted(self) -> None: |
| 666 | syms = _parse(self._README) |
| 667 | top = [k for k in syms if "::" in k] |
| 668 | names = [k.split("::")[-1] for k in top] |
| 669 | assert "Muse" in names or any("Muse" in n for n in names) |
| 670 | |
| 671 | def test_installation_section_extracted(self) -> None: |
| 672 | syms = _parse(self._README) |
| 673 | assert any("Installation" in k for k in syms) |
| 674 | |
| 675 | def test_usage_commands_table_extracted(self) -> None: |
| 676 | syms = _parse(self._README) |
| 677 | assert any(k.endswith(".table") or k.endswith("::table") for k in syms) |
| 678 | |
| 679 | def test_bash_code_block_extracted(self) -> None: |
| 680 | syms = _parse(self._README) |
| 681 | assert any("code[bash]" in k for k in syms) |
| 682 | |
| 683 | def test_python_code_block_extracted(self) -> None: |
| 684 | syms = _parse(self._README) |
| 685 | assert any("code[python]" in k for k in syms) |
| 686 | |
| 687 | def test_api_reference_subsection_extracted(self) -> None: |
| 688 | syms = _parse(self._README) |
| 689 | assert any("API Reference" in k for k in syms) |
| 690 | |
| 691 | def test_all_symbol_records_have_required_keys(self) -> None: |
| 692 | syms = _parse(self._README) |
| 693 | required = { |
| 694 | "kind", "name", "qualified_name", "content_id", "body_hash", |
| 695 | "signature_id", "metadata_id", "canonical_key", "lineno", "end_lineno", |
| 696 | } |
| 697 | for addr, rec in syms.items(): |
| 698 | missing = required - set(rec.keys()) |
| 699 | assert not missing, f"{addr!r} missing keys: {missing}" |
| 700 | |
| 701 | def test_no_symbol_has_empty_content_id(self) -> None: |
| 702 | syms = _parse(self._README) |
| 703 | for addr, rec in syms.items(): |
| 704 | assert rec["content_id"], f"{addr!r} has empty content_id" |
| 705 | |
| 706 | def test_all_linenos_positive(self) -> None: |
| 707 | syms = _parse(self._README) |
| 708 | for addr, rec in syms.items(): |
| 709 | assert rec["lineno"] > 0, f"{addr!r} lineno={rec['lineno']}" |
| 710 | |
| 711 | def test_all_end_linenos_gte_lineno(self) -> None: |
| 712 | syms = _parse(self._README) |
| 713 | for addr, rec in syms.items(): |
| 714 | assert rec["end_lineno"] >= rec["lineno"], ( |
| 715 | f"{addr!r} end_lineno={rec['end_lineno']} < lineno={rec['lineno']}" |
| 716 | ) |
| 717 | |
| 718 | def test_contributing_section_extracted(self) -> None: |
| 719 | syms = _parse(self._README) |
| 720 | assert any("Contributing" in k for k in syms) |
| 721 | |
| 722 | def test_commands_subsection_qualified_under_usage(self) -> None: |
| 723 | syms = _parse(self._README) |
| 724 | # "Commands" lives under "Usage", so its qualified name should |
| 725 | # contain "Usage.Commands". |
| 726 | assert any("Usage.Commands" in k for k in syms) |
| 727 | |
| 728 | |
| 729 | # --------------------------------------------------------------------------- |
| 730 | # MB Block naming stability β line number as collision suffix only |
| 731 | # |
| 732 | # A code block or table that is unique within its scope (section prefix) |
| 733 | # must NOT carry @L{n} in its name. Line numbers are unstable: moving a |
| 734 | # block by one line would change its address and produce a spurious |
| 735 | # delete+add diff instead of recognising it as the same block. |
| 736 | # |
| 737 | # @L{n} is still appended when two blocks of the same language (or two |
| 738 | # tables) exist within the same section β it acts as a disambiguator, not |
| 739 | # a primary identifier. |
| 740 | # |
| 741 | # MB1 Single code block in section β name is code[lang], no @L |
| 742 | # MB2 Two same-lang blocks in same section β both get @L |
| 743 | # MB3 Three same-lang blocks in same section β all three get @L |
| 744 | # MB4 Single code block at document root β code[lang], no @L |
| 745 | # MB5 Same lang in different sections β each gets clean name (no @L) |
| 746 | # MB6 Single table in section β name is "table", no @L |
| 747 | # MB7 Two tables in same section β both get @L |
| 748 | # MB8 Moving a block (line shift) β address unchanged (stability guarantee) |
| 749 | # MB9 No-language block, unique β name is "code", no @L |
| 750 | # MB10 Different langs in same section β each gets clean name |
| 751 | # --------------------------------------------------------------------------- |
| 752 | |
| 753 | |
| 754 | class TestBlockNamingStability: |
| 755 | def test_MB1_single_code_block_no_line_number(self) -> None: |
| 756 | """MB1: single python block in a section β code[python], no @L.""" |
| 757 | src = "# Setup\n\n```python\nprint('hi')\n```\n" |
| 758 | syms = _parse(src) |
| 759 | code_keys = [k for k in syms if "code[python]" in k] |
| 760 | assert len(code_keys) == 1 |
| 761 | assert "@L" not in code_keys[0], ( |
| 762 | f"single code block must not carry @L; got {code_keys[0]!r}" |
| 763 | ) |
| 764 | |
| 765 | def test_MB2_two_same_lang_blocks_both_get_line_number(self) -> None: |
| 766 | """MB2: two python blocks in the same section β both names carry @L.""" |
| 767 | src = ( |
| 768 | "# Section\n\n" |
| 769 | "```python\nfirst = 1\n```\n\n" |
| 770 | "```python\nsecond = 2\n```\n" |
| 771 | ) |
| 772 | syms = _parse(src) |
| 773 | code_keys = [k for k in syms if "code[python]" in k] |
| 774 | assert len(code_keys) == 2, f"expected 2 code blocks, got {code_keys}" |
| 775 | assert all("@L" in k for k in code_keys), ( |
| 776 | f"both blocks must carry @L when there are two; got {code_keys}" |
| 777 | ) |
| 778 | |
| 779 | def test_MB3_three_same_lang_blocks_all_get_line_number(self) -> None: |
| 780 | """MB3: three python blocks in the same section β all carry @L.""" |
| 781 | src = ( |
| 782 | "# Section\n\n" |
| 783 | "```python\na = 1\n```\n\n" |
| 784 | "```python\nb = 2\n```\n\n" |
| 785 | "```python\nc = 3\n```\n" |
| 786 | ) |
| 787 | syms = _parse(src) |
| 788 | code_keys = [k for k in syms if "code[python]" in k] |
| 789 | assert len(code_keys) == 3 |
| 790 | assert all("@L" in k for k in code_keys) |
| 791 | |
| 792 | def test_MB4_single_code_block_at_root_no_line_number(self) -> None: |
| 793 | """MB4: code block at document root (no section) β code[python], no @L.""" |
| 794 | src = "```python\nprint('hi')\n```\n" |
| 795 | syms = _parse(src) |
| 796 | code_keys = [k for k in syms if "code[python]" in k] |
| 797 | assert len(code_keys) == 1 |
| 798 | assert "@L" not in code_keys[0], ( |
| 799 | f"root-level single block must not carry @L; got {code_keys[0]!r}" |
| 800 | ) |
| 801 | |
| 802 | def test_MB5_same_lang_in_different_sections_each_clean(self) -> None: |
| 803 | """MB5: one python block per section β each gets code[python] with no @L.""" |
| 804 | src = ( |
| 805 | "# Alpha\n\n```python\na = 1\n```\n\n" |
| 806 | "# Beta\n\n```python\nb = 2\n```\n" |
| 807 | ) |
| 808 | syms = _parse(src) |
| 809 | code_keys = [k for k in syms if "code[python]" in k] |
| 810 | assert len(code_keys) == 2 |
| 811 | assert all("@L" not in k for k in code_keys), ( |
| 812 | f"blocks in different sections must not get @L; got {code_keys}" |
| 813 | ) |
| 814 | |
| 815 | def test_MB6_single_table_no_line_number(self) -> None: |
| 816 | """MB6: single table in a section β name is 'table', no @L.""" |
| 817 | src = ( |
| 818 | "# Section\n\n" |
| 819 | "| A | B |\n| - | - |\n| 1 | 2 |\n" |
| 820 | ) |
| 821 | syms = _parse(src) |
| 822 | table_keys = [k for k in syms if "table" in k] |
| 823 | assert len(table_keys) == 1 |
| 824 | assert "@L" not in table_keys[0], ( |
| 825 | f"single table must not carry @L; got {table_keys[0]!r}" |
| 826 | ) |
| 827 | |
| 828 | def test_MB7_two_tables_in_same_section_both_get_line_number(self) -> None: |
| 829 | """MB7: two tables in the same section β both names carry @L.""" |
| 830 | src = ( |
| 831 | "# Section\n\n" |
| 832 | "| A | B |\n| - | - |\n| 1 | 2 |\n\n" |
| 833 | "| X | Y |\n| - | - |\n| 3 | 4 |\n" |
| 834 | ) |
| 835 | syms = _parse(src) |
| 836 | table_keys = [k for k in syms if "table" in k] |
| 837 | assert len(table_keys) == 2 |
| 838 | assert all("@L" in k for k in table_keys), ( |
| 839 | f"both tables must carry @L when there are two; got {table_keys}" |
| 840 | ) |
| 841 | |
| 842 | def test_MB8_moving_block_address_stable(self) -> None: |
| 843 | """MB8: adding a paragraph above a code block (line shift) β address unchanged.""" |
| 844 | src_before = "# Section\n\n```python\nx = 1\n```\n" |
| 845 | src_after = "# Section\n\nA new paragraph.\n\n```python\nx = 1\n```\n" |
| 846 | syms_before = _parse(src_before) |
| 847 | syms_after = _parse(src_after) |
| 848 | key_before = next(k for k in syms_before if "code[python]" in k) |
| 849 | key_after = next(k for k in syms_after if "code[python]" in k) |
| 850 | assert key_before == key_after, ( |
| 851 | f"address changed after line shift: {key_before!r} β {key_after!r}" |
| 852 | ) |
| 853 | |
| 854 | def test_MB9_no_language_single_block_no_line_number(self) -> None: |
| 855 | """MB9: no-language block, unique in scope β name is 'code', no @L.""" |
| 856 | src = "# Section\n\n```\nplain text\n```\n" |
| 857 | syms = _parse(src) |
| 858 | # Should contain a key whose last component is exactly "code" (no lang, no @L) |
| 859 | bare_keys = [k for k in syms if k.split("::")[-1].endswith(".code") or k.endswith("::code")] |
| 860 | assert len(bare_keys) >= 1, f"expected a bare 'code' key; got {list(syms.keys())}" |
| 861 | assert all("@L" not in k for k in bare_keys), ( |
| 862 | f"single no-lang block must not carry @L; got {bare_keys}" |
| 863 | ) |
| 864 | |
| 865 | def test_MB10_different_langs_in_same_section_each_clean(self) -> None: |
| 866 | """MB10: python and bash blocks in same section β each gets clean name.""" |
| 867 | src = ( |
| 868 | "# Section\n\n" |
| 869 | "```python\nprint('hi')\n```\n\n" |
| 870 | "```bash\necho hi\n```\n" |
| 871 | ) |
| 872 | syms = _parse(src) |
| 873 | py_keys = [k for k in syms if "code[python]" in k] |
| 874 | sh_keys = [k for k in syms if "code[bash]" in k] |
| 875 | assert len(py_keys) == 1 and "@L" not in py_keys[0], ( |
| 876 | f"unique python block must have no @L; got {py_keys}" |
| 877 | ) |
| 878 | assert len(sh_keys) == 1 and "@L" not in sh_keys[0], ( |
| 879 | f"unique bash block must have no @L; got {sh_keys}" |
| 880 | ) |