gabriel / muse public

test_markdown_adapter.py file-level

at main · View file ↗ · Intel ↗

History
1 files
1 commits
0 hotspots
0 🧊 dead
0 πŸ’₯ blast risk
sha256:4 Merge branch 'dev' into main · gabriel · Jun 17, 2026
1 """Tests for the rewritten MarkdownAdapter.
2
3 Coverage:
4 - Extension routing: only .md / .rst / .txt are accepted.
5 - Section symbols: flat headings, hierarchical qualified names, level encoding.
6 - Content-ID correctness: full section bytes hashed, not just heading text.
7 - Body-hash / signature split: retitle detection, level-change detection.
8 - Code block symbols: language tag, no-language fallback, content hash.
9 - GFM table symbols: header signature, data-row body_hash, schema changes.
10 - Inline markup stripping: bold, italic, inline-code, links in headings.
11 - Deduplication: identical sibling headings get @L{lineno} suffix.
12 - Depth limit: sections beyond _MAX_DEPTH are silently dropped.
13 - Edge cases: empty file, no headings, setext headings (unsupported β†’ skip).
14 - Real-world shape: README-shaped document exercises all three emitters.
15 - _plain_heading unit tests: images dropped, markup stripped, truncation.
16 """
17
18 from __future__ import annotations
19
20 import pytest
21 from muse.plugins.code.ast_parser import (
22 MarkdownAdapter,
23 SymbolRecord,
24 SymbolTree,
25 _plain_heading,
26 )
27
28
29 # ---------------------------------------------------------------------------
30 # Helpers
31 # ---------------------------------------------------------------------------
32
33 def _parse(source: str, path: str = "README.md") -> SymbolTree:
34 adapter = MarkdownAdapter()
35 if adapter._parser is None:
36 pytest.skip("tree-sitter-markdown not available")
37 return adapter.parse_symbols(source.encode(), path)
38
39
40 # ---------------------------------------------------------------------------
41 # _plain_heading unit tests
42 # ---------------------------------------------------------------------------
43
44 class TestPlainHeading:
45 def test_plain_text_unchanged(self) -> None:
46 assert _plain_heading("Hello World") == "Hello World"
47
48 def test_bold_stripped(self) -> None:
49 assert _plain_heading("**Bold** heading") == "Bold heading"
50
51 def test_italic_star_stripped(self) -> None:
52 assert _plain_heading("*italic* text") == "italic text"
53
54 def test_bold_italic_combined(self) -> None:
55 assert _plain_heading("***bold italic***") == "bold italic"
56
57 def test_italic_underscore_stripped(self) -> None:
58 assert _plain_heading("_italic_") == "italic"
59
60 def test_bold_underscore_stripped(self) -> None:
61 assert _plain_heading("__bold__") == "bold"
62
63 def test_inline_code_stripped(self) -> None:
64 assert _plain_heading("`code` block") == "code block"
65
66 def test_triple_backtick_stripped(self) -> None:
67 assert _plain_heading("```code```") == "code"
68
69 def test_link_keeps_text(self) -> None:
70 assert _plain_heading("[link text](https://example.com)") == "link text"
71
72 def test_reference_link_keeps_text(self) -> None:
73 assert _plain_heading("[link text][ref]") == "link text"
74
75 def test_image_dropped_entirely(self) -> None:
76 assert _plain_heading("![alt text](img.png) caption") == "caption"
77
78 def test_reference_image_dropped(self) -> None:
79 assert _plain_heading("![alt][ref] caption") == "caption"
80
81 def test_html_entity_amp(self) -> None:
82 assert _plain_heading("foo & bar") == "foo & bar"
83
84 def test_html_entity_lt_gt(self) -> None:
85 assert _plain_heading("a &lt; b &gt; c") == "a < b > c"
86
87 def test_html_entity_quot(self) -> None:
88 assert _plain_heading("say &quot;hi&quot;") == 'say "hi"'
89
90 def test_html_entity_apos(self) -> None:
91 assert _plain_heading("it&#39;s") == "it's"
92
93 def test_whitespace_collapsed(self) -> None:
94 assert _plain_heading(" too many spaces ") == "too many spaces"
95
96 def test_truncation_at_120_chars(self) -> None:
97 long = "A" * 200
98 result = _plain_heading(long)
99 assert len(result) == 120
100
101 def test_empty_string(self) -> None:
102 assert _plain_heading("") == ""
103
104 def test_mixed_markup(self) -> None:
105 # Realistic heading: "**API** `Reference` Guide"
106 result = _plain_heading("**API** `Reference` Guide")
107 assert result == "API Reference Guide"
108
109
110 # ---------------------------------------------------------------------------
111 # Extension routing
112 # ---------------------------------------------------------------------------
113
114 class TestExtensionRouting:
115 def test_md_supported(self) -> None:
116 adapter = MarkdownAdapter()
117 assert ".md" in adapter.supported_extensions()
118
119 def test_rst_supported(self) -> None:
120 adapter = MarkdownAdapter()
121 assert ".rst" in adapter.supported_extensions()
122
123 def test_txt_supported(self) -> None:
124 adapter = MarkdownAdapter()
125 assert ".txt" in adapter.supported_extensions()
126
127 def test_py_not_supported(self) -> None:
128 adapter = MarkdownAdapter()
129 assert ".py" not in adapter.supported_extensions()
130
131 def test_html_not_supported(self) -> None:
132 adapter = MarkdownAdapter()
133 assert ".html" not in adapter.supported_extensions()
134
135
136 # ---------------------------------------------------------------------------
137 # Section symbols: flat headings
138 # ---------------------------------------------------------------------------
139
140 class TestFlatSections:
141 def test_h1_emitted(self) -> None:
142 syms = _parse("# Hello\n\nContent.\n")
143 keys = list(syms)
144 assert any("Hello" in k for k in keys)
145
146 def test_h1_kind_is_section(self) -> None:
147 syms = _parse("# Hello\n\nContent.\n")
148 rec = next(v for k, v in syms.items() if "Hello" in k)
149 assert rec["kind"] == "section"
150
151 def test_h2_emitted(self) -> None:
152 syms = _parse("## Setup\n\nDo the thing.\n")
153 keys = list(syms)
154 assert any("Setup" in k for k in keys)
155
156 def test_h3_emitted(self) -> None:
157 syms = _parse("### Detail\n\nMore detail.\n")
158 keys = list(syms)
159 assert any("Detail" in k for k in keys)
160
161 def test_address_contains_file_path(self) -> None:
162 syms = _parse("# Hello\n", "docs/guide.md")
163 assert any(k.startswith("docs/guide.md::") for k in syms)
164
165 def test_lineno_is_one_based(self) -> None:
166 syms = _parse("# Hello\n\nContent.\n")
167 rec = next(v for k, v in syms.items() if "Hello" in k)
168 assert rec["lineno"] == 1
169
170 def test_end_lineno_greater_than_lineno(self) -> None:
171 syms = _parse("# Hello\n\nSome content.\n")
172 rec = next(v for k, v in syms.items() if "Hello" in k)
173 assert rec["end_lineno"] >= rec["lineno"]
174
175 def test_name_is_plain_text(self) -> None:
176 syms = _parse("# **Bold** Heading\n\nContent.\n")
177 rec = next(v for k, v in syms.items() if "Bold Heading" in k)
178 assert rec["name"] == "Bold Heading"
179
180
181 # ---------------------------------------------------------------------------
182 # Section symbols: hierarchy
183 # ---------------------------------------------------------------------------
184
185 class TestSectionHierarchy:
186 def test_h2_under_h1_has_qualified_name(self) -> None:
187 src = "# Parent\n\n## Child\n\nText.\n"
188 syms = _parse(src)
189 assert any("Parent.Child" in k for k in syms)
190
191 def test_h3_under_h2_under_h1(self) -> None:
192 src = "# A\n\n## B\n\n### C\n\nText.\n"
193 syms = _parse(src)
194 assert any("A.B.C" in k for k in syms)
195
196 def test_sibling_h2s_are_distinct(self) -> None:
197 src = "# Root\n\n## Alpha\n\nFoo.\n\n## Beta\n\nBar.\n"
198 syms = _parse(src)
199 assert any("Alpha" in k for k in syms)
200 assert any("Beta" in k for k in syms)
201
202 def test_h2_address_does_not_bleed_into_sibling(self) -> None:
203 src = "# Root\n\n## A\n\nFoo.\n\n## B\n\nBar.\n"
204 syms = _parse(src)
205 # "A.B" should NOT appear; B is a sibling, not a child of A.
206 assert not any("A.B" in k for k in syms)
207
208 def test_parent_section_includes_child_in_content_id(self) -> None:
209 src_with_child = "# Parent\n\n## Child\n\nText.\n"
210 src_no_child = "# Parent\n\nText.\n"
211 syms_with = _parse(src_with_child)
212 syms_no = _parse(src_no_child)
213 parent_with = next(v for k, v in syms_with.items() if k.endswith("::Parent"))
214 parent_no = next(v for k, v in syms_no.items() if k.endswith("::Parent"))
215 # Adding a child section changes the parent's content_id.
216 assert parent_with["content_id"] != parent_no["content_id"]
217
218 def test_parallel_h2s_in_separate_h1_sections_dont_collide(self) -> None:
219 src = "# Intro\n\n## Overview\n\nX.\n\n# Usage\n\n## Overview\n\nY.\n"
220 syms = _parse(src)
221 # Two Overview headings exist; they must have different addresses.
222 overview_keys = [k for k in syms if "Overview" in k]
223 assert len(overview_keys) == 2
224 assert overview_keys[0] != overview_keys[1]
225
226
227 # ---------------------------------------------------------------------------
228 # Content-ID correctness β€” the core bug fix
229 # ---------------------------------------------------------------------------
230
231 class TestContentIDCorrectness:
232 def test_changing_body_changes_content_id(self) -> None:
233 src_a = "# Intro\n\nFirst paragraph.\n"
234 src_b = "# Intro\n\nFirst paragraph changed entirely.\n"
235 a = _parse(src_a)
236 b = _parse(src_b)
237 key_a = next(k for k in a if "Intro" in k)
238 key_b = next(k for k in b if "Intro" in k)
239 assert a[key_a]["content_id"] != b[key_b]["content_id"]
240
241 def test_same_content_produces_same_content_id(self) -> None:
242 src = "# Hello\n\nSame content.\n"
243 a = _parse(src)
244 b = _parse(src)
245 key = next(k for k in a if "Hello" in k)
246 assert a[key]["content_id"] == b[key]["content_id"]
247
248 def test_adding_paragraph_changes_content_id(self) -> None:
249 src_a = "# Section\n\nParagraph one.\n"
250 src_b = "# Section\n\nParagraph one.\n\nParagraph two.\n"
251 a = _parse(src_a)
252 b = _parse(src_b)
253 key_a = next(k for k in a if "Section" in k)
254 key_b = next(k for k in b if "Section" in k)
255 assert a[key_a]["content_id"] != b[key_b]["content_id"]
256
257 def test_heading_retitle_changes_content_id(self) -> None:
258 src_a = "# Old Title\n\nSame body.\n"
259 src_b = "# New Title\n\nSame body.\n"
260 a = _parse(src_a)
261 b = _parse(src_b)
262 # Different addresses (different titles) β€” both content_ids checked
263 key_a = next(k for k in a if "Old Title" in k)
264 key_b = next(k for k in b if "New Title" in k)
265 # content_id differs because heading text changed.
266 assert a[key_a]["content_id"] != b[key_b]["content_id"]
267
268 def test_retitle_with_same_body_has_same_body_hash(self) -> None:
269 """Retitle detection: body_hash stable, signature_id changes."""
270 src_a = "# Old Title\n\nIdentical body content.\n"
271 src_b = "# New Title\n\nIdentical body content.\n"
272 a = _parse(src_a)
273 b = _parse(src_b)
274 key_a = next(k for k in a if "Old Title" in k)
275 key_b = next(k for k in b if "New Title" in k)
276 # Same body text below heading β†’ same body_hash.
277 assert a[key_a]["body_hash"] == b[key_b]["body_hash"]
278 # Different heading text β†’ different signature_id.
279 assert a[key_a]["signature_id"] != b[key_b]["signature_id"]
280
281 def test_level_change_changes_metadata_id(self) -> None:
282 """Promoting a heading level is visible in metadata_id, not body_hash."""
283 src_a = "## Section\n\nBody.\n"
284 src_b = "# Section\n\nBody.\n"
285 a = _parse(src_a)
286 b = _parse(src_b)
287 key_a = next(k for k in a if "Section" in k)
288 key_b = next(k for k in b if "Section" in k)
289 assert a[key_a]["metadata_id"] != b[key_b]["metadata_id"]
290 # Body content is the same, so body_hash should match.
291 assert a[key_a]["body_hash"] == b[key_b]["body_hash"]
292
293 def test_level_change_changes_signature_id(self) -> None:
294 src_a = "## Section\n\nBody.\n"
295 src_b = "# Section\n\nBody.\n"
296 a = _parse(src_a)
297 b = _parse(src_b)
298 key_a = next(k for k in a if "Section" in k)
299 key_b = next(k for k in b if "Section" in k)
300 assert a[key_a]["signature_id"] != b[key_b]["signature_id"]
301
302
303 # ---------------------------------------------------------------------------
304 # Fenced code blocks
305 # ---------------------------------------------------------------------------
306
307 class TestCodeBlockSymbols:
308 def test_python_block_emitted(self) -> None:
309 src = "# Section\n\n```python\nprint('hello')\n```\n"
310 syms = _parse(src)
311 assert any("code[python]" in k for k in syms)
312
313 def test_code_block_kind_is_variable(self) -> None:
314 src = "# Section\n\n```python\nprint('hello')\n```\n"
315 syms = _parse(src)
316 rec = next(v for k, v in syms.items() if "code[python]" in k)
317 assert rec["kind"] == "variable"
318
319 def test_no_language_block_emitted(self) -> None:
320 src = "# Section\n\n```\nplain text\n```\n"
321 syms = _parse(src)
322 assert any(k.endswith(".code") or k.endswith("::code") for k in syms)
323
324 def test_no_language_not_in_symbol_name(self) -> None:
325 src = "# Section\n\n```\nplain text\n```\n"
326 syms = _parse(src)
327 # Should be code@L... not code[]@L...
328 assert not any("code[]" in k for k in syms)
329
330 def test_code_block_scoped_to_section(self) -> None:
331 src = "# Intro\n\n```python\nx = 1\n```\n"
332 syms = _parse(src)
333 # code block address should contain the parent section name
334 assert any("Intro" in k and "code[python]" in k for k in syms)
335
336 def test_code_content_change_changes_content_id(self) -> None:
337 src_a = "# S\n\n```python\nx = 1\n```\n"
338 src_b = "# S\n\n```python\nx = 2\n```\n"
339 a = _parse(src_a)
340 b = _parse(src_b)
341 key_a = next(k for k in a if "code[python]" in k)
342 key_b = next(k for k in b if "code[python]" in k)
343 assert a[key_a]["content_id"] != b[key_b]["content_id"]
344
345 def test_lang_change_changes_signature_id(self) -> None:
346 src_a = "# S\n\n```python\nx = 1\n```\n"
347 src_b = "# S\n\n```javascript\nx = 1\n```\n"
348 a = _parse(src_a)
349 b = _parse(src_b)
350 key_a = next(k for k in a if "code[python]" in k)
351 key_b = next(k for k in b if "code[javascript]" in k)
352 assert a[key_a]["signature_id"] != b[key_b]["signature_id"]
353
354 def test_lang_tag_is_lowercased(self) -> None:
355 src = "# S\n\n```Python\npass\n```\n"
356 syms = _parse(src)
357 # Language tag must be lowercased in the symbol name.
358 assert any("code[python]" in k for k in syms)
359
360 def test_multiple_code_blocks_are_distinct(self) -> None:
361 src = (
362 "# Section\n\n"
363 "```python\nblock_one = 1\n```\n\n"
364 "```python\nblock_two = 2\n```\n"
365 )
366 syms = _parse(src)
367 code_keys = [k for k in syms if "code[python]" in k]
368 assert len(code_keys) == 2
369 assert code_keys[0] != code_keys[1]
370
371 def test_code_block_lineno_populated(self) -> None:
372 src = "# Section\n\n```python\npass\n```\n"
373 syms = _parse(src)
374 rec = next(v for k, v in syms.items() if "code[python]" in k)
375 assert rec["lineno"] > 0
376
377
378 # ---------------------------------------------------------------------------
379 # GFM pipe tables
380 # ---------------------------------------------------------------------------
381
382 class TestTableSymbols:
383 _TABLE_SRC = (
384 "# Section\n\n"
385 "| Name | Value |\n"
386 "| ---- | ----- |\n"
387 "| foo | 1 |\n"
388 "| bar | 2 |\n"
389 )
390
391 def test_table_emitted(self) -> None:
392 syms = _parse(self._TABLE_SRC)
393 assert any(k.endswith(".table") or k.endswith("::table") for k in syms)
394
395 def test_table_kind_is_section(self) -> None:
396 syms = _parse(self._TABLE_SRC)
397 rec = next(v for k, v in syms.items() if k.endswith(".table") or k.endswith("::table"))
398 assert rec["kind"] == "section"
399
400 def test_table_scoped_to_section(self) -> None:
401 syms = _parse(self._TABLE_SRC)
402 assert any("Section" in k and (k.endswith(".table") or k.endswith("::table")) for k in syms)
403
404 def test_adding_data_row_changes_content_id(self) -> None:
405 src_a = (
406 "# S\n\n"
407 "| A | B |\n| - | - |\n| 1 | 2 |\n"
408 )
409 src_b = (
410 "# S\n\n"
411 "| A | B |\n| - | - |\n| 1 | 2 |\n| 3 | 4 |\n"
412 )
413 a = _parse(src_a)
414 b = _parse(src_b)
415 key_a = next(k for k in a if k.endswith(".table") or k.endswith("::table"))
416 key_b = next(k for k in b if k.endswith(".table") or k.endswith("::table"))
417 assert a[key_a]["content_id"] != b[key_b]["content_id"]
418
419 def test_adding_data_row_changes_body_hash(self) -> None:
420 src_a = "# S\n\n| A | B |\n| - | - |\n| 1 | 2 |\n"
421 src_b = "# S\n\n| A | B |\n| - | - |\n| 1 | 2 |\n| 3 | 4 |\n"
422 a = _parse(src_a)
423 b = _parse(src_b)
424 key_a = next(k for k in a if k.endswith(".table") or k.endswith("::table"))
425 key_b = next(k for k in b if k.endswith(".table") or k.endswith("::table"))
426 assert a[key_a]["body_hash"] != b[key_b]["body_hash"]
427
428 def test_column_rename_changes_signature_id(self) -> None:
429 src_a = "# S\n\n| Name | Value |\n| ---- | ----- |\n| x | 1 |\n"
430 src_b = "# S\n\n| Label | Value |\n| ----- | ----- |\n| x | 1 |\n"
431 a = _parse(src_a)
432 b = _parse(src_b)
433 key_a = next(k for k in a if k.endswith(".table") or k.endswith("::table"))
434 key_b = next(k for k in b if k.endswith(".table") or k.endswith("::table"))
435 assert a[key_a]["signature_id"] != b[key_b]["signature_id"]
436
437 def test_column_rename_does_not_change_body_hash(self) -> None:
438 """Renaming a column header should change signature_id but not body_hash."""
439 src_a = "# S\n\n| Name | Value |\n| ---- | ----- |\n| x | 1 |\n"
440 src_b = "# S\n\n| Label | Value |\n| ------ | ----- |\n| x | 1 |\n"
441 a = _parse(src_a)
442 b = _parse(src_b)
443 key_a = next(k for k in a if k.endswith(".table") or k.endswith("::table"))
444 key_b = next(k for k in b if k.endswith(".table") or k.endswith("::table"))
445 # Data rows are the same β†’ body_hash must be equal.
446 assert a[key_a]["body_hash"] == b[key_b]["body_hash"]
447
448 def test_table_lineno_populated(self) -> None:
449 syms = _parse(self._TABLE_SRC)
450 rec = next(v for k, v in syms.items() if k.endswith(".table") or k.endswith("::table"))
451 assert rec["lineno"] > 0
452
453
454 # ---------------------------------------------------------------------------
455 # Inline markup stripping β€” address stability
456 # ---------------------------------------------------------------------------
457
458 class TestInlineMarkupStripping:
459 def test_bold_heading_address_matches_plain(self) -> None:
460 src_bold = "# **Setup**\n\nContent.\n"
461 src_plain = "# Setup\n\nContent.\n"
462 syms_bold = _parse(src_bold)
463 syms_plain = _parse(src_plain)
464 # Both should produce a key containing "Setup" (not **Setup**).
465 assert any("Setup" in k for k in syms_bold)
466 assert any("Setup" in k for k in syms_plain)
467 # The qualified name in both should be identical.
468 name_bold = next(v for k, v in syms_bold.items() if "Setup" in k)["name"]
469 name_plain = next(v for k, v in syms_plain.items() if "Setup" in k)["name"]
470 assert name_bold == name_plain
471
472 def test_inline_code_heading_stripped(self) -> None:
473 src = "# `muse init` Command\n\nContent.\n"
474 syms = _parse(src)
475 assert any("muse init Command" in k for k in syms)
476
477 def test_link_heading_keeps_text(self) -> None:
478 src = "# [API Reference](https://example.com/api)\n\nContent.\n"
479 syms = _parse(src)
480 assert any("API Reference" in k for k in syms)
481
482 def test_image_in_heading_dropped(self) -> None:
483 src = "# ![logo](logo.png) Intro\n\nContent.\n"
484 syms = _parse(src)
485 # The logo image should be gone; "Intro" should remain.
486 assert any("Intro" in k for k in syms)
487 assert not any("logo.png" in k for k in syms)
488
489
490 # ---------------------------------------------------------------------------
491 # Deduplication
492 # ---------------------------------------------------------------------------
493
494 class TestDeduplication:
495 def test_two_identical_h2s_get_unique_addresses(self) -> None:
496 src = (
497 "# Root\n\n"
498 "## Examples\n\nFirst.\n\n"
499 "## Examples\n\nSecond.\n"
500 )
501 syms = _parse(src)
502 examples_keys = [k for k in syms if "Examples" in k]
503 assert len(examples_keys) == 2
504 assert examples_keys[0] != examples_keys[1]
505
506 def test_deduplicated_key_contains_lineno(self) -> None:
507 src = (
508 "# Root\n\n"
509 "## Examples\n\nFirst.\n\n"
510 "## Examples\n\nSecond.\n"
511 )
512 syms = _parse(src)
513 examples_keys = [k for k in syms if "Examples" in k]
514 # One of the two keys must have @L appended.
515 assert any("@L" in k for k in examples_keys)
516
517 def test_identical_headings_in_different_parents_not_deduped(self) -> None:
518 src = (
519 "# Alpha\n\n## Notes\n\nFoo.\n\n"
520 "# Beta\n\n## Notes\n\nBar.\n"
521 )
522 syms = _parse(src)
523 notes_keys = [k for k in syms if "Notes" in k]
524 assert len(notes_keys) == 2
525 # Should be Alpha.Notes and Beta.Notes β€” no @L suffix needed.
526 assert any("Alpha.Notes" in k for k in notes_keys)
527 assert any("Beta.Notes" in k for k in notes_keys)
528
529
530 # ---------------------------------------------------------------------------
531 # Depth limit
532 # ---------------------------------------------------------------------------
533
534 class TestDepthLimit:
535 def test_deep_nesting_does_not_crash(self) -> None:
536 # Build 20 levels of nesting: # A, ## A.B, ### A.B.C, etc.
537 levels = ["#" * i + f" Level{i}\n\nText.\n\n" for i in range(1, 21)]
538 src = "".join(levels)
539 # Should not raise; may return fewer symbols than levels.
540 syms = _parse(src)
541 assert isinstance(syms, dict)
542
543 def test_symbols_within_limit_are_extracted(self) -> None:
544 # Only 3 levels β€” all should be extracted.
545 src = "# A\n\n## A B\n\n### A B C\n\nText.\n"
546 syms = _parse(src)
547 assert any("A" in k for k in syms)
548
549
550 # ---------------------------------------------------------------------------
551 # Edge cases
552 # ---------------------------------------------------------------------------
553
554 class TestEdgeCases:
555 def test_empty_file_returns_empty(self) -> None:
556 adapter = MarkdownAdapter()
557 if adapter._parser is None:
558 pytest.skip("tree-sitter-markdown not available")
559 result = adapter.parse_symbols(b"", "empty.md")
560 assert result == {}
561
562 def test_no_headings_returns_empty(self) -> None:
563 src = "Just a paragraph with no headings.\n"
564 syms = _parse(src)
565 assert syms == {}
566
567 def test_only_horizontal_rule_returns_empty(self) -> None:
568 src = "---\n"
569 syms = _parse(src)
570 assert syms == {}
571
572 def test_binary_like_content_does_not_crash(self) -> None:
573 adapter = MarkdownAdapter()
574 if adapter._parser is None:
575 pytest.skip("tree-sitter-markdown not available")
576 # Non-UTF-8 bytes should not raise.
577 result = adapter.parse_symbols(b"\xff\xfe# Title\n", "weird.md")
578 assert isinstance(result, dict)
579
580 def test_very_long_heading_truncated_in_name(self) -> None:
581 long_heading = "Word " * 50 # 250 chars
582 src = f"# {long_heading}\n\nContent.\n"
583 syms = _parse(src)
584 assert len(syms) == 1
585 rec = next(iter(syms.values()))
586 # name must be at most 120 chars.
587 assert len(rec["name"]) <= 120
588
589 def test_file_content_id_changes_on_any_change(self) -> None:
590 adapter = MarkdownAdapter()
591 src_a = b"# Hello\n\nWorld.\n"
592 src_b = b"# Hello\n\nWorld. " # trailing space
593 assert adapter.file_content_id(src_a) != adapter.file_content_id(src_b)
594
595 def test_file_content_id_is_hex_sha256(self) -> None:
596 adapter = MarkdownAdapter()
597 cid = adapter.file_content_id(b"# Hello\n")
598 assert cid.startswith("sha256:")
599 assert len(cid) == 71
600 hex_part = cid[len("sha256:"):]
601 assert all(c in "0123456789abcdef" for c in hex_part)
602
603 def test_headings_only_no_body(self) -> None:
604 src = "# Title\n## Subtitle\n"
605 syms = _parse(src)
606 assert any("Title" in k for k in syms)
607
608 def test_code_block_at_root_level(self) -> None:
609 """A code block not inside any section gets a root-level address."""
610 src = "```python\nprint('hi')\n```\n"
611 syms = _parse(src)
612 # Should be emitted even without a parent section.
613 assert any("code[python]" in k for k in syms)
614
615 def test_table_at_root_level(self) -> None:
616 src = "| A | B |\n| - | - |\n| 1 | 2 |\n"
617 syms = _parse(src)
618 assert any(k.endswith(".table") or k.endswith("::table") for k in syms)
619
620
621 # ---------------------------------------------------------------------------
622 # Real-world README shape
623 # ---------------------------------------------------------------------------
624
625 class TestRealWorldShape:
626 _README = """\
627 # Muse
628
629 A domain-agnostic version control system.
630
631 ## Installation
632
633 ```bash
634 pip install muse-vcs
635 ```
636
637 ## Usage
638
639 Run `muse init` to initialise a repository.
640
641 ### Commands
642
643 | Command | Description |
644 | -------------- | ------------------------- |
645 | `muse init` | Initialise a repository |
646 | `muse commit` | Record a new snapshot |
647 | `muse log` | Show commit history |
648
649 ## API Reference
650
651 ### `muse.core.snapshot`
652
653 Snapshot hashing and workdir diffing.
654
655 ```python
656 from muse.core import snapshot
657 snap = snapshot.build(root)
658 ```
659
660 ## Contributing
661
662 See CONTRIBUTING.md for guidelines.
663 """
664
665 def test_top_level_sections_extracted(self) -> None:
666 syms = _parse(self._README)
667 top = [k for k in syms if "::" in k]
668 names = [k.split("::")[-1] for k in top]
669 assert "Muse" in names or any("Muse" in n for n in names)
670
671 def test_installation_section_extracted(self) -> None:
672 syms = _parse(self._README)
673 assert any("Installation" in k for k in syms)
674
675 def test_usage_commands_table_extracted(self) -> None:
676 syms = _parse(self._README)
677 assert any(k.endswith(".table") or k.endswith("::table") for k in syms)
678
679 def test_bash_code_block_extracted(self) -> None:
680 syms = _parse(self._README)
681 assert any("code[bash]" in k for k in syms)
682
683 def test_python_code_block_extracted(self) -> None:
684 syms = _parse(self._README)
685 assert any("code[python]" in k for k in syms)
686
687 def test_api_reference_subsection_extracted(self) -> None:
688 syms = _parse(self._README)
689 assert any("API Reference" in k for k in syms)
690
691 def test_all_symbol_records_have_required_keys(self) -> None:
692 syms = _parse(self._README)
693 required = {
694 "kind", "name", "qualified_name", "content_id", "body_hash",
695 "signature_id", "metadata_id", "canonical_key", "lineno", "end_lineno",
696 }
697 for addr, rec in syms.items():
698 missing = required - set(rec.keys())
699 assert not missing, f"{addr!r} missing keys: {missing}"
700
701 def test_no_symbol_has_empty_content_id(self) -> None:
702 syms = _parse(self._README)
703 for addr, rec in syms.items():
704 assert rec["content_id"], f"{addr!r} has empty content_id"
705
706 def test_all_linenos_positive(self) -> None:
707 syms = _parse(self._README)
708 for addr, rec in syms.items():
709 assert rec["lineno"] > 0, f"{addr!r} lineno={rec['lineno']}"
710
711 def test_all_end_linenos_gte_lineno(self) -> None:
712 syms = _parse(self._README)
713 for addr, rec in syms.items():
714 assert rec["end_lineno"] >= rec["lineno"], (
715 f"{addr!r} end_lineno={rec['end_lineno']} < lineno={rec['lineno']}"
716 )
717
718 def test_contributing_section_extracted(self) -> None:
719 syms = _parse(self._README)
720 assert any("Contributing" in k for k in syms)
721
722 def test_commands_subsection_qualified_under_usage(self) -> None:
723 syms = _parse(self._README)
724 # "Commands" lives under "Usage", so its qualified name should
725 # contain "Usage.Commands".
726 assert any("Usage.Commands" in k for k in syms)
727
728
729 # ---------------------------------------------------------------------------
730 # MB Block naming stability β€” line number as collision suffix only
731 #
732 # A code block or table that is unique within its scope (section prefix)
733 # must NOT carry @L{n} in its name. Line numbers are unstable: moving a
734 # block by one line would change its address and produce a spurious
735 # delete+add diff instead of recognising it as the same block.
736 #
737 # @L{n} is still appended when two blocks of the same language (or two
738 # tables) exist within the same section β€” it acts as a disambiguator, not
739 # a primary identifier.
740 #
741 # MB1 Single code block in section β†’ name is code[lang], no @L
742 # MB2 Two same-lang blocks in same section β†’ both get @L
743 # MB3 Three same-lang blocks in same section β†’ all three get @L
744 # MB4 Single code block at document root β†’ code[lang], no @L
745 # MB5 Same lang in different sections β†’ each gets clean name (no @L)
746 # MB6 Single table in section β†’ name is "table", no @L
747 # MB7 Two tables in same section β†’ both get @L
748 # MB8 Moving a block (line shift) β†’ address unchanged (stability guarantee)
749 # MB9 No-language block, unique β†’ name is "code", no @L
750 # MB10 Different langs in same section β†’ each gets clean name
751 # ---------------------------------------------------------------------------
752
753
754 class TestBlockNamingStability:
755 def test_MB1_single_code_block_no_line_number(self) -> None:
756 """MB1: single python block in a section β†’ code[python], no @L."""
757 src = "# Setup\n\n```python\nprint('hi')\n```\n"
758 syms = _parse(src)
759 code_keys = [k for k in syms if "code[python]" in k]
760 assert len(code_keys) == 1
761 assert "@L" not in code_keys[0], (
762 f"single code block must not carry @L; got {code_keys[0]!r}"
763 )
764
765 def test_MB2_two_same_lang_blocks_both_get_line_number(self) -> None:
766 """MB2: two python blocks in the same section β†’ both names carry @L."""
767 src = (
768 "# Section\n\n"
769 "```python\nfirst = 1\n```\n\n"
770 "```python\nsecond = 2\n```\n"
771 )
772 syms = _parse(src)
773 code_keys = [k for k in syms if "code[python]" in k]
774 assert len(code_keys) == 2, f"expected 2 code blocks, got {code_keys}"
775 assert all("@L" in k for k in code_keys), (
776 f"both blocks must carry @L when there are two; got {code_keys}"
777 )
778
779 def test_MB3_three_same_lang_blocks_all_get_line_number(self) -> None:
780 """MB3: three python blocks in the same section β†’ all carry @L."""
781 src = (
782 "# Section\n\n"
783 "```python\na = 1\n```\n\n"
784 "```python\nb = 2\n```\n\n"
785 "```python\nc = 3\n```\n"
786 )
787 syms = _parse(src)
788 code_keys = [k for k in syms if "code[python]" in k]
789 assert len(code_keys) == 3
790 assert all("@L" in k for k in code_keys)
791
792 def test_MB4_single_code_block_at_root_no_line_number(self) -> None:
793 """MB4: code block at document root (no section) β†’ code[python], no @L."""
794 src = "```python\nprint('hi')\n```\n"
795 syms = _parse(src)
796 code_keys = [k for k in syms if "code[python]" in k]
797 assert len(code_keys) == 1
798 assert "@L" not in code_keys[0], (
799 f"root-level single block must not carry @L; got {code_keys[0]!r}"
800 )
801
802 def test_MB5_same_lang_in_different_sections_each_clean(self) -> None:
803 """MB5: one python block per section β†’ each gets code[python] with no @L."""
804 src = (
805 "# Alpha\n\n```python\na = 1\n```\n\n"
806 "# Beta\n\n```python\nb = 2\n```\n"
807 )
808 syms = _parse(src)
809 code_keys = [k for k in syms if "code[python]" in k]
810 assert len(code_keys) == 2
811 assert all("@L" not in k for k in code_keys), (
812 f"blocks in different sections must not get @L; got {code_keys}"
813 )
814
815 def test_MB6_single_table_no_line_number(self) -> None:
816 """MB6: single table in a section β†’ name is 'table', no @L."""
817 src = (
818 "# Section\n\n"
819 "| A | B |\n| - | - |\n| 1 | 2 |\n"
820 )
821 syms = _parse(src)
822 table_keys = [k for k in syms if "table" in k]
823 assert len(table_keys) == 1
824 assert "@L" not in table_keys[0], (
825 f"single table must not carry @L; got {table_keys[0]!r}"
826 )
827
828 def test_MB7_two_tables_in_same_section_both_get_line_number(self) -> None:
829 """MB7: two tables in the same section β†’ both names carry @L."""
830 src = (
831 "# Section\n\n"
832 "| A | B |\n| - | - |\n| 1 | 2 |\n\n"
833 "| X | Y |\n| - | - |\n| 3 | 4 |\n"
834 )
835 syms = _parse(src)
836 table_keys = [k for k in syms if "table" in k]
837 assert len(table_keys) == 2
838 assert all("@L" in k for k in table_keys), (
839 f"both tables must carry @L when there are two; got {table_keys}"
840 )
841
842 def test_MB8_moving_block_address_stable(self) -> None:
843 """MB8: adding a paragraph above a code block (line shift) β†’ address unchanged."""
844 src_before = "# Section\n\n```python\nx = 1\n```\n"
845 src_after = "# Section\n\nA new paragraph.\n\n```python\nx = 1\n```\n"
846 syms_before = _parse(src_before)
847 syms_after = _parse(src_after)
848 key_before = next(k for k in syms_before if "code[python]" in k)
849 key_after = next(k for k in syms_after if "code[python]" in k)
850 assert key_before == key_after, (
851 f"address changed after line shift: {key_before!r} β†’ {key_after!r}"
852 )
853
854 def test_MB9_no_language_single_block_no_line_number(self) -> None:
855 """MB9: no-language block, unique in scope β†’ name is 'code', no @L."""
856 src = "# Section\n\n```\nplain text\n```\n"
857 syms = _parse(src)
858 # Should contain a key whose last component is exactly "code" (no lang, no @L)
859 bare_keys = [k for k in syms if k.split("::")[-1].endswith(".code") or k.endswith("::code")]
860 assert len(bare_keys) >= 1, f"expected a bare 'code' key; got {list(syms.keys())}"
861 assert all("@L" not in k for k in bare_keys), (
862 f"single no-lang block must not carry @L; got {bare_keys}"
863 )
864
865 def test_MB10_different_langs_in_same_section_each_clean(self) -> None:
866 """MB10: python and bash blocks in same section β†’ each gets clean name."""
867 src = (
868 "# Section\n\n"
869 "```python\nprint('hi')\n```\n\n"
870 "```bash\necho hi\n```\n"
871 )
872 syms = _parse(src)
873 py_keys = [k for k in syms if "code[python]" in k]
874 sh_keys = [k for k in syms if "code[bash]" in k]
875 assert len(py_keys) == 1 and "@L" not in py_keys[0], (
876 f"unique python block must have no @L; got {py_keys}"
877 )
878 assert len(sh_keys) == 1 and "@L" not in sh_keys[0], (
879 f"unique bash block must have no @L; got {sh_keys}"
880 )