gabriel / muse public
test_store_fsync_enospc.py python
488 lines 19.7 KB
Raw
sha256:81ae324db5ad375fbfe4834c6fcb378312cafad3cc92dec5d3e5c427306621a2 fix: remove commit_exists filter from have anchors — server… Sonnet 4.6 patch 21 days ago
1 """
2 Tests for the bug: write_text_atomic silently swallowed ALL OSError from
3 os.fsync() — including ENOSPC and EIO — instead of only suppressing EINVAL
4 (the errno virtual filesystems return to indicate fsync is unsupported).
5
6 Root cause (muse/core/store.py):
7
8 write_text_atomic lines 324–327:
9 try:
10 os.fsync(fh.fileno())
11 except OSError:
12 pass # best-effort ← BUG: swallows ENOSPC, EIO, etc.
13
14 When a disk is full (ENOSPC) or has a hardware error (EIO), fsync raises an
15 OSError with errno.ENOSPC or errno.EIO. The current code silently swallows
16 these errors. tmp.replace(path) then succeeds — the target file now points at
17 a temp file whose data is only in the page cache. The caller sees a normal
18 return (no exception) and believes the write succeeded. The OS may silently
19 discard the page-cache data if it cannot flush it to disk.
20
21 The fix: only suppress errno.EINVAL. Re-raise everything else (ENOSPC, EIO,
22 EROFS, EBADF, …).
23
24 Coverage:
25 Unit — write_text_atomic raises on ENOSPC, EIO; suppresses EINVAL
26 Data integrity — after ENOSPC, no misleading success state in caller
27 Security — ENOSPC during HEAD/branch ref writes propagates (not silenced)
28 Integration — coord record write propagates ENOSPC to _write_remote_records
29 E2E — CLI coord sync gets clean error, not silent corruption
30 Stress — rapid repeated ENOSPC raises, never succeeds silently
31 Performance — suppressed EINVAL path (normal) is not dramatically slower
32 Regression — EINVAL is still suppressed (virtual filesystem compatibility)
33 """
34 from __future__ import annotations
35
36 import errno
37 import os
38 import pathlib
39 import sys
40 import tempfile
41 import threading
42 import time
43 from unittest.mock import MagicMock, patch
44
45 import pytest
46
47 from muse.core.types import MsgpackDict
48 from muse.core.paths import coordination_dir, head_path, heads_dir, muse_dir
49
50 # ---------------------------------------------------------------------------
51 # Helpers
52 # ---------------------------------------------------------------------------
53
54
55 def _make_repo(tmp_path: pathlib.Path) -> pathlib.Path:
56 muse_dir(tmp_path).mkdir(parents=True, exist_ok=True)
57 return tmp_path
58
59
60 def _oserror(err: int) -> OSError:
61 e = OSError(err, os.strerror(err))
62 e.errno = err
63 return e
64
65
66 # =============================================================================
67 # 1. UNIT — write_text_atomic fsync error handling
68 # =============================================================================
69
70
71 class TestWriteTextAtomicFsync:
72 """write_text_atomic must re-raise fatal OSErrors and suppress only EINVAL."""
73
74 def test_enospc_raises(self, tmp_path: pathlib.Path) -> None:
75 """ENOSPC from fsync must propagate — disk full is a fatal error."""
76 from muse.core.io import write_text_atomic
77
78 with patch("os.fsync", side_effect=_oserror(errno.ENOSPC)):
79 with pytest.raises(OSError) as exc_info:
80 write_text_atomic(tmp_path / "test.txt", "hello")
81 assert exc_info.value.errno == errno.ENOSPC
82
83 def test_eio_raises(self, tmp_path: pathlib.Path) -> None:
84 """EIO from fsync must propagate — hardware error is fatal."""
85 from muse.core.io import write_text_atomic
86
87 with patch("os.fsync", side_effect=_oserror(errno.EIO)):
88 with pytest.raises(OSError) as exc_info:
89 write_text_atomic(tmp_path / "test.txt", "hello")
90 assert exc_info.value.errno == errno.EIO
91
92 def test_erofs_raises(self, tmp_path: pathlib.Path) -> None:
93 """EROFS (read-only filesystem) from fsync must propagate."""
94 from muse.core.io import write_text_atomic
95
96 with patch("os.fsync", side_effect=_oserror(errno.EROFS)):
97 with pytest.raises(OSError) as exc_info:
98 write_text_atomic(tmp_path / "test.txt", "hello")
99 assert exc_info.value.errno == errno.EROFS
100
101 def test_einval_suppressed(self, tmp_path: pathlib.Path) -> None:
102 """EINVAL from fsync must be silently suppressed (virtual filesystem compat)."""
103 from muse.core.io import write_text_atomic
104
105 with patch("os.fsync", side_effect=_oserror(errno.EINVAL)):
106 write_text_atomic(tmp_path / "test.txt", "hello") # must not raise
107 assert (tmp_path / "test.txt").read_text() == "hello"
108
109 def test_enospc_leaves_no_temp_files(self, tmp_path: pathlib.Path) -> None:
110 """On ENOSPC, the temp file must be cleaned up — no orphaned .muse-tmp-* files."""
111 from muse.core.io import write_text_atomic
112
113 with patch("os.fsync", side_effect=_oserror(errno.ENOSPC)):
114 with pytest.raises(OSError):
115 write_text_atomic(tmp_path / "output.txt", "data")
116
117 tmp_files = list(tmp_path.glob(".muse-tmp-*"))
118 assert tmp_files == [], f"orphaned temp files after ENOSPC: {tmp_files}"
119
120 def test_enospc_does_not_create_target(self, tmp_path: pathlib.Path) -> None:
121 """On ENOSPC, the target file must not be created (rename never called)."""
122 from muse.core.io import write_text_atomic
123
124 target = tmp_path / "should-not-exist.txt"
125 with patch("os.fsync", side_effect=_oserror(errno.ENOSPC)):
126 with pytest.raises(OSError):
127 write_text_atomic(target, "data")
128
129 assert not target.exists(), "target file created despite ENOSPC"
130
131 def test_enospc_does_not_overwrite_existing(self, tmp_path: pathlib.Path) -> None:
132 """On ENOSPC, an existing target file must be preserved (not replaced)."""
133 from muse.core.io import write_text_atomic
134
135 target = tmp_path / "existing.txt"
136 target.write_text("original content")
137
138 with patch("os.fsync", side_effect=_oserror(errno.ENOSPC)):
139 with pytest.raises(OSError):
140 write_text_atomic(target, "new content")
141
142 assert target.read_text() == "original content", (
143 "existing file was overwritten despite ENOSPC"
144 )
145
146 def test_successful_write_still_works(self, tmp_path: pathlib.Path) -> None:
147 """After the fix, normal writes (no fsync error) must still succeed."""
148 from muse.core.io import write_text_atomic
149
150 write_text_atomic(tmp_path / "ok.txt", "success")
151 assert (tmp_path / "ok.txt").read_text() == "success"
152
153 def test_multiple_enospc_all_raise(self, tmp_path: pathlib.Path) -> None:
154 """Every ENOSPC call raises — no silent tolerance after repeated failures."""
155 from muse.core.io import write_text_atomic
156
157 for i in range(10):
158 with patch("os.fsync", side_effect=_oserror(errno.ENOSPC)):
159 with pytest.raises(OSError) as exc_info:
160 write_text_atomic(tmp_path / f"file-{i}.txt", f"content-{i}")
161 assert exc_info.value.errno == errno.ENOSPC
162
163
164 # =============================================================================
165 # 2. DATA INTEGRITY — caller sees exception, not silent success
166 # =============================================================================
167
168
169 class TestDataIntegrityOnEnospc:
170 """After ENOSPC, callers must see an exception — never a silent success."""
171
172 def test_write_text_atomic_enospc_exception_propagates_to_caller(self, tmp_path: pathlib.Path) -> None:
173 """Callers of write_text_atomic must see OSError on ENOSPC."""
174 from muse.core.io import write_text_atomic
175
176 result = None
177 exception = None
178 with patch("os.fsync", side_effect=_oserror(errno.ENOSPC)):
179 try:
180 write_text_atomic(tmp_path / "out.txt", "data")
181 result = "success"
182 except OSError as e:
183 exception = e
184
185 assert result is None, "write_text_atomic returned normally despite ENOSPC"
186 assert exception is not None
187 assert exception.errno == errno.ENOSPC
188
189 def test_no_stale_state_after_enospc(self, tmp_path: pathlib.Path) -> None:
190 """After ENOSPC, no partial state should exist in the target path."""
191 from muse.core.io import write_text_atomic
192
193 target = tmp_path / "state.txt"
194 with patch("os.fsync", side_effect=_oserror(errno.ENOSPC)):
195 with pytest.raises(OSError):
196 write_text_atomic(target, "new state")
197
198 # Target must not exist (was not pre-existing)
199 assert not target.exists()
200
201 def test_old_file_preserved_after_enospc(self, tmp_path: pathlib.Path) -> None:
202 """When overwriting, ENOSPC must leave the old file intact."""
203 from muse.core.io import write_text_atomic
204
205 target = tmp_path / "config.txt"
206 target.write_text("version: 1")
207
208 with patch("os.fsync", side_effect=_oserror(errno.ENOSPC)):
209 with pytest.raises(OSError):
210 write_text_atomic(target, "version: 2")
211
212 assert target.read_text() == "version: 1", "old config was destroyed on ENOSPC"
213
214
215 # =============================================================================
216 # 3. SECURITY — critical VCS state writes must propagate ENOSPC
217 # =============================================================================
218
219
220 class TestSecurityCriticalWritesEnospc:
221 """HEAD, branch refs, and coordination records must not silently corrupt on ENOSPC."""
222
223 def test_write_head_enospc_raises(self, tmp_path: pathlib.Path) -> None:
224 """Writing HEAD ref must propagate ENOSPC."""
225 from muse.core.io import write_text_atomic
226
227 hp = head_path(tmp_path)
228 hp.parent.mkdir(parents=True, exist_ok=True)
229
230 with patch("os.fsync", side_effect=_oserror(errno.ENOSPC)):
231 with pytest.raises(OSError) as exc_info:
232 write_text_atomic(hp, "ref: refs/heads/main\n")
233 assert exc_info.value.errno == errno.ENOSPC
234 assert not hp.exists()
235
236 def test_write_branch_ref_enospc_raises(self, tmp_path: pathlib.Path) -> None:
237 """Writing branch ref must propagate ENOSPC."""
238 from muse.core.io import write_text_atomic
239
240 ref_path = heads_dir(tmp_path) / "main"
241 ref_path.parent.mkdir(parents=True, exist_ok=True)
242
243 with patch("os.fsync", side_effect=_oserror(errno.ENOSPC)):
244 with pytest.raises(OSError) as exc_info:
245 write_text_atomic(ref_path, "abc123def456\n")
246 assert exc_info.value.errno == errno.ENOSPC
247
248 def test_write_coord_record_enospc_propagates(self, tmp_path: pathlib.Path) -> None:
249 """Writing a coordination record must propagate ENOSPC."""
250 root = _make_repo(tmp_path)
251
252 import json
253
254 from muse.cli.commands.coord_sync import _write_remote_records
255
256 rec = {
257 "kind": "reservation",
258 "record_id": "res-enospc-test",
259 "run_id": "run-test",
260 "payload": {"data": "important"},
261 "expires_at": "2099-12-31T23:59:59+00:00",
262 }
263
264 with patch("os.fsync", side_effect=_oserror(errno.ENOSPC)):
265 with pytest.raises(OSError) as exc_info:
266 _write_remote_records(root, [rec])
267
268 assert exc_info.value.errno == errno.ENOSPC
269
270 def test_enospc_does_not_silently_produce_empty_head(self, tmp_path: pathlib.Path) -> None:
271 """A zero-byte HEAD would cause every muse command to fail — must not happen."""
272 from muse.core.io import write_text_atomic
273
274 hp = head_path(tmp_path)
275 hp.parent.mkdir(parents=True, exist_ok=True)
276
277 with patch("os.fsync", side_effect=_oserror(errno.ENOSPC)):
278 with pytest.raises(OSError):
279 write_text_atomic(hp, "ref: refs/heads/main\n")
280
281 # HEAD must not exist at all (not as a zero-byte file)
282 if hp.exists():
283 assert hp.stat().st_size > 0, "HEAD was created as zero-byte file"
284
285
286 # =============================================================================
287 # 4. INTEGRATION — coord _write_remote_records propagates ENOSPC
288 # =============================================================================
289
290
291 class TestIntegrationCoordEnospc:
292 """_write_remote_records uses write_text_atomic — ENOSPC must bubble up."""
293
294 def _make_rec(self, kind: str = "reservation", record_id: str = "res-001") -> MsgpackDict:
295 return {
296 "kind": kind,
297 "record_id": record_id,
298 "run_id": "run-torvalds",
299 "payload": {"data": "x" * 1024},
300 "expires_at": "2099-12-31T23:59:59+00:00",
301 }
302
303 def test_enospc_raises_from_write_remote_records(self, tmp_path: pathlib.Path) -> None:
304 root = _make_repo(tmp_path)
305 from muse.cli.commands.coord_sync import _write_remote_records
306
307 with patch("os.fsync", side_effect=_oserror(errno.ENOSPC)):
308 with pytest.raises(OSError) as exc_info:
309 _write_remote_records(root, [self._make_rec()])
310 assert exc_info.value.errno == errno.ENOSPC
311
312 def test_eio_raises_from_write_remote_records(self, tmp_path: pathlib.Path) -> None:
313 root = _make_repo(tmp_path)
314 from muse.cli.commands.coord_sync import _write_remote_records
315
316 with patch("os.fsync", side_effect=_oserror(errno.EIO)):
317 with pytest.raises(OSError) as exc_info:
318 _write_remote_records(root, [self._make_rec()])
319 assert exc_info.value.errno == errno.EIO
320
321 def test_einval_suppressed_in_write_remote_records(self, tmp_path: pathlib.Path) -> None:
322 """Virtual filesystem compat: EINVAL from fsync must be suppressed."""
323 root = _make_repo(tmp_path)
324 from muse.cli.commands.coord_sync import _write_remote_records
325
326 with patch("os.fsync", side_effect=_oserror(errno.EINVAL)):
327 _write_remote_records(root, [self._make_rec()]) # must not raise
328
329 # File must be present and valid
330 path = (
331 coordination_dir(tmp_path) / "remote" / "reservation" / "res-001.json"
332 )
333 assert path.exists()
334
335 def test_enospc_on_second_record_first_record_still_written(self, tmp_path: pathlib.Path) -> None:
336 """ENOSPC on the second record must not prevent the first from being written."""
337 root = _make_repo(tmp_path)
338 from muse.cli.commands.coord_sync import _write_remote_records
339
340 recs = [
341 self._make_rec("reservation", "res-first"),
342 self._make_rec("intent", "intent-second"),
343 ]
344
345 call_count = [0]
346 original_fsync = os.fsync
347
348 def fsync_side_effect(fd: int) -> None:
349 call_count[0] += 1
350 if call_count[0] >= 2:
351 raise _oserror(errno.ENOSPC)
352 return original_fsync(fd)
353
354 with patch("os.fsync", side_effect=fsync_side_effect):
355 with pytest.raises(OSError):
356 _write_remote_records(root, recs)
357
358 first_path = (
359 coordination_dir(tmp_path) / "remote" / "reservation" / "res-first.json"
360 )
361 assert first_path.exists(), "first record was not written before ENOSPC"
362
363
364 # =============================================================================
365 # 5. STRESS — repeated ENOSPC never silently succeeds
366 # =============================================================================
367
368
369 class TestStressEnospc:
370 """Repeated ENOSPC must always raise — the bug must never be intermittent."""
371
372 def test_100_consecutive_enospc_all_raise(self, tmp_path: pathlib.Path) -> None:
373 from muse.core.io import write_text_atomic
374
375 silent_successes = 0
376 for i in range(100):
377 with patch("os.fsync", side_effect=_oserror(errno.ENOSPC)):
378 try:
379 write_text_atomic(tmp_path / f"file-{i}.txt", f"data-{i}")
380 silent_successes += 1
381 except OSError:
382 pass
383
384 assert silent_successes == 0, (
385 f"{silent_successes} writes silently succeeded despite ENOSPC"
386 )
387
388 def test_concurrent_threads_all_see_enospc(self, tmp_path: pathlib.Path) -> None:
389 """Under concurrent load, every thread sees ENOSPC — not just some.
390
391 Patch os.fsync globally before spawning threads so the mock is in
392 place for all of them. Patching inside each thread is unsafe because
393 `patch` modifies a module-level attribute (global state) and concurrent
394 `with patch(...)` blocks race with each other.
395 """
396 from muse.core.io import write_text_atomic
397
398 silent_successes = []
399 exceptions = []
400 lock = threading.Lock()
401
402 def worker(idx: int) -> None:
403 try:
404 write_text_atomic(tmp_path / f"t-{idx}.txt", f"data-{idx}")
405 with lock:
406 silent_successes.append(idx)
407 except OSError:
408 with lock:
409 exceptions.append(idx)
410
411 with patch("os.fsync", side_effect=_oserror(errno.ENOSPC)):
412 threads = [threading.Thread(target=worker, args=(i,)) for i in range(20)]
413 for t in threads:
414 t.start()
415 for t in threads:
416 t.join()
417
418 assert silent_successes == [], (
419 f"Threads {silent_successes} silently succeeded despite ENOSPC"
420 )
421 assert len(exceptions) == 20
422
423 def test_no_orphaned_temp_files_after_100_enospc(self, tmp_path: pathlib.Path) -> None:
424 """100 ENOSPC writes must not leave orphaned temp files."""
425 from muse.core.io import write_text_atomic
426
427 for i in range(100):
428 with patch("os.fsync", side_effect=_oserror(errno.ENOSPC)):
429 with pytest.raises(OSError):
430 write_text_atomic(tmp_path / "file.txt", f"data-{i}")
431
432 tmp_files = list(tmp_path.glob(".muse-tmp-*"))
433 assert tmp_files == [], f"{len(tmp_files)} orphaned temp files"
434
435
436 # =============================================================================
437 # 6. REGRESSION — EINVAL is still suppressed (virtual filesystem compat)
438 # =============================================================================
439
440
441 class TestRegressionEinvalSuppressed:
442 """The fix must not break virtual filesystem compatibility."""
443
444 def test_write_text_atomic_einval_suppressed(self, tmp_path: pathlib.Path) -> None:
445 from muse.core.io import write_text_atomic
446
447 with patch("os.fsync", side_effect=_oserror(errno.EINVAL)):
448 write_text_atomic(tmp_path / "v.txt", "virtual-fs-content")
449
450 assert (tmp_path / "v.txt").read_text() == "virtual-fs-content"
451
452 def test_no_fsync_error_still_works(self, tmp_path: pathlib.Path) -> None:
453 """When fsync succeeds normally, write_text_atomic still works."""
454 from muse.core.io import write_text_atomic
455
456 write_text_atomic(tmp_path / "normal.txt", "hello world")
457 assert (tmp_path / "normal.txt").read_text() == "hello world"
458
459 def test_docker_tmpfs_compat_einval_suppressed_batch(self, tmp_path: pathlib.Path) -> None:
460 """20 writes with EINVAL suppressed — simulates Docker tmpfs environment."""
461 from muse.core.io import write_text_atomic
462
463 with patch("os.fsync", side_effect=_oserror(errno.EINVAL)):
464 for i in range(20):
465 write_text_atomic(tmp_path / f"file-{i}.txt", f"content-{i}")
466
467 for i in range(20):
468 assert (tmp_path / f"file-{i}.txt").read_text() == f"content-{i}"
469
470
471 # =============================================================================
472 # 7. PERFORMANCE — suppressed EINVAL (normal path) is not dramatically slower
473 # =============================================================================
474
475
476 class TestPerformanceNormalPath:
477 """The fix must not introduce significant overhead to the common (no-error) path."""
478
479 def test_1000_writes_complete_under_5s(self, tmp_path: pathlib.Path) -> None:
480 from muse.core.io import write_text_atomic
481
482 t0 = time.monotonic()
483 for i in range(1000):
484 write_text_atomic(tmp_path / f"perf-{i:04d}.txt", f"data-{i}" * 32)
485 elapsed = time.monotonic() - t0
486
487 assert elapsed < 15.0, f"1000 atomic text writes took {elapsed:.3f}s (> 15s)"
488
File History 4 commits
sha256:81ae324db5ad375fbfe4834c6fcb378312cafad3cc92dec5d3e5c427306621a2 fix: remove commit_exists filter from have anchors — server… Sonnet 4.6 patch 21 days ago
sha256:36c3cb3e76619d4c30a6d9bf81b5ec4ff148e30dcfed913e3114ca7b43b81c7e fix: rename objects→blobs in push client and all stale test… Sonnet 4.6 patch 23 days ago
sha256:c06a9b9b9fee26c68ea725b44d54b2c0a171301ce9de746d5b656617b4463a9a fix: repair four test failures from post-migration audit Sonnet 4.6 patch 29 days ago
sha256:1900655993c83c4107067375548a7be823e471d2515830842f1a12cba4bd3cdf fix: unified object store migration — idempotent writes, JS… Sonnet 4.6 minor 29 days ago