From d60cf236593740b5d61f1ec531a20914789fdac4 Mon Sep 17 00:00:00 2001 From: goodboy Date: Tue, 9 Jun 2026 20:18:44 -0400 Subject: [PATCH] Arm `dump_on_hang` on `test_stale_entry_is_deleted` Wrap the test's `trio.run(main)` in `dump_on_hang(seconds=20)` so any future hang regression captures a stack dump for triage instead of wedging CI silently; under the default backends it's a no-op safety net. Includes a "KNOWN ISSUE" comment block documenting the (future) `subint` backend hang classes observed against this test during Phase B bringup (#379). (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit 4a3254583b1168382314fdad1b268c2498bbf729) (factored: kept only the tests/discovery/test_registrar.py part of "Doc `subint` backend hang classes + arm `dump_on_hang`"; dropped subint conc-anal docs + tests/test_subint_cancellation.py) --- tests/discovery/test_registrar.py | 52 ++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/tests/discovery/test_registrar.py b/tests/discovery/test_registrar.py index 60b2b10c..6f34b117 100644 --- a/tests/discovery/test_registrar.py +++ b/tests/discovery/test_registrar.py @@ -14,6 +14,7 @@ import psutil import pytest import subprocess import tractor +from tractor.devx import dump_on_hang from tractor.trionics import collapse_eg from tractor._testing import tractor_test from tractor.discovery._addr import wrap_address @@ -562,4 +563,53 @@ def test_stale_entry_is_deleted( await ptl.cancel_actor() await an.cancel() - trio.run(main) + # TODO, remove once the `[subint]` variant no longer hangs. + # + # Status (as of Phase B hard-kill landing): + # + # - `[trio]`/`[mp_*]` variants: completes normally; `dump_on_hang` + # is a no-op safety net here. + # + # - `[subint]` variant: hangs indefinitely AND is un-Ctrl-C-able. + # `strace -p ` while in the hang reveals a silently- + # dropped SIGINT — the C signal handler tries to write the + # signum byte to Python's signal-wakeup fd and gets `EAGAIN`, + # meaning the pipe is full (nobody's draining it). + # + # Root-cause chain: our hard-kill in `spawn._subint` abandoned + # the driver OS-thread (which is `daemon=True`) after the soft- + # kill timeout, but the *sub-interpreter* inside that thread is + # still running `trio.run()` — `_interpreters.destroy()` can't + # force-stop a running subint (raises `InterpreterError`), and + # legacy-config subints share the main GIL. The abandoned subint + # starves the parent's trio event loop from iterating often + # enough to drain its wakeup pipe → SIGINT silently drops. + # + # This is structurally a CPython-level limitation: there's no + # public force-destroy primitive for a running subint. We + # escape on the harness side via a SIGINT-loop in the `daemon` + # fixture teardown (killing the bg registrar subproc closes its + # end of the IPC, which eventually unblocks a recv in main trio, + # which lets the loop drain the wakeup pipe). Long-term fix path: + # msgspec PEP 684 support (jcrist/msgspec#563) → isolated-mode + # subints with per-interp GIL. + # + # Full analysis: + # `ai/conc-anal/subint_sigint_starvation_issue.md` + # + # See also the *sibling* hang class documented in + # `ai/conc-anal/subint_cancel_delivery_hang_issue.md` — same + # subint backend, different root cause (Ctrl-C-able hang, main + # trio loop iterating fine; ours to fix, not CPython's). + # Reproduced by `tests/test_subint_cancellation.py + # ::test_subint_non_checkpointing_child`. + # + # Kept here (and not behind a `pytestmark.skip`) so we can still + # inspect the dump file if the hang ever returns after a refactor. + # `pytest`'s stderr capture eats `faulthandler` output otherwise, + # so we route `dump_on_hang` to a file. + with dump_on_hang( + seconds=20, + path=f'/tmp/test_stale_entry_is_deleted_{start_method}.dump', + ): + trio.run(main)