From 253e7cbd1cc161c82e9c58ba740c156eb4b110f5 Mon Sep 17 00:00:00 2001 From: goodboy Date: Wed, 22 Apr 2026 19:39:41 -0400 Subject: [PATCH] Add DRAFT `subint_forkserver` orphan-SIGINT test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tier-4 test `test_orphaned_subactor_sigint_cleanup_DRAFT` documents an empirical SIGINT-delivery gap in the `subint_forkserver` backend: when the parent dies via `SIGKILL` (no IPC `Portal.cancel_actor()` possible) and `SIGINT` is sent to the orphan child, the child DOES NOT unwind — CPython's default `KeyboardInterrupt` is delivered to `threading.main_thread()`, whose tstate is dead in the post-fork child bc fork inherited the worker thread, not main. Trio running on the fork-inherited worker thread therefore never observes the signal. Marked `xfail(strict=True)` so the mark flips to XPASS→fail once the backend grows explicit SIGINT plumbing. Deats, - harness runs the failure-mode sequence out-of-process: 1. harness subprocess runs a fresh Python script that calls `try_set_start_method('subint_forkserver')` then opens a root actor + one `sleep_forever` subactor 2. parse `PARENT_READY=` + `CHILD_PID=` markers off harness `stdout` to confirm IPC handshake completed 3. `SIGKILL` the parent, `proc.wait()` to reap the zombie (otherwise `os.kill(pid, 0)` keeps reporting it alive) 4. assert the child survived the parent-reap (i.e. was actually orphaned, not reaped too) before moving on 5. `SIGINT` the orphan child, poll `os.kill(child_pid, 0)` every 100ms for up to 10s - supporting helpers: `_read_marker()` with per-proc bytes-buffer to carry partial lines across calls, `_process_alive()` liveness probe via `kill(pid, 0)` - Linux-only via `platform.system() != 'Linux'` skip — orphan-reparenting semantics don't generalize to other platforms - port offset (`reg_addr[1] + 17`) so the harness listener doesn't race concurrently-running backend tests - best-effort `finally:` cleanup: `SIGKILL` any still-alive pids + `proc.kill()` + bounded `proc.wait()` to avoid leaking orphans across the session Also, tier-4 header comment documents the cross-backend generalization path: applicable to any multi-process backend (`trio`, `mp_spawn`, `mp_forkserver`, `subint_forkserver`), NOT to plain `subint` (in-process subints have no orphan OS-child). Move path: lift harness into `tests/_orphan_harness.py`, parametrize on session `_spawn_method`, add `skipif _spawn_method == 'subint'`. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code --- tests/spawn/test_subint_forkserver.py | 256 ++++++++++++++++++++++++++ 1 file changed, 256 insertions(+) diff --git a/tests/spawn/test_subint_forkserver.py b/tests/spawn/test_subint_forkserver.py index bb601cfe..c38d0ad2 100644 --- a/tests/spawn/test_subint_forkserver.py +++ b/tests/spawn/test_subint_forkserver.py @@ -43,6 +43,12 @@ Gating from __future__ import annotations from functools import partial import os +import platform +import select +import signal +import subprocess +import sys +import time import pytest import trio @@ -327,3 +333,253 @@ def test_subint_forkserver_spawn_basic( deadline, ), ) + + +# ---------------------------------------------------------------- +# tier-4 DRAFT: orphaned-subactor SIGINT survivability +# +# Motivating question: with `subint_forkserver`, the child's +# `trio.run()` lives on the fork-inherited worker thread which +# is NOT `threading.main_thread()` — so trio cannot install its +# `signal.set_wakeup_fd`-based SIGINT handler. If the parent +# goes away via `SIGKILL` (no IPC `Portal.cancel_actor()` +# possible), does SIGINT on the orphan child cleanly tear it +# down via CPython's default `KeyboardInterrupt` delivery, or +# does it hang? +# +# Working hypothesis (unverified pre-this-test): post-fork the +# child is effectively single-threaded (only the fork-worker +# tstate survived), so SIGINT → default handler → raises +# `KeyboardInterrupt` on the only thread — which happens to be +# the one driving trio's event loop — so trio observes it at +# the next checkpoint. If so, we're "fine" on this backend +# despite the missing trio SIGINT handler. +# +# Cross-backend generalization (decide after this passes): +# - applicable to any backend whose subactors are separate OS +# processes: `trio`, `mp_spawn`, `mp_forkserver`, +# `subint_forkserver`. +# - NOT applicable to plain `subint` (subactors are in-process +# subinterpreters, no orphan child process to SIGINT). +# - move path: lift the harness script into +# `tests/_orphan_harness.py`, parametrize on the session's +# `_spawn_method`, add `skipif _spawn_method == 'subint'`. +# ---------------------------------------------------------------- + + +_ORPHAN_HARNESS_SCRIPT: str = ''' +import os +import sys +import trio +import tractor +from tractor.spawn._spawn import try_set_start_method + +async def _sleep_forever() -> None: + print(f"CHILD_PID={os.getpid()}", flush=True) + await trio.sleep_forever() + +async def _main(reg_addr): + async with ( + tractor.open_root_actor(registry_addrs=[reg_addr]), + tractor.open_nursery() as an, + ): + portal = await an.run_in_actor( + _sleep_forever, + name="orphan-test-child", + ) + print(f"PARENT_READY={os.getpid()}", flush=True) + await trio.sleep_forever() + +if __name__ == "__main__": + backend = sys.argv[1] + host = sys.argv[2] + port = int(sys.argv[3]) + try_set_start_method(backend) + trio.run(_main, (host, port)) +''' + + +def _read_marker( + proc: subprocess.Popen, + marker: str, + timeout: float, + _buf: dict, +) -> str: + ''' + Block until `=\\n` appears on `proc.stdout` + and return ``. Uses a per-proc byte buffer (`_buf`) + to carry partial lines across calls. + + ''' + deadline: float = time.monotonic() + timeout + remainder: bytes = _buf.get('remainder', b'') + prefix: bytes = f'{marker}='.encode() + while time.monotonic() < deadline: + # drain any complete lines already buffered + while b'\n' in remainder: + line, remainder = remainder.split(b'\n', 1) + if line.startswith(prefix): + _buf['remainder'] = remainder + return line[len(prefix):].decode().strip() + ready, _, _ = select.select([proc.stdout], [], [], 0.2) + if not ready: + continue + chunk: bytes = os.read(proc.stdout.fileno(), 4096) + if not chunk: + break + remainder += chunk + _buf['remainder'] = remainder + raise TimeoutError( + f'Never observed marker {marker!r} on harness stdout ' + f'within {timeout}s' + ) + + +def _process_alive(pid: int) -> bool: + '''Liveness probe for a pid we do NOT parent (post-orphan).''' + try: + os.kill(pid, 0) + return True + except ProcessLookupError: + return False + + +@pytest.mark.xfail( + strict=True, + reason=( + 'subint_forkserver orphan-child SIGINT gap: trio on the ' + 'fork-inherited non-main thread has no SIGINT wakeup-fd ' + 'handler installed, and CPython\'s default ' + 'KeyboardInterrupt delivery does NOT reach the trio ' + 'loop on this thread post-fork. Fix path TBD — see ' + 'TODO in `tractor.spawn._subint_forkserver`. Flip this ' + 'mark (or drop it) once the gap is closed.' + ), +) +@pytest.mark.timeout(60, method='thread') +def test_orphaned_subactor_sigint_cleanup_DRAFT( + reg_addr: tuple[str, int | str], + tmp_path, +) -> None: + ''' + DRAFT — orphaned-subactor SIGINT survivability under the + `subint_forkserver` backend. + + Sequence: + 1. Spawn a harness subprocess that brings up a root + actor + one `sleep_forever` subactor via + `subint_forkserver`. + 2. Read the harness's stdout for `PARENT_READY=` + and `CHILD_PID=` markers (confirms the + parent→child IPC handshake completed). + 3. `SIGKILL` the parent (no IPC cancel possible — the + whole point of this test). + 4. `SIGINT` the orphan child. + 5. Poll `os.kill(child_pid, 0)` for up to 10s — assert + the child exits. + + Empirical result (2026-04): currently **FAILS** — the + "post-fork single-thread → default KeyboardInterrupt lands + on trio's thread" hypothesis from the class-A/B + conc-anal discussion turned out to be wrong. SIGINT on the + orphan child doesn't unwind the trio loop. Most likely + CPython delivers `KeyboardInterrupt` specifically to + `threading.main_thread()`, whose tstate is dead in the + post-fork child (fork inherited the worker thread, not the + original main thread). Marked `xfail(strict=True)` so the + mark flips to XPASS→fail once the gap is closed and we'll + know to drop the mark. + + ''' + if platform.system() != 'Linux': + pytest.skip( + 'orphan-reparenting semantics only exercised on Linux' + ) + + script_path = tmp_path / '_orphan_harness.py' + script_path.write_text(_ORPHAN_HARNESS_SCRIPT) + + # Offset the port so we don't race the session reg_addr with + # any concurrently-running backend test's listener. + host: str = reg_addr[0] + port: int = int(reg_addr[1]) + 17 + + proc: subprocess.Popen = subprocess.Popen( + [ + sys.executable, + str(script_path), + 'subint_forkserver', + host, + str(port), + ], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + parent_pid: int | None = None + child_pid: int | None = None + buf: dict = {} + try: + child_pid = int(_read_marker(proc, 'CHILD_PID', 15.0, buf)) + parent_pid = int(_read_marker(proc, 'PARENT_READY', 15.0, buf)) + + # sanity: both alive before we start killing stuff + assert _process_alive(parent_pid), ( + f'harness parent pid={parent_pid} gone before ' + f'SIGKILL — test premise broken' + ) + assert _process_alive(child_pid), ( + f'orphan-candidate child pid={child_pid} gone ' + f'before test started' + ) + + # step 3: kill parent — no IPC cancel arrives at child. + # `proc.wait()` reaps the zombie so it truly disappears + # from the process table (otherwise `os.kill(pid, 0)` + # keeps reporting it as alive). + os.kill(parent_pid, signal.SIGKILL) + try: + proc.wait(timeout=3.0) + except subprocess.TimeoutExpired: + pytest.fail( + f'harness parent pid={parent_pid} did not die ' + f'after SIGKILL — test premise broken' + ) + assert _process_alive(child_pid), ( + f'child pid={child_pid} died along with parent — ' + f'did the parent reap it before SIGKILL took? ' + f'test premise requires an orphan.' + ) + + # step 4+5: SIGINT the orphan, poll for exit. + os.kill(child_pid, signal.SIGINT) + cleanup_deadline: float = time.monotonic() + 10.0 + while time.monotonic() < cleanup_deadline: + if not _process_alive(child_pid): + return # <- success path + time.sleep(0.1) + + pytest.fail( + f'Orphan subactor (pid={child_pid}) did NOT exit ' + f'within 10s of SIGINT under `subint_forkserver` ' + f'→ trio on non-main thread did not observe the ' + f'default CPython KeyboardInterrupt; backend needs ' + f'explicit SIGINT plumbing.' + ) + finally: + # best-effort cleanup to avoid leaking orphans across + # the test session regardless of outcome. + for pid in (parent_pid, child_pid): + if pid is None: + continue + try: + os.kill(pid, signal.SIGKILL) + except ProcessLookupError: + pass + try: + proc.kill() + except OSError: + pass + try: + proc.wait(timeout=2.0) + except subprocess.TimeoutExpired: + pass