Add DRAFT `subint_forkserver` orphan-SIGINT test
Tier-4 test `test_orphaned_subactor_sigint_cleanup_DRAFT`
documents an empirical SIGINT-delivery gap in the
`subint_forkserver` backend: when the parent dies via
`SIGKILL` (no IPC `Portal.cancel_actor()` possible) and
`SIGINT` is sent to the orphan child, the child DOES NOT
unwind — CPython's default `KeyboardInterrupt` is delivered
to `threading.main_thread()`, whose tstate is dead in the
post-fork child bc fork inherited the worker thread, not
main. Trio running on the fork-inherited worker thread
therefore never observes the signal. Marked
`xfail(strict=True)` so the mark flips to XPASS→fail once
the backend grows explicit SIGINT plumbing.
Deats,
- harness runs the failure-mode sequence out-of-process:
1. harness subprocess runs a fresh Python script
that calls `try_set_start_method('subint_forkserver')`
then opens a root actor + one `sleep_forever` subactor
2. parse `PARENT_READY=<pid>` + `CHILD_PID=<pid>` markers
off harness `stdout` to confirm IPC handshake
completed
3. `SIGKILL` the parent, `proc.wait()` to reap the
zombie (otherwise `os.kill(pid, 0)` keeps reporting
it alive)
4. assert the child survived the parent-reap (i.e. was
actually orphaned, not reaped too) before moving on
5. `SIGINT` the orphan child, poll `os.kill(child_pid, 0)`
every 100ms for up to 10s
- supporting helpers: `_read_marker()` with per-proc
bytes-buffer to carry partial lines across calls,
`_process_alive()` liveness probe via `kill(pid, 0)`
- Linux-only via `platform.system() != 'Linux'` skip —
orphan-reparenting semantics don't generalize to
other platforms
- port offset (`reg_addr[1] + 17`) so the harness listener
doesn't race concurrently-running backend tests
- best-effort `finally:` cleanup: `SIGKILL` any still-alive
pids + `proc.kill()` + bounded `proc.wait()` to avoid
leaking orphans across the session
Also, tier-4 header comment documents the cross-backend
generalization path: applicable to any multi-process
backend (`trio`, `mp_spawn`, `mp_forkserver`,
`subint_forkserver`), NOT to plain `subint` (in-process
subints have no orphan OS-child). Move path: lift
harness into `tests/_orphan_harness.py`, parametrize on
session `_spawn_method`, add
`skipif _spawn_method == 'subint'`.
(this patch was generated in some part by [`claude-code`][claude-code-gh])
[claude-code-gh]: https://github.com/anthropics/claude-code
subint_forkserver
parent
27cc02e83d
commit
253e7cbd1c
|
|
@ -43,6 +43,12 @@ Gating
|
|||
from __future__ import annotations
|
||||
from functools import partial
|
||||
import os
|
||||
import platform
|
||||
import select
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import trio
|
||||
|
|
@ -327,3 +333,253 @@ def test_subint_forkserver_spawn_basic(
|
|||
deadline,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# tier-4 DRAFT: orphaned-subactor SIGINT survivability
|
||||
#
|
||||
# Motivating question: with `subint_forkserver`, the child's
|
||||
# `trio.run()` lives on the fork-inherited worker thread which
|
||||
# is NOT `threading.main_thread()` — so trio cannot install its
|
||||
# `signal.set_wakeup_fd`-based SIGINT handler. If the parent
|
||||
# goes away via `SIGKILL` (no IPC `Portal.cancel_actor()`
|
||||
# possible), does SIGINT on the orphan child cleanly tear it
|
||||
# down via CPython's default `KeyboardInterrupt` delivery, or
|
||||
# does it hang?
|
||||
#
|
||||
# Working hypothesis (unverified pre-this-test): post-fork the
|
||||
# child is effectively single-threaded (only the fork-worker
|
||||
# tstate survived), so SIGINT → default handler → raises
|
||||
# `KeyboardInterrupt` on the only thread — which happens to be
|
||||
# the one driving trio's event loop — so trio observes it at
|
||||
# the next checkpoint. If so, we're "fine" on this backend
|
||||
# despite the missing trio SIGINT handler.
|
||||
#
|
||||
# Cross-backend generalization (decide after this passes):
|
||||
# - applicable to any backend whose subactors are separate OS
|
||||
# processes: `trio`, `mp_spawn`, `mp_forkserver`,
|
||||
# `subint_forkserver`.
|
||||
# - NOT applicable to plain `subint` (subactors are in-process
|
||||
# subinterpreters, no orphan child process to SIGINT).
|
||||
# - move path: lift the harness script into
|
||||
# `tests/_orphan_harness.py`, parametrize on the session's
|
||||
# `_spawn_method`, add `skipif _spawn_method == 'subint'`.
|
||||
# ----------------------------------------------------------------
|
||||
|
||||
|
||||
_ORPHAN_HARNESS_SCRIPT: str = '''
|
||||
import os
|
||||
import sys
|
||||
import trio
|
||||
import tractor
|
||||
from tractor.spawn._spawn import try_set_start_method
|
||||
|
||||
async def _sleep_forever() -> None:
|
||||
print(f"CHILD_PID={os.getpid()}", flush=True)
|
||||
await trio.sleep_forever()
|
||||
|
||||
async def _main(reg_addr):
|
||||
async with (
|
||||
tractor.open_root_actor(registry_addrs=[reg_addr]),
|
||||
tractor.open_nursery() as an,
|
||||
):
|
||||
portal = await an.run_in_actor(
|
||||
_sleep_forever,
|
||||
name="orphan-test-child",
|
||||
)
|
||||
print(f"PARENT_READY={os.getpid()}", flush=True)
|
||||
await trio.sleep_forever()
|
||||
|
||||
if __name__ == "__main__":
|
||||
backend = sys.argv[1]
|
||||
host = sys.argv[2]
|
||||
port = int(sys.argv[3])
|
||||
try_set_start_method(backend)
|
||||
trio.run(_main, (host, port))
|
||||
'''
|
||||
|
||||
|
||||
def _read_marker(
|
||||
proc: subprocess.Popen,
|
||||
marker: str,
|
||||
timeout: float,
|
||||
_buf: dict,
|
||||
) -> str:
|
||||
'''
|
||||
Block until `<marker>=<value>\\n` appears on `proc.stdout`
|
||||
and return `<value>`. Uses a per-proc byte buffer (`_buf`)
|
||||
to carry partial lines across calls.
|
||||
|
||||
'''
|
||||
deadline: float = time.monotonic() + timeout
|
||||
remainder: bytes = _buf.get('remainder', b'')
|
||||
prefix: bytes = f'{marker}='.encode()
|
||||
while time.monotonic() < deadline:
|
||||
# drain any complete lines already buffered
|
||||
while b'\n' in remainder:
|
||||
line, remainder = remainder.split(b'\n', 1)
|
||||
if line.startswith(prefix):
|
||||
_buf['remainder'] = remainder
|
||||
return line[len(prefix):].decode().strip()
|
||||
ready, _, _ = select.select([proc.stdout], [], [], 0.2)
|
||||
if not ready:
|
||||
continue
|
||||
chunk: bytes = os.read(proc.stdout.fileno(), 4096)
|
||||
if not chunk:
|
||||
break
|
||||
remainder += chunk
|
||||
_buf['remainder'] = remainder
|
||||
raise TimeoutError(
|
||||
f'Never observed marker {marker!r} on harness stdout '
|
||||
f'within {timeout}s'
|
||||
)
|
||||
|
||||
|
||||
def _process_alive(pid: int) -> bool:
|
||||
'''Liveness probe for a pid we do NOT parent (post-orphan).'''
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
return True
|
||||
except ProcessLookupError:
|
||||
return False
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
strict=True,
|
||||
reason=(
|
||||
'subint_forkserver orphan-child SIGINT gap: trio on the '
|
||||
'fork-inherited non-main thread has no SIGINT wakeup-fd '
|
||||
'handler installed, and CPython\'s default '
|
||||
'KeyboardInterrupt delivery does NOT reach the trio '
|
||||
'loop on this thread post-fork. Fix path TBD — see '
|
||||
'TODO in `tractor.spawn._subint_forkserver`. Flip this '
|
||||
'mark (or drop it) once the gap is closed.'
|
||||
),
|
||||
)
|
||||
@pytest.mark.timeout(60, method='thread')
|
||||
def test_orphaned_subactor_sigint_cleanup_DRAFT(
|
||||
reg_addr: tuple[str, int | str],
|
||||
tmp_path,
|
||||
) -> None:
|
||||
'''
|
||||
DRAFT — orphaned-subactor SIGINT survivability under the
|
||||
`subint_forkserver` backend.
|
||||
|
||||
Sequence:
|
||||
1. Spawn a harness subprocess that brings up a root
|
||||
actor + one `sleep_forever` subactor via
|
||||
`subint_forkserver`.
|
||||
2. Read the harness's stdout for `PARENT_READY=<pid>`
|
||||
and `CHILD_PID=<pid>` markers (confirms the
|
||||
parent→child IPC handshake completed).
|
||||
3. `SIGKILL` the parent (no IPC cancel possible — the
|
||||
whole point of this test).
|
||||
4. `SIGINT` the orphan child.
|
||||
5. Poll `os.kill(child_pid, 0)` for up to 10s — assert
|
||||
the child exits.
|
||||
|
||||
Empirical result (2026-04): currently **FAILS** — the
|
||||
"post-fork single-thread → default KeyboardInterrupt lands
|
||||
on trio's thread" hypothesis from the class-A/B
|
||||
conc-anal discussion turned out to be wrong. SIGINT on the
|
||||
orphan child doesn't unwind the trio loop. Most likely
|
||||
CPython delivers `KeyboardInterrupt` specifically to
|
||||
`threading.main_thread()`, whose tstate is dead in the
|
||||
post-fork child (fork inherited the worker thread, not the
|
||||
original main thread). Marked `xfail(strict=True)` so the
|
||||
mark flips to XPASS→fail once the gap is closed and we'll
|
||||
know to drop the mark.
|
||||
|
||||
'''
|
||||
if platform.system() != 'Linux':
|
||||
pytest.skip(
|
||||
'orphan-reparenting semantics only exercised on Linux'
|
||||
)
|
||||
|
||||
script_path = tmp_path / '_orphan_harness.py'
|
||||
script_path.write_text(_ORPHAN_HARNESS_SCRIPT)
|
||||
|
||||
# Offset the port so we don't race the session reg_addr with
|
||||
# any concurrently-running backend test's listener.
|
||||
host: str = reg_addr[0]
|
||||
port: int = int(reg_addr[1]) + 17
|
||||
|
||||
proc: subprocess.Popen = subprocess.Popen(
|
||||
[
|
||||
sys.executable,
|
||||
str(script_path),
|
||||
'subint_forkserver',
|
||||
host,
|
||||
str(port),
|
||||
],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
)
|
||||
parent_pid: int | None = None
|
||||
child_pid: int | None = None
|
||||
buf: dict = {}
|
||||
try:
|
||||
child_pid = int(_read_marker(proc, 'CHILD_PID', 15.0, buf))
|
||||
parent_pid = int(_read_marker(proc, 'PARENT_READY', 15.0, buf))
|
||||
|
||||
# sanity: both alive before we start killing stuff
|
||||
assert _process_alive(parent_pid), (
|
||||
f'harness parent pid={parent_pid} gone before '
|
||||
f'SIGKILL — test premise broken'
|
||||
)
|
||||
assert _process_alive(child_pid), (
|
||||
f'orphan-candidate child pid={child_pid} gone '
|
||||
f'before test started'
|
||||
)
|
||||
|
||||
# step 3: kill parent — no IPC cancel arrives at child.
|
||||
# `proc.wait()` reaps the zombie so it truly disappears
|
||||
# from the process table (otherwise `os.kill(pid, 0)`
|
||||
# keeps reporting it as alive).
|
||||
os.kill(parent_pid, signal.SIGKILL)
|
||||
try:
|
||||
proc.wait(timeout=3.0)
|
||||
except subprocess.TimeoutExpired:
|
||||
pytest.fail(
|
||||
f'harness parent pid={parent_pid} did not die '
|
||||
f'after SIGKILL — test premise broken'
|
||||
)
|
||||
assert _process_alive(child_pid), (
|
||||
f'child pid={child_pid} died along with parent — '
|
||||
f'did the parent reap it before SIGKILL took? '
|
||||
f'test premise requires an orphan.'
|
||||
)
|
||||
|
||||
# step 4+5: SIGINT the orphan, poll for exit.
|
||||
os.kill(child_pid, signal.SIGINT)
|
||||
cleanup_deadline: float = time.monotonic() + 10.0
|
||||
while time.monotonic() < cleanup_deadline:
|
||||
if not _process_alive(child_pid):
|
||||
return # <- success path
|
||||
time.sleep(0.1)
|
||||
|
||||
pytest.fail(
|
||||
f'Orphan subactor (pid={child_pid}) did NOT exit '
|
||||
f'within 10s of SIGINT under `subint_forkserver` '
|
||||
f'→ trio on non-main thread did not observe the '
|
||||
f'default CPython KeyboardInterrupt; backend needs '
|
||||
f'explicit SIGINT plumbing.'
|
||||
)
|
||||
finally:
|
||||
# best-effort cleanup to avoid leaking orphans across
|
||||
# the test session regardless of outcome.
|
||||
for pid in (parent_pid, child_pid):
|
||||
if pid is None:
|
||||
continue
|
||||
try:
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
try:
|
||||
proc.kill()
|
||||
except OSError:
|
||||
pass
|
||||
try:
|
||||
proc.wait(timeout=2.0)
|
||||
except subprocess.TimeoutExpired:
|
||||
pass
|
||||
|
|
|
|||
Loading…
Reference in New Issue