tractor/tests/test_subint_cancellation.py

'''
Cancellation + hard-kill semantics audit for the `subint` spawn
backend.

Exercises the escape-hatch machinery added to
`tractor.spawn._subint` (module-level `_HARD_KILL_TIMEOUT`,
bounded shields around the soft-kill / thread-join sites, daemon
driver-thread abandonment) so that future stdlib regressions or
our own refactors don't silently re-introduce the hangs first
diagnosed during the Phase B.2/B.3 bringup (issue #379).

Every test in this module:
- is wrapped in `trio.fail_after()` for a deterministic per-test
  wall-clock ceiling (the whole point of these tests is to fail
  fast when our escape hatches regress; an unbounded test would
  defeat itself),
- arms `tractor.devx.dump_on_hang()` to capture a stack dump on
  failure — without it, a hang here is opaque because pytest's
  stderr capture swallows `faulthandler` output by default
  (hard-won lesson from the original diagnosis),
- skips on py<3.13 (no `_interpreters`) and on any
  `--spawn-backend` other than `'subint'` (these tests are
  subint-specific by design — they'd be nonsense under `trio` or
  `mp_*`).

'''
from __future__ import annotations
from functools import partial

import pytest
import trio
import tractor
from tractor.devx import dump_on_hang


# Gate: the `subint` backend requires py3.14+. Check the
# public stdlib wrapper's presence (added in 3.14) rather than
# the private `_interpreters` module (which exists on 3.13 but
# wedges under tractor's usage — see `tractor.spawn._subint`).
pytest.importorskip('concurrent.interpreters')

# Subint-only: read the spawn method that `pytest_configure`
# committed via `try_set_start_method()`. By the time this module
# imports, the CLI backend choice has been applied.
from tractor.spawn._spawn import _spawn_method  # noqa: E402

if _spawn_method != 'subint':
    pytestmark = pytest.mark.skip(
        reason=(
            "subint-specific cancellation audit — "
            "pass `--spawn-backend=subint` to run."
        ),
    )


# ----------------------------------------------------------------
# child-side task bodies (run inside the spawned subint)
# ----------------------------------------------------------------


async def _trivial_rpc() -> str:
    '''
    Minimal RPC body for the baseline happy-teardown test.
    '''
    return 'hello from subint'


async def _spin_without_trio_checkpoints() -> None:
    '''
    Block the main task with NO trio-visible checkpoints so any
    `Portal.cancel_actor()` arriving over IPC has nothing to hand
    off to.

    `threading.Event.wait(timeout)` releases the GIL (so other
    threads — including trio's IO/RPC tasks — can progress) but
    does NOT insert a trio checkpoint, so the subactor's main
    task never notices cancellation.

    This is the exact "stuck subint" scenario the hard-kill
    shields exist to survive.
    '''
    import threading
    never_set = threading.Event()
    while not never_set.is_set():
        # 1s re-check granularity; low enough not to waste CPU,
        # high enough that even a pathologically slow
        # `_HARD_KILL_TIMEOUT` won't accidentally align with a
        # wake.
        never_set.wait(timeout=1.0)


# ----------------------------------------------------------------
# parent-side harnesses (driven inside `trio.run(...)`)
# ----------------------------------------------------------------


async def _happy_path(
    reg_addr: tuple[str, int|str],
    deadline: float,
) -> None:
    with trio.fail_after(deadline):
        async with (
            tractor.open_root_actor(
                registry_addrs=[reg_addr],
            ),
            tractor.open_nursery() as an,
        ):
            portal: tractor.Portal = await an.run_in_actor(
                _trivial_rpc,
                name='subint-happy',
            )
            result: str = await portal.wait_for_result()
            assert result == 'hello from subint'


async def _spawn_stuck_then_cancel(
    reg_addr: tuple[str, int|str],
    deadline: float,
) -> None:
    with trio.fail_after(deadline):
        async with (
            tractor.open_root_actor(
                registry_addrs=[reg_addr],
            ),
            tractor.open_nursery() as an,
        ):
            await an.run_in_actor(
                _spin_without_trio_checkpoints,
                name='subint-stuck',
            )
            # Give the child time to reach its non-checkpointing
            # loop before we cancel; the precise value doesn't
            # matter as long as it's a handful of trio schedule
            # ticks.
            await trio.sleep(0.5)
            an.cancel_scope.cancel()


# ----------------------------------------------------------------
# tests
# ----------------------------------------------------------------


def test_subint_happy_teardown(
    reg_addr: tuple[str, int|str],
) -> None:
    '''
    Baseline: spawn a subactor, do one portal RPC, close nursery
    cleanly. No cancel, no faults.

    If this regresses we know something's wrong at the
    spawn/teardown layer unrelated to the hard-kill escape
    hatches.

    '''
    deadline: float = 10.0
    with dump_on_hang(
        seconds=deadline,
        path='/tmp/subint_cancellation_happy.dump',
    ):
        trio.run(partial(_happy_path, reg_addr, deadline))


# Wall-clock bound via `pytest-timeout` (`method='thread'`)
# as defense-in-depth over the inner `trio.fail_after(15)`.
# Under the orphaned-channel hang class described in
# `ai/conc-anal/subint_cancel_delivery_hang_issue.md`, SIGINT
# is still deliverable and this test *should* be unwedgeable
# by the inner trio timeout — but sibling subint-backend
# tests in this repo have also exhibited the
# `subint_sigint_starvation_issue.md` GIL-starvation flavor,
# so `method='thread'` keeps us safe in case ordering or
# load shifts the failure mode.
@pytest.mark.timeout(
    3,  # NOTE never passes pre-3.14+ subints support.
    method='thread',
)
def test_subint_non_checkpointing_child(
    reg_addr: tuple[str, int|str],
) -> None:
    '''
    Cancel a subactor whose main task is stuck in a non-
    checkpointing Python loop.

    `Portal.cancel_actor()` may be delivered over IPC but the
    main task never checkpoints to observe the Cancelled —
    so the subint's `trio.run()` can't exit gracefully.

    The parent `subint_proc` bounded-shield + daemon-driver-
    thread combo should abandon the thread after
    `_HARD_KILL_TIMEOUT` and let the parent return cleanly.

    Wall-clock budget:
    - ~0.5s: settle time for child to enter the stuck loop
    - ~3s: `_HARD_KILL_TIMEOUT` (soft-kill wait)
    - ~3s: `_HARD_KILL_TIMEOUT` (thread-join wait)
    - margin

    KNOWN ISSUE (Ctrl-C-able hang):
    -------------------------------
    This test currently hangs past the hard-kill timeout for
    reasons unrelated to the subint teardown itself — after
    the subint is destroyed, a parent-side trio task appears
    to park on an orphaned IPC channel (no clean EOF
    delivered to a waiting receive). Unlike the
    SIGINT-starvation sibling case in
    `test_stale_entry_is_deleted`, this hang IS Ctrl-C-able
    (`strace` shows SIGINT wakeup-fd `write() = 1`, not
    `EAGAIN`) — i.e. the main trio loop is still iterating
    normally. That makes this *our* bug to fix, not a
    CPython-level limitation.

    See `ai/conc-anal/subint_cancel_delivery_hang_issue.md`
    for the full analysis + candidate fix directions
    (explicit parent-side channel abort in `subint_proc`
    teardown being the most likely surgical fix).

    The sibling `ai/conc-anal/subint_sigint_starvation_issue.md`
    documents the *other* hang class (abandoned-legacy-subint
    thread + shared-GIL starvation → signal-wakeup-fd pipe
    fills → SIGINT silently dropped) — that one is
    structurally blocked on msgspec PEP 684 adoption and is
    NOT what this test is hitting.

    '''
    deadline: float = 15.0
    with dump_on_hang(
        seconds=deadline,
        path='/tmp/subint_cancellation_stuck.dump',
    ):
        trio.run(
            partial(
                _spawn_stuck_then_cancel,
                reg_addr,
                deadline,
            ),
        )
Add `subint` cancellation + hard-kill test audit Lock in the escape-hatch machinery added to `tractor.spawn._subint` during the Phase B.2/B.3 bringup (issue #379) so future stdlib regressions or our own refactors don't silently re-introduce the mid-suite hangs. Deats, - `test_subint_happy_teardown`: baseline — spawn a subactor, one portal RPC, clean teardown. If this breaks, something's wrong unrelated to the hard-kill shields. - `test_subint_non_checkpointing_child`: cancel a subactor stuck in a non-checkpointing Python loop (`threading.Event.wait()` releases the GIL but never inserts a trio checkpoint). Validates the bounded-shield + daemon-driver-thread combo abandons the thread after `_HARD_KILL_TIMEOUT`. Every test is wrapped in `trio.fail_after()` for a deterministic per-test wall-clock ceiling (an unbounded audit would defeat itself) and arms `tractor.devx.dump_on_hang()` so a hang captures a stack dump — pytest's stderr capture swallows `faulthandler` output by default. Gated via `pytest.importorskip('concurrent.interpreters')` and a module-level skip when `--spawn-backend` isn't `'subint'`. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code 2026-04-20 18:06:54 +00:00			`'''`
			Cancellation + hard-kill semantics audit for the `subint` spawn
			`backend.`

			`Exercises the escape-hatch machinery added to`
			`tractor.spawn._subint` (module-level `_HARD_KILL_TIMEOUT`,
			`bounded shields around the soft-kill / thread-join sites, daemon`
			`driver-thread abandonment) so that future stdlib regressions or`
			`our own refactors don't silently re-introduce the hangs first`
			`diagnosed during the Phase B.2/B.3 bringup (issue #379).`

			`Every test in this module:`
			- is wrapped in `trio.fail_after()` for a deterministic per-test
			`wall-clock ceiling (the whole point of these tests is to fail`
			`fast when our escape hatches regress; an unbounded test would`
			`defeat itself),`
			- arms `tractor.devx.dump_on_hang()` to capture a stack dump on
			`failure — without it, a hang here is opaque because pytest's`
			stderr capture swallows `faulthandler` output by default
			`(hard-won lesson from the original diagnosis),`
			- skips on py<3.13 (no `_interpreters`) and on any
			`--spawn-backend` other than `'subint'` (these tests are
			subint-specific by design — they'd be nonsense under `trio` or
			`mp_*`).

			`'''`
			`from __future__ import annotations`
			`from functools import partial`

			`import pytest`
			`import trio`
			`import tractor`
			`from tractor.devx import dump_on_hang`


			# Gate: the `subint` backend requires py3.14+. Check the
			`# public stdlib wrapper's presence (added in 3.14) rather than`
			# the private `_interpreters` module (which exists on 3.13 but
			# wedges under tractor's usage — see `tractor.spawn._subint`).
			`pytest.importorskip('concurrent.interpreters')`

			# Subint-only: read the spawn method that `pytest_configure`
			# committed via `try_set_start_method()`. By the time this module
			`# imports, the CLI backend choice has been applied.`
			`from tractor.spawn._spawn import _spawn_method # noqa: E402`

			`if _spawn_method != 'subint':`
			`pytestmark = pytest.mark.skip(`
			`reason=(`
			`"subint-specific cancellation audit — "`
			"pass `--spawn-backend=subint` to run."
			`),`
			`)`


			`# ----------------------------------------------------------------`
			`# child-side task bodies (run inside the spawned subint)`
			`# ----------------------------------------------------------------`


			`async def _trivial_rpc() -> str:`
			`'''`
			`Minimal RPC body for the baseline happy-teardown test.`
			`'''`
			`return 'hello from subint'`


			`async def _spin_without_trio_checkpoints() -> None:`
			`'''`
			`Block the main task with NO trio-visible checkpoints so any`
			`Portal.cancel_actor()` arriving over IPC has nothing to hand
			`off to.`

			`threading.Event.wait(timeout)` releases the GIL (so other
			`threads — including trio's IO/RPC tasks — can progress) but`
			`does NOT insert a trio checkpoint, so the subactor's main`
			`task never notices cancellation.`

			`This is the exact "stuck subint" scenario the hard-kill`
			`shields exist to survive.`
			`'''`
			`import threading`
			`never_set = threading.Event()`
			`while not never_set.is_set():`
			`# 1s re-check granularity; low enough not to waste CPU,`
			`# high enough that even a pathologically slow`
			# `_HARD_KILL_TIMEOUT` won't accidentally align with a
			`# wake.`
			`never_set.wait(timeout=1.0)`


			`# ----------------------------------------------------------------`
			# parent-side harnesses (driven inside `trio.run(...)`)
			`# ----------------------------------------------------------------`


			`async def _happy_path(`
			`reg_addr: tuple[str, int\|str],`
			`deadline: float,`
			`) -> None:`
			`with trio.fail_after(deadline):`
			`async with (`
			`tractor.open_root_actor(`
			`registry_addrs=[reg_addr],`
			`),`
			`tractor.open_nursery() as an,`
			`):`
			`portal: tractor.Portal = await an.run_in_actor(`
			`_trivial_rpc,`
			`name='subint-happy',`
			`)`
			`result: str = await portal.wait_for_result()`
			`assert result == 'hello from subint'`


			`async def _spawn_stuck_then_cancel(`
			`reg_addr: tuple[str, int\|str],`
			`deadline: float,`
			`) -> None:`
			`with trio.fail_after(deadline):`
			`async with (`
			`tractor.open_root_actor(`
			`registry_addrs=[reg_addr],`
			`),`
			`tractor.open_nursery() as an,`
			`):`
			`await an.run_in_actor(`
			`_spin_without_trio_checkpoints,`
			`name='subint-stuck',`
			`)`
			`# Give the child time to reach its non-checkpointing`
			`# loop before we cancel; the precise value doesn't`
			`# matter as long as it's a handful of trio schedule`
			`# ticks.`
			`await trio.sleep(0.5)`
			`an.cancel_scope.cancel()`


			`# ----------------------------------------------------------------`
			`# tests`
			`# ----------------------------------------------------------------`


			`def test_subint_happy_teardown(`
			`reg_addr: tuple[str, int\|str],`
			`) -> None:`
			`'''`
			`Baseline: spawn a subactor, do one portal RPC, close nursery`
			`cleanly. No cancel, no faults.`

			`If this regresses we know something's wrong at the`
			`spawn/teardown layer unrelated to the hard-kill escape`
			`hatches.`

			`'''`
			`deadline: float = 10.0`
			`with dump_on_hang(`
			`seconds=deadline,`
			`path='/tmp/subint_cancellation_happy.dump',`
			`):`
			`trio.run(partial(_happy_path, reg_addr, deadline))`


Wall-cap `subint` audit tests via `pytest-timeout` Add a hard process-level wall-clock bound on the two known-hanging subint-backend tests so an unattended suite run can't wedge indefinitely in either of the hang classes doc'd in `ai/conc-anal/`. Deats, - New `testing` dep: `pytest-timeout>=2.3`. - `test_stale_entry_is_deleted`: `@pytest.mark.timeout(3, method='thread')`. The `method='thread'` choice is deliberate — `method='signal'` routes via `SIGALRM` which is starved by the same GIL-hostage path that drops `SIGINT` (see `subint_sigint_starvation_issue.md`), so it'd never actually fire in the starvation case. - `test_subint_non_checkpointing_child`: same decorator, same reasoning (defense-in-depth over the inner `trio.fail_after(15)`). At timeout, `pytest-timeout` hard-kills the pytest process itself — that's the intended behavior here; the alternative is the suite never returning. (this commit msg was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code 2026-04-21 00:45:56 +00:00			# Wall-clock bound via `pytest-timeout` (`method='thread'`)
			# as defense-in-depth over the inner `trio.fail_after(15)`.
			`# Under the orphaned-channel hang class described in`
			# `ai/conc-anal/subint_cancel_delivery_hang_issue.md`, SIGINT
			`# is still deliverable and this test should be unwedgeable`
			`# by the inner trio timeout — but sibling subint-backend`
			`# tests in this repo have also exhibited the`
			# `subint_sigint_starvation_issue.md` GIL-starvation flavor,
			# so `method='thread'` keeps us safe in case ordering or
			`# load shifts the failure mode.`
			`@pytest.mark.timeout(`
			`3, # NOTE never passes pre-3.14+ subints support.`
			`method='thread',`
			`)`
Add `subint` cancellation + hard-kill test audit Lock in the escape-hatch machinery added to `tractor.spawn._subint` during the Phase B.2/B.3 bringup (issue #379) so future stdlib regressions or our own refactors don't silently re-introduce the mid-suite hangs. Deats, - `test_subint_happy_teardown`: baseline — spawn a subactor, one portal RPC, clean teardown. If this breaks, something's wrong unrelated to the hard-kill shields. - `test_subint_non_checkpointing_child`: cancel a subactor stuck in a non-checkpointing Python loop (`threading.Event.wait()` releases the GIL but never inserts a trio checkpoint). Validates the bounded-shield + daemon-driver-thread combo abandons the thread after `_HARD_KILL_TIMEOUT`. Every test is wrapped in `trio.fail_after()` for a deterministic per-test wall-clock ceiling (an unbounded audit would defeat itself) and arms `tractor.devx.dump_on_hang()` so a hang captures a stack dump — pytest's stderr capture swallows `faulthandler` output by default. Gated via `pytest.importorskip('concurrent.interpreters')` and a module-level skip when `--spawn-backend` isn't `'subint'`. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code 2026-04-20 18:06:54 +00:00			`def test_subint_non_checkpointing_child(`
			`reg_addr: tuple[str, int\|str],`
			`) -> None:`
			`'''`
			`Cancel a subactor whose main task is stuck in a non-`
			`checkpointing Python loop.`

			`Portal.cancel_actor()` may be delivered over IPC but the
			`main task never checkpoints to observe the Cancelled —`
			so the subint's `trio.run()` can't exit gracefully.

			The parent `subint_proc` bounded-shield + daemon-driver-
			`thread combo should abandon the thread after`
			`_HARD_KILL_TIMEOUT` and let the parent return cleanly.

			`Wall-clock budget:`
			`- ~0.5s: settle time for child to enter the stuck loop`
			- ~3s: `_HARD_KILL_TIMEOUT` (soft-kill wait)
			- ~3s: `_HARD_KILL_TIMEOUT` (thread-join wait)
			`- margin`

Doc `subint` backend hang classes + arm `dump_on_hang` Classify and write up the two distinct hang modes hit during Phase B subint bringup (issue #379) so future triage doesn't re-derive them from scratch. Deats, two new `ai/conc-anal/` docs, - `subint_sigint_starvation_issue.md`: abandoned legacy-subint thread + shared GIL → main trio loop starves → signal-wakeup-fd pipe fills → `SIGINT` silently dropped (`strace` shows `write() = EAGAIN` on the wakeup-fd). Un- Ctrl-C-able. Structurally a CPython limit; blocked on `msgspec` PEP 684 (jcrist/msgspec#563) - `subint_cancel_delivery_hang_issue.md`: parent-side trio task parks on an orphaned IPC channel after subint teardown — no clean EOF delivered to the waiting receive. Ctrl-C-able (main loop iterates fine); OUR bug to fix. Candidate fix: explicit parent-side channel abort in `subint_proc`'s hard-kill teardown Cross-link the docs from their test reproducers, - `test_stale_entry_is_deleted` (→ starvation class): wrap `trio.run(main)` in `dump_on_hang(seconds=20)` so a future regression captures a stack dump. Kept un- skipped so the dump file is inspectable - `test_subint_non_checkpointing_child` (→ delivery class): extend docstring with a "KNOWN ISSUE" block pointing at the analysis (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code 2026-04-20 19:28:00 +00:00			`KNOWN ISSUE (Ctrl-C-able hang):`
			`-------------------------------`
			`This test currently hangs past the hard-kill timeout for`
			`reasons unrelated to the subint teardown itself — after`
			`the subint is destroyed, a parent-side trio task appears`
			`to park on an orphaned IPC channel (no clean EOF`
			`delivered to a waiting receive). Unlike the`
			`SIGINT-starvation sibling case in`
			`test_stale_entry_is_deleted`, this hang IS Ctrl-C-able
			(`strace` shows SIGINT wakeup-fd `write() = 1`, not
			`EAGAIN`) — i.e. the main trio loop is still iterating
			`normally. That makes this our bug to fix, not a`
			`CPython-level limitation.`

			See `ai/conc-anal/subint_cancel_delivery_hang_issue.md`
			`for the full analysis + candidate fix directions`
			(explicit parent-side channel abort in `subint_proc`
			`teardown being the most likely surgical fix).`

			The sibling `ai/conc-anal/subint_sigint_starvation_issue.md`
			`documents the other hang class (abandoned-legacy-subint`
			`thread + shared-GIL starvation → signal-wakeup-fd pipe`
			`fills → SIGINT silently dropped) — that one is`
			`structurally blocked on msgspec PEP 684 adoption and is`
			`NOT what this test is hitting.`

Add `subint` cancellation + hard-kill test audit Lock in the escape-hatch machinery added to `tractor.spawn._subint` during the Phase B.2/B.3 bringup (issue #379) so future stdlib regressions or our own refactors don't silently re-introduce the mid-suite hangs. Deats, - `test_subint_happy_teardown`: baseline — spawn a subactor, one portal RPC, clean teardown. If this breaks, something's wrong unrelated to the hard-kill shields. - `test_subint_non_checkpointing_child`: cancel a subactor stuck in a non-checkpointing Python loop (`threading.Event.wait()` releases the GIL but never inserts a trio checkpoint). Validates the bounded-shield + daemon-driver-thread combo abandons the thread after `_HARD_KILL_TIMEOUT`. Every test is wrapped in `trio.fail_after()` for a deterministic per-test wall-clock ceiling (an unbounded audit would defeat itself) and arms `tractor.devx.dump_on_hang()` so a hang captures a stack dump — pytest's stderr capture swallows `faulthandler` output by default. Gated via `pytest.importorskip('concurrent.interpreters')` and a module-level skip when `--spawn-backend` isn't `'subint'`. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code 2026-04-20 18:06:54 +00:00			`'''`
			`deadline: float = 15.0`
			`with dump_on_hang(`
			`seconds=deadline,`
			`path='/tmp/subint_cancellation_stuck.dump',`
			`):`
			`trio.run(`
			`partial(`
			`_spawn_stuck_then_cancel,`
			`reg_addr,`
			`deadline,`
			`),`
			`)`