From 8ac3dfeb85e64be48b00b28a1731784ae83fda56 Mon Sep 17 00:00:00 2001 From: goodboy Date: Thu, 23 Apr 2026 16:27:38 -0400 Subject: [PATCH] Break parent-chan shield during teardown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completes the nested-cancel deadlock fix started in 0cd0b633 (fork-child FD scrub) and fe540d02 (pidfd- cancellable wait). The remaining piece: the parent- channel `process_messages` loop runs under `shield=True` (so normal cancel cascades don't kill it prematurely), and relies on EOF arriving when the parent closes the socket to exit naturally. Under exec-spawn backends (`trio_proc`, mp) that EOF arrival is reliable — parent's teardown closes the handler-task socket deterministically. But fork- based backends like `subint_forkserver` share enough process-image state that EOF delivery becomes racy: the loop parks waiting for an EOF that only arrives after the parent finishes its own teardown, but the parent is itself blocked on `os.waitpid()` for THIS actor's exit. Mutual wait → deadlock. Deats, - `async_main` stashes the cancel-scope returned by `root_tn.start(...)` for the parent-chan `process_messages` task onto the actor as `_parent_chan_cs` - `Actor.cancel()`'s teardown path (after `ipc_server.cancel()` + `wait_for_shutdown()`) calls `self._parent_chan_cs.cancel()` to explicitly break the shield — no more waiting for EOF delivery, unwinding proceeds deterministically regardless of backend - inline comments on both sites explain the mutual- wait deadlock + why the explicit cancel is backend-agnostic rather than a forkserver-specific workaround With this + the prior two fixes, the `subint_forkserver` nested-cancel cascade unwinds cleanly end-to-end. (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code --- tractor/runtime/_runtime.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/tractor/runtime/_runtime.py b/tractor/runtime/_runtime.py index cbfaa313..12b2473e 100644 --- a/tractor/runtime/_runtime.py +++ b/tractor/runtime/_runtime.py @@ -1216,6 +1216,23 @@ class Actor: ipc_server.cancel() await ipc_server.wait_for_shutdown() + # Break the shield on the parent-channel + # `process_messages` loop (started with `shield=True` + # in `async_main` above). Required to avoid a + # deadlock during teardown of fork-spawned subactors: + # without this cancel, the loop parks waiting for + # EOF on the parent channel, but the parent is + # blocked on `os.waitpid()` for THIS actor's exit + # — mutual wait. For exec-spawn backends the EOF + # arrives naturally when the parent closes its + # handler-task socket during its own teardown, but + # in fork backends the shared-process-image makes + # that delivery racy / not guaranteed. Explicit + # cancel here gives us deterministic unwinding + # regardless of backend. + if self._parent_chan_cs is not None: + self._parent_chan_cs.cancel() + # cancel all rpc tasks permanently if self._service_tn: self._service_tn.cancel_scope.cancel() @@ -1736,7 +1753,16 @@ async def async_main( # start processing parent requests until our channel # server is 100% up and running. if actor._parent_chan: - await root_tn.start( + # Capture the shielded `loop_cs` for the + # parent-channel `process_messages` task so + # `Actor.cancel()` has a handle to break the + # shield during teardown — without this, the + # shielded loop would park on the parent chan + # indefinitely waiting for EOF that only arrives + # after the PARENT tears down, which under + # fork-based backends (e.g. `subint_forkserver`) + # it waits on THIS actor's exit — deadlock. + actor._parent_chan_cs = await root_tn.start( partial( _rpc.process_messages, chan=actor._parent_chan,