From fe753f9656ddfca9d849bce43791db17c7ef59be Mon Sep 17 00:00:00 2001
From: goodboy <goodboy_foss@protonmail.com>
Date: Sat, 18 Apr 2026 16:30:04 -0400
Subject: [PATCH] Bound subint teardown shields with hard-kill timeout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Unbounded `trio.CancelScope(shield=True)` at the
soft-kill and thread-join sites can wedge the parent
trio loop indefinitely when a stuck subint ignores
portal-cancel (e.g. bc the IPC channel is already
broken).

Deats,
- add `_HARD_KILL_TIMEOUT` (3s) module-level const
- wrap both shield sites with
  `trio.move_on_after()` so we abandon a stuck
  subint after the deadline
- flip driver thread to `daemon=True` so proc-exit
  also isn't blocked by a wedged subint
- pass `abandon_on_cancel=True` to
  `trio.to_thread.run_sync(driver_thread.join)`
  — load-bearing for `move_on_after` to actually
  fire
- log warnings when either timeout triggers
- improve `InterpreterError` log msg to explain
  the abandoned-thread scenario

(this patch was generated in some part by [`claude-code`][claude-code-gh])
[claude-code-gh]: https://github.com/anthropics/claude-code
---
 tractor/spawn/_subint.py | 94 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 88 insertions(+), 6 deletions(-)

diff --git a/tractor/spawn/_subint.py b/tractor/spawn/_subint.py
index 4b4afa5f..e740bbb9 100644
--- a/tractor/spawn/_subint.py
+++ b/tractor/spawn/_subint.py
@@ -115,6 +115,17 @@ if TYPE_CHECKING:
 log = get_logger('tractor')
 
 
+# How long we'll wait (in seconds) inside the shielded soft-kill
+# / teardown blocks before abandoning the sub-interpreter to its
+# fate. See the "hard-kill" comments at the two shield sites.
+#
+# Unbounded shields are a Bad Idea with subints: because CPython
+# doesn't deliver SIGINT into sub-interpreters and the legacy
+# config shares the main GIL, a stuck subint can otherwise lock
+# the parent trio loop (and the user's Ctrl-C) indefinitely.
+_HARD_KILL_TIMEOUT: float = 3.0
+
+
 async def subint_proc(
     name: str,
     actor_nursery: ActorNursery,
@@ -220,10 +231,15 @@ async def subint_proc(
                 # teardown); nothing to signal.
                 pass
 
+    # NOTE: `daemon=True` so a stuck subint can never block
+    # process exit — if our `_HARD_KILL_TIMEOUT` paths below
+    # have to abandon this thread, Python's interpreter
+    # shutdown won't wait for it forever. Tradeoff: any
+    # subint state still live at abandon-time may leak.
     driver_thread = threading.Thread(
         target=_subint_target,
         name=f'subint-driver[{interp_id}]',
-        daemon=False,
+        daemon=True,
     )
 
     try:
@@ -283,7 +299,23 @@ async def subint_proc(
                 try:
                     await subint_exited.wait()
                 except trio.Cancelled:
-                    with trio.CancelScope(shield=True):
+                    # Bounded shield: we want to ATTEMPT a
+                    # graceful cancel via the portal, but we
+                    # MUST NOT let the shield trap user
+                    # Ctrl-C / parent teardown forever if the
+                    # subint is already unreachable (e.g., the
+                    # IPC channel was broken — which is exactly
+                    # what `test_ipc_channel_break_during_stream`
+                    # exercises). After `_HARD_KILL_TIMEOUT` we
+                    # drop the shield and let `Cancelled`
+                    # propagate; the outer teardown will force
+                    # things along.
+                    with (
+                        trio.CancelScope(shield=True),
+                        trio.move_on_after(
+                            _HARD_KILL_TIMEOUT,
+                        ) as cs,
+                    ):
                         log.cancel(
                             f'Soft-killing subint sub-actor\n'
                             f'c)=> {chan.aid.reprol()}\n'
@@ -296,9 +328,20 @@ async def subint_proc(
                             trio.ClosedResourceError,
                         ):
                             # channel already down — subint will
-                            # exit on its own timeline
+                            # exit on its own timeline (or won't,
+                            # in which case the timeout below
+                            # is our escape).
                             pass
                         await subint_exited.wait()
+                    if cs.cancelled_caught:
+                        log.warning(
+                            f'Soft-kill of subint sub-actor timed '
+                            f'out after {_HARD_KILL_TIMEOUT}s — '
+                            f'subint may still be running; '
+                            f'escalating to thread-abandon.\n'
+                            f'   |_interp_id={interp_id}\n'
+                            f'   |_aid={chan.aid.reprol()}\n'
+                        )
                     raise
                 finally:
                     lifecycle_n.cancel_scope.cancel()
@@ -312,9 +355,44 @@ async def subint_proc(
             # teardown. Off-load the blocking `.join()` to a
             # cache thread (which carries no subint tstate of
             # its own, so no cache conflict).
-            with trio.CancelScope(shield=True):
+            #
+            # Bounded shield: if the driver thread never exits
+            # (soft-kill failed above, subint stuck in
+            # non-checkpointing Python, etc.) we MUST abandon
+            # it rather than wedge the parent forever. The
+            # thread is `daemon=True` so proc-exit won't block
+            # on it either. Subsequent `_interpreters.destroy()`
+            # on a still-running subint raises `InterpreterError`
+            # which we log and swallow — the abandoned subint
+            # will be torn down by process exit.
+            with (
+                trio.CancelScope(shield=True),
+                trio.move_on_after(_HARD_KILL_TIMEOUT) as cs,
+            ):
                 if driver_thread.is_alive():
-                    await trio.to_thread.run_sync(driver_thread.join)
+                    # XXX `abandon_on_cancel=True` is load-bearing:
+                    # the default (False) makes `to_thread.run_sync`
+                    # ignore the enclosing `move_on_after` and
+                    # block until `driver_thread.join()` returns —
+                    # which is exactly what we can't wait for here.
+                    await trio.to_thread.run_sync(
+                        driver_thread.join,
+                        abandon_on_cancel=True,
+                    )
+            if cs.cancelled_caught:
+                log.warning(
+                    f'Subint driver thread did not exit within '
+                    f'{_HARD_KILL_TIMEOUT}s — abandoning.\n'
+                    f'   |_interp_id={interp_id}\n'
+                    f'   |_thread={driver_thread.name}\n'
+                    f'(This usually means portal-cancel could '
+                    f'not be delivered — e.g., IPC channel was '
+                    f'already broken. The subint will continue '
+                    f'running until process exit terminates the '
+                    f'daemon thread.)'
+                )
+
+            with trio.CancelScope(shield=True):
                 try:
                     _interpreters.destroy(interp_id)
                     log.runtime(
@@ -325,7 +403,11 @@ async def subint_proc(
                 except _interpreters.InterpreterError as e:
                     log.warning(
                         f'Could not destroy sub-interpreter '
-                        f'{interp_id}: {e}'
+                        f'{interp_id}: {e}\n'
+                        f'(expected if the driver thread was '
+                        f'abandoned above; the subint is still '
+                        f'running and will be reaped at process '
+                        f'exit.)'
                     )
 
     finally: