Fix subint destroy race via dedicated OS thread

`trio.to_thread.run_sync(_interpreters.exec, ...)` runs `exec()` on a cached worker thread — and when that thread is returned to the cache after the subint's `trio.run()` exits, CPython still keeps the subint's tstate attached to the (now idle) worker. Result: the teardown `_interpreters.destroy(interp_id)` in the `finally` block can block the parent's trio loop indefinitely, waiting for a tstate release that only happens when the worker either picks up a new job or exits. Manifested as intermittent mid-suite hangs under `--spawn-backend=subint` — caught by a `faulthandler.dump_traceback_later()` showing the main thread stuck in `_interpreters.destroy()` at `_subint.py:293` with only an idle trio-cache worker as the other live thread. Deats, - drive the subint on a plain `threading.Thread` (not `trio.to_thread`) so the OS thread truly exits after `_interpreters.exec()` returns, releasing tstate and unblocking destroy - signal `subint_exited.set()` back to the parent trio loop from the driver thread via `trio.from_thread.run_sync(..., trio_token=...)` — capture the token at `subint_proc` entry - swallow `trio.RunFinishedError` in that signal path for the case where parent trio has already exited (proc teardown) - in the teardown `finally`, off-load the sync `driver_thread.join()` to `trio.to_thread.run_sync` (cache thread w/ no subint tstate → safe) so we actually wait for the driver to exit before `_interpreters.destroy()` (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code
2026-04-18 00:21:49 -04:00 · 2026-04-18 00:21:49 -04:00 · 31cbd11a5b
parent 8a8d01e076
commit 31cbd11a5b
1 changed files with 112 additions and 86 deletions
--- a/tractor/spawn/_subint.py
+++ b/tractor/spawn/_subint.py
@ -36,7 +36,7 @@ introspectable) but `subint_proc()` raises.
 '''
 from __future__ import annotations
 import sys
-from functools import partial
+import threading
 from typing import (
    Any,
    TYPE_CHECKING,
@ -188,28 +188,47 @@ async def subint_proc(
    subint_exited = trio.Event()
    ipc_server: _server.Server = actor_nursery._actor.ipc_server

-    async def _drive_subint() -> None:
+    # Capture a trio token so the driver thread can signal
+    # `subint_exited.set()` back into the parent trio loop.
+    trio_token = trio.lowlevel.current_trio_token()
+
+    def _subint_target() -> None:
        '''
-        Block a worker OS-thread on `_interpreters.exec()` for
-        the lifetime of the sub-actor. When the subint's inner
-        `trio.run()` exits, `exec()` returns and the thread
-        naturally joins.
+        Dedicated OS-thread target: runs `_interpreters.exec()`
+        once and exits.
+
+        We intentionally use a plain `threading.Thread` here
+        rather than `trio.to_thread.run_sync()` because trio's
+        thread cache would *recycle* the same OS thread for
+        subsequent jobs — leaving CPython's subinterpreter
+        tstate attached to that cached worker and blocking
+        `_interpreters.destroy()` in the teardown block below.
+        A dedicated thread truly exits after `exec()` returns,
+        releasing the tstate so destroy can proceed.

        '''
        try:
-            await trio.to_thread.run_sync(
-                _interpreters.exec,
-                interp_id,
-                bootstrap,
-                abandon_on_cancel=False,
-            )
+            _interpreters.exec(interp_id, bootstrap)
        finally:
-            subint_exited.set()
+            try:
+                trio.from_thread.run_sync(
+                    subint_exited.set,
+                    trio_token=trio_token,
+                )
+            except trio.RunFinishedError:
+                # parent trio loop has already exited (proc
+                # teardown); nothing to signal.
+                pass
+
+    driver_thread = threading.Thread(
+        target=_subint_target,
+        name=f'subint-driver[{interp_id}]',
+        daemon=False,
+    )

    try:
        try:
-            async with trio.open_nursery() as thread_n:
-                thread_n.start_soon(_drive_subint)
+            driver_thread.start()

            try:
                event, chan = await ipc_server.wait_for_peer(uid)
@ -259,7 +278,7 @@ async def subint_proc(
                # Soft-kill analog: wait for the subint to exit
                # naturally; on cancel, send a graceful cancel
                # via the IPC portal and then wait for the
-                    # driver thread to finish so `interp.close()`
+                # driver thread to finish so `_interpreters.destroy()`
                # won't race with a running interpreter.
                try:
                    await subint_exited.wait()
@ -285,10 +304,17 @@ async def subint_proc(
                    lifecycle_n.cancel_scope.cancel()

        finally:
-            # The driver thread has exited (either natural subint
-            # completion or post-cancel teardown) so the subint is
-            # no longer running — safe to destroy.
+            # Ensure the driver thread is *fully* joined before
+            # destroying the subint. `subint_exited.set()` fires
+            # from inside the thread but returns to trio before
+            # the thread's bootstrap cleanup finishes; calling
+            # `destroy()` too eagerly can race with tstate
+            # teardown. Off-load the blocking `.join()` to a
+            # cache thread (which carries no subint tstate of
+            # its own, so no cache conflict).
            with trio.CancelScope(shield=True):
+                if driver_thread.is_alive():
+                    await trio.to_thread.run_sync(driver_thread.join)
                try:
                    _interpreters.destroy(interp_id)
                    log.runtime(