Compare commits
4 Commits
0cd0b633f1
...
a4d6318ca7
| Author | SHA1 | Date |
|---|---|---|
|
|
a4d6318ca7 | |
|
|
fe89169f1c | |
|
|
57935804e2 | |
|
|
fe540d0228 |
|
|
@ -1,8 +1,16 @@
|
||||||
{
|
{
|
||||||
"permissions": {
|
"permissions": {
|
||||||
"allow": [
|
"allow": [
|
||||||
"Bash(date *)",
|
|
||||||
"Bash(cp .claude/*)",
|
"Bash(cp .claude/*)",
|
||||||
|
"Read(.claude/**)",
|
||||||
|
"Read(.claude/skills/run-tests/**)",
|
||||||
|
"Write(.claude/**/*commit_msg*)",
|
||||||
|
"Write(.claude/git_commit_msg_LATEST.md)",
|
||||||
|
"Skill(run-tests)",
|
||||||
|
"Skill(close-wkt)",
|
||||||
|
"Skill(open-wkt)",
|
||||||
|
"Skill(prompt-io)",
|
||||||
|
"Bash(date *)",
|
||||||
"Bash(git diff *)",
|
"Bash(git diff *)",
|
||||||
"Bash(git log *)",
|
"Bash(git log *)",
|
||||||
"Bash(git status)",
|
"Bash(git status)",
|
||||||
|
|
@ -23,14 +31,12 @@
|
||||||
"Bash(UV_PROJECT_ENVIRONMENT=py* uv sync:*)",
|
"Bash(UV_PROJECT_ENVIRONMENT=py* uv sync:*)",
|
||||||
"Bash(UV_PROJECT_ENVIRONMENT=py* uv run:*)",
|
"Bash(UV_PROJECT_ENVIRONMENT=py* uv run:*)",
|
||||||
"Bash(echo EXIT:$?:*)",
|
"Bash(echo EXIT:$?:*)",
|
||||||
"Write(.claude/*commit_msg*)",
|
"Bash(echo \"EXIT=$?\")",
|
||||||
"Write(.claude/git_commit_msg_LATEST.md)",
|
"Read(//tmp/**)"
|
||||||
"Skill(run-tests)",
|
|
||||||
"Skill(close-wkt)",
|
|
||||||
"Skill(open-wkt)",
|
|
||||||
"Skill(prompt-io)"
|
|
||||||
],
|
],
|
||||||
"deny": [],
|
"deny": [],
|
||||||
"ask": []
|
"ask": []
|
||||||
}
|
},
|
||||||
|
"prefersReducedMotion": false,
|
||||||
|
"outputStyle": "default"
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -306,13 +306,103 @@ root's `open_nursery` receives the
|
||||||
`BaseExceptionGroup` containing the `AssertionError`
|
`BaseExceptionGroup` containing the `AssertionError`
|
||||||
from the errorer and unwinds cleanly.
|
from the errorer and unwinds cleanly.
|
||||||
|
|
||||||
|
## Update — 2026-04-23: partial fix landed, deeper layer surfaced
|
||||||
|
|
||||||
|
Three improvements landed as separate commits in the
|
||||||
|
`subint_forkserver_backend` branch (see `git log`):
|
||||||
|
|
||||||
|
1. **`_close_inherited_fds()` in fork-child prelude**
|
||||||
|
(`tractor/spawn/_subint_forkserver.py`). POSIX
|
||||||
|
close-fds-equivalent enumeration via
|
||||||
|
`/proc/self/fd` (or `RLIMIT_NOFILE` fallback), keep
|
||||||
|
only stdio. This is fix-direction (1) from the list
|
||||||
|
above — went with the blunt form rather than the
|
||||||
|
targeted enum-via-`actor.ipc_server` form, turns
|
||||||
|
out the aggressive close is safe because every
|
||||||
|
inheritable resource the fresh child needs
|
||||||
|
(IPC-channel socket, etc.) is opened AFTER the
|
||||||
|
fork anyway.
|
||||||
|
2. **`_ForkedProc.wait()` via `os.pidfd_open()` +
|
||||||
|
`trio.lowlevel.wait_readable()`** — matches the
|
||||||
|
`trio.Process.wait` / `mp.Process.sentinel` pattern
|
||||||
|
used by `trio_proc` and `proc_waiter`. Gives us
|
||||||
|
fully trio-cancellable child-wait (prior impl
|
||||||
|
blocked a cache thread on a sync `os.waitpid` that
|
||||||
|
was NOT trio-cancellable due to
|
||||||
|
`abandon_on_cancel=False`).
|
||||||
|
3. **`_parent_chan_cs` wiring** in
|
||||||
|
`tractor/runtime/_runtime.py`: capture the shielded
|
||||||
|
`loop_cs` for the parent-channel `process_messages`
|
||||||
|
task in `async_main`; explicitly cancel it in
|
||||||
|
`Actor.cancel()` teardown. This breaks the shield
|
||||||
|
during teardown so the parent-chan loop exits when
|
||||||
|
cancel is issued, instead of parking on a parent-
|
||||||
|
socket EOF that might never arrive under fork
|
||||||
|
semantics.
|
||||||
|
|
||||||
|
**Concrete wins from (1):** the sibling
|
||||||
|
`subint_forkserver_orphan_sigint_hang_issue.md` class
|
||||||
|
is **now fixed** — `test_orphaned_subactor_sigint_cleanup_DRAFT`
|
||||||
|
went from strict-xfail to pass. The xfail mark was
|
||||||
|
removed; the test remains as a regression guard.
|
||||||
|
|
||||||
|
**test_nested_multierrors STILL hangs** though.
|
||||||
|
|
||||||
|
### Updated diagnosis (narrowed)
|
||||||
|
|
||||||
|
DIAGDEBUG instrumentation of `process_messages` ENTER/
|
||||||
|
EXIT pairs + `_parent_chan_cs.cancel()` call sites
|
||||||
|
showed (captured during a 20s-timeout repro):
|
||||||
|
|
||||||
|
- 80 `process_messages` ENTERs, 75 EXITs → 5 stuck.
|
||||||
|
- **All 40 `shield=True` ENTERs matched EXIT** — every
|
||||||
|
shielded parent-chan loop exits cleanly. The
|
||||||
|
`_parent_chan_cs` wiring works as intended.
|
||||||
|
- **The 5 stuck loops are all `shield=False`** — peer-
|
||||||
|
channel handlers (inbound connections handled by
|
||||||
|
`handle_stream_from_peer` in stream_handler_tn).
|
||||||
|
- After our `_parent_chan_cs.cancel()` fires, NEW
|
||||||
|
shielded process_messages loops start (on the
|
||||||
|
session reg_addr port — probably discovery-layer
|
||||||
|
reconnection attempts). These don't block teardown
|
||||||
|
(they all exit) but indicate the cancel cascade has
|
||||||
|
more moving parts than expected.
|
||||||
|
|
||||||
|
### Remaining unknown
|
||||||
|
|
||||||
|
Why don't the 5 peer-channel loops exit when
|
||||||
|
`service_tn.cancel_scope.cancel()` fires? They're in
|
||||||
|
`stream_handler_tn` which IS `service_tn` in the
|
||||||
|
current configuration (`open_ipc_server(parent_tn=
|
||||||
|
service_tn, stream_handler_tn=service_tn)`). A
|
||||||
|
standard nursery-scope-cancel should propagate through
|
||||||
|
them — no shield, no special handler. Something
|
||||||
|
specific to the fork-spawned configuration keeps them
|
||||||
|
alive.
|
||||||
|
|
||||||
|
Candidate follow-up experiments:
|
||||||
|
|
||||||
|
- Dump the trio task tree at the hang point (via
|
||||||
|
`stackscope` or direct trio introspection) to see
|
||||||
|
what each stuck loop is awaiting. `chan.__anext__`
|
||||||
|
on a socket recv? An inner lock? A shielded sub-task?
|
||||||
|
- Compare peer-channel handler lifecycle under
|
||||||
|
`trio_proc` vs `subint_forkserver` with equivalent
|
||||||
|
logging to spot the divergence.
|
||||||
|
- Investigate whether the peer handler is caught in
|
||||||
|
the `except trio.Cancelled:` path at
|
||||||
|
`tractor/ipc/_server.py:448` that re-raises — but
|
||||||
|
re-raise means it should still exit. Unless
|
||||||
|
something higher up swallows it.
|
||||||
|
|
||||||
## Stopgap (landed)
|
## Stopgap (landed)
|
||||||
|
|
||||||
Until the fix lands, `test_nested_multierrors` +
|
`test_nested_multierrors` skip-marked under
|
||||||
related multi-level-spawn tests can be skip-marked
|
`subint_forkserver` via
|
||||||
under `subint_forkserver` via
|
|
||||||
`@pytest.mark.skipon_spawn_backend('subint_forkserver',
|
`@pytest.mark.skipon_spawn_backend('subint_forkserver',
|
||||||
reason='...')`. Cross-ref this doc.
|
reason='...')`, cross-referenced to this doc. Mark
|
||||||
|
should be dropped once the peer-channel-loop exit
|
||||||
|
issue is fixed.
|
||||||
|
|
||||||
## References
|
## References
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -446,22 +446,15 @@ def _process_alive(pid: int) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail(
|
# NOTE: was previously `@pytest.mark.xfail(strict=True, ...)`
|
||||||
strict=True,
|
# for the orphan-SIGINT hang documented in
|
||||||
reason=(
|
# `ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md`
|
||||||
'subint_forkserver orphan-child SIGINT hang: trio\'s '
|
# — now passes after the fork-child FD-hygiene fix in
|
||||||
'event loop stays wedged in `epoll_wait` despite the '
|
# `tractor.spawn._subint_forkserver._close_inherited_fds()`:
|
||||||
'SIGINT handler being correctly installed and the '
|
# closing all inherited FDs (including the parent's IPC
|
||||||
'signal being delivered at the kernel level. NOT a '
|
# listener + trio-epoll + wakeup-pipe FDs) lets the child's
|
||||||
'"handler missing on non-main thread" issue — post-'
|
# trio event loop respond cleanly to external SIGINT.
|
||||||
'fork the worker IS `threading.main_thread()` and '
|
# Leaving the test in place as a regression guard.
|
||||||
'trio\'s `KIManager` handler is confirmed installed. '
|
|
||||||
'Full analysis + ruled-out hypotheses + fix directions '
|
|
||||||
'in `ai/conc-anal/'
|
|
||||||
'subint_forkserver_orphan_sigint_hang_issue.md`. '
|
|
||||||
'Flip this mark (or drop it) once the gap is closed.'
|
|
||||||
),
|
|
||||||
)
|
|
||||||
@pytest.mark.timeout(
|
@pytest.mark.timeout(
|
||||||
30,
|
30,
|
||||||
method='thread',
|
method='thread',
|
||||||
|
|
|
||||||
|
|
@ -452,6 +452,19 @@ async def spawn_and_error(
|
||||||
await nursery.run_in_actor(*args, **kwargs)
|
await nursery.run_in_actor(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipon_spawn_backend(
|
||||||
|
'subint_forkserver',
|
||||||
|
reason=(
|
||||||
|
'Multi-level fork-spawn cancel cascade hang — '
|
||||||
|
'peer-channel `process_messages` loops do not '
|
||||||
|
'exit on `service_tn.cancel_scope.cancel()`. '
|
||||||
|
'See `ai/conc-anal/'
|
||||||
|
'subint_forkserver_test_cancellation_leak_issue.md` '
|
||||||
|
'for the full diagnosis + candidate fix directions. '
|
||||||
|
'Drop this mark once the peer-chan-loop exit issue '
|
||||||
|
'is closed.'
|
||||||
|
),
|
||||||
|
)
|
||||||
@pytest.mark.timeout(
|
@pytest.mark.timeout(
|
||||||
10,
|
10,
|
||||||
method='thread',
|
method='thread',
|
||||||
|
|
|
||||||
|
|
@ -1216,6 +1216,23 @@ class Actor:
|
||||||
ipc_server.cancel()
|
ipc_server.cancel()
|
||||||
await ipc_server.wait_for_shutdown()
|
await ipc_server.wait_for_shutdown()
|
||||||
|
|
||||||
|
# Break the shield on the parent-channel
|
||||||
|
# `process_messages` loop (started with `shield=True`
|
||||||
|
# in `async_main` above). Required to avoid a
|
||||||
|
# deadlock during teardown of fork-spawned subactors:
|
||||||
|
# without this cancel, the loop parks waiting for
|
||||||
|
# EOF on the parent channel, but the parent is
|
||||||
|
# blocked on `os.waitpid()` for THIS actor's exit
|
||||||
|
# — mutual wait. For exec-spawn backends the EOF
|
||||||
|
# arrives naturally when the parent closes its
|
||||||
|
# handler-task socket during its own teardown, but
|
||||||
|
# in fork backends the shared-process-image makes
|
||||||
|
# that delivery racy / not guaranteed. Explicit
|
||||||
|
# cancel here gives us deterministic unwinding
|
||||||
|
# regardless of backend.
|
||||||
|
if self._parent_chan_cs is not None:
|
||||||
|
self._parent_chan_cs.cancel()
|
||||||
|
|
||||||
# cancel all rpc tasks permanently
|
# cancel all rpc tasks permanently
|
||||||
if self._service_tn:
|
if self._service_tn:
|
||||||
self._service_tn.cancel_scope.cancel()
|
self._service_tn.cancel_scope.cancel()
|
||||||
|
|
@ -1736,7 +1753,16 @@ async def async_main(
|
||||||
# start processing parent requests until our channel
|
# start processing parent requests until our channel
|
||||||
# server is 100% up and running.
|
# server is 100% up and running.
|
||||||
if actor._parent_chan:
|
if actor._parent_chan:
|
||||||
await root_tn.start(
|
# Capture the shielded `loop_cs` for the
|
||||||
|
# parent-channel `process_messages` task so
|
||||||
|
# `Actor.cancel()` has a handle to break the
|
||||||
|
# shield during teardown — without this, the
|
||||||
|
# shielded loop would park on the parent chan
|
||||||
|
# indefinitely waiting for EOF that only arrives
|
||||||
|
# after the PARENT tears down, which under
|
||||||
|
# fork-based backends (e.g. `subint_forkserver`)
|
||||||
|
# it waits on THIS actor's exit — deadlock.
|
||||||
|
actor._parent_chan_cs = await root_tn.start(
|
||||||
partial(
|
partial(
|
||||||
_rpc.process_messages,
|
_rpc.process_messages,
|
||||||
chan=actor._parent_chan,
|
chan=actor._parent_chan,
|
||||||
|
|
|
||||||
|
|
@ -526,6 +526,18 @@ class _ForkedProc:
|
||||||
self.stdin = None
|
self.stdin = None
|
||||||
self.stdout = None
|
self.stdout = None
|
||||||
self.stderr = None
|
self.stderr = None
|
||||||
|
# pidfd (Linux 5.3+, Python 3.9+) — a file descriptor
|
||||||
|
# referencing this child process which becomes readable
|
||||||
|
# once the child exits. Enables a fully trio-cancellable
|
||||||
|
# wait via `trio.lowlevel.wait_readable()` — same
|
||||||
|
# pattern `trio.Process.wait()` uses under the hood, and
|
||||||
|
# the same pattern `multiprocessing.Process.sentinel`
|
||||||
|
# uses for `tractor.spawn._spawn.proc_waiter()`. Without
|
||||||
|
# this, waiting via `trio.to_thread.run_sync(os.waitpid,
|
||||||
|
# ...)` blocks a cache thread on a sync syscall that is
|
||||||
|
# NOT trio-cancellable, which prevents outer cancel
|
||||||
|
# scopes from unwedging a stuck-child cancel cascade.
|
||||||
|
self._pidfd: int = os.pidfd_open(pid)
|
||||||
|
|
||||||
def poll(self) -> int | None:
|
def poll(self) -> int | None:
|
||||||
'''
|
'''
|
||||||
|
|
@ -555,22 +567,40 @@ class _ForkedProc:
|
||||||
|
|
||||||
async def wait(self) -> int:
|
async def wait(self) -> int:
|
||||||
'''
|
'''
|
||||||
Async blocking wait for the child's exit, off-loaded
|
Async, fully-trio-cancellable wait for the child's
|
||||||
to a trio cache thread so we don't block the event
|
exit. Uses `trio.lowlevel.wait_readable()` on the
|
||||||
loop on `waitpid()`. Safe to call multiple times;
|
`pidfd` sentinel — same pattern as `trio.Process.wait`
|
||||||
subsequent calls return the cached rc without
|
and `tractor.spawn._spawn.proc_waiter` (mp backend).
|
||||||
re-issuing the syscall.
|
|
||||||
|
Safe to call multiple times; subsequent calls return
|
||||||
|
the cached rc without re-issuing the syscall.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
if self._returncode is not None:
|
if self._returncode is not None:
|
||||||
return self._returncode
|
return self._returncode
|
||||||
_, status = await trio.to_thread.run_sync(
|
# Park until the pidfd becomes readable — the OS
|
||||||
os.waitpid,
|
# signals this exactly once on child exit. Cancellable
|
||||||
self.pid,
|
# via any outer trio cancel scope (this was the key
|
||||||
0,
|
# fix vs. the prior `to_thread.run_sync(os.waitpid,
|
||||||
abandon_on_cancel=False,
|
# abandon_on_cancel=False)` which blocked a thread on
|
||||||
)
|
# a sync syscall and swallowed cancels).
|
||||||
|
await trio.lowlevel.wait_readable(self._pidfd)
|
||||||
|
# pidfd signaled → reap non-blocking to collect the
|
||||||
|
# exit status. `WNOHANG` here is correct: by the time
|
||||||
|
# the pidfd is readable, `waitpid()` won't block.
|
||||||
|
try:
|
||||||
|
_, status = os.waitpid(self.pid, os.WNOHANG)
|
||||||
|
except ChildProcessError:
|
||||||
|
# already reaped by something else
|
||||||
|
status = 0
|
||||||
self._returncode = self._parse_status(status)
|
self._returncode = self._parse_status(status)
|
||||||
|
# pidfd is one-shot; close it so we don't leak fds
|
||||||
|
# across many spawns.
|
||||||
|
try:
|
||||||
|
os.close(self._pidfd)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
self._pidfd = -1
|
||||||
return self._returncode
|
return self._returncode
|
||||||
|
|
||||||
def kill(self) -> None:
|
def kill(self) -> None:
|
||||||
|
|
@ -584,6 +614,16 @@ class _ForkedProc:
|
||||||
except ProcessLookupError:
|
except ProcessLookupError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def __del__(self) -> None:
|
||||||
|
# belt-and-braces: close the pidfd if `wait()` wasn't
|
||||||
|
# called (e.g. unexpected teardown path).
|
||||||
|
fd: int = getattr(self, '_pidfd', -1)
|
||||||
|
if fd >= 0:
|
||||||
|
try:
|
||||||
|
os.close(fd)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
def _parse_status(self, status: int) -> int:
|
def _parse_status(self, status: int) -> int:
|
||||||
if os.WIFEXITED(status):
|
if os.WIFEXITED(status):
|
||||||
return os.WEXITSTATUS(status)
|
return os.WEXITSTATUS(status)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue