Compare commits
No commits in common. "e5e2afb5f4890cb0c8f7f3bb38ad12429f4e0b59" and "aa7450974cd5811cd3d87b1d1ffeb2f1ecdab4ec" have entirely different histories.
e5e2afb5f4
...
aa7450974c
|
|
@ -249,38 +249,22 @@ ls -la /tmp/registry@*.sock 2>/dev/null \
|
||||||
surface PIDs + cmdlines to the user, offer cleanup:
|
surface PIDs + cmdlines to the user, offer cleanup:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# 1. GRACEFUL FIRST (tractor is structured concurrent — it
|
# 1. kill test zombies scoped to THIS repo's python only
|
||||||
# catches SIGINT as an OS-cancel in `_trio_main` and
|
# (don't pkill by bare pattern — that'd nuke legit
|
||||||
# cascades Portal.cancel_actor via IPC to every descendant.
|
# long-running tractor apps like piker)
|
||||||
# So always try SIGINT first with a bounded timeout; only
|
pkill -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv"
|
||||||
# escalate to SIGKILL if graceful cleanup doesn't complete).
|
|
||||||
pkill -INT -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv"
|
|
||||||
|
|
||||||
# 2. bounded wait for graceful teardown (usually sub-second).
|
|
||||||
# Loop until the processes exit, or timeout. Keep the
|
|
||||||
# bound tight — hung/abrupt-killed descendants usually
|
|
||||||
# hang forever, so don't wait more than a few seconds.
|
|
||||||
for i in $(seq 1 10); do
|
|
||||||
pgrep -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv" >/dev/null || break
|
|
||||||
sleep 0.3
|
sleep 0.3
|
||||||
done
|
pkill -9 -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv" 2>/dev/null
|
||||||
|
|
||||||
# 3. ESCALATE TO SIGKILL only if graceful didn't finish.
|
# 2. if a test zombie holds :1616 specifically and doesn't
|
||||||
if pgrep -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv" >/dev/null; then
|
|
||||||
echo 'graceful teardown timed out — escalating to SIGKILL'
|
|
||||||
pkill -9 -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 4. if a test zombie holds :1616 specifically and doesn't
|
|
||||||
# match the above pattern, find its PID the hard way:
|
# match the above pattern, find its PID the hard way:
|
||||||
ss -tlnp 2>/dev/null | grep ':1616' # prints `users:(("<name>",pid=NNNN,...))`
|
ss -tlnp 2>/dev/null | grep ':1616' # prints `users:(("<name>",pid=NNNN,...))`
|
||||||
# then (same SIGINT-first ladder):
|
# then: kill <NNNN>
|
||||||
# kill -INT <NNNN>; sleep 1; kill -9 <NNNN> 2>/dev/null
|
|
||||||
|
|
||||||
# 5. remove stale UDS sockets
|
# 3. remove stale UDS sockets
|
||||||
rm -f /tmp/registry@*.sock
|
rm -f /tmp/registry@*.sock
|
||||||
|
|
||||||
# 6. re-verify
|
# 4. re-verify
|
||||||
ss -tlnp 2>/dev/null | grep ':1616' || echo 'TCP :1616 now free'
|
ss -tlnp 2>/dev/null | grep ':1616' || echo 'TCP :1616 now free'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,165 +0,0 @@
|
||||||
# `subint_forkserver` backend leaks subactor descendants in `test_cancellation.py`
|
|
||||||
|
|
||||||
Follow-up tracker: surfaced while wiring the new
|
|
||||||
`subint_forkserver` spawn backend into the full tractor
|
|
||||||
test matrix (step 2 of the post-backend-lands plan;
|
|
||||||
see also
|
|
||||||
`ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md`).
|
|
||||||
|
|
||||||
## TL;DR
|
|
||||||
|
|
||||||
Running `tests/test_cancellation.py` under
|
|
||||||
`--spawn-backend=subint_forkserver` reproducibly leaks
|
|
||||||
**exactly 5 `subint-forkserv` comm-named child processes**
|
|
||||||
after the pytest session exits. Both previously-run
|
|
||||||
sessions produced the same 5-process signature — not a
|
|
||||||
flake. Each leaked process holds a `LISTEN` on the
|
|
||||||
default registry TCP addr (`127.0.0.1:1616`), which
|
|
||||||
poisons any subsequent tractor test session that
|
|
||||||
defaults to that addr.
|
|
||||||
|
|
||||||
## Stopgap (not the real fix)
|
|
||||||
|
|
||||||
Multiple tests in `test_cancellation.py` were calling
|
|
||||||
`tractor.open_nursery()` **without** passing
|
|
||||||
`registry_addrs=[reg_addr]`, i.e. falling back on the
|
|
||||||
default `:1616`. The commit accompanying this doc wires
|
|
||||||
the `reg_addr` fixture through those tests so each run
|
|
||||||
gets a session-unique port — leaked zombies can no
|
|
||||||
longer poison **other** tests (they hold their own
|
|
||||||
unique port instead).
|
|
||||||
|
|
||||||
Tests touched (in `tests/test_cancellation.py`):
|
|
||||||
|
|
||||||
- `test_cancel_infinite_streamer`
|
|
||||||
- `test_some_cancels_all`
|
|
||||||
- `test_nested_multierrors`
|
|
||||||
- `test_cancel_via_SIGINT`
|
|
||||||
- `test_cancel_via_SIGINT_other_task`
|
|
||||||
|
|
||||||
This is a **suite-hygiene fix** — it doesn't close the
|
|
||||||
actual leak; it just stops the leak from blast-radiusing.
|
|
||||||
Zombie descendants still accumulate per run.
|
|
||||||
|
|
||||||
## The real bug (unfixed)
|
|
||||||
|
|
||||||
`subint_forkserver_proc`'s teardown — `_ForkedProc.kill()`
|
|
||||||
(plain `os.kill(SIGKILL)` to the direct child pid) +
|
|
||||||
`proc.wait()` — does **not** reap grandchildren or
|
|
||||||
deeper descendants. When a cancellation test causes a
|
|
||||||
multi-level actor tree to tear down, the direct child
|
|
||||||
dies but its own children survive and get reparented to
|
|
||||||
init (PID 1), where they stay running with their
|
|
||||||
inherited FDs (including the registry listen socket).
|
|
||||||
|
|
||||||
**Symptom on repro:**
|
|
||||||
|
|
||||||
```
|
|
||||||
$ ss -tlnp 2>/dev/null | grep ':1616'
|
|
||||||
LISTEN 0 4096 127.0.0.1:1616 0.0.0.0:* \
|
|
||||||
users:(("subint-forkserv",pid=211595,fd=17),
|
|
||||||
("subint-forkserv",pid=211585,fd=17),
|
|
||||||
("subint-forkserv",pid=211583,fd=17),
|
|
||||||
("subint-forkserv",pid=211576,fd=17),
|
|
||||||
("subint-forkserv",pid=211572,fd=17))
|
|
||||||
|
|
||||||
$ for p in 211572 211576 211583 211585 211595; do
|
|
||||||
cat /proc/$p/cmdline | tr '\0' ' '; echo; done
|
|
||||||
./py314/bin/python -m pytest --spawn-backend=subint_forkserver \
|
|
||||||
tests/test_cancellation.py --timeout=30 --timeout-method=signal \
|
|
||||||
--tb=no -q --no-header
|
|
||||||
... (x5, all same cmdline — inherited from fork)
|
|
||||||
```
|
|
||||||
|
|
||||||
All 5 share the pytest cmdline because `os.fork()`
|
|
||||||
without `exec()` preserves the parent's argv. Their
|
|
||||||
comm-name (`subint-forkserv`) is the `thread_name` we
|
|
||||||
pass to the fork-worker thread in
|
|
||||||
`tractor.spawn._subint_forkserver.fork_from_worker_thread`.
|
|
||||||
|
|
||||||
## Why 5?
|
|
||||||
|
|
||||||
Not confirmed; guess is 5 = the parametrize cardinality
|
|
||||||
of one of the leaky tests (e.g. `test_some_cancels_all`
|
|
||||||
has 5 parametrize cases). Each param-case spawns a
|
|
||||||
nested tree; each leaks exactly one descendant. Worth
|
|
||||||
verifying by running each parametrize-case individually
|
|
||||||
and counting leaked procs per case.
|
|
||||||
|
|
||||||
## Ruled out
|
|
||||||
|
|
||||||
- **`:1616` collision from a different repo** (e.g.
|
|
||||||
piker): `/proc/$pid/cmdline` + `cwd` both resolve to
|
|
||||||
the tractor repo's `py314/` venv for all 5. These are
|
|
||||||
definitively spawned by our test run.
|
|
||||||
- **Parent-side `_ForkedProc.wait()` regressed**: the
|
|
||||||
direct child's teardown completes cleanly (exit-code
|
|
||||||
captured, `waitpid` returns); the 5 survivors are
|
|
||||||
deeper-descendants whose parent-side shim has no
|
|
||||||
handle on them. So the bug isn't in
|
|
||||||
`_ForkedProc.wait()` — it's in the lack of tree-
|
|
||||||
level descendant enumeration + reaping during nursery
|
|
||||||
teardown.
|
|
||||||
|
|
||||||
## Likely fix directions
|
|
||||||
|
|
||||||
1. **Process-group-scoped spawn + tree kill.** Put each
|
|
||||||
forkserver-spawned subactor into its own process
|
|
||||||
group (`os.setpgrp()` in the fork child), then on
|
|
||||||
teardown `os.killpg(pgid, SIGKILL)` to reap the
|
|
||||||
whole tree atomically. Simplest, most surgical.
|
|
||||||
2. **Subreaper registration.** Use
|
|
||||||
`PR_SET_CHILD_SUBREAPER` on the tractor root so
|
|
||||||
orphaned grandchildren reparent to the root rather
|
|
||||||
than init — then we can `waitpid` them from the
|
|
||||||
parent-side nursery teardown. More invasive.
|
|
||||||
3. **Explicit descendant enumeration at teardown.**
|
|
||||||
In `subint_forkserver_proc`'s finally block, walk
|
|
||||||
`/proc/<pid>/task/*/children` before issuing SIGKILL
|
|
||||||
to build a descendant-pid set; then kill + reap all
|
|
||||||
of them. Fragile (Linux-only, proc-fs-scan race).
|
|
||||||
|
|
||||||
Vote: **(1)** — clean, POSIX-standard, aligns with how
|
|
||||||
`subprocess.Popen` (and by extension `trio.lowlevel.
|
|
||||||
open_process`) handle tree-kill semantics on
|
|
||||||
kwargs-supplied `start_new_session=True`.
|
|
||||||
|
|
||||||
## Reproducer
|
|
||||||
|
|
||||||
```sh
|
|
||||||
# before: ensure clean env
|
|
||||||
ss -tlnp 2>/dev/null | grep ':1616' || echo 'clean'
|
|
||||||
|
|
||||||
# run the leaky tests
|
|
||||||
./py314/bin/python -m pytest \
|
|
||||||
--spawn-backend=subint_forkserver \
|
|
||||||
tests/test_cancellation.py \
|
|
||||||
--timeout=30 --timeout-method=signal --tb=no -q --no-header
|
|
||||||
|
|
||||||
# observe: 5 leaked children now holding :1616
|
|
||||||
ss -tlnp 2>/dev/null | grep ':1616'
|
|
||||||
```
|
|
||||||
|
|
||||||
Expected output: `subint-forkserv` processes listed as
|
|
||||||
listeners on `:1616`. Cleanup:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
pkill -9 -f \
|
|
||||||
"$(pwd)/py314/bin/python.*pytest.*spawn-backend=subint_forkserver"
|
|
||||||
```
|
|
||||||
|
|
||||||
## References
|
|
||||||
|
|
||||||
- `tractor/spawn/_subint_forkserver.py::_ForkedProc`
|
|
||||||
— the current teardown shim; PID-scoped, not tree-
|
|
||||||
scoped.
|
|
||||||
- `tractor/spawn/_subint_forkserver.py::subint_forkserver_proc`
|
|
||||||
— the spawn backend whose `finally` block needs the
|
|
||||||
tree-kill fix.
|
|
||||||
- `tests/test_cancellation.py` — the surface where the
|
|
||||||
leak surfaces.
|
|
||||||
- `ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md`
|
|
||||||
— sibling tracker for a different forkserver-teardown
|
|
||||||
class (orphaned child doesn't respond to SIGINT); may
|
|
||||||
share root cause with this one once the fix lands.
|
|
||||||
- tractor issue #379 — subint backend tracking.
|
|
||||||
|
|
@ -275,8 +275,7 @@ async def stream_forever():
|
||||||
timeout=6,
|
timeout=6,
|
||||||
)
|
)
|
||||||
async def test_cancel_infinite_streamer(
|
async def test_cancel_infinite_streamer(
|
||||||
reg_addr: tuple,
|
start_method: str
|
||||||
start_method: str,
|
|
||||||
):
|
):
|
||||||
# stream for at most 1 seconds
|
# stream for at most 1 seconds
|
||||||
with (
|
with (
|
||||||
|
|
@ -342,7 +341,6 @@ async def test_cancel_infinite_streamer(
|
||||||
)
|
)
|
||||||
async def test_some_cancels_all(
|
async def test_some_cancels_all(
|
||||||
num_actors_and_errs: tuple,
|
num_actors_and_errs: tuple,
|
||||||
reg_addr: tuple,
|
|
||||||
start_method: str,
|
start_method: str,
|
||||||
loglevel: str,
|
loglevel: str,
|
||||||
):
|
):
|
||||||
|
|
@ -452,13 +450,8 @@ async def spawn_and_error(
|
||||||
await nursery.run_in_actor(*args, **kwargs)
|
await nursery.run_in_actor(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.timeout(
|
|
||||||
10,
|
|
||||||
method='thread',
|
|
||||||
)
|
|
||||||
@tractor_test
|
@tractor_test
|
||||||
async def test_nested_multierrors(
|
async def test_nested_multierrors(
|
||||||
reg_addr: tuple,
|
|
||||||
loglevel: str,
|
loglevel: str,
|
||||||
start_method: str,
|
start_method: str,
|
||||||
):
|
):
|
||||||
|
|
@ -548,7 +541,6 @@ async def test_nested_multierrors(
|
||||||
|
|
||||||
@no_windows
|
@no_windows
|
||||||
def test_cancel_via_SIGINT(
|
def test_cancel_via_SIGINT(
|
||||||
reg_addr: tuple,
|
|
||||||
loglevel: str,
|
loglevel: str,
|
||||||
start_method: str,
|
start_method: str,
|
||||||
):
|
):
|
||||||
|
|
@ -561,9 +553,7 @@ def test_cancel_via_SIGINT(
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
with trio.fail_after(2):
|
with trio.fail_after(2):
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery() as tn:
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
) as tn:
|
|
||||||
await tn.start_actor('sucka')
|
await tn.start_actor('sucka')
|
||||||
if 'mp' in start_method:
|
if 'mp' in start_method:
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
|
|
@ -576,7 +566,6 @@ def test_cancel_via_SIGINT(
|
||||||
|
|
||||||
@no_windows
|
@no_windows
|
||||||
def test_cancel_via_SIGINT_other_task(
|
def test_cancel_via_SIGINT_other_task(
|
||||||
reg_addr: tuple,
|
|
||||||
loglevel: str,
|
loglevel: str,
|
||||||
start_method: str,
|
start_method: str,
|
||||||
spawn_backend: str,
|
spawn_backend: str,
|
||||||
|
|
@ -605,9 +594,7 @@ def test_cancel_via_SIGINT_other_task(
|
||||||
async def spawn_and_sleep_forever(
|
async def spawn_and_sleep_forever(
|
||||||
task_status=trio.TASK_STATUS_IGNORED
|
task_status=trio.TASK_STATUS_IGNORED
|
||||||
):
|
):
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery() as tn:
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
) as tn:
|
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
await tn.run_in_actor(
|
await tn.run_in_actor(
|
||||||
sleep_forever,
|
sleep_forever,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue