From 2e2977b74cfce3b37edc110e204e8f3d5e4dc7d2 Mon Sep 17 00:00:00 2001 From: goodboy Date: Tue, 9 Jun 2026 20:22:38 -0400 Subject: [PATCH] Fix `SharedMemory` under `subint_forkserver` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements the resolution described in c99d475d's `subint_forkserver_mp_shared_memory_issue.md` (now updated with the resolution post-mortem). Two-part fix that side-steps `mp.resource_tracker` entirely rather than try to make it fork-safe — turns out that's both simpler AND more correct given tractor already SC-manages allocation lifetimes. Deats, - `tractor/ipc/_mp_bs.py::disable_mantracker()`: drop the `platform.python_version_tuple()[:-1] >= ('3', '13')` branch — patches now run unconditionally: * monkey-patch `mp.resource_tracker. _resource_tracker` to a no-op `ManTracker` subclass (empty `register` / `unregister` / `ensure_running`). * return `partial(SharedMemory, track=False)` for the per-allocation opt-out. * belt + suspenders: even if something dodges the wrapper, the singleton can't talk to the inherited (broken) parent fd. - `tractor/ipc/_shm.py::open_shm_list()`: drop the 3.13+ conditional skip of the unlink-callback; install a `try_unlink()` wrapper that swallows `FileNotFoundError` (sibling-already-cleaned race in shared-key setups). Without `mp.resource_tracker` doing it for us, we own the unlink — `actor. lifetime_stack` is the right place since tractor already controls actor lifecycle. - `tests/test_shm.py`: uncomment-out `subint_forkserver` from the module-level skip- list (tests pass now). Inline comment cross-refs the two `_mp_bs` / `_shm` workarounds. - `ai/conc-anal/subint_forkserver_mp_shared_memory_ issue.md`: heavy rewrite — flips status from "open / unresolvable in tractor" to "resolved, kept as decision record". Adds Resolution section, "Why this is the right call" rationale (mp tracker is widely criticized; tractor already owns lifecycle), trade-offs (crash-leaked segments, lost mp leak warning), verification (7 passed under both `subint_forkserver` and `trio` backends), and upstream issue links (this patch was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code (cherry picked from commit aa3e2309266f29329bc31edeef3c6af2b3111f34) (factored: dropped subint_forkserver conc-anal doc update) --- tests/test_shm.py | 4 +++ tractor/ipc/_mp_bs.py | 68 ++++++++++++++++++------------------------- tractor/ipc/_shm.py | 25 +++++++++++----- 3 files changed, 50 insertions(+), 47 deletions(-) diff --git a/tests/test_shm.py b/tests/test_shm.py index 8ea43457..d6ad93f4 100644 --- a/tests/test_shm.py +++ b/tests/test_shm.py @@ -17,6 +17,10 @@ from tractor.ipc._shm import ( pytestmark = pytest.mark.skipon_spawn_backend( 'subint', # 'subint_forkserver', + # XXX we hack around this stdlib limitation by both, + # - setting `ShareMemory(track=False)` + # - overriding the `mp.ResourceTracker` nonsense in + # `.ipc._mp_bs`. reason=( 'subint: GIL-contention hanging class.\n' 'subint_forkserver: `multiprocessing.SharedMemory` ' diff --git a/tractor/ipc/_mp_bs.py b/tractor/ipc/_mp_bs.py index 462291c6..7f2092d2 100644 --- a/tractor/ipc/_mp_bs.py +++ b/tractor/ipc/_mp_bs.py @@ -17,7 +17,7 @@ Utils to tame mp non-SC madeness ''' -import platform +from functools import partial def disable_mantracker(): @@ -27,49 +27,37 @@ def disable_mantracker(): ''' from multiprocessing.shared_memory import SharedMemory + from multiprocessing import ( + resource_tracker as mantracker, + ) + # XXX ALWAYS disable the stdlib's "resource tracker"; it prevents + # fork backends and never was useful to us since we're SC + # lifetime managing all allocations. + class ManTracker(mantracker.ResourceTracker): + def register(self, name, rtype): + pass + + def unregister(self, name, rtype): + pass + + def ensure_running(self): + pass + + # "know your land and know your prey" + # https://www.dailymotion.com/video/x6ozzco + mantracker._resource_tracker = ManTracker() + mantracker.register = mantracker._resource_tracker.register + mantracker.ensure_running = mantracker._resource_tracker.ensure_running + mantracker.unregister = mantracker._resource_tracker.unregister + mantracker.getfd = mantracker._resource_tracker.getfd # 3.13+ only.. can pass `track=False` to disable # all the resource tracker bs. # https://docs.python.org/3/library/multiprocessing.shared_memory.html - if (_py_313 := ( - platform.python_version_tuple()[:-1] - >= - ('3', '13') - ) - ): - from functools import partial - return partial( - SharedMemory, - track=False, - ) - - # !TODO, once we drop 3.12- we can obvi remove all this! - else: - from multiprocessing import ( - resource_tracker as mantracker, - ) - - # Tell the "resource tracker" thing to fuck off. - class ManTracker(mantracker.ResourceTracker): - def register(self, name, rtype): - pass - - def unregister(self, name, rtype): - pass - - def ensure_running(self): - pass - - # "know your land and know your prey" - # https://www.dailymotion.com/video/x6ozzco - mantracker._resource_tracker = ManTracker() - mantracker.register = mantracker._resource_tracker.register - mantracker.ensure_running = mantracker._resource_tracker.ensure_running - mantracker.unregister = mantracker._resource_tracker.unregister - mantracker.getfd = mantracker._resource_tracker.getfd - - # use std type verbatim - shmT = SharedMemory + shmT = partial( + SharedMemory, + track=False, + ) return shmT diff --git a/tractor/ipc/_shm.py b/tractor/ipc/_shm.py index b60fafcc..f0225d70 100644 --- a/tractor/ipc/_shm.py +++ b/tractor/ipc/_shm.py @@ -929,15 +929,26 @@ def open_shm_list( # "close" attached shm on actor teardown try: actor = tractor.current_actor() - actor.lifetime_stack.callback(shml.shm.close) - # XXX on 3.13+ we don't need to call this? - # -> bc we pass `track=False` for `SharedMemeory` orr? - if ( - platform.python_version_tuple()[:-1] < ('3', '13') - ): - actor.lifetime_stack.callback(shml.shm.unlink) + # >XXX NOTE< on 3.13+ we need to call this AS WELL AS pass + # `track=False` for `mp.SharedMemeory` otherwise fork based + # backends will error out due to long lived stdlib + # limitations, + # - https://bugs.python.org/issue38119 + # - https://bugs.python.org/issue45209 + # + def try_unlink(): + try: + shml.shm.unlink() + except FileNotFoundError as fne: + log.debug( + f'ShmList already deallocated pre-actor-shutdown.\n' + f'{fne!r}\n' + ) + + actor.lifetime_stack.callback(try_unlink) + except RuntimeError: log.warning('tractor runtime not active, skipping teardown steps')