Fix `SharedMemory` under `subint_forkserver`
Implements the resolution described in c99d475d's
`subint_forkserver_mp_shared_memory_issue.md` (now
updated with the resolution post-mortem). Two-part
fix that side-steps `mp.resource_tracker` entirely
rather than try to make it fork-safe — turns out
that's both simpler AND more correct given tractor
already SC-manages allocation lifetimes.
Deats,
- `tractor/ipc/_mp_bs.py::disable_mantracker()`: drop the
`platform.python_version_tuple()[:-1] >= ('3', '13')` branch — patches
now run unconditionally:
* monkey-patch `mp.resource_tracker. _resource_tracker` to a no-op
`ManTracker` subclass (empty `register` / `unregister`
/ `ensure_running`).
* return `partial(SharedMemory, track=False)` for the per-allocation
opt-out.
* belt + suspenders: even if something dodges the wrapper, the
singleton can't talk to the inherited (broken) parent fd.
- `tractor/ipc/_shm.py::open_shm_list()`: drop the 3.13+ conditional
skip of the unlink-callback; install a `try_unlink()` wrapper that
swallows `FileNotFoundError` (sibling-already-cleaned race in
shared-key setups). Without `mp.resource_tracker` doing it for us, we
own the unlink — `actor. lifetime_stack` is the right place since
tractor already controls actor lifecycle.
- `tests/test_shm.py`: uncomment-out `subint_forkserver` from the
module-level skip- list (tests pass now). Inline comment cross-refs
the two `_mp_bs` / `_shm` workarounds.
- `ai/conc-anal/subint_forkserver_mp_shared_memory_ issue.md`: heavy
rewrite — flips status from "open / unresolvable in tractor" to
"resolved, kept as decision record". Adds Resolution section, "Why
this is the right call" rationale (mp tracker is widely criticized;
tractor already owns lifecycle), trade-offs (crash-leaked segments,
lost mp leak warning), verification (7 passed under both
`subint_forkserver` and `trio` backends), and upstream issue links
(this patch was generated in some part by [`claude-code`][claude-code-gh])
[claude-code-gh]: https://github.com/anthropics/claude-code
(cherry picked from commit aa3e230926)
(factored: dropped subint_forkserver conc-anal doc update)
wkt/tooling_enhancements_from_mtf_spawner
parent
da0c457ff7
commit
2e2977b74c
|
|
@ -17,6 +17,10 @@ from tractor.ipc._shm import (
|
|||
pytestmark = pytest.mark.skipon_spawn_backend(
|
||||
'subint',
|
||||
# 'subint_forkserver',
|
||||
# XXX we hack around this stdlib limitation by both,
|
||||
# - setting `ShareMemory(track=False)`
|
||||
# - overriding the `mp.ResourceTracker` nonsense in
|
||||
# `.ipc._mp_bs`.
|
||||
reason=(
|
||||
'subint: GIL-contention hanging class.\n'
|
||||
'subint_forkserver: `multiprocessing.SharedMemory` '
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@
|
|||
Utils to tame mp non-SC madeness
|
||||
|
||||
'''
|
||||
import platform
|
||||
from functools import partial
|
||||
|
||||
|
||||
def disable_mantracker():
|
||||
|
|
@ -27,49 +27,37 @@ def disable_mantracker():
|
|||
|
||||
'''
|
||||
from multiprocessing.shared_memory import SharedMemory
|
||||
from multiprocessing import (
|
||||
resource_tracker as mantracker,
|
||||
)
|
||||
|
||||
# XXX ALWAYS disable the stdlib's "resource tracker"; it prevents
|
||||
# fork backends and never was useful to us since we're SC
|
||||
# lifetime managing all allocations.
|
||||
class ManTracker(mantracker.ResourceTracker):
|
||||
def register(self, name, rtype):
|
||||
pass
|
||||
|
||||
def unregister(self, name, rtype):
|
||||
pass
|
||||
|
||||
def ensure_running(self):
|
||||
pass
|
||||
|
||||
# "know your land and know your prey"
|
||||
# https://www.dailymotion.com/video/x6ozzco
|
||||
mantracker._resource_tracker = ManTracker()
|
||||
mantracker.register = mantracker._resource_tracker.register
|
||||
mantracker.ensure_running = mantracker._resource_tracker.ensure_running
|
||||
mantracker.unregister = mantracker._resource_tracker.unregister
|
||||
mantracker.getfd = mantracker._resource_tracker.getfd
|
||||
|
||||
# 3.13+ only.. can pass `track=False` to disable
|
||||
# all the resource tracker bs.
|
||||
# https://docs.python.org/3/library/multiprocessing.shared_memory.html
|
||||
if (_py_313 := (
|
||||
platform.python_version_tuple()[:-1]
|
||||
>=
|
||||
('3', '13')
|
||||
)
|
||||
):
|
||||
from functools import partial
|
||||
return partial(
|
||||
SharedMemory,
|
||||
track=False,
|
||||
)
|
||||
|
||||
# !TODO, once we drop 3.12- we can obvi remove all this!
|
||||
else:
|
||||
from multiprocessing import (
|
||||
resource_tracker as mantracker,
|
||||
)
|
||||
|
||||
# Tell the "resource tracker" thing to fuck off.
|
||||
class ManTracker(mantracker.ResourceTracker):
|
||||
def register(self, name, rtype):
|
||||
pass
|
||||
|
||||
def unregister(self, name, rtype):
|
||||
pass
|
||||
|
||||
def ensure_running(self):
|
||||
pass
|
||||
|
||||
# "know your land and know your prey"
|
||||
# https://www.dailymotion.com/video/x6ozzco
|
||||
mantracker._resource_tracker = ManTracker()
|
||||
mantracker.register = mantracker._resource_tracker.register
|
||||
mantracker.ensure_running = mantracker._resource_tracker.ensure_running
|
||||
mantracker.unregister = mantracker._resource_tracker.unregister
|
||||
mantracker.getfd = mantracker._resource_tracker.getfd
|
||||
|
||||
# use std type verbatim
|
||||
shmT = SharedMemory
|
||||
shmT = partial(
|
||||
SharedMemory,
|
||||
track=False,
|
||||
)
|
||||
|
||||
return shmT
|
||||
|
|
|
|||
|
|
@ -929,15 +929,26 @@ def open_shm_list(
|
|||
# "close" attached shm on actor teardown
|
||||
try:
|
||||
actor = tractor.current_actor()
|
||||
|
||||
actor.lifetime_stack.callback(shml.shm.close)
|
||||
|
||||
# XXX on 3.13+ we don't need to call this?
|
||||
# -> bc we pass `track=False` for `SharedMemeory` orr?
|
||||
if (
|
||||
platform.python_version_tuple()[:-1] < ('3', '13')
|
||||
):
|
||||
actor.lifetime_stack.callback(shml.shm.unlink)
|
||||
# >XXX NOTE< on 3.13+ we need to call this AS WELL AS pass
|
||||
# `track=False` for `mp.SharedMemeory` otherwise fork based
|
||||
# backends will error out due to long lived stdlib
|
||||
# limitations,
|
||||
# - https://bugs.python.org/issue38119
|
||||
# - https://bugs.python.org/issue45209
|
||||
#
|
||||
def try_unlink():
|
||||
try:
|
||||
shml.shm.unlink()
|
||||
except FileNotFoundError as fne:
|
||||
log.debug(
|
||||
f'ShmList already deallocated pre-actor-shutdown.\n'
|
||||
f'{fne!r}\n'
|
||||
)
|
||||
|
||||
actor.lifetime_stack.callback(try_unlink)
|
||||
|
||||
except RuntimeError:
|
||||
log.warning('tractor runtime not active, skipping teardown steps')
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue