From 70dc60a199ff5bbd476ffa1d6b9e7fe6251efe39 Mon Sep 17 00:00:00 2001 From: goodboy Date: Tue, 14 Apr 2026 15:32:04 -0400 Subject: [PATCH] Bump UDS `listen()` backlog 1 -> 128 for multi-actor unreg A backlog of 1 caused `ECONNREFUSED` when multiple sub-actors simultaneously connect to deregister from a remote-daemon registrar. Now matches the TCP transport's default backlog (~128). Also, - add cross-ref comments between `_uds.close_listener()` and `async_main()`'s `parent_is_reg` deregistration path explaining the UDS socket-file lifecycle (this commit msg was generated in some part by [`claude-code`][claude-code-gh]) [claude-code-gh]: https://github.com/anthropics/claude-code --- tractor/ipc/_uds.py | 28 +++++++++++++++++++++++++++- tractor/runtime/_runtime.py | 16 +++++++++++----- 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/tractor/ipc/_uds.py b/tractor/ipc/_uds.py index 45100f97..3b214f6a 100644 --- a/tractor/ipc/_uds.py +++ b/tractor/ipc/_uds.py @@ -300,7 +300,23 @@ async def start_listener( ): await sock.bind(str(bindpath)) - sock.listen(1) + # NOTE, the backlog must be large enough to handle + # concurrent connection attempts during actor teardown. + # Previously this was `listen(1)` which caused + # deregistration failures in the remote-daemon registrar + # case: when multiple sub-actors simultaneously try to + # connect to deregister, a backlog of 1 overflows and + # connections get ECONNREFUSED. This matches the TCP + # transport which uses `trio.open_tcp_listeners()` with + # a default backlog of ~128. + # + # For details see the `close_listener()` below which + # `os.unlink()`s the socket file on teardown — meaning + # any NEW connection attempts after that point will fail + # with `FileNotFoundError` regardless of backlog size. + # The backlog only matters while the listener is alive + # and accepting. + sock.listen(128) log.info( f'Listening on UDS socket\n' f'[>\n' @@ -316,6 +332,16 @@ def close_listener( ''' Close and remove the listening unix socket's path. + NOTE, the `os.unlink()` here removes the socket file from + the filesystem immediately, which means any subsequent + connection attempts (e.g. sub-actors trying to deregister + with a registrar whose listener is tearing down) will fail + with `FileNotFoundError`. For the local-registrar case + (parent IS the registrar), `_runtime.async_main()` works + around this by reusing the existing `_parent_chan` instead + of opening a new connection; see the `parent_is_reg` logic + in the deregistration path. + ''' lstnr.socket.close() os.unlink(addr.sockpath) diff --git a/tractor/runtime/_runtime.py b/tractor/runtime/_runtime.py index 7cf20ec2..6381c840 100644 --- a/tractor/runtime/_runtime.py +++ b/tractor/runtime/_runtime.py @@ -1848,11 +1848,17 @@ async def async_main( failed_unreg: bool = False rent_chan: Channel|None = actor._parent_chan - # XXX check if the parent IS the registrar so we can - # reuse the existing channel (avoids opening a new - # connection which fails when the listener socket is - # already closed, e.g. UDS transport unlinks the socket - # file during teardown). + # XXX check if the parent IS the registrar so we + # can reuse the existing `_parent_chan` (avoids + # opening a new connection which fails when the + # listener socket is already closed, e.g. UDS + # transport `os.unlink()`s the socket file during + # teardown). + # + # See `ipc._uds.close_listener()` for details on + # the UDS socket-file lifecycle and why this + # optimization is necessary for the local-registrar + # case. parent_is_reg: bool = False if ( rent_chan is not None