From 9716d86825e3378701ba3acc7f9504142a113c3d Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sat, 15 Oct 2022 16:35:32 -0400
Subject: [PATCH 001/378] Initial module import from `piker.data._sharemem`

More or less a verbatim copy-paste minus some edgy variable naming and
internal `piker` module imports. There is a bunch of OHLC related
defaults that need to be dropped and we need to adjust to an optional
dependence on `numpy` by supporting shared lists as per the mp docs.
---
 tractor/_shm.py | 706 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 706 insertions(+)
 create mode 100644 tractor/_shm.py

diff --git a/tractor/_shm.py b/tractor/_shm.py
new file mode 100644
index 00000000..dca9d5a5
--- /dev/null
+++ b/tractor/_shm.py
@@ -0,0 +1,706 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+"""
+SC friendly shared memory management geared at real-time
+processing.
+
+Support for ``numpy`` compatible array-buffers is provided but is
+considered optional within the context of this runtime-library.
+
+"""
+from __future__ import annotations
+from sys import byteorder
+import time
+from typing import Optional
+from multiprocessing.shared_memory import (
+    SharedMemory,
+    _USE_POSIX,
+)
+
+if _USE_POSIX:
+    from _posixshmem import shm_unlink
+
+from msgspec import Struct
+import numpy as np
+from numpy.lib import recfunctions as rfn
+import tractor
+
+from .log import get_logger
+
+
+log = get_logger(__name__)
+
+
+# how  much is probably dependent on lifestyle
+_secs_in_day = int(60 * 60 * 24)
+# we try for a buncha times, but only on a run-every-other-day kinda week.
+_days_worth = 16
+_default_size = _days_worth * _secs_in_day
+# where to start the new data append index
+_rt_buffer_start = int((_days_worth - 1) * _secs_in_day)
+
+
+def disable_mantracker():
+    '''
+    Disable all ``multiprocessing``` "resource tracking" machinery since
+    it's an absolute multi-threaded mess of non-SC madness.
+
+    '''
+    from multiprocessing import resource_tracker as mantracker
+
+    # Tell the "resource tracker" thing to fuck off.
+    class ManTracker(mantracker.ResourceTracker):
+        def register(self, name, rtype):
+            pass
+
+        def unregister(self, name, rtype):
+            pass
+
+        def ensure_running(self):
+            pass
+
+    # "know your land and know your prey"
+    # https://www.dailymotion.com/video/x6ozzco
+    mantracker._resource_tracker = ManTracker()
+    mantracker.register = mantracker._resource_tracker.register
+    mantracker.ensure_running = mantracker._resource_tracker.ensure_running
+    # ensure_running = mantracker._resource_tracker.ensure_running
+    mantracker.unregister = mantracker._resource_tracker.unregister
+    mantracker.getfd = mantracker._resource_tracker.getfd
+
+
+disable_mantracker()
+
+
+class SharedInt:
+    """Wrapper around a single entry shared memory array which
+    holds an ``int`` value used as an index counter.
+
+    """
+    def __init__(
+        self,
+        shm: SharedMemory,
+    ) -> None:
+        self._shm = shm
+
+    @property
+    def value(self) -> int:
+        return int.from_bytes(self._shm.buf, byteorder)
+
+    @value.setter
+    def value(self, value) -> None:
+        self._shm.buf[:] = value.to_bytes(self._shm.size, byteorder)
+
+    def destroy(self) -> None:
+        if _USE_POSIX:
+            # We manually unlink to bypass all the "resource tracker"
+            # nonsense meant for non-SC systems.
+            name = self._shm.name
+            try:
+                shm_unlink(name)
+            except FileNotFoundError:
+                # might be a teardown race here?
+                log.warning(f'Shm for {name} already unlinked?')
+
+
+class _Token(Struct, frozen=True):
+    '''
+    Internal represenation of a shared memory "token"
+    which can be used to key a system wide post shm entry.
+
+    '''
+    shm_name: str  # this servers as a "key" value
+    shm_first_index_name: str
+    shm_last_index_name: str
+    dtype_descr: tuple
+    size: int  # in struct-array index / row terms
+
+    @property
+    def dtype(self) -> np.dtype:
+        return np.dtype(list(map(tuple, self.dtype_descr))).descr
+
+    def as_msg(self):
+        return self.to_dict()
+
+    @classmethod
+    def from_msg(cls, msg: dict) -> _Token:
+        if isinstance(msg, _Token):
+            return msg
+
+        # TODO: native struct decoding
+        # return _token_dec.decode(msg)
+
+        msg['dtype_descr'] = tuple(map(tuple, msg['dtype_descr']))
+        return _Token(**msg)
+
+
+# _token_dec = msgspec.msgpack.Decoder(_Token)
+
+# TODO: this api?
+# _known_tokens = tractor.ActorVar('_shm_tokens', {})
+# _known_tokens = tractor.ContextStack('_known_tokens', )
+# _known_tokens = trio.RunVar('shms', {})
+
+# process-local store of keys to tokens
+_known_tokens = {}
+
+
+def get_shm_token(key: str) -> _Token:
+    """Convenience func to check if a token
+    for the provided key is known by this process.
+    """
+    return _known_tokens.get(key)
+
+
+def _make_token(
+    key: str,
+    size: int,
+    dtype: np.dtype,
+
+) -> _Token:
+    '''
+    Create a serializable token that can be used
+    to access a shared array.
+
+    '''
+    return _Token(
+        shm_name=key,
+        shm_first_index_name=key + "_first",
+        shm_last_index_name=key + "_last",
+        dtype_descr=tuple(np.dtype(dtype).descr),
+        size=size,
+    )
+
+
+class ShmArray:
+    '''
+    A shared memory ``numpy`` (compatible) array API.
+
+    An underlying shared memory buffer is allocated based on
+    a user specified ``numpy.ndarray``. This fixed size array
+    can be read and written to by pushing data both onto the "front"
+    or "back" of a set index range. The indexes for the "first" and
+    "last" index are themselves stored in shared memory (accessed via
+    ``SharedInt`` interfaces) values such that multiple processes can
+    interact with the same array using a synchronized-index.
+
+    '''
+    def __init__(
+        self,
+        shmarr: np.ndarray,
+        first: SharedInt,
+        last: SharedInt,
+        shm: SharedMemory,
+        # readonly: bool = True,
+    ) -> None:
+        self._array = shmarr
+
+        # indexes for first and last indices corresponding
+        # to fille data
+        self._first = first
+        self._last = last
+
+        self._len = len(shmarr)
+        self._shm = shm
+        self._post_init: bool = False
+
+        # pushing data does not write the index (aka primary key)
+        dtype = shmarr.dtype
+        if dtype.fields:
+            self._write_fields = list(shmarr.dtype.fields.keys())[1:]
+        else:
+            self._write_fields = None
+
+    # TODO: ringbuf api?
+
+    @property
+    def _token(self) -> _Token:
+        return _Token(
+            shm_name=self._shm.name,
+            shm_first_index_name=self._first._shm.name,
+            shm_last_index_name=self._last._shm.name,
+            dtype_descr=tuple(self._array.dtype.descr),
+            size=self._len,
+        )
+
+    @property
+    def token(self) -> dict:
+        """Shared memory token that can be serialized and used by
+        another process to attach to this array.
+        """
+        return self._token.as_msg()
+
+    @property
+    def index(self) -> int:
+        return self._last.value % self._len
+
+    @property
+    def array(self) -> np.ndarray:
+        '''
+        Return an up-to-date ``np.ndarray`` view of the
+        so-far-written data to the underlying shm buffer.
+
+        '''
+        a = self._array[self._first.value:self._last.value]
+
+        # first, last = self._first.value, self._last.value
+        # a = self._array[first:last]
+
+        # TODO: eventually comment this once we've not seen it in the
+        # wild in a long time..
+        # XXX: race where first/last indexes cause a reader
+        # to load an empty array..
+        if len(a) == 0 and self._post_init:
+            raise RuntimeError('Empty array race condition hit!?')
+            # breakpoint()
+
+        return a
+
+    def ustruct(
+        self,
+        fields: Optional[list[str]] = None,
+
+        # type that all field values will be cast to
+        # in the returned view.
+        common_dtype: np.dtype = np.float,
+
+    ) -> np.ndarray:
+
+        array = self._array
+
+        if fields:
+            selection = array[fields]
+            # fcount = len(fields)
+        else:
+            selection = array
+            # fcount = len(array.dtype.fields)
+
+        # XXX: manual ``.view()`` attempt that also doesn't work.
+        # uview = selection.view(
+        #     dtype='<f16',
+        # ).reshape(-1, 4, order='A')
+
+        # assert len(selection) == len(uview)
+
+        u = rfn.structured_to_unstructured(
+            selection,
+            # dtype=float,
+            copy=True,
+        )
+
+        # unstruct = np.ndarray(u.shape, dtype=a.dtype, buffer=shm.buf)
+        # array[:] = a[:]
+        return u
+        # return ShmArray(
+        #     shmarr=u,
+        #     first=self._first,
+        #     last=self._last,
+        #     shm=self._shm
+        # )
+
+    def last(
+        self,
+        length: int = 1,
+
+    ) -> np.ndarray:
+        '''
+        Return the last ``length``'s worth of ("row") entries from the
+        array.
+
+        '''
+        return self.array[-length:]
+
+    def push(
+        self,
+        data: np.ndarray,
+
+        field_map: Optional[dict[str, str]] = None,
+        prepend: bool = False,
+        update_first: bool = True,
+        start: Optional[int] = None,
+
+    ) -> int:
+        '''
+        Ring buffer like "push" to append data
+        into the buffer and return updated "last" index.
+
+        NB: no actual ring logic yet to give a "loop around" on overflow
+        condition, lel.
+
+        '''
+        length = len(data)
+
+        if prepend:
+            index = (start or self._first.value) - length
+
+            if index < 0:
+                raise ValueError(
+                    f'Array size of {self._len} was overrun during prepend.\n'
+                    f'You have passed {abs(index)} too many datums.'
+                )
+
+        else:
+            index = start if start is not None else self._last.value
+
+        end = index + length
+
+        if field_map:
+            src_names, dst_names = zip(*field_map.items())
+        else:
+            dst_names = src_names = self._write_fields
+
+        try:
+            self._array[
+                list(dst_names)
+            ][index:end] = data[list(src_names)][:]
+
+            # NOTE: there was a race here between updating
+            # the first and last indices and when the next reader
+            # tries to access ``.array`` (which due to the index
+            # overlap will be empty). Pretty sure we've fixed it now
+            # but leaving this here as a reminder.
+            if prepend and update_first and length:
+                assert index < self._first.value
+
+            if (
+                index < self._first.value
+                and update_first
+            ):
+                assert prepend, 'prepend=True not passed but index decreased?'
+                self._first.value = index
+
+            elif not prepend:
+                self._last.value = end
+
+            self._post_init = True
+            return end
+
+        except ValueError as err:
+            if field_map:
+                raise
+
+            # should raise if diff detected
+            self.diff_err_fields(data)
+            raise err
+
+    def diff_err_fields(
+        self,
+        data: np.ndarray,
+    ) -> None:
+        # reraise with any field discrepancy
+        our_fields, their_fields = (
+            set(self._array.dtype.fields),
+            set(data.dtype.fields),
+        )
+
+        only_in_ours = our_fields - their_fields
+        only_in_theirs = their_fields - our_fields
+
+        if only_in_ours:
+            raise TypeError(
+                f"Input array is missing field(s): {only_in_ours}"
+            )
+        elif only_in_theirs:
+            raise TypeError(
+                f"Input array has unknown field(s): {only_in_theirs}"
+            )
+
+    # TODO: support "silent" prepends that don't update ._first.value?
+    def prepend(
+        self,
+        data: np.ndarray,
+    ) -> int:
+        end = self.push(data, prepend=True)
+        assert end
+
+    def close(self) -> None:
+        self._first._shm.close()
+        self._last._shm.close()
+        self._shm.close()
+
+    def destroy(self) -> None:
+        if _USE_POSIX:
+            # We manually unlink to bypass all the "resource tracker"
+            # nonsense meant for non-SC systems.
+            shm_unlink(self._shm.name)
+
+        self._first.destroy()
+        self._last.destroy()
+
+    def flush(self) -> None:
+        # TODO: flush to storage backend like markestore?
+        ...
+
+
+def open_shm_array(
+
+    key: Optional[str] = None,
+    size: int = _default_size,  # see above
+    dtype: Optional[np.dtype] = None,
+    readonly: bool = False,
+
+) -> ShmArray:
+    '''Open a memory shared ``numpy`` using the standard library.
+
+    This call unlinks (aka permanently destroys) the buffer on teardown
+    and thus should be used from the parent-most accessor (process).
+
+    '''
+    # create new shared mem segment for which we
+    # have write permission
+    a = np.zeros(size, dtype=dtype)
+    a['index'] = np.arange(len(a))
+
+    shm = SharedMemory(
+        name=key,
+        create=True,
+        size=a.nbytes
+    )
+    array = np.ndarray(
+        a.shape,
+        dtype=a.dtype,
+        buffer=shm.buf
+    )
+    array[:] = a[:]
+    array.setflags(write=int(not readonly))
+
+    token = _make_token(
+        key=key,
+        size=size,
+        dtype=dtype,
+    )
+
+    # create single entry arrays for storing an first and last indices
+    first = SharedInt(
+        shm=SharedMemory(
+            name=token.shm_first_index_name,
+            create=True,
+            size=4,  # std int
+        )
+    )
+
+    last = SharedInt(
+        shm=SharedMemory(
+            name=token.shm_last_index_name,
+            create=True,
+            size=4,  # std int
+        )
+    )
+
+    # start the "real-time" updated section after 3-days worth of 1s
+    # sampled OHLC. this allows appending up to a days worth from
+    # tick/quote feeds before having to flush to a (tsdb) storage
+    # backend, and looks something like,
+    # -------------------------
+    # |              |        i
+    # _________________________
+    # <-------------> <------->
+    #  history         real-time
+    #
+    # Once fully "prepended", the history section will leave the
+    # ``ShmArray._start.value: int = 0`` and the yet-to-be written
+    # real-time section will start at ``ShmArray.index: int``.
+
+    # this sets the index to 3/4 of the length of the buffer
+    # leaving a "days worth of second samples" for the real-time
+    # section.
+    last.value = first.value = _rt_buffer_start
+
+    shmarr = ShmArray(
+        array,
+        first,
+        last,
+        shm,
+    )
+
+    assert shmarr._token == token
+    _known_tokens[key] = shmarr.token
+
+    # "unlink" created shm on process teardown by
+    # pushing teardown calls onto actor context stack
+
+    stack = tractor.current_actor().lifetime_stack
+    stack.callback(shmarr.close)
+    stack.callback(shmarr.destroy)
+
+    return shmarr
+
+
+def attach_shm_array(
+    token: tuple[str, str, tuple[str, str]],
+    readonly: bool = True,
+
+) -> ShmArray:
+    '''
+    Attach to an existing shared memory array previously
+    created by another process using ``open_shared_array``.
+
+    No new shared mem is allocated but wrapper types for read/write
+    access are constructed.
+
+    '''
+    token = _Token.from_msg(token)
+    key = token.shm_name
+
+    if key in _known_tokens:
+        assert _Token.from_msg(_known_tokens[key]) == token, "WTF"
+
+    # XXX: ugh, looks like due to the ``shm_open()`` C api we can't
+    # actually place files in a subdir, see discussion here:
+    # https://stackoverflow.com/a/11103289
+
+    # attach to array buffer and view as per dtype
+    _err: Optional[Exception] = None
+    for _ in range(3):
+        try:
+            shm = SharedMemory(
+                name=key,
+                create=False,
+            )
+            break
+        except OSError as oserr:
+            _err = oserr
+            time.sleep(0.1)
+    else:
+        if _err:
+            raise _err
+
+    shmarr = np.ndarray(
+        (token.size,),
+        dtype=token.dtype,
+        buffer=shm.buf
+    )
+    shmarr.setflags(write=int(not readonly))
+
+    first = SharedInt(
+        shm=SharedMemory(
+            name=token.shm_first_index_name,
+            create=False,
+            size=4,  # std int
+        ),
+    )
+    last = SharedInt(
+        shm=SharedMemory(
+            name=token.shm_last_index_name,
+            create=False,
+            size=4,  # std int
+        ),
+    )
+
+    # make sure we can read
+    first.value
+
+    sha = ShmArray(
+        shmarr,
+        first,
+        last,
+        shm,
+    )
+    # read test
+    sha.array
+
+    # Stash key -> token knowledge for future queries
+    # via `maybe_opepn_shm_array()` but only after we know
+    # we can attach.
+    if key not in _known_tokens:
+        _known_tokens[key] = token
+
+    # "close" attached shm on actor teardown
+    tractor.current_actor().lifetime_stack.callback(sha.close)
+
+    return sha
+
+
+def maybe_open_shm_array(
+    key: str,
+    dtype: Optional[np.dtype] = None,
+    **kwargs,
+
+) -> tuple[ShmArray, bool]:
+    '''
+    Attempt to attach to a shared memory block using a "key" lookup
+    to registered blocks in the users overall "system" registry
+    (presumes you don't have the block's explicit token).
+
+    This function is meant to solve the problem of discovering whether
+    a shared array token has been allocated or discovered by the actor
+    running in **this** process. Systems where multiple actors may seek
+    to access a common block can use this function to attempt to acquire
+    a token as discovered by the actors who have previously stored
+    a "key" -> ``_Token`` map in an actor local (aka python global)
+    variable.
+
+    If you know the explicit ``_Token`` for your memory segment instead
+    use ``attach_shm_array``.
+
+    '''
+    size = kwargs.pop('size', _default_size)
+    try:
+        # see if we already know this key
+        token = _known_tokens[key]
+        return attach_shm_array(token=token, **kwargs), False
+    except KeyError:
+        log.warning(f"Could not find {key} in shms cache")
+        if dtype:
+            token = _make_token(
+                key,
+                size=size,
+                dtype=dtype,
+            )
+            try:
+                return attach_shm_array(token=token, **kwargs), False
+            except FileNotFoundError:
+                log.warning(f"Could not attach to shm with token {token}")
+
+        # This actor does not know about memory
+        # associated with the provided "key".
+        # Attempt to open a block and expect
+        # to fail if a block has been allocated
+        # on the OS by someone else.
+        return open_shm_array(key=key, dtype=dtype, **kwargs), True
+
+
+def try_read(
+    array: np.ndarray
+
+) -> Optional[np.ndarray]:
+    '''
+    Try to read the last row from a shared mem array or ``None``
+    if the array read returns a zero-length array result.
+
+    Can be used to check for backfilling race conditions where an array
+    is currently being (re-)written by a writer actor but the reader is
+    unaware and reads during the window where the first and last indexes
+    are being updated.
+
+    '''
+    try:
+        return array[-1]
+    except IndexError:
+        # XXX: race condition with backfilling shm.
+        #
+        # the underlying issue is that a backfill (aka prepend) and subsequent
+        # shm array first/last index update could result in an empty array
+        # read here since the indices may be updated in such a way that
+        # a read delivers an empty array (though it seems like we
+        # *should* be able to prevent that?). also, as and alt and
+        # something we need anyway, maybe there should be some kind of
+        # signal that a prepend is taking place and this consumer can
+        # respond (eg. redrawing graphics) accordingly.
+
+        # the array read was emtpy
+        return None
-- 
2.34.1


From 71477290fc8ee4e740200ab7acdac31c0fd71b6c Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 16 Oct 2022 18:06:07 -0400
Subject: [PATCH 002/378] Add `ShmList` wrapping the stdlib's `ShareableList`

First attempt at getting `multiprocessing.shared_memory.ShareableList`
working; we wrap the stdlib type with a readonly attr and a `.key` for
cross-actor lookup. Also, rename all `numpy` specific routines to have
a `ndarray` suffix in the func names.
---
 tractor/_shm.py | 206 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 141 insertions(+), 65 deletions(-)

diff --git a/tractor/_shm.py b/tractor/_shm.py
index dca9d5a5..63d18411 100644
--- a/tractor/_shm.py
+++ b/tractor/_shm.py
@@ -28,6 +28,7 @@ import time
 from typing import Optional
 from multiprocessing.shared_memory import (
     SharedMemory,
+    ShareableList,
     _USE_POSIX,
 )
 
@@ -87,10 +88,11 @@ disable_mantracker()
 
 
 class SharedInt:
-    """Wrapper around a single entry shared memory array which
+    '''
+    Wrapper around a single entry shared memory array which
     holds an ``int`` value used as an index counter.
 
-    """
+    '''
     def __init__(
         self,
         shm: SharedMemory,
@@ -117,10 +119,13 @@ class SharedInt:
                 log.warning(f'Shm for {name} already unlinked?')
 
 
-class _Token(Struct, frozen=True):
+class _NpToken(Struct, frozen=True):
     '''
-    Internal represenation of a shared memory "token"
-    which can be used to key a system wide post shm entry.
+    Internal represenation of a shared memory ``numpy`` array "token"
+    which can be used to key and load a system (OS) wide shm entry
+    and correctly read the array by type signature.
+
+    This type is msg safe.
 
     '''
     shm_name: str  # this servers as a "key" value
@@ -137,18 +142,18 @@ class _Token(Struct, frozen=True):
         return self.to_dict()
 
     @classmethod
-    def from_msg(cls, msg: dict) -> _Token:
-        if isinstance(msg, _Token):
+    def from_msg(cls, msg: dict) -> _NpToken:
+        if isinstance(msg, _NpToken):
             return msg
 
         # TODO: native struct decoding
         # return _token_dec.decode(msg)
 
         msg['dtype_descr'] = tuple(map(tuple, msg['dtype_descr']))
-        return _Token(**msg)
+        return _NpToken(**msg)
 
 
-# _token_dec = msgspec.msgpack.Decoder(_Token)
+# _token_dec = msgspec.msgpack.Decoder(_NpToken)
 
 # TODO: this api?
 # _known_tokens = tractor.ActorVar('_shm_tokens', {})
@@ -159,10 +164,14 @@ class _Token(Struct, frozen=True):
 _known_tokens = {}
 
 
-def get_shm_token(key: str) -> _Token:
-    """Convenience func to check if a token
+def get_shm_token(key: str) -> _NpToken | str:
+    '''
+    Convenience func to check if a token
     for the provided key is known by this process.
-    """
+
+    Returns either the ``numpy`` token or a string for a shared list.
+
+    '''
     return _known_tokens.get(key)
 
 
@@ -171,13 +180,13 @@ def _make_token(
     size: int,
     dtype: np.dtype,
 
-) -> _Token:
+) -> _NpToken:
     '''
     Create a serializable token that can be used
     to access a shared array.
 
     '''
-    return _Token(
+    return _NpToken(
         shm_name=key,
         shm_first_index_name=key + "_first",
         shm_last_index_name=key + "_last",
@@ -188,7 +197,7 @@ def _make_token(
 
 class ShmArray:
     '''
-    A shared memory ``numpy`` (compatible) array API.
+    A shared memory ``numpy.ndarray`` API.
 
     An underlying shared memory buffer is allocated based on
     a user specified ``numpy.ndarray``. This fixed size array
@@ -228,8 +237,8 @@ class ShmArray:
     # TODO: ringbuf api?
 
     @property
-    def _token(self) -> _Token:
-        return _Token(
+    def _token(self) -> _NpToken:
+        return _NpToken(
             shm_name=self._shm.name,
             shm_first_index_name=self._first._shm.name,
             shm_last_index_name=self._last._shm.name,
@@ -446,15 +455,17 @@ class ShmArray:
         ...
 
 
-def open_shm_array(
+def open_shm_ndarray(
 
     key: Optional[str] = None,
-    size: int = _default_size,  # see above
-    dtype: Optional[np.dtype] = None,
+    size: int = int(2 ** 10),
+    dtype: np.dtype | None = None,
+    append_start_index: int = 0,
     readonly: bool = False,
 
 ) -> ShmArray:
-    '''Open a memory shared ``numpy`` using the standard library.
+    '''
+    Open a memory shared ``numpy`` using the standard library.
 
     This call unlinks (aka permanently destroys) the buffer on teardown
     and thus should be used from the parent-most accessor (process).
@@ -501,10 +512,10 @@ def open_shm_array(
         )
     )
 
-    # start the "real-time" updated section after 3-days worth of 1s
-    # sampled OHLC. this allows appending up to a days worth from
-    # tick/quote feeds before having to flush to a (tsdb) storage
-    # backend, and looks something like,
+    # Start the "real-time" append-updated (or "pushed-to") section
+    # after some start index: ``append_start_index``. This allows appending
+    # from a start point in the array which isn't the 0 index and looks
+    # something like,
     # -------------------------
     # |              |        i
     # _________________________
@@ -518,7 +529,7 @@ def open_shm_array(
     # this sets the index to 3/4 of the length of the buffer
     # leaving a "days worth of second samples" for the real-time
     # section.
-    last.value = first.value = _rt_buffer_start
+    last.value = first.value = append_start_index
 
     shmarr = ShmArray(
         array,
@@ -540,7 +551,7 @@ def open_shm_array(
     return shmarr
 
 
-def attach_shm_array(
+def attach_shm_ndarray(
     token: tuple[str, str, tuple[str, str]],
     readonly: bool = True,
 
@@ -553,11 +564,11 @@ def attach_shm_array(
     access are constructed.
 
     '''
-    token = _Token.from_msg(token)
+    token = _NpToken.from_msg(token)
     key = token.shm_name
 
     if key in _known_tokens:
-        assert _Token.from_msg(_known_tokens[key]) == token, "WTF"
+        assert _NpToken.from_msg(_known_tokens[key]) == token, "WTF"
 
     # XXX: ugh, looks like due to the ``shm_open()`` C api we can't
     # actually place files in a subdir, see discussion here:
@@ -625,10 +636,14 @@ def attach_shm_array(
     return sha
 
 
-def maybe_open_shm_array(
-    key: str,
-    dtype: Optional[np.dtype] = None,
-    **kwargs,
+def maybe_open_shm_ndarray(
+    key: str,  # unique identifier for segment
+
+    # from ``open_shm_array()``
+    size: int = int(2 ** 10),  # array length in index terms
+    dtype: np.dtype | None = None,
+    append_start_index: int = 0,
+    readonly: bool = True,
 
 ) -> tuple[ShmArray, bool]:
     '''
@@ -641,18 +656,23 @@ def maybe_open_shm_array(
     running in **this** process. Systems where multiple actors may seek
     to access a common block can use this function to attempt to acquire
     a token as discovered by the actors who have previously stored
-    a "key" -> ``_Token`` map in an actor local (aka python global)
+    a "key" -> ``_NpToken`` map in an actor local (aka python global)
     variable.
 
-    If you know the explicit ``_Token`` for your memory segment instead
+    If you know the explicit ``_NpToken`` for your memory segment instead
     use ``attach_shm_array``.
 
     '''
-    size = kwargs.pop('size', _default_size)
     try:
         # see if we already know this key
         token = _known_tokens[key]
-        return attach_shm_array(token=token, **kwargs), False
+        return (
+            attach_shm_ndarray(
+                token=token,
+                readonly=readonly,
+            ),
+            False,  # not newly opened
+        )
     except KeyError:
         log.warning(f"Could not find {key} in shms cache")
         if dtype:
@@ -661,8 +681,16 @@ def maybe_open_shm_array(
                 size=size,
                 dtype=dtype,
             )
+        else:
+
             try:
-                return attach_shm_array(token=token, **kwargs), False
+                return (
+                    attach_shm_ndarray(
+                        token=token,
+                        readonly=readonly,
+                    ),
+                    False,
+                )
             except FileNotFoundError:
                 log.warning(f"Could not attach to shm with token {token}")
 
@@ -671,36 +699,84 @@ def maybe_open_shm_array(
         # Attempt to open a block and expect
         # to fail if a block has been allocated
         # on the OS by someone else.
-        return open_shm_array(key=key, dtype=dtype, **kwargs), True
+        return (
+            open_shm_ndarray(
+                key=key,
+                size=size,
+                dtype=dtype,
+                append_start_index=append_start_index,
+                readonly=readonly,
+            ),
+            True,
+        )
 
 
-def try_read(
-    array: np.ndarray
-
-) -> Optional[np.ndarray]:
+class ShmList(ShareableList):
     '''
-    Try to read the last row from a shared mem array or ``None``
-    if the array read returns a zero-length array result.
-
-    Can be used to check for backfilling race conditions where an array
-    is currently being (re-)written by a writer actor but the reader is
-    unaware and reads during the window where the first and last indexes
-    are being updated.
+    Carbon copy of ``.shared_memory.ShareableList`` but add a
+    readonly state instance var.
 
     '''
-    try:
-        return array[-1]
-    except IndexError:
-        # XXX: race condition with backfilling shm.
-        #
-        # the underlying issue is that a backfill (aka prepend) and subsequent
-        # shm array first/last index update could result in an empty array
-        # read here since the indices may be updated in such a way that
-        # a read delivers an empty array (though it seems like we
-        # *should* be able to prevent that?). also, as and alt and
-        # something we need anyway, maybe there should be some kind of
-        # signal that a prepend is taking place and this consumer can
-        # respond (eg. redrawing graphics) accordingly.
+    def __init__(
+        self,
+        sequence: list | None = None,
+        *,
+        name: str | None = None,
+        readonly: bool = True
 
-        # the array read was emtpy
-        return None
+    ) -> None:
+        self._readonly = readonly
+        self._key = name
+        return super().__init__(
+            sequence=sequence,
+            name=name,
+        )
+
+    @property
+    def key(self) -> str:
+        return self._key
+
+    def __setitem__(
+        self,
+        position,
+        value,
+
+    ) -> None:
+
+        # mimick ``numpy`` error
+        if self._readonly:
+            raise ValueError('assignment destination is read-only')
+
+        return super().__setitem__(position, value)
+
+
+def open_shm_list(
+    key: str,
+    sequence: list | None = None,
+    size: int = int(2 ** 10),
+    dtype: np.dtype | None = None,
+    readonly: bool = True,
+
+) -> ShmList:
+
+    if sequence is None:
+        sequence = list(map(float, range(size)))
+
+    shml = ShmList(
+        sequence=sequence,
+        name=key,
+        readonly=readonly,
+    )
+
+    # "close" attached shm on actor teardown
+    tractor.current_actor().lifetime_stack.callback(shml.shm.close)
+    tractor.current_actor().lifetime_stack.callback(shml.shm.unlink)
+
+    return shml
+
+
+def attach_shm_list(
+    key: str,
+) -> ShmList:
+
+    return ShmList(name=key)
-- 
2.34.1


From c32b21b4b1f573a56c8acd00db1a3ea6d248b44b Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 16 Oct 2022 18:16:58 -0400
Subject: [PATCH 003/378] Add initial readers-writer shm list tests

---
 tests/test_shm.py | 84 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 tests/test_shm.py

diff --git a/tests/test_shm.py b/tests/test_shm.py
new file mode 100644
index 00000000..83ce7e21
--- /dev/null
+++ b/tests/test_shm.py
@@ -0,0 +1,84 @@
+"""
+Shared mem primitives and APIs.
+
+"""
+
+# import numpy
+import pytest
+import trio
+import tractor
+from tractor._shm import (
+    open_shm_list,
+    attach_shm_list,
+)
+
+
+@tractor.context
+async def child_read_shm_list(
+    ctx: tractor.Context,
+    shm_key: str,
+    use_str: bool,
+) -> None:
+
+    shml = attach_shm_list(key=shm_key)
+    await ctx.started(shml.key)
+
+    async with ctx.open_stream() as stream:
+        async for i in stream:
+            print(f'reading shm list index: {i}')
+
+            if use_str:
+                expect = str(float(i))
+            else:
+                expect = float(i)
+
+            assert expect == shml[i]
+
+
+@pytest.mark.parametrize(
+    'use_str', [False, True],
+)
+def test_parent_writer_child_reader(
+    use_str: bool,
+):
+
+    async def main():
+        async with tractor.open_nursery() as an:
+
+            # allocate writeable list in parent
+            key = 'shm_list'
+            shml = open_shm_list(
+                key=key,
+                readonly=False,
+            )
+
+            portal = await an.start_actor(
+                'shm_reader',
+                enable_modules=[__name__],
+            )
+
+            async with (
+                portal.open_context(
+                    child_read_shm_list,  # taken from pytest parameterization
+                    shm_key=key,
+                    use_str=use_str,
+                ) as (ctx, sent),
+
+                ctx.open_stream() as stream,
+            ):
+
+                assert sent == key
+
+                for i in range(2 ** 10):
+
+                    val = float(i)
+                    if use_str:
+                        val = str(val)
+
+                    print(f'writing {val}')
+                    shml[i] = val
+                    await stream.send(i)
+
+            await portal.cancel_actor()
+
+    trio.run(main)
-- 
2.34.1


From 339d787cf8d7164df39b1ae7051bd2a76b7c179d Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 17 Oct 2022 15:13:05 -0400
Subject: [PATCH 004/378] Add repetitive attach to existing segment test

---
 tests/test_shm.py | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/tests/test_shm.py b/tests/test_shm.py
index 83ce7e21..850ccb3e 100644
--- a/tests/test_shm.py
+++ b/tests/test_shm.py
@@ -2,6 +2,7 @@
 Shared mem primitives and APIs.
 
 """
+import uuid
 
 # import numpy
 import pytest
@@ -13,6 +14,50 @@ from tractor._shm import (
 )
 
 
+@tractor.context
+async def child_attach_shml_alot(
+    ctx: tractor.Context,
+    shm_key: str,
+) -> None:
+
+    await ctx.started(shm_key)
+
+    # now try to attach a boatload of times in a loop..
+    for _ in range(1000):
+        shml = attach_shm_list(key=shm_key)
+        assert shml.shm.name == shm_key
+        await trio.sleep(0.001)
+
+
+def test_child_attaches_alot():
+    async def main():
+        async with tractor.open_nursery() as an:
+
+            # allocate writeable list in parent
+            key = f'shml_{uuid.uuid4()}'
+            shml = open_shm_list(
+                key=key,
+            )
+
+            portal = await an.start_actor(
+                'shm_attacher',
+                enable_modules=[__name__],
+            )
+
+            async with (
+                portal.open_context(
+                    child_attach_shml_alot,  # taken from pytest parameterization
+                    shm_key=key,
+                ) as (ctx, start_val),
+            ):
+                assert start_val == key
+                await ctx.result()
+
+            await portal.cancel_actor()
+
+    trio.run(main)
+
+
 @tractor.context
 async def child_read_shm_list(
     ctx: tractor.Context,
-- 
2.34.1


From edb82fdd784f345a30ae5dc1c145cfb879796ccb Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 17 Oct 2022 15:13:58 -0400
Subject: [PATCH 005/378] Don't require runtime (for now), type annot fixing

---
 tractor/_shm.py | 44 +++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/tractor/_shm.py b/tractor/_shm.py
index 63d18411..80ca49d1 100644
--- a/tractor/_shm.py
+++ b/tractor/_shm.py
@@ -29,7 +29,7 @@ from typing import Optional
 from multiprocessing.shared_memory import (
     SharedMemory,
     ShareableList,
-    _USE_POSIX,
+    _USE_POSIX,  # type: ignore
 )
 
 if _USE_POSIX:
@@ -46,15 +46,6 @@ from .log import get_logger
 log = get_logger(__name__)
 
 
-# how  much is probably dependent on lifestyle
-_secs_in_day = int(60 * 60 * 24)
-# we try for a buncha times, but only on a run-every-other-day kinda week.
-_days_worth = 16
-_default_size = _days_worth * _secs_in_day
-# where to start the new data append index
-_rt_buffer_start = int((_days_worth - 1) * _secs_in_day)
-
-
 def disable_mantracker():
     '''
     Disable all ``multiprocessing``` "resource tracking" machinery since
@@ -79,7 +70,6 @@ def disable_mantracker():
     mantracker._resource_tracker = ManTracker()
     mantracker.register = mantracker._resource_tracker.register
     mantracker.ensure_running = mantracker._resource_tracker.ensure_running
-    # ensure_running = mantracker._resource_tracker.ensure_running
     mantracker.unregister = mantracker._resource_tracker.unregister
     mantracker.getfd = mantracker._resource_tracker.getfd
 
@@ -134,9 +124,14 @@ class _NpToken(Struct, frozen=True):
     dtype_descr: tuple
     size: int  # in struct-array index / row terms
 
+    # TODO: use nptyping here on dtypes
     @property
-    def dtype(self) -> np.dtype:
-        return np.dtype(list(map(tuple, self.dtype_descr))).descr
+    def dtype(self) -> list[tuple[str, str, tuple[int, ...]]]:
+        return np.dtype(
+            list(
+                map(tuple, self.dtype_descr)
+            )
+        ).descr
 
     def as_msg(self):
         return self.to_dict()
@@ -161,10 +156,10 @@ class _NpToken(Struct, frozen=True):
 # _known_tokens = trio.RunVar('shms', {})
 
 # process-local store of keys to tokens
-_known_tokens = {}
+_known_tokens: dict[str, _NpToken] = {}
 
 
-def get_shm_token(key: str) -> _NpToken | str:
+def get_shm_token(key: str) -> _NpToken | None:
     '''
     Convenience func to check if a token
     for the provided key is known by this process.
@@ -228,11 +223,10 @@ class ShmArray:
         self._post_init: bool = False
 
         # pushing data does not write the index (aka primary key)
+        self._write_fields: list[str] | None = None
         dtype = shmarr.dtype
         if dtype.fields:
             self._write_fields = list(shmarr.dtype.fields.keys())[1:]
-        else:
-            self._write_fields = None
 
     # TODO: ringbuf api?
 
@@ -283,9 +277,9 @@ class ShmArray:
         self,
         fields: Optional[list[str]] = None,
 
-        # type that all field values will be cast to
-        # in the returned view.
-        common_dtype: np.dtype = np.float,
+        # type that all field values will be cast to in the returned
+        # view.
+        common_dtype: np.dtype = np.float64,  # type: ignore
 
     ) -> np.ndarray:
 
@@ -543,7 +537,6 @@ def open_shm_ndarray(
 
     # "unlink" created shm on process teardown by
     # pushing teardown calls onto actor context stack
-
     stack = tractor.current_actor().lifetime_stack
     stack.callback(shmarr.close)
     stack.callback(shmarr.destroy)
@@ -769,14 +762,19 @@ def open_shm_list(
     )
 
     # "close" attached shm on actor teardown
-    tractor.current_actor().lifetime_stack.callback(shml.shm.close)
-    tractor.current_actor().lifetime_stack.callback(shml.shm.unlink)
+    try:
+        actor = tractor.current_actor()
+        actor.lifetime_stack.callback(shml.shm.close)
+        actor.lifetime_stack.callback(shml.shm.unlink)
+    except RuntimeError:
+        log.warning('tractor runtime not active, skipping teardown steps')
 
     return shml
 
 
 def attach_shm_list(
     key: str,
+
 ) -> ShmList:
 
     return ShmList(name=key)
-- 
2.34.1


From 1713ecd9f8414862918550724659475c6aa9f23b Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 17 Oct 2022 17:21:14 -0400
Subject: [PATCH 006/378] Rename token type to `NDToken` in the style of
 `nptyping`

---
 tractor/_shm.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/tractor/_shm.py b/tractor/_shm.py
index 80ca49d1..3f415c52 100644
--- a/tractor/_shm.py
+++ b/tractor/_shm.py
@@ -109,7 +109,7 @@ class SharedInt:
                 log.warning(f'Shm for {name} already unlinked?')
 
 
-class _NpToken(Struct, frozen=True):
+class NDToken(Struct, frozen=True):
     '''
     Internal represenation of a shared memory ``numpy`` array "token"
     which can be used to key and load a system (OS) wide shm entry
@@ -137,18 +137,18 @@ class _NpToken(Struct, frozen=True):
         return self.to_dict()
 
     @classmethod
-    def from_msg(cls, msg: dict) -> _NpToken:
-        if isinstance(msg, _NpToken):
+    def from_msg(cls, msg: dict) -> NDToken:
+        if isinstance(msg, NDToken):
             return msg
 
         # TODO: native struct decoding
         # return _token_dec.decode(msg)
 
         msg['dtype_descr'] = tuple(map(tuple, msg['dtype_descr']))
-        return _NpToken(**msg)
+        return NDToken(**msg)
 
 
-# _token_dec = msgspec.msgpack.Decoder(_NpToken)
+# _token_dec = msgspec.msgpack.Decoder(NDToken)
 
 # TODO: this api?
 # _known_tokens = tractor.ActorVar('_shm_tokens', {})
@@ -156,10 +156,10 @@ class _NpToken(Struct, frozen=True):
 # _known_tokens = trio.RunVar('shms', {})
 
 # process-local store of keys to tokens
-_known_tokens: dict[str, _NpToken] = {}
+_known_tokens: dict[str, NDToken] = {}
 
 
-def get_shm_token(key: str) -> _NpToken | None:
+def get_shm_token(key: str) -> NDToken | None:
     '''
     Convenience func to check if a token
     for the provided key is known by this process.
@@ -175,13 +175,13 @@ def _make_token(
     size: int,
     dtype: np.dtype,
 
-) -> _NpToken:
+) -> NDToken:
     '''
     Create a serializable token that can be used
     to access a shared array.
 
     '''
-    return _NpToken(
+    return NDToken(
         shm_name=key,
         shm_first_index_name=key + "_first",
         shm_last_index_name=key + "_last",
@@ -231,8 +231,8 @@ class ShmArray:
     # TODO: ringbuf api?
 
     @property
-    def _token(self) -> _NpToken:
-        return _NpToken(
+    def _token(self) -> NDToken:
+        return NDToken(
             shm_name=self._shm.name,
             shm_first_index_name=self._first._shm.name,
             shm_last_index_name=self._last._shm.name,
@@ -557,11 +557,11 @@ def attach_shm_ndarray(
     access are constructed.
 
     '''
-    token = _NpToken.from_msg(token)
+    token = NDToken.from_msg(token)
     key = token.shm_name
 
     if key in _known_tokens:
-        assert _NpToken.from_msg(_known_tokens[key]) == token, "WTF"
+        assert NDToken.from_msg(_known_tokens[key]) == token, "WTF"
 
     # XXX: ugh, looks like due to the ``shm_open()`` C api we can't
     # actually place files in a subdir, see discussion here:
@@ -649,10 +649,10 @@ def maybe_open_shm_ndarray(
     running in **this** process. Systems where multiple actors may seek
     to access a common block can use this function to attempt to acquire
     a token as discovered by the actors who have previously stored
-    a "key" -> ``_NpToken`` map in an actor local (aka python global)
+    a "key" -> ``NDToken`` map in an actor local (aka python global)
     variable.
 
-    If you know the explicit ``_NpToken`` for your memory segment instead
+    If you know the explicit ``NDToken`` for your memory segment instead
     use ``attach_shm_array``.
 
     '''
-- 
2.34.1


From b52ff270c57eb579576ff2b71013daf5cc8e7522 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 18 Oct 2022 11:01:02 -0400
Subject: [PATCH 007/378] Add `ShmList` slice support in `.__getitem__()`

---
 tractor/_shm.py | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/tractor/_shm.py b/tractor/_shm.py
index 3f415c52..c26c9911 100644
--- a/tractor/_shm.py
+++ b/tractor/_shm.py
@@ -26,20 +26,26 @@ from __future__ import annotations
 from sys import byteorder
 import time
 from typing import Optional
+from multiprocessing import shared_memory as shm
 from multiprocessing.shared_memory import (
     SharedMemory,
     ShareableList,
-    _USE_POSIX,  # type: ignore
+    # _USE_POSIX,  # type: ignore
 )
 
-if _USE_POSIX:
+if getattr(shm, '_USE_POSIX', False):
     from _posixshmem import shm_unlink
 
 from msgspec import Struct
-import numpy as np
-from numpy.lib import recfunctions as rfn
 import tractor
 
+try:
+    import numpy as np
+    from numpy.lib import recfunctions as rfn
+    import nptyping
+except ImportError:
+    pass
+
 from .log import get_logger
 
 
@@ -742,6 +748,15 @@ class ShmList(ShareableList):
 
         return super().__setitem__(position, value)
 
+    def __getitem__(
+        self,
+        indexish,
+    ) -> list:
+        if isinstance(indexish, slice):
+            return list(self)[indexish]
+
+        return super().__getitem__(indexish)
+
 
 def open_shm_list(
     key: str,
@@ -774,7 +789,11 @@ def open_shm_list(
 
 def attach_shm_list(
     key: str,
+    readonly: bool = False,
 
 ) -> ShmList:
 
-    return ShmList(name=key)
+    return ShmList(
+        name=key,
+        readonly=readonly,
+    )
-- 
2.34.1


From a9fc4c1b91decacea89595483e3be0deb3cc3c4d Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 18 Oct 2022 11:01:30 -0400
Subject: [PATCH 008/378] Parametrize rw test with variable frame sizes

Demonstrates fixed size frame-oriented reads by the child where the
parent only transmits a "read" stream msg on "frame fill events" such
that the child incrementally reads the shm list data (much like in
a real-time-buffered streaming system).
---
 tests/test_shm.py | 64 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 48 insertions(+), 16 deletions(-)

diff --git a/tests/test_shm.py b/tests/test_shm.py
index 850ccb3e..c183040c 100644
--- a/tests/test_shm.py
+++ b/tests/test_shm.py
@@ -24,7 +24,10 @@ async def child_attach_shml_alot(
 
     # now try to attach a boatload of times in a loop..
     for _ in range(1000):
-        shml = attach_shm_list(key=shm_key)
+        shml = attach_shm_list(
+            key=shm_key,
+            readonly=False,
+        )
         assert shml.shm.name == shm_key
         await trio.sleep(0.001)
 
@@ -46,8 +49,8 @@ def test_child_attaches_alot():
 
             async with (
                 portal.open_context(
-                    child_attach_shml_alot,  # taken from pytest parameterization
-                    shm_key=key,
+                    child_attach_shml_alot,
+                    shm_key=shml.key,
                 ) as (ctx, start_val),
             ):
                 assert start_val == key
@@ -63,50 +66,70 @@ async def child_read_shm_list(
     ctx: tractor.Context,
     shm_key: str,
     use_str: bool,
+    frame_size: int,
 ) -> None:
 
+    # attach in child
     shml = attach_shm_list(key=shm_key)
     await ctx.started(shml.key)
 
     async with ctx.open_stream() as stream:
         async for i in stream:
-            print(f'reading shm list index: {i}')
+            print(f'(child): reading shm list index: {i}')
 
             if use_str:
                 expect = str(float(i))
             else:
                 expect = float(i)
 
-            assert expect == shml[i]
+            if frame_size == 1:
+                val = shml[i]
+                assert expect == val
+                print(f'(child): reading value: {val}')
+            else:
+                frame = shml[i - frame_size:i]
+                print(f'(child): reading frame: {frame}')
 
 
 @pytest.mark.parametrize(
     'use_str', [False, True],
 )
+@pytest.mark.parametrize(
+    'frame_size',
+    [1, 2**6, 2**10],
+    ids=lambda i: f'frame_size={i}',
+)
 def test_parent_writer_child_reader(
     use_str: bool,
+    frame_size: int,
 ):
 
     async def main():
-        async with tractor.open_nursery() as an:
-
-            # allocate writeable list in parent
-            key = 'shm_list'
-            shml = open_shm_list(
-                key=key,
-                readonly=False,
-            )
+        async with tractor.open_nursery(
+            debug_mode=True,
+        ) as an:
 
             portal = await an.start_actor(
                 'shm_reader',
                 enable_modules=[__name__],
+                debug_mode=True,
+            )
+
+            # allocate writeable list in parent
+            key = 'shm_list'
+            seq_size = int(2 * 2 ** 10)
+            shml = open_shm_list(
+                key=key,
+                size=seq_size,
+                readonly=False,
             )
 
             async with (
                 portal.open_context(
-                    child_read_shm_list,  # taken from pytest parameterization
+                    child_read_shm_list,
                     shm_key=key,
                     use_str=use_str,
+                    frame_size=frame_size,
                 ) as (ctx, sent),
 
                 ctx.open_stream() as stream,
@@ -114,14 +137,23 @@ def test_parent_writer_child_reader(
 
                 assert sent == key
 
-                for i in range(2 ** 10):
+                for i in range(seq_size):
 
                     val = float(i)
                     if use_str:
                         val = str(val)
 
-                    print(f'writing {val}')
+                    print(f'(parent): writing {val}')
                     shml[i] = val
+
+                    # only on frame fills do we
+                    # signal to the child that a frame's
+                    # worth is ready.
+                    if (i % frame_size) == 0:
+                        print(f'(parent): signalling frame full on {val}')
+                        await stream.send(i)
+                else:
+                    print(f'(parent): signalling final frame on {val}')
                     await stream.send(i)
 
             await portal.cancel_actor()
-- 
2.34.1


From e0bf964ff0f8e44d180bb9ab345b41258ee45af9 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 18 Oct 2022 16:28:57 -0400
Subject: [PATCH 009/378] Mod define `_USE_POSIX`, add a of of todos

---
 tractor/_shm.py | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/tractor/_shm.py b/tractor/_shm.py
index c26c9911..79ac8969 100644
--- a/tractor/_shm.py
+++ b/tractor/_shm.py
@@ -30,15 +30,19 @@ from multiprocessing import shared_memory as shm
 from multiprocessing.shared_memory import (
     SharedMemory,
     ShareableList,
-    # _USE_POSIX,  # type: ignore
 )
 
-if getattr(shm, '_USE_POSIX', False):
-    from _posixshmem import shm_unlink
-
 from msgspec import Struct
 import tractor
 
+from .log import get_logger
+
+
+_USE_POSIX = getattr(shm, '_USE_POSIX', False)
+if _USE_POSIX:
+    from _posixshmem import shm_unlink
+
+
 try:
     import numpy as np
     from numpy.lib import recfunctions as rfn
@@ -46,8 +50,6 @@ try:
 except ImportError:
     pass
 
-from .log import get_logger
-
 
 log = get_logger(__name__)
 
@@ -161,6 +163,8 @@ class NDToken(Struct, frozen=True):
 # _known_tokens = tractor.ContextStack('_known_tokens', )
 # _known_tokens = trio.RunVar('shms', {})
 
+# TODO: this should maybe be provided via
+# a `.trionics.maybe_open_context()` wrapper factory?
 # process-local store of keys to tokens
 _known_tokens: dict[str, NDToken] = {}
 
@@ -712,8 +716,12 @@ def maybe_open_shm_ndarray(
 
 class ShmList(ShareableList):
     '''
-    Carbon copy of ``.shared_memory.ShareableList`` but add a
-    readonly state instance var.
+    Carbon copy of ``.shared_memory.ShareableList`` with a few
+    enhancements:
+
+    - readonly mode via instance var flag
+    - ``.__getitem__()`` accepts ``slice`` inputs
+    - exposes the underlying buffer "name" as a ``.key: str``
 
     '''
     def __init__(
@@ -752,11 +760,22 @@ class ShmList(ShareableList):
         self,
         indexish,
     ) -> list:
+
+        # NOTE: this is a non-writeable view (copy?) of the buffer
+        # in a new list instance.
         if isinstance(indexish, slice):
             return list(self)[indexish]
 
         return super().__getitem__(indexish)
 
+    # TODO: should we offer a `.array` and `.push()` equivalent
+    # to the `ShmArray`?
+    # currently we have the following limitations:
+    # - can't write slices of input using traditional slice-assign
+    #   syntax due to the ``ShareableList.__setitem__()`` implementation.
+    # - ``list(shmlist)`` returns a non-mutable copy instead of
+    #   a writeable view which would be handier numpy-style ops.
+
 
 def open_shm_list(
     key: str,
-- 
2.34.1


From f9a84f073266ab5e0137e76d06515cd4a8dacefd Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 19 Oct 2022 14:20:50 -0400
Subject: [PATCH 010/378] Allocate size-specced "empty" sequence from default
 values by type

---
 tractor/_shm.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tractor/_shm.py b/tractor/_shm.py
index 79ac8969..2ce148da 100644
--- a/tractor/_shm.py
+++ b/tractor/_shm.py
@@ -719,7 +719,7 @@ class ShmList(ShareableList):
     Carbon copy of ``.shared_memory.ShareableList`` with a few
     enhancements:
 
-    - readonly mode via instance var flag
+    - readonly mode via instance var flag  `._readonly: bool`
     - ``.__getitem__()`` accepts ``slice`` inputs
     - exposes the underlying buffer "name" as a ``.key: str``
 
@@ -743,6 +743,10 @@ class ShmList(ShareableList):
     def key(self) -> str:
         return self._key
 
+    @property
+    def readonly(self) -> bool:
+        return self._readonly
+
     def __setitem__(
         self,
         position,
@@ -781,13 +785,21 @@ def open_shm_list(
     key: str,
     sequence: list | None = None,
     size: int = int(2 ** 10),
-    dtype: np.dtype | None = None,
+    dtype: float | int | bool | str | bytes | None = float,
     readonly: bool = True,
 
 ) -> ShmList:
 
     if sequence is None:
-        sequence = list(map(float, range(size)))
+        default = {
+            float: 0.,
+            int: 0,
+            bool: True,
+            str: 'doggy',
+            None: None,
+        }[dtype]
+        sequence = [default] * size
+        # sequence = [0.] * size
 
     shml = ShmList(
         sequence=sequence,
-- 
2.34.1


From 4f442efbd79ac8d106f290056c3cf2a578f09d94 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 20 Oct 2022 16:08:28 -0400
Subject: [PATCH 011/378] Pass `str` dtype for `use_str` case

---
 tests/test_shm.py | 14 ++++++++++----
 tractor/_shm.py   |  2 --
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/tests/test_shm.py b/tests/test_shm.py
index c183040c..2b7a382f 100644
--- a/tests/test_shm.py
+++ b/tests/test_shm.py
@@ -70,7 +70,10 @@ async def child_read_shm_list(
 ) -> None:
 
     # attach in child
-    shml = attach_shm_list(key=shm_key)
+    shml = attach_shm_list(
+        key=shm_key,
+        # dtype=str if use_str else float,
+    )
     await ctx.started(shml.key)
 
     async with ctx.open_stream() as stream:
@@ -92,7 +95,9 @@ async def child_read_shm_list(
 
 
 @pytest.mark.parametrize(
-    'use_str', [False, True],
+    'use_str',
+    [False, True],
+    ids=lambda i: f'use_str_values={i}',
 )
 @pytest.mark.parametrize(
     'frame_size',
@@ -106,7 +111,7 @@ def test_parent_writer_child_reader(
 
     async def main():
         async with tractor.open_nursery(
-            debug_mode=True,
+            # debug_mode=True,
         ) as an:
 
             portal = await an.start_actor(
@@ -121,6 +126,7 @@ def test_parent_writer_child_reader(
             shml = open_shm_list(
                 key=key,
                 size=seq_size,
+                dtype=str if use_str else float,
                 readonly=False,
             )
 
@@ -143,7 +149,7 @@ def test_parent_writer_child_reader(
                     if use_str:
                         val = str(val)
 
-                    print(f'(parent): writing {val}')
+                    # print(f'(parent): writing {val}')
                     shml[i] = val
 
                     # only on frame fills do we
diff --git a/tractor/_shm.py b/tractor/_shm.py
index 2ce148da..c4c17335 100644
--- a/tractor/_shm.py
+++ b/tractor/_shm.py
@@ -460,7 +460,6 @@ class ShmArray:
 
 
 def open_shm_ndarray(
-
     key: Optional[str] = None,
     size: int = int(2 ** 10),
     dtype: np.dtype | None = None,
@@ -799,7 +798,6 @@ def open_shm_list(
             None: None,
         }[dtype]
         sequence = [default] * size
-        # sequence = [0.] * size
 
     shml = ShmList(
         sequence=sequence,
-- 
2.34.1


From f745da9fb2783c7b55ef7c7ae25d79d18bf530c8 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 26 Oct 2022 12:00:14 -0400
Subject: [PATCH 012/378] Add `numpy` for testing optional integrated shm API
 layer

---
 requirements-test.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements-test.txt b/requirements-test.txt
index 8070f2c7..b589bd12 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -6,3 +6,4 @@ mypy
 trio_typing
 pexpect
 towncrier
+numpy
-- 
2.34.1


From ebcb275cd8b768226ecf014a4628d042ebc6b7f9 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 7 Mar 2023 17:37:06 -0500
Subject: [PATCH 013/378] Add (first-draft) infected-`asyncio` actor task uses
 debugger example

---
 examples/debugging/asyncio_bp.py | 79 ++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 examples/debugging/asyncio_bp.py

diff --git a/examples/debugging/asyncio_bp.py b/examples/debugging/asyncio_bp.py
new file mode 100644
index 00000000..882ebbed
--- /dev/null
+++ b/examples/debugging/asyncio_bp.py
@@ -0,0 +1,79 @@
+import asyncio
+
+import trio
+import tractor
+
+
+async def bp_then_error(
+    to_trio: trio.MemorySendChannel,
+    from_trio: asyncio.Queue,
+
+) -> None:
+
+    # sync with ``trio``-side (caller) task
+    to_trio.send_nowait('start')
+
+    # NOTE: what happens here inside the hook needs some refinement..
+    # => seems like it's still `._debug._set_trace()` but
+    #    we set `Lock.local_task_in_debug = 'sync'`, we probably want
+    #    some further, at least, meta-data about the task/actoq in debug
+    #    in terms of making it clear it's asyncio mucking about.
+
+    breakpoint()
+
+    await asyncio.sleep(0.5)
+    raise ValueError('blah')
+
+
+async def aio_sleep_forever():
+    await asyncio.sleep(float('inf'))
+
+
+@tractor.context
+async def trio_ctx(
+    ctx: tractor.Context,
+):
+
+    # this will block until the ``asyncio`` task sends a "first"
+    # message, see first line in above func.
+    async with (
+        tractor.to_asyncio.open_channel_from(bp_then_error) as (first, chan),
+        trio.open_nursery() as n,
+    ):
+
+        assert first == 'start'
+        await ctx.started(first)
+
+        n.start_soon(
+            tractor.to_asyncio.run_task,
+            aio_sleep_forever,
+        )
+        await trio.sleep_forever()
+
+
+async def main():
+
+    async with tractor.open_nursery() as n:
+
+        p = await n.start_actor(
+            'aio_daemon',
+            enable_modules=[__name__],
+            infect_asyncio=True,
+            debug_mode=True,
+            loglevel='cancel',
+        )
+
+        async with p.open_context(trio_ctx) as (ctx, first):
+
+            assert first == 'start'
+            await trio.sleep_forever()
+
+            assert 0
+
+        # TODO: case where we cancel from trio-side while asyncio task
+        # has debugger lock?
+        # await p.cancel_actor()
+
+
+if __name__ == '__main__':
+    trio.run(main)
-- 
2.34.1


From ee87cf0e29b022b5b755961204ac5a7a7c227580 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 27 Mar 2023 19:05:00 -0400
Subject: [PATCH 014/378] Add a debug-mode-breakpoint-causes-hang case!

Only found this by luck more or less (while working on something in
a client project) and it turns out we can actually get to (yet another)
hang state where SIGINT will be ignored by the root actor on teardown..

I've added all the necessary logic flags to reproduce. We obviously need
a follow up bug issue and a test suite to replicate!

It appears as though the following are required based on very light
tinkering:
- infected asyncio mode active
- debug mode active
- the `trio` context must breakpoint *before* `.started()`-ing
- the `asyncio` must **not** error
---
 examples/debugging/asyncio_bp.py | 56 +++++++++++++++++++++++++++-----
 1 file changed, 47 insertions(+), 9 deletions(-)

diff --git a/examples/debugging/asyncio_bp.py b/examples/debugging/asyncio_bp.py
index 882ebbed..b32ad1d8 100644
--- a/examples/debugging/asyncio_bp.py
+++ b/examples/debugging/asyncio_bp.py
@@ -2,12 +2,19 @@ import asyncio
 
 import trio
 import tractor
+from tractor import to_asyncio
+
+
+async def aio_sleep_forever():
+    await asyncio.sleep(float('inf'))
 
 
 async def bp_then_error(
     to_trio: trio.MemorySendChannel,
     from_trio: asyncio.Queue,
 
+    raise_after_bp: bool = True,
+
 ) -> None:
 
     # sync with ``trio``-side (caller) task
@@ -18,40 +25,57 @@ async def bp_then_error(
     #    we set `Lock.local_task_in_debug = 'sync'`, we probably want
     #    some further, at least, meta-data about the task/actoq in debug
     #    in terms of making it clear it's asyncio mucking about.
-
     breakpoint()
 
+    # short checkpoint / delay
     await asyncio.sleep(0.5)
-    raise ValueError('blah')
 
+    if raise_after_bp:
+        raise ValueError('blah')
 
-async def aio_sleep_forever():
-    await asyncio.sleep(float('inf'))
+    # TODO: test case with this so that it gets cancelled?
+    else:
+        # XXX NOTE: this is required in order to get the SIGINT-ignored
+        # hang case documented in the module script section!
+        await aio_sleep_forever()
 
 
 @tractor.context
 async def trio_ctx(
     ctx: tractor.Context,
+    bp_before_started: bool = False,
 ):
 
     # this will block until the ``asyncio`` task sends a "first"
     # message, see first line in above func.
     async with (
-        tractor.to_asyncio.open_channel_from(bp_then_error) as (first, chan),
+
+        to_asyncio.open_channel_from(
+            bp_then_error,
+            raise_after_bp=not bp_before_started,
+        ) as (first, chan),
+
         trio.open_nursery() as n,
     ):
 
         assert first == 'start'
+
+        if bp_before_started:
+            await tractor.breakpoint()
+
         await ctx.started(first)
 
         n.start_soon(
-            tractor.to_asyncio.run_task,
+            to_asyncio.run_task,
             aio_sleep_forever,
         )
         await trio.sleep_forever()
 
 
-async def main():
+async def main(
+    bps_all_over: bool = False,
+
+) -> None:
 
     async with tractor.open_nursery() as n:
 
@@ -63,11 +87,18 @@ async def main():
             loglevel='cancel',
         )
 
-        async with p.open_context(trio_ctx) as (ctx, first):
+        async with p.open_context(
+            trio_ctx,
+            bp_before_started=bps_all_over,
+        ) as (ctx, first):
 
             assert first == 'start'
-            await trio.sleep_forever()
 
+            if bps_all_over:
+                await tractor.breakpoint()
+
+            # await trio.sleep_forever()
+            await ctx.cancel()
             assert 0
 
         # TODO: case where we cancel from trio-side while asyncio task
@@ -76,4 +107,11 @@ async def main():
 
 
 if __name__ == '__main__':
+
+    # works fine B)
     trio.run(main)
+
+    # will hang and ignores SIGINT !!
+    # NOTE: you'll need to send a SIGQUIT (via ctl-\) to kill it
+    # manually..
+    # trio.run(main, True)
-- 
2.34.1


From fc56971a2da3c2360795ab10c5a8311fe2b4b2c1 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 21 Jun 2023 16:08:18 -0400
Subject: [PATCH 015/378] First proto: use `greenback` for sync func
 breakpointing

This works now for supporting a new `tractor.pause_from_sync()`
`tractor`-aware-replacement for `Pdb.set_trace()` from sync functions
which are also scheduled from our runtime. Uses `greenback` to do all
the magic of scheduling the bg `tractor._debug._pause()` task and
engaging the normal TTY locking machinery triggered by `await
tractor.breakpoint()`

Further this starts some public API renaming, making a switch to
`tractor.pause()` from `.breakpoint()` which IMO much better expresses
the semantics of the runtime intervention required to suffice
multi-process "breakpointing"; it also is an alternate name for the same
in computer science more generally: https://en.wikipedia.org/wiki/Breakpoint
It also avoids using the same name as the `breakpoint()` built-in which
is important since there **is alot more going on** when you call our
equivalent API.

Deats of that:
- add deprecation warning for `tractor.breakpoint()`
- add `tractor.pause()` and a shorthand, easier-to-type, alias `.pp()`
  for "pause-point" B)
- add `pause_from_sync()` as the new `breakpoint()`-from-sync-function
  hack which does all the `greenback` stuff for the user.

Still TODO:
- figure out where in the runtime and when to call
  `greenback.ensure_portal()`.
- fix the frame selection issue where
  `trio._core._ki._ki_protection_decorator:wrapper` seems to be always
  shown on REPL start as the selected frame..
---
 tractor/__init__.py |   8 ++-
 tractor/_debug.py   | 116 ++++++++++++++++++++++++++++++++++++--------
 tractor/_runtime.py |   8 ++-
 3 files changed, 110 insertions(+), 22 deletions(-)

diff --git a/tractor/__init__.py b/tractor/__init__.py
index aa262105..8781943a 100644
--- a/tractor/__init__.py
+++ b/tractor/__init__.py
@@ -48,6 +48,9 @@ from ._exceptions import (
 )
 from ._debug import (
     breakpoint,
+    pause,
+    pp,
+    pause_from_sync,
     post_mortem,
 )
 from . import msg
@@ -61,12 +64,12 @@ from ._runtime import Actor
 
 __all__ = [
     'Actor',
+    'BaseExceptionGroup',
     'Channel',
     'Context',
     'ContextCancelled',
     'ModuleNotExposed',
     'MsgStream',
-    'BaseExceptionGroup',
     'Portal',
     'RemoteActorError',
     'breakpoint',
@@ -79,7 +82,10 @@ __all__ = [
     'open_actor_cluster',
     'open_nursery',
     'open_root_actor',
+    'pause',
     'post_mortem',
+    'pp',
+    'pause_from_sync'
     'query_actor',
     'run_daemon',
     'stream',
diff --git a/tractor/_debug.py b/tractor/_debug.py
index b0482f18..eec6fc50 100644
--- a/tractor/_debug.py
+++ b/tractor/_debug.py
@@ -374,7 +374,7 @@ async def wait_for_parent_stdin_hijack(
 
     This function is used by any sub-actor to acquire mutex access to
     the ``pdb`` REPL and thus the root's TTY for interactive debugging
-    (see below inside ``_breakpoint()``). It can be used to ensure that
+    (see below inside ``_pause()``). It can be used to ensure that
     an intermediate nursery-owning actor does not clobber its children
     if they are in debug (see below inside
     ``maybe_wait_for_debugger()``).
@@ -440,17 +440,29 @@ def mk_mpdb() -> tuple[MultiActorPdb, Callable]:
     return pdb, Lock.unshield_sigint
 
 
-async def _breakpoint(
+async def _pause(
 
-    debug_func,
+    debug_func: Callable | None = None,
+    release_lock_signal: trio.Event | None = None,
 
     # TODO:
     # shield: bool = False
+    task_status: TaskStatus[trio.Event] = trio.TASK_STATUS_IGNORED
 
 ) -> None:
     '''
-    Breakpoint entry for engaging debugger instance sync-interaction,
-    from async code, executing in actor runtime (task).
+    A pause point (more commonly known as a "breakpoint") interrupt
+    instruction for engaging a blocking debugger instance to
+    conduct manual console-based-REPL-interaction from within
+    `tractor`'s async runtime, normally from some single-threaded
+    and currently executing actor-hosted-`trio`-task in some
+    (remote) process.
+
+    NOTE: we use the semantics "pause" since it better encompasses
+    the entirety of the necessary global-runtime-state-mutation any
+    actor-task must access and lock in order to get full isolated
+    control over the process tree's root TTY:
+    https://en.wikipedia.org/wiki/Breakpoint
 
     '''
     __tracebackhide__ = True
@@ -559,10 +571,21 @@ async def _breakpoint(
         Lock.repl = pdb
 
     try:
-        # block here one (at the appropriate frame *up*) where
-        # ``breakpoint()`` was awaited and begin handling stdio.
-        log.debug("Entering the synchronous world of pdb")
-        debug_func(actor, pdb)
+        # breakpoint()
+        if debug_func is None:
+            assert release_lock_signal, (
+                'Must pass `release_lock_signal: trio.Event` if no '
+                'trace func provided!'
+            )
+            print(f"{actor.uid} ENTERING WAIT")
+            task_status.started()
+            await release_lock_signal.wait()
+
+        else:
+            # block here one (at the appropriate frame *up*) where
+            # ``breakpoint()`` was awaited and begin handling stdio.
+            log.debug("Entering the synchronous world of pdb")
+            debug_func(actor, pdb)
 
     except bdb.BdbQuit:
         Lock.release()
@@ -708,8 +731,8 @@ def shield_sigint_handler(
         # elif debug_mode():
 
     else:  # XXX: shouldn't ever get here?
-        print("WTFWTFWTF")
-        raise KeyboardInterrupt
+        raise RuntimeError("WTFWTFWTF")
+        # raise KeyboardInterrupt("WTFWTFWTF")
 
     # NOTE: currently (at least on ``fancycompleter`` 0.9.2)
     # it looks to be that the last command that was run (eg. ll)
@@ -737,21 +760,18 @@ def shield_sigint_handler(
         # https://github.com/goodboy/tractor/issues/130#issuecomment-663752040
         # https://github.com/prompt-toolkit/python-prompt-toolkit/blob/c2c6af8a0308f9e5d7c0e28cb8a02963fe0ce07a/prompt_toolkit/patch_stdout.py
 
-        # XXX LEGACY: lol, see ``pdbpp`` issue:
-        # https://github.com/pdbpp/pdbpp/issues/496
-
 
 def _set_trace(
     actor: tractor.Actor | None = None,
     pdb: MultiActorPdb | None = None,
 ):
     __tracebackhide__ = True
-    actor = actor or tractor.current_actor()
+    actor: tractor.Actor = actor or tractor.current_actor()
 
     # start 2 levels up in user code
-    frame: Optional[FrameType] = sys._getframe()
+    frame: FrameType | None = sys._getframe()
     if frame:
-        frame = frame.f_back  # type: ignore
+        frame: FrameType = frame.f_back  # type: ignore
 
     if (
         frame
@@ -773,10 +793,66 @@ def _set_trace(
     pdb.set_trace(frame=frame)
 
 
-breakpoint = partial(
-    _breakpoint,
+# TODO: allow pausing from sync code, normally by remapping
+# python's builtin breakpoint() hook to this runtime aware version.
+def pause_from_sync() -> None:
+    import greenback
+
+    actor: tractor.Actor = tractor.current_actor()
+    task_can_release_tty_lock = trio.Event()
+
+    # spawn bg task which will lock out the TTY, we poll
+    # just below until the release event is reporting that task as
+    # waiting.. not the most ideal but works for now ;)
+    greenback.await_(
+        actor._service_n.start(partial(
+            _pause,
+            debug_func=None,
+            release_lock_signal=task_can_release_tty_lock,
+        ))
+    )
+    print("ENTER SYNC PAUSE")
+    pdb, undo_sigint = mk_mpdb()
+    try:
+        print("ENTER SYNC PAUSE")
+        # _set_trace(actor=actor)
+
+        # we entered the global ``breakpoint()`` built-in from sync
+        # code?
+        Lock.local_task_in_debug = 'sync'
+        frame: FrameType | None = sys._getframe()
+        print(f'FRAME: {str(frame)}')
+
+        frame: FrameType = frame.f_back  # type: ignore
+        print(f'FRAME: {str(frame)}')
+
+        frame: FrameType = frame.f_back  # type: ignore
+        print(f'FRAME: {str(frame)}')
+
+        pdb.set_trace(frame=frame)
+        # pdb.do_frame(
+        #     pdb.curindex
+
+    finally:
+        task_can_release_tty_lock.set()
+        undo_sigint()
+
+# using the "pause" semantics instead since
+# that better covers actually somewhat "pausing the runtime"
+# for this particular paralell task to do debugging B)
+pause = partial(
+    _pause,
     _set_trace,
 )
+pp = pause  # short-hand for "pause point"
+
+
+async def breakpoint(**kwargs):
+    log.warning(
+        '`tractor.breakpoint()` is deprecated!\n'
+        'Please use `tractor.pause()` instead!\n'
+    )
+    await pause(**kwargs)
 
 
 def _post_mortem(
@@ -801,7 +877,7 @@ def _post_mortem(
 
 
 post_mortem = partial(
-    _breakpoint,
+    _pause,
     _post_mortem,
 )
 
diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 08ddabc4..268f059d 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -95,6 +95,10 @@ async def _invoke(
     treat_as_gen: bool = False
     failed_resp: bool = False
 
+    if _state.debug_mode():
+        import greenback
+        await greenback.ensure_portal()
+
     # possibly a traceback (not sure what typing is for this..)
     tb = None
 
@@ -1862,4 +1866,6 @@ class Arbiter(Actor):
 
     ) -> None:
         uid = (str(uid[0]), str(uid[1]))
-        self._registry.pop(uid)
+        entry: tuple = self._registry.pop(uid, None)
+        if entry is None:
+            log.warning(f'Request to de-register {uid} failed?')
-- 
2.34.1


From ac695a05bfe9eb13055688f41b145138df050e68 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 22 Jun 2023 17:16:17 -0400
Subject: [PATCH 016/378] Updates from latest `piker.data._sharedmem` changes

---
 tractor/_shm.py | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/tractor/_shm.py b/tractor/_shm.py
index c4c17335..f8295105 100644
--- a/tractor/_shm.py
+++ b/tractor/_shm.py
@@ -287,9 +287,9 @@ class ShmArray:
         self,
         fields: Optional[list[str]] = None,
 
-        # type that all field values will be cast to in the returned
-        # view.
-        common_dtype: np.dtype = np.float64,  # type: ignore
+        # type that all field values will be cast to
+        # in the returned view.
+        common_dtype: np.dtype = float,
 
     ) -> np.ndarray:
 
@@ -344,7 +344,7 @@ class ShmArray:
         field_map: Optional[dict[str, str]] = None,
         prepend: bool = False,
         update_first: bool = True,
-        start: Optional[int] = None,
+        start: int | None = None,
 
     ) -> int:
         '''
@@ -386,7 +386,11 @@ class ShmArray:
             # tries to access ``.array`` (which due to the index
             # overlap will be empty). Pretty sure we've fixed it now
             # but leaving this here as a reminder.
-            if prepend and update_first and length:
+            if (
+                prepend
+                and update_first
+                and length
+            ):
                 assert index < self._first.value
 
             if (
@@ -460,10 +464,10 @@ class ShmArray:
 
 
 def open_shm_ndarray(
-    key: Optional[str] = None,
-    size: int = int(2 ** 10),
+    size: int,
+    key: str | None = None,
     dtype: np.dtype | None = None,
-    append_start_index: int = 0,
+    append_start_index: int | None = None,
     readonly: bool = False,
 
 ) -> ShmArray:
@@ -529,9 +533,12 @@ def open_shm_ndarray(
     # ``ShmArray._start.value: int = 0`` and the yet-to-be written
     # real-time section will start at ``ShmArray.index: int``.
 
-    # this sets the index to 3/4 of the length of the buffer
-    # leaving a "days worth of second samples" for the real-time
-    # section.
+    # this sets the index to nearly 2/3rds into the the length of
+    # the buffer leaving at least a "days worth of second samples"
+    # for the real-time section.
+    if append_start_index is None:
+        append_start_index = round(size * 0.616)
+
     last.value = first.value = append_start_index
 
     shmarr = ShmArray(
@@ -640,9 +647,7 @@ def attach_shm_ndarray(
 
 def maybe_open_shm_ndarray(
     key: str,  # unique identifier for segment
-
-    # from ``open_shm_array()``
-    size: int = int(2 ** 10),  # array length in index terms
+    size: int,
     dtype: np.dtype | None = None,
     append_start_index: int = 0,
     readonly: bool = True,
-- 
2.34.1


From 565d7c3ee5f646941e6bd1a988aa2fece4d22424 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 7 Jul 2023 14:47:42 -0400
Subject: [PATCH 017/378] Add longer "required reading" list B)

---
 docs/README.rst | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/docs/README.rst b/docs/README.rst
index 9dfe2f60..9dd7faf4 100644
--- a/docs/README.rst
+++ b/docs/README.rst
@@ -3,8 +3,8 @@
 |gh_actions|
 |docs|
 
-``tractor`` is a `structured concurrent`_, multi-processing_ runtime
-built on trio_.
+``tractor`` is a `structured concurrent`_, (optionally
+distributed_) multi-processing_ runtime built on trio_.
 
 Fundamentally, ``tractor`` gives you parallelism via
 ``trio``-"*actors*": independent Python processes (aka
@@ -17,11 +17,20 @@ protocol" constructed on top of multiple Pythons each running a ``trio``
 scheduled runtime - a call to ``trio.run()``.
 
 We believe the system adheres to the `3 axioms`_ of an "`actor model`_"
-but likely *does not* look like what *you* probably think an "actor
-model" looks like, and that's *intentional*.
+but likely **does not** look like what **you** probably *think* an "actor
+model" looks like, and that's **intentional**.
 
-The first step to grok ``tractor`` is to get the basics of ``trio`` down.
-A great place to start is the `trio docs`_ and this `blog post`_.
+
+Where do i start!?
+------------------
+The first step to grok ``tractor`` is to get an intermediate
+knowledge of ``trio`` and **structured concurrency** B)
+
+Some great places to start are,
+- the seminal `blog post`_
+- obviously the `trio docs`_
+- wikipedia's nascent SC_ page
+- the fancy diagrams @ libdill-docs_
 
 
 Features
@@ -593,6 +602,7 @@ matrix seems too hip, we're also mostly all in the the `trio gitter
 channel`_!
 
 .. _structured concurrent: https://trio.discourse.group/t/concise-definition-of-structured-concurrency/228
+.. _distributed: https://en.wikipedia.org/wiki/Distributed_computing
 .. _multi-processing: https://en.wikipedia.org/wiki/Multiprocessing
 .. _trio: https://github.com/python-trio/trio
 .. _nurseries: https://vorpus.org/blog/notes-on-structured-concurrency-or-go-statement-considered-harmful/#nurseries-a-structured-replacement-for-go-statements
@@ -611,8 +621,9 @@ channel`_!
 .. _trio docs: https://trio.readthedocs.io/en/latest/
 .. _blog post: https://vorpus.org/blog/notes-on-structured-concurrency-or-go-statement-considered-harmful/
 .. _structured concurrency: https://en.wikipedia.org/wiki/Structured_concurrency
+.. _SC: https://en.wikipedia.org/wiki/Structured_concurrency
+.. _libdill-docs: https://sustrik.github.io/libdill/structured-concurrency.html
 .. _structured chadcurrency: https://en.wikipedia.org/wiki/Structured_concurrency
-.. _structured concurrency: https://en.wikipedia.org/wiki/Structured_concurrency
 .. _unrequirements: https://en.wikipedia.org/wiki/Actor_model#Direct_communication_and_asynchrony
 .. _async generators: https://www.python.org/dev/peps/pep-0525/
 .. _trio-parallel: https://github.com/richardsheridan/trio-parallel
-- 
2.34.1


From 46972df041c4a414c744bb4b57728123c739f0d7 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 7 Jul 2023 14:48:37 -0400
Subject: [PATCH 018/378] .log: more correct handling for
 `get_logger(__name__)` usage

---
 tractor/log.py | 42 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 7 deletions(-)

diff --git a/tractor/log.py b/tractor/log.py
index 1ea99315..5710e83e 100644
--- a/tractor/log.py
+++ b/tractor/log.py
@@ -193,15 +193,39 @@ def get_logger(
     '''
     log = rlog = logging.getLogger(_root_name)
 
-    if name and name != _proj_name:
+    if (
+        name
+        and name != _proj_name
+    ):
 
-        # handling for modules that use ``get_logger(__name__)`` to
-        # avoid duplicate project-package token in msg output
-        rname, _, tail = name.partition('.')
-        if rname == _root_name:
-            name = tail
+        # NOTE: for handling for modules that use ``get_logger(__name__)``
+        # we make the following stylistic choice:
+        # - always avoid duplicate project-package token
+        #   in msg output: i.e. tractor.tractor _ipc.py in header
+        #   looks ridiculous XD
+        # - never show the leaf module name in the {name} part
+        #   since in python the {filename} is always this same
+        #   module-file.
+
+        sub_name: None | str = None
+        rname, _, sub_name = name.partition('.')
+        pkgpath, _, modfilename = sub_name.rpartition('.')
+
+        # NOTE: for tractor itself never include the last level
+        # module key in the name such that something like: eg.
+        # 'tractor.trionics._broadcast` only includes the first
+        # 2 tokens in the (coloured) name part.
+        if rname == 'tractor':
+            sub_name = pkgpath
+
+        if _root_name in sub_name:
+            duplicate, _, sub_name = sub_name.partition('.')
+
+        if not sub_name:
+            log = rlog
+        else:
+            log = rlog.getChild(sub_name)
 
-        log = rlog.getChild(name)
         log.level = rlog.level
 
     # add our actor-task aware adapter which will dynamically look up
@@ -254,3 +278,7 @@ def get_console_log(
 
 def get_loglevel() -> str:
     return _default_loglevel
+
+
+# global module logger for tractor itself
+log = get_logger('tractor')
-- 
2.34.1


From 98a7326c855a23f1b3bb8d8826b697202329ead1 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 7 Jul 2023 14:49:23 -0400
Subject: [PATCH 019/378] ._runtime: log level tweaks, use crit for stale debug
 lock detection

---
 tractor/_runtime.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 268f059d..244fffb9 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -760,6 +760,7 @@ class Actor:
                             # deliver response to local caller/waiter
                             await self._push_result(chan, cid, msg)
 
+                    log.runtime('Waiting on actor nursery to exit..')
                     await local_nursery.exited.wait()
 
                 if disconnected:
@@ -814,7 +815,7 @@ class Actor:
                         db_cs
                         and not db_cs.cancel_called
                     ):
-                        log.warning(
+                        log.critical(
                             f'STALE DEBUG LOCK DETECTED FOR {uid}'
                         )
                         # TODO: figure out why this breaks tests..
-- 
2.34.1


From 4ace8f603713dd1654a6145fd6c35cefaf9b995a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 7 Jul 2023 14:51:44 -0400
Subject: [PATCH 020/378] Fix frame-selection display on first REPL entry

For whatever reason pdb(p), and in general, will show the frame of the
*next* python instruction/LOC on initial entry (at least using
`.set_trace()`), as such remove the `try/finally` block in the sync
code entrypoint `.pause_from_sync()`, and also since doesn't seem like
we really need it anyway.

Further, and to this end:
- enable hidden frames support in our default config.
- fix/drop/mask all the frame ref-ing/mangling we had prior since it's no
  longer needed as well as manual `Lock` releasing which seems to work
  already by having the `greenback` spawned task do it's normal thing?
- move to no `Union` type annots.
- hide all frames that can add "this is the runtime confusion" to
  traces.
---
 tractor/_debug.py | 132 +++++++++++++++++++++++++---------------------
 1 file changed, 72 insertions(+), 60 deletions(-)

diff --git a/tractor/_debug.py b/tractor/_debug.py
index eec6fc50..d5f5f4f1 100644
--- a/tractor/_debug.py
+++ b/tractor/_debug.py
@@ -30,7 +30,6 @@ from functools import (
 from contextlib import asynccontextmanager as acm
 from typing import (
     Any,
-    Optional,
     Callable,
     AsyncIterator,
     AsyncGenerator,
@@ -40,7 +39,10 @@ from types import FrameType
 import pdbp
 import tractor
 import trio
-from trio_typing import TaskStatus
+from trio_typing import (
+    TaskStatus,
+    # Task,
+)
 
 from .log import get_logger
 from ._discovery import get_root
@@ -69,10 +71,10 @@ class Lock:
     '''
     repl: MultiActorPdb | None = None
     # placeholder for function to set a ``trio.Event`` on debugger exit
-    # pdb_release_hook: Optional[Callable] = None
+    # pdb_release_hook: Callable | None = None
 
     _trio_handler: Callable[
-        [int, Optional[FrameType]], Any
+        [int, FrameType | None], Any
     ] | int | None = None
 
     # actor-wide variable pointing to current task name using debugger
@@ -83,23 +85,23 @@ class Lock:
     # and must be cancelled if this actor is cancelled via IPC
     # request-message otherwise deadlocks with the parent actor may
     # ensure
-    _debugger_request_cs: Optional[trio.CancelScope] = None
+    _debugger_request_cs: trio.CancelScope | None = None
 
     # NOTE: set only in the root actor for the **local** root spawned task
     # which has acquired the lock (i.e. this is on the callee side of
     # the `lock_tty_for_child()` context entry).
-    _root_local_task_cs_in_debug: Optional[trio.CancelScope] = None
+    _root_local_task_cs_in_debug: trio.CancelScope | None = None
 
     # actor tree-wide actor uid that supposedly has the tty lock
-    global_actor_in_debug: Optional[tuple[str, str]] = None
+    global_actor_in_debug: tuple[str, str] = None
 
-    local_pdb_complete: Optional[trio.Event] = None
-    no_remote_has_tty: Optional[trio.Event] = None
+    local_pdb_complete: trio.Event | None = None
+    no_remote_has_tty: trio.Event | None = None
 
     # lock in root actor preventing multi-access to local tty
     _debug_lock: trio.StrictFIFOLock = trio.StrictFIFOLock()
 
-    _orig_sigint_handler: Optional[Callable] = None
+    _orig_sigint_handler: Callable | None = None
     _blocked: set[tuple[str, str]] = set()
 
     @classmethod
@@ -110,6 +112,7 @@ class Lock:
         )
 
     @classmethod
+    @pdbp.hideframe  # XXX NOTE XXX see below in `.pause_from_sync()`
     def unshield_sigint(cls):
         # always restore ``trio``'s sigint handler. see notes below in
         # the pdb factory about the nightmare that is that code swapping
@@ -129,10 +132,6 @@ class Lock:
             if owner:
                 raise
 
-        # actor-local state, irrelevant for non-root.
-        cls.global_actor_in_debug = None
-        cls.local_task_in_debug = None
-
         try:
             # sometimes the ``trio`` might already be terminated in
             # which case this call will raise.
@@ -143,6 +142,11 @@ class Lock:
             cls.unshield_sigint()
             cls.repl = None
 
+            # actor-local state, irrelevant for non-root.
+            cls.global_actor_in_debug = None
+            cls.local_task_in_debug = None
+
+
 
 class TractorConfig(pdbp.DefaultConfig):
     '''
@@ -151,7 +155,7 @@ class TractorConfig(pdbp.DefaultConfig):
     '''
     use_pygments: bool = True
     sticky_by_default: bool = False
-    enable_hidden_frames: bool = False
+    enable_hidden_frames: bool = True
 
     # much thanks @mdmintz for the hot tip!
     # fixes line spacing issue when resizing terminal B)
@@ -228,26 +232,23 @@ async def _acquire_debug_lock_from_root_task(
     to the ``pdb`` repl.
 
     '''
-    task_name = trio.lowlevel.current_task().name
+    task_name: str = trio.lowlevel.current_task().name
+    we_acquired: bool = False
 
     log.runtime(
         f"Attempting to acquire TTY lock, remote task: {task_name}:{uid}"
     )
-
-    we_acquired = False
-
     try:
         log.runtime(
             f"entering lock checkpoint, remote task: {task_name}:{uid}"
         )
-        we_acquired = True
-
         # NOTE: if the surrounding cancel scope from the
         # `lock_tty_for_child()` caller is cancelled, this line should
         # unblock and NOT leave us in some kind of
         # a "child-locked-TTY-but-child-is-uncontactable-over-IPC"
         # condition.
         await Lock._debug_lock.acquire()
+        we_acquired = True
 
         if Lock.no_remote_has_tty is None:
             # mark the tty lock as being in use so that the runtime
@@ -573,13 +574,15 @@ async def _pause(
     try:
         # breakpoint()
         if debug_func is None:
-            assert release_lock_signal, (
-                'Must pass `release_lock_signal: trio.Event` if no '
-                'trace func provided!'
-            )
+            # assert release_lock_signal, (
+            #     'Must pass `release_lock_signal: trio.Event` if no '
+            #     'trace func provided!'
+            # )
             print(f"{actor.uid} ENTERING WAIT")
             task_status.started()
-            await release_lock_signal.wait()
+
+            # with trio.CancelScope(shield=True):
+            #     await release_lock_signal.wait()
 
         else:
             # block here one (at the appropriate frame *up*) where
@@ -606,7 +609,7 @@ async def _pause(
 def shield_sigint_handler(
     signum: int,
     frame: 'frame',  # type: ignore # noqa
-    # pdb_obj: Optional[MultiActorPdb] = None,
+    # pdb_obj: MultiActorPdb | None = None,
     *args,
 
 ) -> None:
@@ -620,7 +623,7 @@ def shield_sigint_handler(
     '''
     __tracebackhide__ = True
 
-    uid_in_debug = Lock.global_actor_in_debug
+    uid_in_debug: tuple[str, str] | None = Lock.global_actor_in_debug
 
     actor = tractor.current_actor()
     # print(f'{actor.uid} in HANDLER with ')
@@ -638,14 +641,14 @@ def shield_sigint_handler(
         else:
             raise KeyboardInterrupt
 
-    any_connected = False
+    any_connected: bool = False
 
     if uid_in_debug is not None:
         # try to see if the supposed (sub)actor in debug still
         # has an active connection to *this* actor, and if not
         # it's likely they aren't using the TTY lock / debugger
         # and we should propagate SIGINT normally.
-        chans = actor._peers.get(tuple(uid_in_debug))
+        chans: list[tractor.Channel] = actor._peers.get(tuple(uid_in_debug))
         if chans:
             any_connected = any(chan.connected() for chan in chans)
             if not any_connected:
@@ -658,7 +661,7 @@ def shield_sigint_handler(
                 return do_cancel()
 
     # only set in the actor actually running the REPL
-    pdb_obj = Lock.repl
+    pdb_obj: MultiActorPdb | None = Lock.repl
 
     # root actor branch that reports whether or not a child
     # has locked debugger.
@@ -716,7 +719,7 @@ def shield_sigint_handler(
             )
             return do_cancel()
 
-        task = Lock.local_task_in_debug
+        task: str | None = Lock.local_task_in_debug
         if (
             task
             and pdb_obj
@@ -791,15 +794,18 @@ def _set_trace(
         Lock.local_task_in_debug = 'sync'
 
     pdb.set_trace(frame=frame)
+    # undo_
 
 
 # TODO: allow pausing from sync code, normally by remapping
 # python's builtin breakpoint() hook to this runtime aware version.
 def pause_from_sync() -> None:
+    print("ENTER SYNC PAUSE")
     import greenback
+    __tracebackhide__ = True
 
     actor: tractor.Actor = tractor.current_actor()
-    task_can_release_tty_lock = trio.Event()
+    # task_can_release_tty_lock = trio.Event()
 
     # spawn bg task which will lock out the TTY, we poll
     # just below until the release event is reporting that task as
@@ -808,34 +814,39 @@ def pause_from_sync() -> None:
         actor._service_n.start(partial(
             _pause,
             debug_func=None,
-            release_lock_signal=task_can_release_tty_lock,
+            # release_lock_signal=task_can_release_tty_lock,
         ))
     )
-    print("ENTER SYNC PAUSE")
-    pdb, undo_sigint = mk_mpdb()
-    try:
-        print("ENTER SYNC PAUSE")
-        # _set_trace(actor=actor)
 
-        # we entered the global ``breakpoint()`` built-in from sync
-        # code?
-        Lock.local_task_in_debug = 'sync'
-        frame: FrameType | None = sys._getframe()
-        print(f'FRAME: {str(frame)}')
+    db, undo_sigint = mk_mpdb()
+    Lock.local_task_in_debug = 'sync'
+    # db.config.enable_hidden_frames = True
 
-        frame: FrameType = frame.f_back  # type: ignore
-        print(f'FRAME: {str(frame)}')
+    # we entered the global ``breakpoint()`` built-in from sync
+    # code?
+    frame: FrameType | None = sys._getframe()
+    # print(f'FRAME: {str(frame)}')
+    # assert not db._is_hidden(frame)
 
-        frame: FrameType = frame.f_back  # type: ignore
-        print(f'FRAME: {str(frame)}')
+    frame: FrameType = frame.f_back  # type: ignore
+    # print(f'FRAME: {str(frame)}')
+    # if not db._is_hidden(frame):
+    #     pdbp.set_trace()
+    # db._hidden_frames.append(
+    #     (frame, frame.f_lineno)
+    # )
+    db.set_trace(frame=frame)
+    # NOTE XXX: see the `@pdbp.hideframe` decoration
+    # on `Lock.unshield_sigint()`.. I have NO CLUE why
+    # the next instruction's def frame is being shown
+    # in the tb but it seems to be something wonky with
+    # the way `pdb` core works?
+    # undo_sigint()
 
-        pdb.set_trace(frame=frame)
-        # pdb.do_frame(
-        #     pdb.curindex
+    # Lock.global_actor_in_debug = actor.uid
+    # Lock.release()
+    # task_can_release_tty_lock.set()
 
-    finally:
-        task_can_release_tty_lock.set()
-        undo_sigint()
 
 # using the "pause" semantics instead since
 # that better covers actually somewhat "pausing the runtime"
@@ -959,8 +970,7 @@ async def maybe_wait_for_debugger(
         # will make the pdb repl unusable.
         # Instead try to wait for pdb to be released before
         # tearing down.
-
-        sub_in_debug = None
+        sub_in_debug: tuple[str, str] | None = None
 
         for _ in range(poll_steps):
 
@@ -980,13 +990,15 @@ async def maybe_wait_for_debugger(
 
                 debug_complete = Lock.no_remote_has_tty
                 if (
-                    (debug_complete and
-                     not debug_complete.is_set())
+                    debug_complete
+                    and sub_in_debug is not None
+                    and not debug_complete.is_set()
                 ):
-                    log.debug(
+                    log.pdb(
                         'Root has errored but pdb is in use by '
                         f'child {sub_in_debug}\n'
-                        'Waiting on tty lock to release..')
+                        'Waiting on tty lock to release..'
+                    )
 
                     await debug_complete.wait()
 
-- 
2.34.1


From b36b3d522f41b4441468b4beafcfb11ddd9dfc8f Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 7 Jul 2023 15:35:52 -0400
Subject: [PATCH 021/378] Map `breakpoint()` built-in to new
 `.pause_from_sync()` ep

---
 tractor/_root.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tractor/_root.py b/tractor/_root.py
index a2d31586..a19652df 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -89,7 +89,7 @@ async def open_root_actor(
     # https://github.com/python-trio/trio/issues/1155#issuecomment-742964018
     builtin_bp_handler = sys.breakpointhook
     orig_bp_path: str | None = os.environ.get('PYTHONBREAKPOINT', None)
-    os.environ['PYTHONBREAKPOINT'] = 'tractor._debug._set_trace'
+    os.environ['PYTHONBREAKPOINT'] = 'tractor._debug.pause_from_sync'
 
     # attempt to retreive ``trio``'s sigint handler and stash it
     # on our debugger lock state.
@@ -235,9 +235,10 @@ async def open_root_actor(
                 BaseExceptionGroup,
             ) as err:
 
-                entered = await _debug._maybe_enter_pm(err)
-
-                if not entered and not is_multi_cancelled(err):
+                if (
+                    not (await _debug._maybe_enter_pm(err))
+                    and not is_multi_cancelled(err)
+                ):
                     logger.exception("Root actor crashed:")
 
                 # always re-raise
-- 
2.34.1


From bee2c36072939d7e537ec1fce9f297d37721db63 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 12 Jul 2023 13:07:30 -0400
Subject: [PATCH 022/378] Make `NamespacePath` work on object refs

Detect if the input ref is a non-func (like an `object` instance) in
which case grab its type name using `type()`. Wrap all the name-getting
into a new `_mk_fqpn()` static meth: gets the "fully qualified path
name" and returns path and name in tuple; port other methds to use it.
Refine and update the docs B)
---
 tractor/msg.py | 56 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 40 insertions(+), 16 deletions(-)

diff --git a/tractor/msg.py b/tractor/msg.py
index 9af3ccd7..ca34dba8 100644
--- a/tractor/msg.py
+++ b/tractor/msg.py
@@ -43,38 +43,62 @@ Built-in messaging patterns, types, APIs and helpers.
 # - https://github.com/msgpack/msgpack-python#packingunpacking-of-custom-data-type
 
 from __future__ import annotations
+from inspect import isfunction
 from pkgutil import resolve_name
 
 
 class NamespacePath(str):
     '''
-    A serializeable description of a (function) Python object location
-    described by the target's module path and namespace key meant as
-    a message-native "packet" to allows actors to point-and-load objects
-    by absolute reference.
+    A serializeable description of a (function) Python object
+    location described by the target's module path and namespace
+    key meant as a message-native "packet" to allows actors to
+    point-and-load objects by an absolute ``str`` (and thus
+    serializable) reference.
 
     '''
-    _ref: object = None
+    _ref: object | type | None = None
 
-    def load_ref(self) -> object:
+    def load_ref(self) -> object | type:
         if self._ref is None:
             self._ref = resolve_name(self)
         return self._ref
 
-    def to_tuple(
-        self,
+    @staticmethod
+    def _mk_fqnp(ref: type | object) -> tuple[str, str]:
+        '''
+        Generate a minial ``str`` pair which describes a python
+        object's namespace path and object/type name.
 
-    ) -> tuple[str, str]:
-        ref = self.load_ref()
-        return ref.__module__, getattr(ref, '__name__', '')
+        In more precise terms something like:
+          - 'py.namespace.path:object_name',
+          - eg.'tractor.msg:NamespacePath' will be the ``str`` form
+            of THIS type XD
+
+        '''
+        if (
+            isinstance(ref, object)
+            and not isfunction(ref)
+        ):
+            name: str = type(ref).__name__
+        else:
+            name: str = getattr(ref, '__name__')
+
+        # fully qualified namespace path, tuple.
+        fqnp: tuple[str, str] = (
+            ref.__module__,
+            name,
+        )
+        return fqnp
 
     @classmethod
     def from_ref(
         cls,
-        ref,
+        ref: type | object,
 
     ) -> NamespacePath:
-        return cls(':'.join(
-            (ref.__module__,
-             getattr(ref, '__name__', ''))
-        ))
+
+        fqnp: tuple[str, str] = cls._mk_fqnp(ref)
+        return cls(':'.join(fqnp))
+
+    def to_tuple(self) -> tuple[str, str]:
+        return self._mk_fqnp(self.load_ref())
-- 
2.34.1


From e03bec5efc9b8739809e7b6d51ca3a13735745d1 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 21 Jul 2023 15:08:46 -0400
Subject: [PATCH 023/378] Move `.to_asyncio` to modern optional value type
 annots

---
 tractor/to_asyncio.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tractor/to_asyncio.py b/tractor/to_asyncio.py
index be3ac8d3..788181e6 100644
--- a/tractor/to_asyncio.py
+++ b/tractor/to_asyncio.py
@@ -28,7 +28,6 @@ from typing import (
     Callable,
     AsyncIterator,
     Awaitable,
-    Optional,
 )
 
 import trio
@@ -65,9 +64,9 @@ class LinkedTaskChannel(trio.abc.Channel):
     _trio_exited: bool = False
 
     # set after ``asyncio.create_task()``
-    _aio_task: Optional[asyncio.Task] = None
-    _aio_err: Optional[BaseException] = None
-    _broadcaster: Optional[BroadcastReceiver] = None
+    _aio_task: asyncio.Task | None = None
+    _aio_err: BaseException | None = None
+    _broadcaster: BroadcastReceiver | None = None
 
     async def aclose(self) -> None:
         await self._from_aio.aclose()
@@ -188,7 +187,7 @@ def _run_asyncio_task(
 
     cancel_scope = trio.CancelScope()
     aio_task_complete = trio.Event()
-    aio_err: Optional[BaseException] = None
+    aio_err: BaseException | None = None
 
     chan = LinkedTaskChannel(
         aio_q,  # asyncio.Queue
@@ -263,7 +262,7 @@ def _run_asyncio_task(
         '''
         nonlocal chan
         aio_err = chan._aio_err
-        task_err: Optional[BaseException] = None
+        task_err: BaseException | None = None
 
         # only to avoid ``asyncio`` complaining about uncaptured
         # task exceptions
@@ -329,11 +328,11 @@ async def translate_aio_errors(
     '''
     trio_task = trio.lowlevel.current_task()
 
-    aio_err: Optional[BaseException] = None
+    aio_err: BaseException | None = None
 
     # TODO: make thisi a channel method?
     def maybe_raise_aio_err(
-        err: Optional[Exception] = None
+        err: Exception | None = None
     ) -> None:
         aio_err = chan._aio_err
         if (
-- 
2.34.1


From 1102843087993c8c620c895c87da17079ba5cff5 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 18 Aug 2023 10:10:36 -0400
Subject: [PATCH 024/378] Teensie tidy up on actor doc string

---
 tractor/_runtime.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 244fffb9..c9e4bfe1 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -452,17 +452,18 @@ class Actor:
     (swappable) network protocols.
 
 
-    Each "actor" is ``trio.run()`` scheduled "runtime" composed of many
-    concurrent tasks in a single thread. The "runtime" tasks conduct
-    a slew of low(er) level functions to make it possible for message
-    passing between actors as well as the ability to create new actors
-    (aka new "runtimes" in new processes which are supervised via
-    a nursery construct). Each task which sends messages to a task in
-    a "peer" (not necessarily a parent-child, depth hierarchy)) is able
-    to do so via an "address", which maps IPC connections across memory
-    boundaries, and task request id which allows for per-actor
-    tasks to send and receive messages to specific peer-actor tasks with
-    which there is an ongoing RPC/IPC dialog.
+    Each "actor" is ``trio.run()`` scheduled "runtime" composed of
+    many concurrent tasks in a single thread. The "runtime" tasks
+    conduct a slew of low(er) level functions to make it possible
+    for message passing between actors as well as the ability to
+    create new actors (aka new "runtimes" in new processes which
+    are supervised via a nursery construct). Each task which sends
+    messages to a task in a "peer" (not necessarily a parent-child,
+    depth hierarchy) is able to do so via an "address", which maps
+    IPC connections across memory boundaries, and a task request id
+    which allows for per-actor tasks to send and receive messages
+    to specific peer-actor tasks with which there is an ongoing
+    RPC/IPC dialog.
 
     '''
     # ugh, we need to get rid of this and replace with a "registry" sys
-- 
2.34.1


From 22c14e235e3390674796dfc3c462863395d2bfda Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 18 Aug 2023 10:18:25 -0400
Subject: [PATCH 025/378] Expose `Channel` @ pkg level, drop `_debug.pp()`
 alias

---
 tractor/__init__.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tractor/__init__.py b/tractor/__init__.py
index 8781943a..c653ec05 100644
--- a/tractor/__init__.py
+++ b/tractor/__init__.py
@@ -21,7 +21,6 @@ tractor: structured concurrent ``trio``-"actors".
 from exceptiongroup import BaseExceptionGroup
 
 from ._clustering import open_actor_cluster
-from ._ipc import Channel
 from ._context import (
     Context,
     context,
@@ -49,7 +48,6 @@ from ._exceptions import (
 from ._debug import (
     breakpoint,
     pause,
-    pp,
     pause_from_sync,
     post_mortem,
 )
@@ -58,6 +56,7 @@ from ._root import (
     run_daemon,
     open_root_actor,
 )
+from ._ipc import Channel
 from ._portal import Portal
 from ._runtime import Actor
 
@@ -76,6 +75,7 @@ __all__ = [
     'context',
     'current_actor',
     'find_actor',
+    'query_actor',
     'get_arbiter',
     'is_root_process',
     'msg',
@@ -84,8 +84,7 @@ __all__ = [
     'open_root_actor',
     'pause',
     'post_mortem',
-    'pp',
-    'pause_from_sync'
+    'pause_from_sync',
     'query_actor',
     'run_daemon',
     'stream',
-- 
2.34.1


From ee151b00afeadac96729e772aff90536fb4b481b Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 27 Sep 2023 14:05:22 -0400
Subject: [PATCH 026/378] Mk `gather_contexts()` support `@acm`s yielding
 `None`

We were using a `all(<yielded values>)` condition which obviously won't
work if the batched managers yield any non-truthy value. So instead see
the `unwrapped: dict` with the `id(mngrs)` and only unblock once all
values have been filled in to be something that is not that value.
---
 tractor/trionics/_mngrs.py | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/tractor/trionics/_mngrs.py b/tractor/trionics/_mngrs.py
index 89db895b..801b138b 100644
--- a/tractor/trionics/_mngrs.py
+++ b/tractor/trionics/_mngrs.py
@@ -70,6 +70,7 @@ async def _enter_and_wait(
     unwrapped: dict[int, T],
     all_entered: trio.Event,
     parent_exit: trio.Event,
+    seed: int,
 
 ) -> None:
     '''
@@ -80,7 +81,10 @@ async def _enter_and_wait(
     async with mngr as value:
         unwrapped[id(mngr)] = value
 
-        if all(unwrapped.values()):
+        if all(
+            val != seed
+            for val in unwrapped.values()
+        ):
             all_entered.set()
 
         await parent_exit.wait()
@@ -91,7 +95,13 @@ async def gather_contexts(
 
     mngrs: Sequence[AsyncContextManager[T]],
 
-) -> AsyncGenerator[tuple[Optional[T], ...], None]:
+) -> AsyncGenerator[
+    tuple[
+        T | None,
+        ...
+    ],
+    None,
+]:
     '''
     Concurrently enter a sequence of async context managers, each in
     a separate ``trio`` task and deliver the unwrapped values in the
@@ -104,7 +114,11 @@ async def gather_contexts(
     entered and exited, and cancellation just works.
 
     '''
-    unwrapped: dict[int, Optional[T]] = {}.fromkeys(id(mngr) for mngr in mngrs)
+    seed: int = id(mngrs)
+    unwrapped: dict[int, T | None] = {}.fromkeys(
+        (id(mngr) for mngr in mngrs),
+        seed,
+    )
 
     all_entered = trio.Event()
     parent_exit = trio.Event()
@@ -116,8 +130,9 @@ async def gather_contexts(
 
     if not mngrs:
         raise ValueError(
-            'input mngrs is empty?\n'
-            'Did try to use inline generator syntax?'
+            '`.trionics.gather_contexts()` input mngrs is empty?\n'
+            'Did try to use inline generator syntax?\n'
+            'Use a non-lazy iterator or sequence type intead!'
         )
 
     async with trio.open_nursery() as n:
@@ -128,6 +143,7 @@ async def gather_contexts(
                 unwrapped,
                 all_entered,
                 parent_exit,
+                seed,
             )
 
         # deliver control once all managers have started up
-- 
2.34.1


From 3d0e95513c901a8686259dc67fac6d8978e246db Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 27 Sep 2023 15:19:30 -0400
Subject: [PATCH 027/378] Init-support for "multi homed" transports

Since we'd like to eventually allow a diverse set of transport
(protocol) methods and stacks, and a multi-peer discovery system for
distributed actor-tree applications, this reworks all runtime internals
to support multi-homing for any given tree on a logical host. In other
words any actor can now bind its transport server (currently only
unsecured TCP + `msgspec`) to more then one address available in its
(linux) network namespace. Further, registry actors (now dubbed
"registars" instead of "arbiters") can also similarly bind to multiple
network addresses and provide discovery services to remote actors via
multiple addresses which can now be provided at runtime startup.

Deats:
- adjust `._runtime` internals to use a `list[tuple[str, int]]` (and
  thus pluralized) socket address sequence where applicable for transport
  server socket binds, now exposed via `Actor.accept_addrs`:
  - `Actor.__init__()` now takes a `registry_addrs: list`.
  - `Actor.is_arbiter` -> `.is_registrar`.
  - `._arb_addr` -> `._reg_addrs: list[tuple]`.
  - always reg and de-reg from all registrars in `async_main()`.
  - only set the global runtime var `'_root_mailbox'` to the loopback
    address since normally all in-tree processes should have access to
    it, right?
  - `._serve_forever()` task now takes `listen_sockaddrs: list[tuple]`
- make `open_root_actor()` take a `registry_addrs: list[tuple[str, int]]`
  and defaults when not passed.
- change `ActorNursery.start_..()` methods take `bind_addrs: list` and
  pass down through the spawning layer(s) via the parent-seed-msg.
- generalize all `._discovery()` APIs to accept `registry_addrs`-like
  inputs and move all relevant subsystems to adopt the "registry" style
  naming instead of "arbiter":
  - make `find_actor()` support batched concurrent portal queries over
    all provided input addresses using `.trionics.gather_contexts()` Bo
  - syntax: move to using `async with <tuples>` 3.9+ style chained
    @acms.
  - a general modernization of the code to a python 3.9+ style.
  - start deprecation and change to "registry" naming / semantics:
    - `._discovery.get_arbiter()` -> `.get_registry()`
---
 tractor/_discovery.py | 188 +++++++++++++++++------
 tractor/_entry.py     |  14 +-
 tractor/_ipc.py       |   4 +-
 tractor/_portal.py    |   7 +-
 tractor/_root.py      | 141 +++++++++++------
 tractor/_runtime.py   | 346 ++++++++++++++++++++++++++++--------------
 tractor/_spawn.py     |  21 ++-
 tractor/_supervise.py |  23 ++-
 8 files changed, 504 insertions(+), 240 deletions(-)

diff --git a/tractor/_discovery.py b/tractor/_discovery.py
index 03775ac2..22ab88d1 100644
--- a/tractor/_discovery.py
+++ b/tractor/_discovery.py
@@ -15,16 +15,19 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 """
-Actor discovery API.
+Discovery (protocols) API for automatic addressing and location
+management of (service) actors.
 
 """
+from __future__ import annotations
 from typing import (
-    Optional,
-    Union,
     AsyncGenerator,
+    TYPE_CHECKING,
 )
 from contextlib import asynccontextmanager as acm
+import warnings
 
+from .trionics import gather_contexts
 from ._ipc import _connect_chan, Channel
 from ._portal import (
     Portal,
@@ -34,13 +37,19 @@ from ._portal import (
 from ._state import current_actor, _runtime_vars
 
 
-@acm
-async def get_arbiter(
+if TYPE_CHECKING:
+    from ._runtime import Actor
 
+
+@acm
+async def get_registry(
     host: str,
     port: int,
 
-) -> AsyncGenerator[Union[Portal, LocalPortal], None]:
+) -> AsyncGenerator[
+    Portal | LocalPortal | None,
+    None,
+]:
     '''
     Return a portal instance connected to a local or remote
     arbiter.
@@ -51,16 +60,23 @@ async def get_arbiter(
     if not actor:
         raise RuntimeError("No actor instance has been defined yet?")
 
-    if actor.is_arbiter:
+    if actor.is_registrar:
         # we're already the arbiter
         # (likely a re-entrant call from the arbiter actor)
-        yield LocalPortal(actor, Channel((host, port)))
+        yield LocalPortal(
+            actor,
+            Channel((host, port))
+        )
     else:
-        async with _connect_chan(host, port) as chan:
+        async with (
+            _connect_chan(host, port) as chan,
+            open_portal(chan) as regstr_ptl,
+        ):
+            yield regstr_ptl
 
-            async with open_portal(chan) as arb_portal:
 
-                yield arb_portal
+# TODO: deprecate and remove _arbiter form
+get_arbiter = get_registry
 
 
 @acm
@@ -68,51 +84,81 @@ async def get_root(
     **kwargs,
 ) -> AsyncGenerator[Portal, None]:
 
+    # TODO: rename mailbox to `_root_maddr` when we finally
+    # add and impl libp2p multi-addrs?
     host, port = _runtime_vars['_root_mailbox']
     assert host is not None
 
-    async with _connect_chan(host, port) as chan:
-        async with open_portal(chan, **kwargs) as portal:
-            yield portal
+    async with (
+        _connect_chan(host, port) as chan,
+        open_portal(chan, **kwargs) as portal,
+    ):
+        yield portal
 
 
 @acm
 async def query_actor(
     name: str,
-    arbiter_sockaddr: Optional[tuple[str, int]] = None,
+    arbiter_sockaddr: tuple[str, int] | None = None,
+    regaddr: tuple[str, int] | None = None,
 
-) -> AsyncGenerator[tuple[str, int], None]:
+) -> AsyncGenerator[
+    tuple[str, int] | None,
+    None,
+]:
     '''
-    Simple address lookup for a given actor name.
+    Make a transport address lookup for an actor name to a specific
+    registrar.
 
-    Returns the (socket) address or ``None``.
+    Returns the (socket) address or ``None`` if no entry under that
+    name exists for the given registrar listening @ `regaddr`.
 
     '''
-    actor = current_actor()
-    async with get_arbiter(
-        *arbiter_sockaddr or actor._arb_addr
-    ) as arb_portal:
+    actor: Actor = current_actor()
+    if (
+        name == 'registrar'
+        and actor.is_registrar
+    ):
+        raise RuntimeError(
+            'The current actor IS the registry!?'
+        )
 
-        sockaddr = await arb_portal.run_from_ns(
+    if arbiter_sockaddr is not None:
+        warnings.warn(
+            '`tractor.query_actor(regaddr=<blah>)` is deprecated.\n'
+            'Use `registry_addrs: list[tuple]` instead!',
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        regaddr: list[tuple[str, int]] = arbiter_sockaddr
+
+    regstr: Portal
+    async with get_registry(
+        *(regaddr or actor._reg_addrs[0])
+    ) as regstr:
+
+        # TODO: return portals to all available actors - for now
+        # just the last one that registered
+        sockaddr: tuple[str, int] = await regstr.run_from_ns(
             'self',
             'find_actor',
             name=name,
         )
-
-        # TODO: return portals to all available actors - for now just
-        # the last one that registered
-        if name == 'arbiter' and actor.is_arbiter:
-            raise RuntimeError("The current actor is the arbiter")
-
-        yield sockaddr if sockaddr else None
+        yield sockaddr
 
 
 @acm
 async def find_actor(
     name: str,
-    arbiter_sockaddr: tuple[str, int] | None = None
+    arbiter_sockaddr: tuple[str, int] | None = None,
+    registry_addrs: list[tuple[str, int]] | None = None,
 
-) -> AsyncGenerator[Optional[Portal], None]:
+    only_first: bool = True,
+
+) -> AsyncGenerator[
+    Portal | list[Portal] | None,
+    None,
+]:
     '''
     Ask the arbiter to find actor(s) by name.
 
@@ -120,24 +166,54 @@ async def find_actor(
     known to the arbiter.
 
     '''
-    async with query_actor(
-        name=name,
-        arbiter_sockaddr=arbiter_sockaddr,
-    ) as sockaddr:
+    if arbiter_sockaddr is not None:
+        warnings.warn(
+            '`tractor.find_actor(arbiter_sockaddr=<blah>)` is deprecated.\n'
+            'Use `registry_addrs: list[tuple]` instead!',
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        registry_addrs: list[tuple[str, int]] = [arbiter_sockaddr]
 
-        if sockaddr:
-            async with _connect_chan(*sockaddr) as chan:
-                async with open_portal(chan) as portal:
-                    yield portal
-        else:
+    @acm
+    async def maybe_open_portal_from_reg_addr(
+        addr: tuple[str, int],
+    ):
+        async with query_actor(
+            name=name,
+            regaddr=addr,
+        ) as sockaddr:
+            if sockaddr:
+                async with _connect_chan(*sockaddr) as chan:
+                    async with open_portal(chan) as portal:
+                        yield portal
+            else:
+                yield None
+
+    async with gather_contexts(
+        mngrs=list(
+            maybe_open_portal_from_reg_addr(addr)
+            for addr in registry_addrs
+        )
+    ) as maybe_portals:
+        print(f'Portalz: {maybe_portals}')
+        if not maybe_portals:
             yield None
+            return
+
+        portals: list[Portal] = list(maybe_portals)
+        if only_first:
+            yield portals[0]
+
+        else:
+            yield portals
 
 
 @acm
 async def wait_for_actor(
     name: str,
     arbiter_sockaddr: tuple[str, int] | None = None,
-    # registry_addr: tuple[str, int] | None = None,
+    registry_addr: tuple[str, int] | None = None,
 
 ) -> AsyncGenerator[Portal, None]:
     '''
@@ -146,17 +222,33 @@ async def wait_for_actor(
     A portal to the first registered actor is returned.
 
     '''
-    actor = current_actor()
+    actor: Actor = current_actor()
 
-    async with get_arbiter(
-        *arbiter_sockaddr or actor._arb_addr,
-    ) as arb_portal:
-        sockaddrs = await arb_portal.run_from_ns(
+    if arbiter_sockaddr is not None:
+        warnings.warn(
+            '`tractor.wait_for_actor(arbiter_sockaddr=<foo>)` is deprecated.\n'
+            'Use `registry_addr: tuple` instead!',
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        registry_addr: list[tuple[str, int]] = [
+            arbiter_sockaddr,
+        ]
+
+    # TODO: use `.trionics.gather_contexts()` like
+    # above in `find_actor()` as well?
+    async with get_registry(
+        *(registry_addr or actor._reg_addrs[0]),  # first if not passed
+    ) as reg_portal:
+        sockaddrs = await reg_portal.run_from_ns(
             'self',
             'wait_for_actor',
             name=name,
         )
-        sockaddr = sockaddrs[-1]
+
+        # get latest registered addr by default?
+        # TODO: offer multi-portal yields in multi-homed case?
+        sockaddr: tuple[str, int] = sockaddrs[-1]
 
         async with _connect_chan(*sockaddr) as chan:
             async with open_portal(chan) as portal:
diff --git a/tractor/_entry.py b/tractor/_entry.py
index e8fb56db..b5ab4055 100644
--- a/tractor/_entry.py
+++ b/tractor/_entry.py
@@ -47,8 +47,8 @@ log = get_logger(__name__)
 
 def _mp_main(
 
-    actor: Actor,  # type: ignore
-    accept_addr: tuple[str, int],
+    actor: Actor,
+    accept_addrs: list[tuple[str, int]],
     forkserver_info: tuple[Any, Any, Any, Any, Any],
     start_method: SpawnMethodKey,
     parent_addr: tuple[str, int] | None = None,
@@ -77,8 +77,8 @@ def _mp_main(
     log.debug(f"parent_addr is {parent_addr}")
     trio_main = partial(
         async_main,
-        actor,
-        accept_addr,
+        actor=actor,
+        accept_addrs=accept_addrs,
         parent_addr=parent_addr
     )
     try:
@@ -96,7 +96,7 @@ def _mp_main(
 
 def _trio_main(
 
-    actor: Actor,  # type: ignore
+    actor: Actor,
     *,
     parent_addr: tuple[str, int] | None = None,
     infect_asyncio: bool = False,
@@ -132,7 +132,9 @@ def _trio_main(
         else:
             trio.run(trio_main)
     except KeyboardInterrupt:
-        log.cancel(f"Actor {actor.uid} received KBI")
+        log.cancel(
+            f'Actor@{actor.uid} received KBI'
+        )
 
     finally:
         log.info(f"Actor {actor.uid} terminated")
diff --git a/tractor/_ipc.py b/tractor/_ipc.py
index ebfd261c..a022908a 100644
--- a/tractor/_ipc.py
+++ b/tractor/_ipc.py
@@ -467,7 +467,9 @@ class Channel:
 
 @asynccontextmanager
 async def _connect_chan(
-    host: str, port: int
+    host: str,
+    port: int
+
 ) -> typing.AsyncGenerator[Channel, None]:
     '''
     Create and connect a channel with disconnect on context manager
diff --git a/tractor/_portal.py b/tractor/_portal.py
index 60293716..53684b42 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -586,7 +586,12 @@ class LocalPortal:
     actor: 'Actor'  # type: ignore # noqa
     channel: Channel
 
-    async def run_from_ns(self, ns: str, func_name: str, **kwargs) -> Any:
+    async def run_from_ns(
+        self,
+        ns: str,
+        func_name: str,
+        **kwargs,
+    ) -> Any:
         '''
         Run a requested local function from a namespace path and
         return it's result.
diff --git a/tractor/_root.py b/tractor/_root.py
index a19652df..f64aa69e 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -58,10 +58,10 @@ async def open_root_actor(
 
     *,
     # defaults are above
-    arbiter_addr: tuple[str, int] | None = None,
+    registry_addrs: list[tuple[str, int]] | None = None,
 
     # defaults are above
-    registry_addr: tuple[str, int] | None = None,
+    arbiter_addr: tuple[str, int] | None = None,
 
     name: str | None = 'root',
 
@@ -115,19 +115,19 @@ async def open_root_actor(
 
     if arbiter_addr is not None:
         warnings.warn(
-            '`arbiter_addr` is now deprecated and has been renamed to'
-            '`registry_addr`.\nUse that instead..',
+            '`arbiter_addr` is now deprecated\n'
+            'Use `registry_addrs: list[tuple]` instead..',
             DeprecationWarning,
             stacklevel=2,
         )
+        registry_addrs = [arbiter_addr]
 
-    registry_addr = (host, port) = (
-        registry_addr
-        or arbiter_addr
-        or (
+    registry_addrs: list[tuple[str, int]] = (
+        registry_addrs
+        or [  # default on localhost
             _default_arbiter_host,
             _default_arbiter_port,
-        )
+        ]
     )
 
     loglevel = (loglevel or log._default_loglevel).upper()
@@ -157,60 +157,105 @@ async def open_root_actor(
 
     log.get_console_log(loglevel)
 
-    try:
-        # make a temporary connection to see if an arbiter exists,
-        # if one can't be made quickly we assume none exists.
-        arbiter_found = False
+    # closed into below ping task-func
+    ponged_addrs: list[tuple[str, int]] = []
 
-        # TODO: this connect-and-bail forces us to have to carefully
-        # rewrap TCP 104-connection-reset errors as EOF so as to avoid
-        # propagating cancel-causing errors to the channel-msg loop
-        # machinery.  Likely it would be better to eventually have
-        # a "discovery" protocol with basic handshake instead.
-        with trio.move_on_after(1):
-            async with _connect_chan(host, port):
-                arbiter_found = True
+    async def ping_tpt_socket(
+        addr: tuple[str, int],
+        timeout: float = 1,
+    ) -> None:
+        '''
+        Attempt temporary connection to see if a registry is
+        listening at the requested address by a tranport layer
+        ping.
 
-    except OSError:
-        # TODO: make this a "discovery" log level?
-        logger.warning(f"No actor registry found @ {host}:{port}")
+        If a connection can't be made quickly we assume none no
+        server is listening at that addr.
 
-    # create a local actor and start up its main routine/task
-    if arbiter_found:
+        '''
+        try:
+            # TODO: this connect-and-bail forces us to have to
+            # carefully rewrap TCP 104-connection-reset errors as
+            # EOF so as to avoid propagating cancel-causing errors
+            # to the channel-msg loop machinery. Likely it would
+            # be better to eventually have a "discovery" protocol
+            # with basic handshake instead?
+            with trio.move_on_after(timeout):
+                async with _connect_chan(*addr):
+                    ponged_addrs.append(addr)
+
+        except OSError:
+            # TODO: make this a "discovery" log level?
+            logger.warning(f'No actor registry found @ {addr}')
+
+    async with trio.open_nursery() as tn:
+        for addr in registry_addrs:
+            tn.start_soon(ping_tpt_socket, addr)
+
+    trans_bind_addrs: list[tuple[str, int]] = []
+
+    # Create a new local root-actor instance which IS NOT THE
+    # REGISTRAR
+    if ponged_addrs:
 
         # we were able to connect to an arbiter
-        logger.info(f"Arbiter seems to exist @ {host}:{port}")
+        logger.info(
+            f'Registry(s) seem(s) to exist @ {ponged_addrs}'
+        )
 
         actor = Actor(
-            name or 'anonymous',
-            arbiter_addr=registry_addr,
+            name=name or 'anonymous',
+            registry_addrs=ponged_addrs,
             loglevel=loglevel,
             enable_modules=enable_modules,
         )
-        host, port = (host, 0)
+        # DO NOT use the registry_addrs as the transport server
+        # addrs for this new non-registar, root-actor.
+        for host, port in ponged_addrs:
+            # NOTE: zero triggers dynamic OS port allocation
+            trans_bind_addrs.append((host, 0))
 
+    # Start this local actor as the "registrar", aka a regular
+    # actor who manages the local registry of "mailboxes" of
+    # other process-tree-local sub-actors.
     else:
-        # start this local actor as the arbiter (aka a regular actor who
-        # manages the local registry of "mailboxes")
 
-        # Note that if the current actor is the arbiter it is desirable
-        # for it to stay up indefinitely until a re-election process has
-        # taken place - which is not implemented yet FYI).
+        # NOTE that if the current actor IS THE REGISTAR, the
+        # following init steps are taken:
+        # - the tranport layer server is bound to each (host, port)
+        #   pair defined in provided registry_addrs, or the default.
+        trans_bind_addrs = registry_addrs
+
+        # - it is normally desirable for any registrar to stay up
+        #   indefinitely until either all registered (child/sub)
+        #   actors are terminated (via SC supervision) or,
+        #   a re-election process has taken place. 
+        # NOTE: all of ^ which is not implemented yet - see:
+        # https://github.com/goodboy/tractor/issues/216
+        # https://github.com/goodboy/tractor/pull/348
+        # https://github.com/goodboy/tractor/issues/296
 
         actor = Arbiter(
-            name or 'arbiter',
-            arbiter_addr=registry_addr,
+            name or 'registrar',
+            registry_addrs=registry_addrs,
             loglevel=loglevel,
             enable_modules=enable_modules,
         )
 
+    # Start up main task set via core actor-runtime nurseries.
     try:
         # assign process-local actor
         _state._current_actor = actor
 
         # start local channel-server and fake the portal API
         # NOTE: this won't block since we provide the nursery
-        logger.info(f"Starting local {actor} @ {host}:{port}")
+        ml_addrs_str: str = '\n'.join(
+            f'@{addr}' for addr in trans_bind_addrs
+        )
+        logger.info(
+            f'Starting local {actor.uid} on the following transport addrs:\n'
+            f'{ml_addrs_str}'
+        )
 
         # start the actor runtime in a new task
         async with trio.open_nursery() as nursery:
@@ -223,7 +268,7 @@ async def open_root_actor(
                 partial(
                     async_main,
                     actor,
-                    accept_addr=(host, port),
+                    accept_addrs=trans_bind_addrs,
                     parent_addr=None
                 )
             )
@@ -235,13 +280,16 @@ async def open_root_actor(
                 BaseExceptionGroup,
             ) as err:
 
+                entered: bool = await _debug._maybe_enter_pm(err)
+
                 if (
-                    not (await _debug._maybe_enter_pm(err))
+                    not entered
                     and not is_multi_cancelled(err)
                 ):
                     logger.exception("Root actor crashed:")
 
-                # always re-raise
+                # ALWAYS re-raise any error bubbled up from the
+                # runtime!
                 raise
 
             finally:
@@ -261,7 +309,7 @@ async def open_root_actor(
     finally:
         _state._current_actor = None
 
-        # restore breakpoint hook state
+        # restore built-in `breakpoint()` hook state
         sys.breakpointhook = builtin_bp_handler
         if orig_bp_path is not None:
             os.environ['PYTHONBREAKPOINT'] = orig_bp_path
@@ -277,10 +325,9 @@ def run_daemon(
 
     # runtime kwargs
     name: str | None = 'root',
-    registry_addr: tuple[str, int] = (
-        _default_arbiter_host,
-        _default_arbiter_port,
-    ),
+    registry_addrs: list[tuple[str, int]] = [
+        (_default_arbiter_host, _default_arbiter_port)
+    ],
 
     start_method: str | None = None,
     debug_mode: bool = False,
@@ -304,7 +351,7 @@ def run_daemon(
     async def _main():
 
         async with open_root_actor(
-            registry_addr=registry_addr,
+            registry_addrs=registry_addrs,
             name=name,
             start_method=start_method,
             debug_mode=debug_mode,
diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index c9e4bfe1..16f105cf 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -25,12 +25,12 @@ from itertools import chain
 import importlib
 import importlib.util
 import inspect
+from pprint import pformat
 import signal
 import sys
 from typing import (
     Any,
     Callable,
-    Union,
     TYPE_CHECKING,
 )
 import uuid
@@ -59,7 +59,7 @@ from ._exceptions import (
     TransportClosed,
 )
 from . import _debug
-from ._discovery import get_arbiter
+from ._discovery import get_registry
 from ._portal import Portal
 from . import _state
 from . import _mp_fixup_main
@@ -82,7 +82,7 @@ async def _invoke(
 
     is_rpc: bool = True,
     task_status: TaskStatus[
-        Union[Context, BaseException]
+        Context | BaseException
     ] = trio.TASK_STATUS_IGNORED,
 ):
     '''
@@ -96,8 +96,14 @@ async def _invoke(
     failed_resp: bool = False
 
     if _state.debug_mode():
-        import greenback
-        await greenback.ensure_portal()
+        try:
+            import greenback
+            await greenback.ensure_portal()
+        except ModuleNotFoundError:
+            log.warning(
+                '`greenback` is not installed.\n'
+                'No sync debug support!'
+            )
 
     # possibly a traceback (not sure what typing is for this..)
     tb = None
@@ -416,13 +422,13 @@ async def _invoke(
                 actor._ongoing_rpc_tasks.set()
 
 
-def _get_mod_abspath(module):
+def _get_mod_abspath(module: ModuleType) -> str:
     return os.path.abspath(module.__file__)
 
 
 async def try_ship_error_to_parent(
     channel: Channel,
-    err: Union[Exception, BaseExceptionGroup],
+    err: Exception | BaseExceptionGroup,
 
 ) -> None:
     with trio.CancelScope(shield=True):
@@ -469,6 +475,11 @@ class Actor:
     # ugh, we need to get rid of this and replace with a "registry" sys
     # https://github.com/goodboy/tractor/issues/216
     is_arbiter: bool = False
+
+    @property
+    def is_registrar(self) -> bool:
+        return self.is_arbiter
+
     msg_buffer_size: int = 2**6
 
     # nursery placeholders filled in by `async_main()` after fork
@@ -501,8 +512,12 @@ class Actor:
         enable_modules: list[str] = [],
         uid: str | None = None,
         loglevel: str | None = None,
+        registry_addrs: list[tuple[str, int]] | None = None,
+        spawn_method: str | None = None,
+
+        # TODO: remove!
         arbiter_addr: tuple[str, int] | None = None,
-        spawn_method: str | None = None
+
     ) -> None:
         '''
         This constructor is called in the parent actor **before** the spawning
@@ -523,27 +538,36 @@ class Actor:
         # always include debugging tools module
         enable_modules.append('tractor._debug')
 
-        mods = {}
+        self.enable_modules: dict[str, str] = {}
         for name in enable_modules:
-            mod = importlib.import_module(name)
-            mods[name] = _get_mod_abspath(mod)
+            mod: ModuleType = importlib.import_module(name)
+            self.enable_modules[name] = _get_mod_abspath(mod)
 
-        self.enable_modules = mods
         self._mods: dict[str, ModuleType] = {}
-        self.loglevel = loglevel
+        self.loglevel: str = loglevel
 
-        self._arb_addr: tuple[str, int] | None = (
-            str(arbiter_addr[0]),
-            int(arbiter_addr[1])
-        ) if arbiter_addr else None
+        if arbiter_addr is not None:
+            warnings.warn(
+                '`Actor(arbiter_addr=<blah>)` is now deprecated.\n'
+                'Use `registry_addrs: list[tuple]` instead.',
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            registry_addrs: list[tuple[str, int]] = [arbiter_addr]
+
+        self._reg_addrs: list[tuple[str, int]] = (
+            registry_addrs
+            or
+            None
+        )
 
         # marked by the process spawning backend at startup
         # will be None for the parent most process started manually
         # by the user (currently called the "arbiter")
-        self._spawn_method = spawn_method
+        self._spawn_method: str = spawn_method
 
         self._peers: defaultdict = defaultdict(list)
-        self._peer_connected: dict = {}
+        self._peer_connected: dict[tuple[str, str], trio.Event] = {}
         self._no_more_peers = trio.Event()
         self._no_more_peers.set()
         self._ongoing_rpc_tasks = trio.Event()
@@ -654,13 +678,17 @@ class Actor:
         self._no_more_peers = trio.Event()  # unset
 
         chan = Channel.from_stream(stream)
-        uid: tuple[str, str] | None = chan.uid
-        log.runtime(f"New connection to us {chan}")
+        their_uid: tuple[str, str] | None = chan.uid
+        if their_uid:
+            log.warning(
+                f'Re-connection from already known {their_uid}'
+            )
+        else:
+           log.runtime(f'New connection to us @{chan.raddr}')
 
         # send/receive initial handshake response
         try:
             uid = await self._do_handshake(chan)
-
         except (
             # we need this for ``msgspec`` for some reason?
             # for now, it's been put in the stream backend.
@@ -956,7 +984,11 @@ class Actor:
     async def _from_parent(
         self,
         parent_addr: tuple[str, int] | None,
-    ) -> tuple[Channel, tuple[str, int] | None]:
+
+    ) -> tuple[
+        Channel,
+        list[tuple[str, int]] | None,
+    ]:
         try:
             # Connect back to the parent actor and conduct initial
             # handshake. From this point on if we error, we
@@ -966,11 +998,11 @@ class Actor:
             )
             await chan.connect()
 
+            # TODO: move this into a `Channel.handshake()`?
             # Initial handshake: swap names.
             await self._do_handshake(chan)
 
-            accept_addr: tuple[str, int] | None = None
-
+            accept_addrs: list[tuple[str, int]] | None = None
             if self._spawn_method == "trio":
                 # Receive runtime state from our parent
                 parent_data: dict[str, Any]
@@ -979,10 +1011,7 @@ class Actor:
                     "Received state from parent:\n"
                     f"{parent_data}"
                 )
-                accept_addr = (
-                    parent_data.pop('bind_host'),
-                    parent_data.pop('bind_port'),
-                )
+                accept_addrs: list[tuple[str, int]] = parent_data.pop('bind_addrs')
                 rvs = parent_data.pop('_runtime_vars')
                 log.runtime(f"Runtime vars are: {rvs}")
                 rvs['_is_root'] = False
@@ -990,17 +1019,18 @@ class Actor:
 
                 for attr, value in parent_data.items():
 
-                    if attr == '_arb_addr':
+                    if attr == '_reg_addrs':
                         # XXX: ``msgspec`` doesn't support serializing tuples
                         # so just cash manually here since it's what our
                         # internals expect.
-                        value = tuple(value) if value else None
-                        self._arb_addr = value
+                        self._reg_addrs = [
+                            tuple(val) for val in value
+                        ] if value else None
 
                     else:
                         setattr(self, attr, value)
 
-            return chan, accept_addr
+            return chan, accept_addrs
 
         except OSError:  # failed to connect
             log.warning(
@@ -1014,8 +1044,8 @@ class Actor:
         handler_nursery: trio.Nursery,
         *,
         # (host, port) to bind for channel server
-        accept_host: tuple[str, int] | None = None,
-        accept_port: int = 0,
+        listen_sockaddrs: list[tuple[str, int]] | None = None,
+
         task_status: TaskStatus[trio.Nursery] = trio.TASK_STATUS_IGNORED,
     ) -> None:
         '''
@@ -1025,28 +1055,39 @@ class Actor:
         ``cancel_server()`` is called.
 
         '''
+        if listen_sockaddrs is None:
+            listen_sockaddrs = [(None, 0)]
+
         self._server_down = trio.Event()
         try:
             async with trio.open_nursery() as server_n:
-                listeners: list[trio.abc.Listener] = await server_n.start(
-                    partial(
-                        trio.serve_tcp,
-                        self._stream_handler,
-                        # new connections will stay alive even if this server
-                        # is cancelled
-                        handler_nursery=handler_nursery,
-                        port=accept_port,
-                        host=accept_host,
+
+                for host, port in listen_sockaddrs:
+                    listeners: list[trio.abc.Listener] = await server_n.start(
+                        partial(
+                            trio.serve_tcp,
+
+                            handler=self._stream_handler,
+                            port=port,
+                            host=host,
+
+                            # NOTE: configured such that new
+                            # connections will stay alive even if
+                            # this server is cancelled!
+                            handler_nursery=handler_nursery,
+                        )
                     )
-                )
-                sockets: list[trio.socket] = [
-                    getattr(listener, 'socket', 'unknown socket')
-                    for listener in listeners
-                ]
-                log.runtime(
-                    f'Started tcp server(s) on {sockets}')
-                self._listeners.extend(listeners)
+                    sockets: list[trio.socket] = [
+                        getattr(listener, 'socket', 'unknown socket')
+                        for listener in listeners
+                    ]
+                    log.runtime(
+                        f'Started tcp server(s) on {sockets}'
+                    )
+                    self._listeners.extend(listeners)
+
                 task_status.started(server_n)
+
         finally:
             # signal the server is down since nursery above terminated
             self._server_down.set()
@@ -1226,13 +1267,26 @@ class Actor:
             self._server_n.cancel_scope.cancel()
 
     @property
-    def accept_addr(self) -> tuple[str, int] | None:
+    def accept_addrs(self) -> list[tuple[str, int]]:
+        '''
+        All addresses to which the transport-channel server binds
+        and listens for new connections.
+
+        '''
+        # throws OSError on failure
+        return [
+            listener.socket.getsockname() 
+            for listener in self._listeners
+        ]  # type: ignore
+
+    @property
+    def accept_addr(self) -> tuple[str, int]:
         '''
         Primary address to which the channel server is bound.
 
         '''
         # throws OSError on failure
-        return self._listeners[0].socket.getsockname()  # type: ignore
+        return self.accept_addrs[0]
 
     def get_parent(self) -> Portal:
         '''
@@ -1249,6 +1303,7 @@ class Actor:
         '''
         return self._peers[uid]
 
+    # TODO: move to `Channel.handshake(uid)`
     async def _do_handshake(
         self,
         chan: Channel
@@ -1278,7 +1333,7 @@ class Actor:
 
 async def async_main(
     actor: Actor,
-    accept_addr: tuple[str, int] | None = None,
+    accept_addrs: tuple[str, int] | None = None,
 
     # XXX: currently ``parent_addr`` is only needed for the
     # ``multiprocessing`` backend (which pickles state sent to
@@ -1303,20 +1358,25 @@ async def async_main(
     # on our debugger lock state.
     _debug.Lock._trio_handler = signal.getsignal(signal.SIGINT)
 
-    registered_with_arbiter = False
+    is_registered: bool = False
     try:
 
         # establish primary connection with immediate parent
-        actor._parent_chan = None
+        actor._parent_chan: Channel | None = None
         if parent_addr is not None:
 
-            actor._parent_chan, accept_addr_rent = await actor._from_parent(
-                parent_addr)
+            (
+                actor._parent_chan,
+                set_accept_addr_says_rent,
+            ) = await actor._from_parent(parent_addr)
 
-            # either it's passed in because we're not a child
-            # or because we're running in mp mode
-            if accept_addr_rent is not None:
-                accept_addr = accept_addr_rent
+            # either it's passed in because we're not a child or
+            # because we're running in mp mode
+            if (
+                set_accept_addr_says_rent
+                and set_accept_addr_says_rent is not None
+            ):
+                accept_addrs = set_accept_addr_says_rent
 
         # load exposed/allowed RPC modules
         # XXX: do this **after** establishing a channel to the parent
@@ -1340,38 +1400,62 @@ async def async_main(
                 actor._service_n = service_nursery
                 assert actor._service_n
 
-                # Startup up the channel server with,
+                # Startup up the transport(-channel) server with,
                 # - subactor: the bind address is sent by our parent
                 #   over our established channel
                 # - root actor: the ``accept_addr`` passed to this method
-                assert accept_addr
-                host, port = accept_addr
+                assert accept_addrs
 
                 actor._server_n = await service_nursery.start(
                     partial(
                         actor._serve_forever,
                         service_nursery,
-                        accept_host=host,
-                        accept_port=port
+                        listen_sockaddrs=accept_addrs,
                     )
                 )
-                accept_addr = actor.accept_addr
+                accept_addrs: list[tuple[str, int]] = actor.accept_addrs
+
+                # NOTE: only set the loopback addr for the 
+                # process-tree-global "root" mailbox since
+                # all sub-actors should be able to speak to
+                # their root actor over that channel.
                 if _state._runtime_vars['_is_root']:
-                    _state._runtime_vars['_root_mailbox'] = accept_addr
+                    for addr in accept_addrs:
+                        host, _ = addr
+                        # TODO: generic 'lo' detector predicate
+                        if '127.0.0.1' in host:
+                            _state._runtime_vars['_root_mailbox'] = addr
 
                 # Register with the arbiter if we're told its addr
-                log.runtime(f"Registering {actor} for role `{actor.name}`")
-                assert isinstance(actor._arb_addr, tuple)
+                log.runtime(
+                    f'Registering `{actor.name}` ->\n'
+                    f'{pformat(accept_addrs)}'
+                )
 
-                async with get_arbiter(*actor._arb_addr) as arb_portal:
-                    await arb_portal.run_from_ns(
-                        'self',
-                        'register_actor',
-                        uid=actor.uid,
-                        sockaddr=accept_addr,
-                    )
+                # TODO: ideally we don't fan out to all registrars
+                # if addresses point to the same actor..
+                # So we need a way to detect that? maybe iterate
+                # only on unique actor uids?
+                for addr in actor._reg_addrs:
+                    assert isinstance(addr, tuple)
+                    assert addr[1]  # non-zero after bind
 
-                registered_with_arbiter = True
+                    async with get_registry(*addr) as reg_portal:
+                        for accept_addr in accept_addrs:
+
+                            if not accept_addr[1]:
+                                await _debug.pause()
+
+                            assert accept_addr[1]
+
+                            await reg_portal.run_from_ns(
+                                'self',
+                                'register_actor',
+                                uid=actor.uid,
+                                sockaddr=accept_addr,
+                            )
+
+                    is_registered: bool = True
 
                 # init steps complete
                 task_status.started()
@@ -1401,18 +1485,18 @@ async def async_main(
         log.runtime("Closing all actor lifetime contexts")
         actor.lifetime_stack.close()
 
-        if not registered_with_arbiter:
+        if not is_registered:
             # TODO: I guess we could try to connect back
             # to the parent through a channel and engage a debugger
             # once we have that all working with std streams locking?
             log.exception(
                 f"Actor errored and failed to register with arbiter "
-                f"@ {actor._arb_addr}?")
+                f"@ {actor._reg_addrs[0]}?")
             log.error(
                 "\n\n\t^^^ THIS IS PROBABLY A TRACTOR BUGGGGG!!! ^^^\n"
                 "\tCALMLY CALL THE AUTHORITIES AND HIDE YOUR CHILDREN.\n\n"
-                "\tYOUR PARENT CODE IS GOING TO KEEP WORKING FINE!!!\n"
-                "\tTHIS IS HOW RELIABlE SYSTEMS ARE SUPPOSED TO WORK!?!?\n"
+                "\tIf this is a sub-actor likely its parent will keep running "
+                "\tcorrectly if this error is caught and ignored.."
             )
 
         if actor._parent_chan:
@@ -1447,29 +1531,35 @@ async def async_main(
 
         actor.lifetime_stack.close()
 
-        # Unregister actor from the arbiter
+        # Unregister actor from the registry
         if (
-            registered_with_arbiter
-            and not actor.is_arbiter
+            is_registered
+            and not actor.is_registrar
         ):
-            failed = False
-            assert isinstance(actor._arb_addr, tuple)
-            with trio.move_on_after(0.5) as cs:
-                cs.shield = True
-                try:
-                    async with get_arbiter(*actor._arb_addr) as arb_portal:
-                        await arb_portal.run_from_ns(
-                            'self',
-                            'unregister_actor',
-                            uid=actor.uid
-                        )
-                except OSError:
+            failed: bool = False
+            for addr in actor._reg_addrs:
+                assert isinstance(addr, tuple)
+                with trio.move_on_after(0.5) as cs:
+                    cs.shield = True
+                    try:
+                        async with get_registry(
+                            *addr,
+                        ) as reg_portal:
+                            await reg_portal.run_from_ns(
+                                'self',
+                                'unregister_actor',
+                                uid=actor.uid
+                            )
+                    except OSError:
+                        failed = True
+                if cs.cancelled_caught:
                     failed = True
-            if cs.cancelled_caught:
-                failed = True
-            if failed:
-                log.warning(
-                    f"Failed to unregister {actor.name} from arbiter")
+
+                if failed:
+                    log.warning(
+                        f'Failed to unregister {actor.name} from '
+                        f'registar @ {addr}'
+                    )
 
         # Ensure all peers (actors connected to us as clients) are finished
         if not actor._no_more_peers.is_set():
@@ -1761,18 +1851,36 @@ async def process_messages(
 
 class Arbiter(Actor):
     '''
-    A special actor who knows all the other actors and always has
-    access to a top level nursery.
+    A special registrar actor who can contact all other actors
+    within its immediate process tree and possibly keeps a registry
+    of others meant to be discoverable in a distributed
+    application. Normally the registrar is also the "root actor"
+    and thus always has access to the top-most-level actor
+    (process) nursery.
 
-    The arbiter is by default the first actor spawned on each host
-    and is responsible for keeping track of all other actors for
-    coordination purposes. If a new main process is launched and an
-    arbiter is already running that arbiter will be used.
+    By default, the registrar is always initialized when and if no
+    other registrar socket addrs have been specified to runtime
+    init entry-points (such as `open_root_actor()` or
+    `open_nursery()`). Any time a new main process is launched (and
+    thus thus a new root actor created) and, no existing registrar
+    can be contacted at the provided `registry_addr`, then a new
+    one is always created; however, if one can be reached it is
+    used.
+
+    Normally a distributed app requires at least registrar per
+    logical host where for that given "host space" (aka localhost
+    IPC domain of addresses) it is responsible for making all other
+    host (local address) bound actors *discoverable* to external
+    actor trees running on remote hosts.
 
     '''
     is_arbiter = True
 
-    def __init__(self, *args, **kwargs) -> None:
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ) -> None:
 
         self._registry: dict[
             tuple[str, str],
@@ -1814,7 +1922,10 @@ class Arbiter(Actor):
         # unpacker since we have tuples as keys (not this makes the
         # arbiter suscetible to hashdos):
         # https://github.com/msgpack/msgpack-python#major-breaking-changes-in-msgpack-10
-        return {'.'.join(key): val for key, val in self._registry.items()}
+        return {
+            '.'.join(key): val
+            for key, val in self._registry.items()
+        }
 
     async def wait_for_actor(
         self,
@@ -1852,8 +1963,15 @@ class Arbiter(Actor):
         sockaddr: tuple[str, int]
 
     ) -> None:
-        uid = name, _ = (str(uid[0]), str(uid[1]))
-        self._registry[uid] = (str(sockaddr[0]), int(sockaddr[1]))
+        uid = name, hash = (str(uid[0]), str(uid[1]))
+        addr = (host, port) = (
+            str(sockaddr[0]),
+            int(sockaddr[1]),
+        )
+        if port == 0:
+            await _debug.pause()
+        assert port  # should never be 0-dynamic-os-alloc
+        self._registry[uid] = addr
 
         # pop and signal all waiter events
         events = self._waiters.pop(name, [])
diff --git a/tractor/_spawn.py b/tractor/_spawn.py
index db465421..985b8107 100644
--- a/tractor/_spawn.py
+++ b/tractor/_spawn.py
@@ -294,7 +294,7 @@ async def new_proc(
     errors: dict[tuple[str, str], Exception],
 
     # passed through to actor main
-    bind_addr: tuple[str, int],
+    bind_addrs: list[tuple[str, int]],
     parent_addr: tuple[str, int],
     _runtime_vars: dict[str, Any],  # serialized and sent to _child
 
@@ -316,7 +316,7 @@ async def new_proc(
         actor_nursery,
         subactor,
         errors,
-        bind_addr,
+        bind_addrs,
         parent_addr,
         _runtime_vars,  # run time vars
         infect_asyncio=infect_asyncio,
@@ -331,7 +331,7 @@ async def trio_proc(
     errors: dict[tuple[str, str], Exception],
 
     # passed through to actor main
-    bind_addr: tuple[str, int],
+    bind_addrs: list[tuple[str, int]],
     parent_addr: tuple[str, int],
     _runtime_vars: dict[str, Any],  # serialized and sent to _child
     *,
@@ -417,12 +417,11 @@ async def trio_proc(
 
         # send additional init params
         await chan.send({
-            "_parent_main_data": subactor._parent_main_data,
-            "enable_modules": subactor.enable_modules,
-            "_arb_addr": subactor._arb_addr,
-            "bind_host": bind_addr[0],
-            "bind_port": bind_addr[1],
-            "_runtime_vars": _runtime_vars,
+            '_parent_main_data': subactor._parent_main_data,
+            'enable_modules': subactor.enable_modules,
+            '_reg_addrs': subactor._reg_addrs,
+            'bind_addrs': bind_addrs,
+            '_runtime_vars': _runtime_vars,
         })
 
         # track subactor in current nursery
@@ -509,7 +508,7 @@ async def mp_proc(
     subactor: Actor,
     errors: dict[tuple[str, str], Exception],
     # passed through to actor main
-    bind_addr: tuple[str, int],
+    bind_addrs: list[tuple[str, int]],
     parent_addr: tuple[str, int],
     _runtime_vars: dict[str, Any],  # serialized and sent to _child
     *,
@@ -567,7 +566,7 @@ async def mp_proc(
         target=_mp_main,
         args=(
             subactor,
-            bind_addr,
+            bind_addrs,
             fs_info,
             _spawn_method,
             parent_addr,
diff --git a/tractor/_supervise.py b/tractor/_supervise.py
index 7f77784b..e8599fd7 100644
--- a/tractor/_supervise.py
+++ b/tractor/_supervise.py
@@ -21,10 +21,7 @@
 from contextlib import asynccontextmanager as acm
 from functools import partial
 import inspect
-from typing import (
-    Optional,
-    TYPE_CHECKING,
-)
+from typing import TYPE_CHECKING
 import typing
 import warnings
 
@@ -94,7 +91,7 @@ class ActorNursery:
             tuple[
                 Actor,
                 trio.Process | mp.Process,
-                Optional[Portal],
+                Portal | None,
             ]
         ] = {}
         # portals spawned with ``run_in_actor()`` are
@@ -110,12 +107,12 @@ class ActorNursery:
         self,
         name: str,
         *,
-        bind_addr: tuple[str, int] = _default_bind_addr,
+        bind_addrs: list[tuple[str, int]] = [_default_bind_addr],
         rpc_module_paths: list[str] | None = None,
         enable_modules: list[str] | None = None,
         loglevel: str | None = None,  # set log level per subactor
         nursery: trio.Nursery | None = None,
-        debug_mode: Optional[bool] | None = None,
+        debug_mode: bool | None = None,
         infect_asyncio: bool = False,
     ) -> Portal:
         '''
@@ -150,7 +147,9 @@ class ActorNursery:
             # modules allowed to invoked funcs from
             enable_modules=enable_modules,
             loglevel=loglevel,
-            arbiter_addr=current_actor()._arb_addr,
+
+            # verbatim relay this actor's registrar addresses
+            registry_addrs=current_actor()._reg_addrs,
         )
         parent_addr = self._actor.accept_addr
         assert parent_addr
@@ -167,7 +166,7 @@ class ActorNursery:
                 self,
                 subactor,
                 self.errors,
-                bind_addr,
+                bind_addrs,
                 parent_addr,
                 _rtv,  # run time vars
                 infect_asyncio=infect_asyncio,
@@ -180,8 +179,8 @@ class ActorNursery:
         fn: typing.Callable,
         *,
 
-        name: Optional[str] = None,
-        bind_addr: tuple[str, int] = _default_bind_addr,
+        name: str | None = None,
+        bind_addrs: tuple[str, int] = [_default_bind_addr],
         rpc_module_paths: list[str] | None = None,
         enable_modules: list[str] | None = None,
         loglevel: str | None = None,  # set log level per subactor
@@ -208,7 +207,7 @@ class ActorNursery:
             enable_modules=[mod_path] + (
                 enable_modules or rpc_module_paths or []
             ),
-            bind_addr=bind_addr,
+            bind_addrs=bind_addrs,
             loglevel=loglevel,
             # use the run_in_actor nursery
             nursery=self._ria_nursery,
-- 
2.34.1


From fa9a9cfb1d724976f7aa72e5b078c4b5ab4ba70c Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 28 Sep 2023 14:14:50 -0400
Subject: [PATCH 028/378] Kick off `.devx` subpkg for our dev tools B)

Where `.devx` is "developer experience", a hopefully broad enough subpkg
name for all the slick stuff planned to augment working on the actor
runtime :boom:

Move the `._debug` module into the new subpkg and adjust rest of core
code base to reflect import path change. Also add a new
`.devx._debug.open_crash_handler()` manager for wrapping any sync code
outside a `trio.run()` which is handy for eventual CLI addons for
popular frameworks like `click`/`typer`.
---
 tractor/__init__.py          |  2 +-
 tractor/_context.py          |  6 ++---
 tractor/_portal.py           |  4 +--
 tractor/_root.py             |  6 ++---
 tractor/_runtime.py          |  5 ++--
 tractor/_spawn.py            |  2 +-
 tractor/_supervise.py        |  2 +-
 tractor/devx/__init__.py     | 45 +++++++++++++++++++++++++++++++++
 tractor/{ => devx}/_debug.py | 49 ++++++++++++++++++++++++++++++------
 9 files changed, 99 insertions(+), 22 deletions(-)
 create mode 100644 tractor/devx/__init__.py
 rename tractor/{ => devx}/_debug.py (96%)

diff --git a/tractor/__init__.py b/tractor/__init__.py
index c653ec05..149d4d2c 100644
--- a/tractor/__init__.py
+++ b/tractor/__init__.py
@@ -45,7 +45,7 @@ from ._exceptions import (
     ModuleNotExposed,
     ContextCancelled,
 )
-from ._debug import (
+from .devx import (
     breakpoint,
     pause,
     pause_from_sync,
diff --git a/tractor/_context.py b/tractor/_context.py
index 054f7832..e35188cd 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -222,7 +222,7 @@ class Context:
             )
 
             if self._cancel_called:
-                # from ._debug import breakpoint
+                # from .devx._debug import breakpoint
                 # await breakpoint()
 
                 # this is an expected cancel request response message
@@ -247,7 +247,7 @@ class Context:
             self._scope.cancel()
 
             # NOTE: this usage actually works here B)
-            # from ._debug import breakpoint
+            # from .devx._debug import breakpoint
             # await breakpoint()
 
         # XXX: this will break early callee results sending
@@ -277,7 +277,7 @@ class Context:
         log.cancel(f'Cancelling {side} side of context to {self.chan.uid}')
 
         self._cancel_called = True
-        # await _debug.breakpoint()
+        # await devx._debug.breakpoint()
         # breakpoint()
 
         if side == 'caller':
diff --git a/tractor/_portal.py b/tractor/_portal.py
index 53684b42..9016eda9 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -482,7 +482,7 @@ class Portal:
             # were initiated by *this* side's task.
             if not ctx._cancel_called:
                 # XXX: this should NEVER happen!
-                # from ._debug import breakpoint
+                # from .devx._debug import breakpoint
                 # await breakpoint()
                 raise
 
@@ -564,7 +564,7 @@ class Portal:
             # a "stop" msg for a stream), this can result in a deadlock
             # where the root is waiting on the lock to clear but the
             # child has already cleared it and clobbered IPC.
-            from ._debug import maybe_wait_for_debugger
+            from .devx._debug import maybe_wait_for_debugger
             await maybe_wait_for_debugger()
 
             # remove the context from runtime tracking
diff --git a/tractor/_root.py b/tractor/_root.py
index f64aa69e..b117c2c9 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -37,7 +37,7 @@ from ._runtime import (
     Arbiter,
     async_main,
 )
-from . import _debug
+from .devx import _debug
 from . import _spawn
 from . import _state
 from . import log
@@ -89,7 +89,7 @@ async def open_root_actor(
     # https://github.com/python-trio/trio/issues/1155#issuecomment-742964018
     builtin_bp_handler = sys.breakpointhook
     orig_bp_path: str | None = os.environ.get('PYTHONBREAKPOINT', None)
-    os.environ['PYTHONBREAKPOINT'] = 'tractor._debug.pause_from_sync'
+    os.environ['PYTHONBREAKPOINT'] = 'tractor.devx._debug.pause_from_sync'
 
     # attempt to retreive ``trio``'s sigint handler and stash it
     # on our debugger lock state.
@@ -137,7 +137,7 @@ async def open_root_actor(
 
         # expose internal debug module to every actor allowing
         # for use of ``await tractor.breakpoint()``
-        enable_modules.append('tractor._debug')
+        enable_modules.append('tractor.devx._debug')
 
         # if debug mode get's enabled *at least* use that level of
         # logging for some informative console prompts.
diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 16f105cf..0f8cc7ae 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -58,7 +58,7 @@ from ._exceptions import (
     ContextCancelled,
     TransportClosed,
 )
-from . import _debug
+from .devx import _debug
 from ._discovery import get_registry
 from ._portal import Portal
 from . import _state
@@ -264,7 +264,6 @@ async def _invoke(
                 cs: trio.CancelScope = ctx._scope
                 if cs.cancel_called:
                     canceller = ctx._cancelled_remote
-                    # await _debug.breakpoint()
 
                     # NOTE / TODO: if we end up having
                     # ``Actor._cancel_task()`` call
@@ -536,7 +535,7 @@ class Actor:
         self._parent_main_data = _mp_fixup_main._mp_figure_out_main()
 
         # always include debugging tools module
-        enable_modules.append('tractor._debug')
+        enable_modules.append('tractor.devx._debug')
 
         self.enable_modules: dict[str, str] = {}
         for name in enable_modules:
diff --git a/tractor/_spawn.py b/tractor/_spawn.py
index 985b8107..9c618557 100644
--- a/tractor/_spawn.py
+++ b/tractor/_spawn.py
@@ -35,7 +35,7 @@ from exceptiongroup import BaseExceptionGroup
 import trio
 from trio_typing import TaskStatus
 
-from ._debug import (
+from .devx._debug import (
     maybe_wait_for_debugger,
     acquire_debug_lock,
 )
diff --git a/tractor/_supervise.py b/tractor/_supervise.py
index e8599fd7..7851d9fb 100644
--- a/tractor/_supervise.py
+++ b/tractor/_supervise.py
@@ -28,7 +28,7 @@ import warnings
 from exceptiongroup import BaseExceptionGroup
 import trio
 
-from ._debug import maybe_wait_for_debugger
+from .devx._debug import maybe_wait_for_debugger
 from ._state import current_actor, is_main_process
 from .log import get_logger, get_loglevel
 from ._runtime import Actor
diff --git a/tractor/devx/__init__.py b/tractor/devx/__init__.py
new file mode 100644
index 00000000..e24405a0
--- /dev/null
+++ b/tractor/devx/__init__.py
@@ -0,0 +1,45 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+"""
+Runtime "developer experience" utils and addons to aid our
+(advanced) users and core devs in building distributed applications
+and working with/on the actor runtime.
+
+"""
+from ._debug import (
+    maybe_wait_for_debugger,
+    acquire_debug_lock,
+    breakpoint,
+    pause,
+    pause_from_sync,
+    shield_sigint_handler,
+    MultiActorPdb,
+    open_crash_handler,
+    post_mortem,
+)
+
+__all__ = [
+    'maybe_wait_for_debugger',
+    'acquire_debug_lock',
+    'breakpoint',
+    'pause',
+    'pause_from_sync',
+    'shield_sigint_handler',
+    'MultiActorPdb',
+    'open_crash_handler',
+    'post_mortem',
+]
diff --git a/tractor/_debug.py b/tractor/devx/_debug.py
similarity index 96%
rename from tractor/_debug.py
rename to tractor/devx/_debug.py
index d5f5f4f1..6575c223 100644
--- a/tractor/_debug.py
+++ b/tractor/devx/_debug.py
@@ -28,6 +28,7 @@ from functools import (
     cached_property,
 )
 from contextlib import asynccontextmanager as acm
+from contextlib import contextmanager as cm
 from typing import (
     Any,
     Callable,
@@ -44,22 +45,25 @@ from trio_typing import (
     # Task,
 )
 
-from .log import get_logger
-from ._discovery import get_root
-from ._state import (
+from ..log import get_logger
+from .._discovery import get_root
+from .._state import (
     is_root_process,
     debug_mode,
 )
-from ._exceptions import (
+from .._exceptions import (
     is_multi_cancelled,
     ContextCancelled,
 )
-from ._ipc import Channel
+from .._ipc import Channel
 
 log = get_logger(__name__)
 
 
-__all__ = ['breakpoint', 'post_mortem']
+__all__ = [
+    'breakpoint',
+    'post_mortem',
+]
 
 
 class Lock:
@@ -390,7 +394,7 @@ async def wait_for_parent_stdin_hijack(
                 # this syncs to child's ``Context.started()`` call.
                 async with portal.open_context(
 
-                    tractor._debug.lock_tty_for_child,
+                    lock_tty_for_child,
                     subactor_uid=actor_uid,
 
                 ) as (ctx, val):
@@ -855,7 +859,7 @@ pause = partial(
     _pause,
     _set_trace,
 )
-pp = pause  # short-hand for "pause point"
+# pp = pause  # short-hand for "pause point"
 
 
 async def breakpoint(**kwargs):
@@ -1008,3 +1012,32 @@ async def maybe_wait_for_debugger(
             log.debug(
                     'Root acquired TTY LOCK'
             )
+
+
+# TODO: better naming and what additionals?
+# - optional runtime plugging?
+# - detection for sync vs. async code?
+# - specialized REPL entry when in distributed mode?
+@cm
+def open_crash_handler(
+    catch: set[BaseException] = {
+        Exception,
+        BaseException,
+    }
+):
+    '''
+    Generic "post mortem" crash handler using `pdbp` REPL debugger.
+
+    We expose this as a CLI framework addon to both `click` and
+    `typer` users so they can quickly wrap cmd endpoints which get
+    automatically wrapped to use the runtime's `debug_mode: bool`
+    AND `pdbp.pm()` around any code that is PRE-runtime entry
+    - any sync code which runs BEFORE the main call to
+      `trio.run()`.
+
+    '''
+    try:
+        yield
+    except tuple(catch):
+        pdbp.xpm()
+        raise
-- 
2.34.1


From 7bed470f5cefa7bc03471de14f971135fb96058f Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 28 Sep 2023 15:36:24 -0400
Subject: [PATCH 029/378] Start `.devx.cli` extensions for pop CLI frameworks

Starting of with just a `typer` (and thus transitively `click`)
`typer.Typer.callback` hook which allows passthrough of the `--ll
<loglevel: str>` and `--pdb <debug_mode: bool>` flags for use when
building CLIs that use the runtime Bo

Still needs lotsa refinement and obviously better docs but, the doc
string for `load_runtime_vars()` shows how to use the underlying
`.devx._debug.open_crash_handler()` via a wrapper that can be passed the
`--pdb` flag and then enable debug mode throughout the entire actor
system.
---
 tractor/devx/_debug.py |   6 +-
 tractor/devx/cli.py    | 149 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 153 insertions(+), 2 deletions(-)
 create mode 100644 tractor/devx/cli.py

diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 6575c223..eef5c843 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -27,8 +27,10 @@ from functools import (
     partial,
     cached_property,
 )
-from contextlib import asynccontextmanager as acm
-from contextlib import contextmanager as cm
+from contextlib import (
+    asynccontextmanager as acm,
+    contextmanager as cm,
+)
 from typing import (
     Any,
     Callable,
diff --git a/tractor/devx/cli.py b/tractor/devx/cli.py
new file mode 100644
index 00000000..353389da
--- /dev/null
+++ b/tractor/devx/cli.py
@@ -0,0 +1,149 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+"""
+CLI framework extensions for hacking on the actor runtime.
+
+Currently popular frameworks supported are:
+
+  - `typer` via the `@callback` API
+
+"""
+from __future__ import annotations
+from contextlib import (
+    # asynccontextmanager as acm,
+    nullcontext,
+    contextmanager as cm,
+)
+from typing import (
+    Any,
+    Callable,
+)
+from typing_extensions import Annotated
+
+import typer
+
+
+from ._debug import open_crash_handler
+
+
+_runtime_vars: dict[str, Any] = {}
+
+
+def load_runtime_vars(
+    ctx: typer.Context,
+    callback: Callable,
+    pdb: bool = False,  # --pdb
+    ll: Annotated[
+        str,
+        typer.Option(
+            '--loglevel',
+            '-l',
+            help='BigD logging level',
+        ),
+    ] = 'cancel',  # -l info
+):
+    '''
+    Maybe engage crash handling with `pdbp` when code inside
+    a `typer` CLI endpoint cmd raises.
+
+    To use this callback simply take your `app = typer.Typer()` instance
+    and decorate this function with it like so:
+
+    .. code:: python
+
+        from tractor.devx import cli
+
+        app = typer.Typer()
+
+        # manual decoration to hook into `click`'s context system!
+        cli.load_runtime_vars = app.callback(
+            invoke_without_command=True,
+        )
+
+    And then you can use the now augmented `click` CLI context as so,
+
+    .. code:: python
+
+        @app.command(
+            context_settings={
+                "allow_extra_args": True,
+                "ignore_unknown_options": True,
+            }
+        )
+        def my_cli_cmd(
+            ctx: typer.Context,
+        ):
+            rtvars: dict = ctx.runtime_vars
+            pdb: bool = rtvars['pdb']
+
+            with tractor.devx.cli.maybe_open_crash_handler(pdb=pdb):
+                trio.run(
+                    partial(
+                        my_tractor_main_task_func,
+                        debug_mode=pdb,
+                        loglevel=rtvars['ll'],
+                    )
+                )
+
+    which will enable log level and debug mode globally for the entire
+    `tractor` + `trio` runtime thereafter!
+
+    Bo
+
+    '''
+    global _runtime_vars
+    _runtime_vars |= {
+        'pdb': pdb,
+        'll': ll,
+    }
+
+    ctx.runtime_vars: dict[str, Any] = _runtime_vars
+    print(
+        f'`typer` sub-cmd: {ctx.invoked_subcommand}\n'
+        f'`tractor` runtime vars: {_runtime_vars}'
+    )
+
+    # XXX NOTE XXX: hackzone.. if no sub-cmd is specified (the
+    # default if the user just invokes `bigd`) then we simply
+    # invoke the sole `_bigd()` cmd passing in the "parent"
+    # typer.Context directly to that call since we're treating it
+    # as a "non sub-command" or wtv..
+    # TODO: ideally typer would have some kinda built-in way to get
+    # this behaviour without having to construct and manually
+    # invoke our own cmd..
+    if (
+        ctx.invoked_subcommand is None
+        or ctx.invoked_subcommand == callback.__name__
+    ):
+        cmd: typer.core.TyperCommand = typer.core.TyperCommand(
+            name='bigd',
+            callback=callback,
+        )
+        ctx.params = {'ctx': ctx}
+        cmd.invoke(ctx)
+
+
+@cm
+def maybe_open_crash_handler(pdb: bool = False):
+    # if the --pdb flag is passed always engage
+    # the pdb REPL on any crashes B)
+    rtctx = nullcontext
+    if pdb:
+        rtctx = open_crash_handler
+
+    with rtctx():
+        yield
-- 
2.34.1


From de89e3a9c4123218c64be5f9e1584f22a43801f7 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 29 Sep 2023 14:11:31 -0400
Subject: [PATCH 030/378] Add libp2p style "multi-address" parser from `piker`

Details are in the module docs; this is a first draft with lotsa room
for refinement and extension.
---
 tractor/_multiaddr.py | 142 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 142 insertions(+)
 create mode 100644 tractor/_multiaddr.py

diff --git a/tractor/_multiaddr.py b/tractor/_multiaddr.py
new file mode 100644
index 00000000..f6b37a35
--- /dev/null
+++ b/tractor/_multiaddr.py
@@ -0,0 +1,142 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+Multiaddress parser and utils according the spec(s) defined by
+`libp2p` and used in dependent project such as `ipfs`:
+
+- https://docs.libp2p.io/concepts/fundamentals/addressing/
+- https://github.com/libp2p/specs/blob/master/addressing/README.md
+
+'''
+from typing import Iterator
+
+from bidict import bidict
+
+# TODO: see if we can leverage libp2p ecosys projects instead of
+# rolling our own (parser) impls of the above addressing specs:
+# - https://github.com/libp2p/py-libp2p
+# - https://docs.libp2p.io/concepts/nat/circuit-relay/#relay-addresses
+# prots: bidict[int, str] = bidict({
+prots: bidict[int, str] = {
+    'ipv4': 3,
+    'ipv6': 3,
+    'wg': 3,
+
+    'tcp': 4,
+    'udp': 4,
+
+    # TODO: support the next-gen shite Bo
+    # 'quic': 4,
+    # 'ssh': 7,  # via rsyscall bootstrapping
+}
+
+prot_params: dict[str, tuple[str]] = {
+    'ipv4': ('addr',),
+    'ipv6': ('addr',),
+    'wg': ('addr', 'port', 'pubkey'),
+
+    'tcp': ('port',),
+    'udp': ('port',),
+
+    # 'quic': ('port',),
+    # 'ssh': ('port',),
+}
+
+
+def iter_prot_layers(
+    multiaddr: str,
+) -> Iterator[
+    tuple[
+        int,
+        list[str]
+    ]
+]:
+    '''
+    Unpack a libp2p style "multiaddress" into multiple "segments"
+    for each "layer" of the protocoll stack (in OSI terms).
+
+    '''
+    tokens: list[str] = multiaddr.split('/')
+    root, tokens = tokens[0], tokens[1:]
+    assert not root  # there is a root '/' on LHS
+    itokens = iter(tokens)
+
+    prot: str | None = None
+    params: list[str] = []
+    for token in itokens:
+        # every prot path should start with a known
+        # key-str.
+        if token in prots:
+            if prot is None:
+                prot: str = token
+            else:
+                yield prot, params
+                prot = token
+
+            params = []
+
+        elif token not in prots:
+            params.append(token)
+
+    else:
+        yield prot, params
+
+
+def parse_addr(
+    multiaddr: str,
+) -> dict[str, str | int | dict]:
+    '''
+    Parse a libp2p style "multiaddress" into it's distinct protocol
+    segments where each segment:
+
+        `../<protocol>/<param0>/<param1>/../<paramN>`
+
+    is loaded into a layers `dict[str, dict[str, Any]` which holds
+    each prot segment of the path as a separate entry sortable by
+    it's approx OSI "layer number".
+
+    Any `paramN` in the path must be distinctly defined in order
+    according to the (global) `prot_params` table in this module.
+
+    '''
+    layers: dict[str, str | int | dict] = {}
+    for (
+        prot_key,
+        params,
+    ) in iter_prot_layers(multiaddr):
+
+        layer: int = prots[prot_key]  # OSI layer used for sorting
+        ep: dict[str, int | str] = {'layer': layer}
+        layers[prot_key] = ep
+
+        # TODO; validation and resolving of names:
+        # - each param via a validator provided as part of the
+        #   prot_params def? (also see `"port"` case below..)
+        # - do a resolv step that will check addrs against
+        #   any loaded network.resolv: dict[str, str]
+        rparams: list = list(reversed(params))
+        for key in prot_params[prot_key]:
+            val: str | int = rparams.pop()
+
+            # TODO: UGHH, dunno what we should do for validation
+            # here, put it in the params spec somehow?
+            if key == 'port':
+                val = int(val)
+
+            ep[key] = val
+
+    return layers
-- 
2.34.1


From 86da79a8547d7037a44880cb713614b837a343a1 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 29 Sep 2023 14:49:18 -0400
Subject: [PATCH 031/378] Rename to `parse_maddr()` and fill out doc strings

---
 tractor/_multiaddr.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/tractor/_multiaddr.py b/tractor/_multiaddr.py
index f6b37a35..d0f562c0 100644
--- a/tractor/_multiaddr.py
+++ b/tractor/_multiaddr.py
@@ -96,21 +96,30 @@ def iter_prot_layers(
         yield prot, params
 
 
-def parse_addr(
+def parse_maddr(
     multiaddr: str,
 ) -> dict[str, str | int | dict]:
     '''
     Parse a libp2p style "multiaddress" into it's distinct protocol
-    segments where each segment:
+    segments where each segment is of the form:
 
         `../<protocol>/<param0>/<param1>/../<paramN>`
 
-    is loaded into a layers `dict[str, dict[str, Any]` which holds
-    each prot segment of the path as a separate entry sortable by
-    it's approx OSI "layer number".
+    and is loaded into a (order preserving) `layers: dict[str,
+    dict[str, Any]` which holds each protocol-layer-segment of the
+    original `str` path as a separate entry according to its approx
+    OSI "layer number".
 
-    Any `paramN` in the path must be distinctly defined in order
-    according to the (global) `prot_params` table in this module.
+    Any `paramN` in the path must be distinctly defined by a str-token in the
+    (module global) `prot_params` table.
+
+    For eg. for wireguard which requires an address, port number and publickey
+    the protocol params are specified as the entry:
+
+        'wg': ('addr', 'port', 'pubkey'),
+
+    and are thus parsed from a maddr in that order:
+        `'/wg/1.1.1.1/51820/<pubkey>'`
 
     '''
     layers: dict[str, str | int | dict] = {}
-- 
2.34.1


From e94f1261b5b28f13f217bb7417f2eb8fbe7f14fb Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 2 Oct 2023 18:10:34 -0400
Subject: [PATCH 032/378] Move `maybe_open_crash_handler()` CLI `--pdb`-driven
 wrapper to debug mod

---
 tractor/devx/__init__.py |  2 ++
 tractor/devx/_debug.py   | 18 ++++++++++++++++++
 tractor/devx/cli.py      | 13 -------------
 3 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/tractor/devx/__init__.py b/tractor/devx/__init__.py
index e24405a0..89b9a336 100644
--- a/tractor/devx/__init__.py
+++ b/tractor/devx/__init__.py
@@ -29,6 +29,7 @@ from ._debug import (
     shield_sigint_handler,
     MultiActorPdb,
     open_crash_handler,
+    maybe_open_crash_handler,
     post_mortem,
 )
 
@@ -41,5 +42,6 @@ __all__ = [
     'shield_sigint_handler',
     'MultiActorPdb',
     'open_crash_handler',
+    'maybe_open_crash_handler',
     'post_mortem',
 ]
diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index eef5c843..24baba06 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -30,6 +30,7 @@ from functools import (
 from contextlib import (
     asynccontextmanager as acm,
     contextmanager as cm,
+    nullcontext,
 )
 from typing import (
     Any,
@@ -1043,3 +1044,20 @@ def open_crash_handler(
     except tuple(catch):
         pdbp.xpm()
         raise
+
+
+@cm
+def maybe_open_crash_handler(pdb: bool = False):
+    '''
+    Same as `open_crash_handler()` but with bool input flag
+    to allow conditional handling.
+
+    Normally this is used with CLI endpoints such that if the --pdb
+    flag is passed the pdb REPL is engaed on any crashes B)
+    '''
+    rtctx = nullcontext
+    if pdb:
+        rtctx = open_crash_handler
+
+    with rtctx():
+        yield
diff --git a/tractor/devx/cli.py b/tractor/devx/cli.py
index 353389da..76890669 100644
--- a/tractor/devx/cli.py
+++ b/tractor/devx/cli.py
@@ -25,7 +25,6 @@ Currently popular frameworks supported are:
 from __future__ import annotations
 from contextlib import (
     # asynccontextmanager as acm,
-    nullcontext,
     contextmanager as cm,
 )
 from typing import (
@@ -135,15 +134,3 @@ def load_runtime_vars(
         )
         ctx.params = {'ctx': ctx}
         cmd.invoke(ctx)
-
-
-@cm
-def maybe_open_crash_handler(pdb: bool = False):
-    # if the --pdb flag is passed always engage
-    # the pdb REPL on any crashes B)
-    rtctx = nullcontext
-    if pdb:
-        rtctx = open_crash_handler
-
-    with rtctx():
-        yield
-- 
2.34.1


From 4314a59327ad7425f1d990d3e006dcf8453db9b1 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 3 Oct 2023 10:54:46 -0400
Subject: [PATCH 033/378] Add post-mortem catch around failed transport addr
 binds to aid with runtime debugging

---
 tractor/_root.py    | 12 ++++++++----
 tractor/_runtime.py | 21 +++++++++++++++------
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/tractor/_root.py b/tractor/_root.py
index b117c2c9..bf2f883e 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -260,10 +260,14 @@ async def open_root_actor(
         # start the actor runtime in a new task
         async with trio.open_nursery() as nursery:
 
-            # ``_runtime.async_main()`` creates an internal nursery and
-            # thus blocks here until the entire underlying actor tree has
-            # terminated thereby conducting structured concurrency.
-
+            # ``_runtime.async_main()`` creates an internal nursery
+            # and blocks here until any underlying actor(-process)
+            # tree has terminated thereby conducting so called
+            # "end-to-end" structured concurrency throughout an
+            # entire hierarchical python sub-process set; all
+            # "actor runtime" primitives are SC-compat and thus all
+            # transitively spawned actors/processes must be as
+            # well.
             await nursery.start(
                 partial(
                     async_main,
diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 0f8cc7ae..bd626440 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -1405,13 +1405,22 @@ async def async_main(
                 # - root actor: the ``accept_addr`` passed to this method
                 assert accept_addrs
 
-                actor._server_n = await service_nursery.start(
-                    partial(
-                        actor._serve_forever,
-                        service_nursery,
-                        listen_sockaddrs=accept_addrs,
+                try:
+                    actor._server_n = await service_nursery.start(
+                        partial(
+                            actor._serve_forever,
+                            service_nursery,
+                            listen_sockaddrs=accept_addrs,
+                        )
                     )
-                )
+                except OSError as oserr:
+                    # NOTE: always allow runtime hackers to debug
+                    # tranport address bind errors - normally it's
+                    # something silly like the wrong socket-address
+                    # passed via a config or CLI Bo
+                    entered_debug = await _debug._maybe_enter_pm(oserr)
+                    raise
+
                 accept_addrs: list[tuple[str, int]] = actor.accept_addrs
 
                 # NOTE: only set the loopback addr for the 
-- 
2.34.1


From 78c0d2b234b45d9c7b20372f36179ffb5db9fbe8 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 5 Oct 2023 19:45:46 -0400
Subject: [PATCH 034/378] Start inter-peer cancellation test mod

Move over relevant test from the "context semantics" test module which
was already verifying peer-caused-`ContextCancelled.canceller: tuple`
error info and propagation during an inter-peer cancellation scenario.

Also begin a more general set of inter-peer cancellation tests starting
with the simplest case where when a peer is cancelled the parent should
NOT get an "muted" `trio.Cancelled` and instead
a `tractor.ContextCancelled` with a `.canceller: tuple` which points to
the sibling actor which requested the peer cancel.
---
 tests/test_context_stream_semantics.py |  94 +----------
 tests/test_inter_peer_cancellation.py  | 209 +++++++++++++++++++++++++
 2 files changed, 211 insertions(+), 92 deletions(-)
 create mode 100644 tests/test_inter_peer_cancellation.py

diff --git a/tests/test_context_stream_semantics.py b/tests/test_context_stream_semantics.py
index 4efc6319..a0d291d7 100644
--- a/tests/test_context_stream_semantics.py
+++ b/tests/test_context_stream_semantics.py
@@ -1,8 +1,8 @@
 '''
 ``async with ():`` inlined context-stream cancellation testing.
 
-Verify the we raise errors when streams are opened prior to sync-opening
-a ``tractor.Context`` beforehand.
+Verify the we raise errors when streams are opened prior to
+sync-opening a ``tractor.Context`` beforehand.
 
 '''
 from contextlib import asynccontextmanager as acm
@@ -922,93 +922,3 @@ def test_maybe_allow_overruns_stream(
         # if this hits the logic blocks from above are not
         # exhaustive..
         pytest.fail('PARAMETRIZED CASE GEN PROBLEM YO')
-
-
-@tractor.context
-async def sleep_forever(
-    ctx: tractor.Context,
-) -> None:
-    await ctx.started()
-    async with ctx.open_stream():
-        await trio.sleep_forever()
-
-
-@acm
-async def attach_to_sleep_forever():
-    '''
-    Cancel a context **before** any underlying error is raised in order
-    to trigger a local reception of a ``ContextCancelled`` which **should not**
-    be re-raised in the local surrounding ``Context`` *iff* the cancel was
-    requested by **this** side of the context.
-
-    '''
-    async with tractor.wait_for_actor('sleeper') as p2:
-        async with (
-            p2.open_context(sleep_forever) as (peer_ctx, first),
-            peer_ctx.open_stream(),
-        ):
-            try:
-                yield
-            finally:
-                # XXX: previously this would trigger local
-                # ``ContextCancelled`` to be received and raised in the
-                # local context overriding any local error due to logic
-                # inside ``_invoke()`` which checked for an error set on
-                # ``Context._error`` and raised it in a cancellation
-                # scenario.
-                # ------
-                # The problem is you can have a remote cancellation that
-                # is part of a local error and we shouldn't raise
-                # ``ContextCancelled`` **iff** we **were not** the side
-                # of the context to initiate it, i.e.
-                # ``Context._cancel_called`` should **NOT** have been
-                # set. The special logic to handle this case is now
-                # inside ``Context._maybe_raise_from_remote_msg()`` XD
-                await peer_ctx.cancel()
-
-
-@tractor.context
-async def error_before_started(
-    ctx: tractor.Context,
-) -> None:
-    '''
-    This simulates exactly an original bug discovered in:
-    https://github.com/pikers/piker/issues/244
-
-    '''
-    async with attach_to_sleep_forever():
-        # send an unserializable type which should raise a type error
-        # here and **NOT BE SWALLOWED** by the surrounding acm!!?!
-        await ctx.started(object())
-
-
-def test_do_not_swallow_error_before_started_by_remote_contextcancelled():
-    '''
-    Verify that an error raised in a remote context which itself opens
-    another remote context, which it cancels, does not ovverride the
-    original error that caused the cancellation of the secondardy
-    context.
-
-    '''
-    async def main():
-        async with tractor.open_nursery() as n:
-            portal = await n.start_actor(
-                'errorer',
-                enable_modules=[__name__],
-            )
-            await n.start_actor(
-                'sleeper',
-                enable_modules=[__name__],
-            )
-
-            async with (
-                portal.open_context(
-                    error_before_started
-                ) as (ctx, sent),
-            ):
-                await trio.sleep_forever()
-
-    with pytest.raises(tractor.RemoteActorError) as excinfo:
-        trio.run(main)
-
-    assert excinfo.value.type == TypeError
diff --git a/tests/test_inter_peer_cancellation.py b/tests/test_inter_peer_cancellation.py
new file mode 100644
index 00000000..bf93372b
--- /dev/null
+++ b/tests/test_inter_peer_cancellation.py
@@ -0,0 +1,209 @@
+'''
+Codify the cancellation request semantics in terms
+of one remote actor cancelling another.
+
+'''
+from contextlib import asynccontextmanager as acm
+
+import pytest
+import trio
+import tractor
+from tractor._exceptions import (
+    StreamOverrun,
+    ContextCancelled,
+)
+
+
+def test_self_cancel():
+    '''
+    2 cases:
+    - calls `Actor.cancel()` locally in some task
+    - calls LocalPortal.cancel_actor()` ?
+
+    '''
+    ...
+
+
+@tractor.context
+async def sleep_forever(
+    ctx: tractor.Context,
+) -> None:
+    '''
+    Sync the context, open a stream then just sleep.
+
+    '''
+    await ctx.started()
+    async with ctx.open_stream():
+        await trio.sleep_forever()
+
+
+@acm
+async def attach_to_sleep_forever():
+    '''
+    Cancel a context **before** any underlying error is raised in order
+    to trigger a local reception of a ``ContextCancelled`` which **should not**
+    be re-raised in the local surrounding ``Context`` *iff* the cancel was
+    requested by **this** side of the context.
+
+    '''
+    async with tractor.wait_for_actor('sleeper') as p2:
+        async with (
+            p2.open_context(sleep_forever) as (peer_ctx, first),
+            peer_ctx.open_stream(),
+        ):
+            try:
+                yield
+            finally:
+                # XXX: previously this would trigger local
+                # ``ContextCancelled`` to be received and raised in the
+                # local context overriding any local error due to logic
+                # inside ``_invoke()`` which checked for an error set on
+                # ``Context._error`` and raised it in a cancellation
+                # scenario.
+                # ------
+                # The problem is you can have a remote cancellation that
+                # is part of a local error and we shouldn't raise
+                # ``ContextCancelled`` **iff** we **were not** the side
+                # of the context to initiate it, i.e.
+                # ``Context._cancel_called`` should **NOT** have been
+                # set. The special logic to handle this case is now
+                # inside ``Context._maybe_raise_from_remote_msg()`` XD
+                await peer_ctx.cancel()
+
+
+@tractor.context
+async def error_before_started(
+    ctx: tractor.Context,
+) -> None:
+    '''
+    This simulates exactly an original bug discovered in:
+    https://github.com/pikers/piker/issues/244
+
+    '''
+    async with attach_to_sleep_forever():
+
+        # XXX NOTE XXX: THIS sends an UNSERIALIZABLE TYPE which
+        # should raise a `TypeError` and **NOT BE SWALLOWED** by
+        # the surrounding acm!!?!
+        await ctx.started(object())
+
+
+def test_do_not_swallow_error_before_started_by_remote_contextcancelled():
+    '''
+    Verify that an error raised in a remote context which itself
+    opens YET ANOTHER remote context, which it then cancels, does not
+    override the original error that caused the cancellation of the
+    secondary context.
+
+    '''
+    async def main():
+        async with tractor.open_nursery() as n:
+            portal = await n.start_actor(
+                'errorer',
+                enable_modules=[__name__],
+            )
+            await n.start_actor(
+                'sleeper',
+                enable_modules=[__name__],
+            )
+
+            async with (
+                portal.open_context(
+                    error_before_started
+                ) as (ctx, sent),
+            ):
+                await trio.sleep_forever()
+
+    with pytest.raises(tractor.RemoteActorError) as excinfo:
+        trio.run(main)
+
+    assert excinfo.value.type == TypeError
+
+
+@tractor.context
+async def sleep_a_bit_then_cancel_sleeper(
+    ctx: tractor.Context,
+) -> None:
+    async with tractor.wait_for_actor('sleeper') as sleeper:
+        await ctx.started()
+        # await trio.sleep_forever()
+        await trio.sleep(3)
+    # async with tractor.wait_for_actor('sleeper') as sleeper:
+        await sleeper.cancel_actor()
+
+
+def test_peer_canceller():
+    '''
+    Verify that a cancellation triggered by a peer (whether in tree
+    or not) results in a cancelled error with
+    a `ContextCancelled.errorer` matching the requesting actor.
+
+    cases:
+    - some arbitrary remote peer cancels via Portal.cancel_actor().
+      => all other connected peers should get that cancel requesting peer's
+         uid in the ctx-cancelled error msg.
+
+    - peer spawned a sub-actor which (also) spawned a failing task
+      which was unhandled and propagated up to the immediate
+      parent, the peer to the actor that also spawned a remote task
+      task in that same peer-parent.
+
+    - peer cancelled itself - so other peers should
+      get errors reflecting that the peer was itself the .canceller?
+
+    - WE cancelled the peer and thus should not see any raised
+      `ContextCancelled` as it should be reaped silently?
+      => pretty sure `test_context_stream_semantics::test_caller_cancels()`
+         already covers this case?
+
+    '''
+
+    async def main():
+        async with tractor.open_nursery() as n:
+            canceller: tractor.Portal = await n.start_actor(
+                'canceller',
+                enable_modules=[__name__],
+            )
+            sleeper: tractor.Portal = await n.start_actor(
+                'sleeper',
+                enable_modules=[__name__],
+            )
+
+            async with (
+                sleeper.open_context(
+                    sleep_forever,
+                ) as (sleeper_ctx, sent),
+
+                canceller.open_context(
+                    sleep_a_bit_then_cancel_sleeper,
+                ) as (canceller_ctx, sent),
+            ):
+                # await tractor.pause()
+                try:
+                    print('PRE CONTEXT RESULT')
+                    await sleeper_ctx.result()
+
+                # TODO: not sure why this isn't catching
+                # but maybe we need an `ExceptionGroup` and
+                # the whole except *errs: thinger in 3.11?
+                except (
+                    ContextCancelled,
+                ) as berr:
+                    print('CAUGHT REMOTE CONTEXT CANCEL')
+
+                    # canceller should not have been remotely
+                    # cancelled.
+                    assert canceller_ctx.cancel_called_remote is None
+                    assert sleeper_ctx.canceller == 'canceller'
+                    await tractor.pause(shield=True)
+                    assert not sleep_ctx.cancelled_caught
+
+                    raise
+                else:
+                    raise RuntimeError('NEVER RXED EXPECTED `ContextCancelled`')
+
+
+    with pytest.raises(tractor.ContextCancelled) as excinfo:
+        trio.run(main)
+
+    assert excinfo.value.type == ContextCancelled
-- 
2.34.1


From 18a1634025000128e203900ce8a7c23e879e49da Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 6 Oct 2023 15:49:23 -0400
Subject: [PATCH 035/378] Add shielding support to `.pause()`

Implement it like you'd expect using simply a wrapping
`trio.CancelScope` which is itself shielded by the input `shield: bool`
B)

There's seemingly still some issues with the frame selection when the
REPL engages and not sure how to resolve it yet but at least this does
indeed work for practical purposes. Still needs a test obviously!
---
 tractor/devx/_debug.py | 361 +++++++++++++++++++++--------------------
 1 file changed, 186 insertions(+), 175 deletions(-)

diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 24baba06..d3ad1bd4 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -382,7 +382,7 @@ async def wait_for_parent_stdin_hijack(
 
     This function is used by any sub-actor to acquire mutex access to
     the ``pdb`` REPL and thus the root's TTY for interactive debugging
-    (see below inside ``_pause()``). It can be used to ensure that
+    (see below inside ``pause()``). It can be used to ensure that
     an intermediate nursery-owning actor does not clobber its children
     if they are in debug (see below inside
     ``maybe_wait_for_debugger()``).
@@ -448,171 +448,6 @@ def mk_mpdb() -> tuple[MultiActorPdb, Callable]:
     return pdb, Lock.unshield_sigint
 
 
-async def _pause(
-
-    debug_func: Callable | None = None,
-    release_lock_signal: trio.Event | None = None,
-
-    # TODO:
-    # shield: bool = False
-    task_status: TaskStatus[trio.Event] = trio.TASK_STATUS_IGNORED
-
-) -> None:
-    '''
-    A pause point (more commonly known as a "breakpoint") interrupt
-    instruction for engaging a blocking debugger instance to
-    conduct manual console-based-REPL-interaction from within
-    `tractor`'s async runtime, normally from some single-threaded
-    and currently executing actor-hosted-`trio`-task in some
-    (remote) process.
-
-    NOTE: we use the semantics "pause" since it better encompasses
-    the entirety of the necessary global-runtime-state-mutation any
-    actor-task must access and lock in order to get full isolated
-    control over the process tree's root TTY:
-    https://en.wikipedia.org/wiki/Breakpoint
-
-    '''
-    __tracebackhide__ = True
-    actor = tractor.current_actor()
-    pdb, undo_sigint = mk_mpdb()
-    task_name = trio.lowlevel.current_task().name
-
-    # TODO: is it possible to debug a trio.Cancelled except block?
-    # right now it seems like we can kinda do with by shielding
-    # around ``tractor.breakpoint()`` but not if we move the shielded
-    # scope here???
-    # with trio.CancelScope(shield=shield):
-    #     await trio.lowlevel.checkpoint()
-
-    if (
-        not Lock.local_pdb_complete
-        or Lock.local_pdb_complete.is_set()
-    ):
-        Lock.local_pdb_complete = trio.Event()
-
-    # TODO: need a more robust check for the "root" actor
-    if (
-        not is_root_process()
-        and actor._parent_chan  # a connected child
-    ):
-
-        if Lock.local_task_in_debug:
-
-            # Recurrence entry case: this task already has the lock and
-            # is likely recurrently entering a breakpoint
-            if Lock.local_task_in_debug == task_name:
-                # noop on recurrent entry case but we want to trigger
-                # a checkpoint to allow other actors error-propagate and
-                # potetially avoid infinite re-entries in some subactor.
-                await trio.lowlevel.checkpoint()
-                return
-
-            # if **this** actor is already in debug mode block here
-            # waiting for the control to be released - this allows
-            # support for recursive entries to `tractor.breakpoint()`
-            log.warning(f"{actor.uid} already has a debug lock, waiting...")
-
-            await Lock.local_pdb_complete.wait()
-            await trio.sleep(0.1)
-
-        # mark local actor as "in debug mode" to avoid recurrent
-        # entries/requests to the root process
-        Lock.local_task_in_debug = task_name
-
-        # this **must** be awaited by the caller and is done using the
-        # root nursery so that the debugger can continue to run without
-        # being restricted by the scope of a new task nursery.
-
-        # TODO: if we want to debug a trio.Cancelled triggered exception
-        # we have to figure out how to avoid having the service nursery
-        # cancel on this task start? I *think* this works below:
-        # ```python
-        #   actor._service_n.cancel_scope.shield = shield
-        # ```
-        # but not entirely sure if that's a sane way to implement it?
-        try:
-            with trio.CancelScope(shield=True):
-                await actor._service_n.start(
-                    wait_for_parent_stdin_hijack,
-                    actor.uid,
-                )
-                Lock.repl = pdb
-        except RuntimeError:
-            Lock.release()
-
-            if actor._cancel_called:
-                # service nursery won't be usable and we
-                # don't want to lock up the root either way since
-                # we're in (the midst of) cancellation.
-                return
-
-            raise
-
-    elif is_root_process():
-
-        # we also wait in the root-parent for any child that
-        # may have the tty locked prior
-        # TODO: wait, what about multiple root tasks acquiring it though?
-        if Lock.global_actor_in_debug == actor.uid:
-            # re-entrant root process already has it: noop.
-            return
-
-        # XXX: since we need to enter pdb synchronously below,
-        # we have to release the lock manually from pdb completion
-        # callbacks. Can't think of a nicer way then this atm.
-        if Lock._debug_lock.locked():
-            log.warning(
-                'Root actor attempting to shield-acquire active tty lock'
-                f' owned by {Lock.global_actor_in_debug}')
-
-            # must shield here to avoid hitting a ``Cancelled`` and
-            # a child getting stuck bc we clobbered the tty
-            with trio.CancelScope(shield=True):
-                await Lock._debug_lock.acquire()
-        else:
-            # may be cancelled
-            await Lock._debug_lock.acquire()
-
-        Lock.global_actor_in_debug = actor.uid
-        Lock.local_task_in_debug = task_name
-        Lock.repl = pdb
-
-    try:
-        # breakpoint()
-        if debug_func is None:
-            # assert release_lock_signal, (
-            #     'Must pass `release_lock_signal: trio.Event` if no '
-            #     'trace func provided!'
-            # )
-            print(f"{actor.uid} ENTERING WAIT")
-            task_status.started()
-
-            # with trio.CancelScope(shield=True):
-            #     await release_lock_signal.wait()
-
-        else:
-            # block here one (at the appropriate frame *up*) where
-            # ``breakpoint()`` was awaited and begin handling stdio.
-            log.debug("Entering the synchronous world of pdb")
-            debug_func(actor, pdb)
-
-    except bdb.BdbQuit:
-        Lock.release()
-        raise
-
-    # XXX: apparently we can't do this without showing this frame
-    # in the backtrace on first entry to the REPL? Seems like an odd
-    # behaviour that should have been fixed by now. This is also why
-    # we scrapped all the @cm approaches that were tried previously.
-    # finally:
-    #     __tracebackhide__ = True
-    #     # frame = sys._getframe()
-    #     # last_f = frame.f_back
-    #     # last_f.f_globals['__tracebackhide__'] = True
-    #     # signal.signal = pdbp.hideframe(signal.signal)
-
-
 def shield_sigint_handler(
     signum: int,
     frame: 'frame',  # type: ignore # noqa
@@ -774,6 +609,7 @@ def shield_sigint_handler(
 def _set_trace(
     actor: tractor.Actor | None = None,
     pdb: MultiActorPdb | None = None,
+    shield: bool = False,
 ):
     __tracebackhide__ = True
     actor: tractor.Actor = actor or tractor.current_actor()
@@ -785,14 +621,20 @@ def _set_trace(
 
     if (
         frame
-        and pdb
-        and actor is not None
+        and (
+            pdb
+            and actor is not None
+        ) or shield
     ):
+        # pdbp.set_trace()
         log.pdb(f"\nAttaching pdb to actor: {actor.uid}\n")
         # no f!#$&* idea, but when we're in async land
         # we need 2x frames up?
         frame = frame.f_back
 
+        # if shield:
+        #     frame = frame.f_back
+
     else:
         pdb, undo_sigint = mk_mpdb()
 
@@ -804,8 +646,181 @@ def _set_trace(
     # undo_
 
 
-# TODO: allow pausing from sync code, normally by remapping
-# python's builtin breakpoint() hook to this runtime aware version.
+
+async def pause(
+
+    debug_func: Callable = _set_trace,
+    release_lock_signal: trio.Event | None = None,
+
+    # allow caller to pause despite task cancellation,
+    # exactly the same as wrapping with:
+    # with CancelScope(shield=True):
+    #     await pause()
+    shield: bool = False,
+
+    # TODO:
+    # shield: bool = False
+    task_status: TaskStatus[trio.Event] = trio.TASK_STATUS_IGNORED
+
+) -> None:
+    '''
+    A pause point (more commonly known as a "breakpoint") interrupt
+    instruction for engaging a blocking debugger instance to
+    conduct manual console-based-REPL-interaction from within
+    `tractor`'s async runtime, normally from some single-threaded
+    and currently executing actor-hosted-`trio`-task in some
+    (remote) process.
+
+    NOTE: we use the semantics "pause" since it better encompasses
+    the entirety of the necessary global-runtime-state-mutation any
+    actor-task must access and lock in order to get full isolated
+    control over the process tree's root TTY:
+    https://en.wikipedia.org/wiki/Breakpoint
+
+    '''
+    __tracebackhide__ = True
+    actor = tractor.current_actor()
+    pdb, undo_sigint = mk_mpdb()
+    task_name = trio.lowlevel.current_task().name
+
+    if (
+        not Lock.local_pdb_complete
+        or Lock.local_pdb_complete.is_set()
+    ):
+        Lock.local_pdb_complete = trio.Event()
+
+    if shield:
+        debug_func = partial(
+            debug_func,
+            shield=shield,
+        )
+
+    with trio.CancelScope(shield=shield):
+
+        # TODO: need a more robust check for the "root" actor
+        if (
+            not is_root_process()
+            and actor._parent_chan  # a connected child
+        ):
+
+            if Lock.local_task_in_debug:
+
+                # Recurrence entry case: this task already has the lock and
+                # is likely recurrently entering a breakpoint
+                if Lock.local_task_in_debug == task_name:
+                    # noop on recurrent entry case but we want to trigger
+                    # a checkpoint to allow other actors error-propagate and
+                    # potetially avoid infinite re-entries in some subactor.
+                    await trio.lowlevel.checkpoint()
+                    return
+
+                # if **this** actor is already in debug mode block here
+                # waiting for the control to be released - this allows
+                # support for recursive entries to `tractor.breakpoint()`
+                log.warning(f"{actor.uid} already has a debug lock, waiting...")
+
+                await Lock.local_pdb_complete.wait()
+                await trio.sleep(0.1)
+
+            # mark local actor as "in debug mode" to avoid recurrent
+            # entries/requests to the root process
+            Lock.local_task_in_debug = task_name
+
+            # this **must** be awaited by the caller and is done using the
+            # root nursery so that the debugger can continue to run without
+            # being restricted by the scope of a new task nursery.
+
+            # TODO: if we want to debug a trio.Cancelled triggered exception
+            # we have to figure out how to avoid having the service nursery
+            # cancel on this task start? I *think* this works below:
+            # ```python
+            #   actor._service_n.cancel_scope.shield = shield
+            # ```
+            # but not entirely sure if that's a sane way to implement it?
+            try:
+                with trio.CancelScope(shield=True):
+                    await actor._service_n.start(
+                        wait_for_parent_stdin_hijack,
+                        actor.uid,
+                    )
+                    Lock.repl = pdb
+            except RuntimeError:
+                Lock.release()
+
+                if actor._cancel_called:
+                    # service nursery won't be usable and we
+                    # don't want to lock up the root either way since
+                    # we're in (the midst of) cancellation.
+                    return
+
+                raise
+
+        elif is_root_process():
+
+            # we also wait in the root-parent for any child that
+            # may have the tty locked prior
+            # TODO: wait, what about multiple root tasks acquiring it though?
+            if Lock.global_actor_in_debug == actor.uid:
+                # re-entrant root process already has it: noop.
+                return
+
+            # XXX: since we need to enter pdb synchronously below,
+            # we have to release the lock manually from pdb completion
+            # callbacks. Can't think of a nicer way then this atm.
+            if Lock._debug_lock.locked():
+                log.warning(
+                    'Root actor attempting to shield-acquire active tty lock'
+                    f' owned by {Lock.global_actor_in_debug}')
+
+                # must shield here to avoid hitting a ``Cancelled`` and
+                # a child getting stuck bc we clobbered the tty
+                with trio.CancelScope(shield=True):
+                    await Lock._debug_lock.acquire()
+            else:
+                # may be cancelled
+                await Lock._debug_lock.acquire()
+
+            Lock.global_actor_in_debug = actor.uid
+            Lock.local_task_in_debug = task_name
+            Lock.repl = pdb
+
+        try:
+            if debug_func is None:
+                # assert release_lock_signal, (
+                #     'Must pass `release_lock_signal: trio.Event` if no '
+                #     'trace func provided!'
+                # )
+                print(f"{actor.uid} ENTERING WAIT")
+                task_status.started()
+
+                # with trio.CancelScope(shield=True):
+                #     await release_lock_signal.wait()
+
+            else:
+                # block here one (at the appropriate frame *up*) where
+                # ``breakpoint()`` was awaited and begin handling stdio.
+                log.debug("Entering the synchronous world of pdb")
+                debug_func(actor, pdb)
+
+        except bdb.BdbQuit:
+            Lock.release()
+            raise
+
+        # XXX: apparently we can't do this without showing this frame
+        # in the backtrace on first entry to the REPL? Seems like an odd
+        # behaviour that should have been fixed by now. This is also why
+        # we scrapped all the @cm approaches that were tried previously.
+        # finally:
+        #     __tracebackhide__ = True
+        #     # frame = sys._getframe()
+        #     # last_f = frame.f_back
+        #     # last_f.f_globals['__tracebackhide__'] = True
+        #     # signal.signal = pdbp.hideframe(signal.signal)
+
+
+# TODO: allow pausing from sync code.
+# normally by remapping python's builtin breakpoint() hook to this
+# runtime aware version which takes care of all .
 def pause_from_sync() -> None:
     print("ENTER SYNC PAUSE")
     import greenback
@@ -858,10 +873,6 @@ def pause_from_sync() -> None:
 # using the "pause" semantics instead since
 # that better covers actually somewhat "pausing the runtime"
 # for this particular paralell task to do debugging B)
-pause = partial(
-    _pause,
-    _set_trace,
-)
 # pp = pause  # short-hand for "pause point"
 
 
@@ -895,7 +906,7 @@ def _post_mortem(
 
 
 post_mortem = partial(
-    _pause,
+    pause,
     _post_mortem,
 )
 
-- 
2.34.1


From d24a9e158f42369e3f95991e503e4cbb74d100fa Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sat, 7 Oct 2023 18:51:03 -0400
Subject: [PATCH 036/378] Msg-ified `ContextCancelled`s sub-error type should
 always be just, its type..

---
 tractor/_exceptions.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 6da2e657..9de27bdf 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -39,8 +39,11 @@ class ActorFailure(Exception):
 
 
 class RemoteActorError(Exception):
+    '''
+    Remote actor exception bundled locally
+
+    '''
     # TODO: local recontruction of remote exception deats
-    "Remote actor exception bundled locally"
     def __init__(
         self,
         message: str,
@@ -149,13 +152,13 @@ def unpack_error(
     error = msg['error']
 
     tb_str = error.get('tb_str', '')
-    message = f"{chan.uid}\n" + tb_str
+    message = f'{chan.uid}\n' + tb_str
     type_name = error['type_str']
     suberror_type: Type[BaseException] = Exception
 
     if type_name == 'ContextCancelled':
         err_type = ContextCancelled
-        suberror_type = RemoteActorError
+        suberror_type = err_type
 
     else:  # try to lookup a suitable local error type
         for ns in [
-- 
2.34.1


From c4cd573b26bc06922d2b80eaa0347feb4bd18830 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sat, 7 Oct 2023 18:51:59 -0400
Subject: [PATCH 037/378] Drop pause line from ctx cancel handler block in test

---
 tests/test_inter_peer_cancellation.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/test_inter_peer_cancellation.py b/tests/test_inter_peer_cancellation.py
index bf93372b..aeb70e81 100644
--- a/tests/test_inter_peer_cancellation.py
+++ b/tests/test_inter_peer_cancellation.py
@@ -194,8 +194,12 @@ def test_peer_canceller():
                     # canceller should not have been remotely
                     # cancelled.
                     assert canceller_ctx.cancel_called_remote is None
+
+                    # NOTE: will only enter if you wrap in
+                    # a shielded cs..
+                    # await tractor.pause()  # TODO: shield=True)
+
                     assert sleeper_ctx.canceller == 'canceller'
-                    await tractor.pause(shield=True)
                     assert not sleep_ctx.cancelled_caught
 
                     raise
-- 
2.34.1


From a09b8560bb70a5a16479c1c2521cd38ff25479be Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sat, 7 Oct 2023 18:52:37 -0400
Subject: [PATCH 038/378] Oof, default reg addrs needs to be in `list[tuple]`
 form..

---
 tractor/_root.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tractor/_root.py b/tractor/_root.py
index bf2f883e..0969fe55 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -124,10 +124,10 @@ async def open_root_actor(
 
     registry_addrs: list[tuple[str, int]] = (
         registry_addrs
-        or [  # default on localhost
+        or [(  # default on localhost
             _default_arbiter_host,
             _default_arbiter_port,
-        ]
+         )]
     )
 
     loglevel = (loglevel or log._default_loglevel).upper()
-- 
2.34.1


From 919e462f88b5fc558e0f6e9b1fb213e38e64e38b Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 8 Oct 2023 15:57:18 -0400
Subject: [PATCH 039/378] Write more comprehensive `Portal.cancel_actor()` doc
 str

---
 tractor/_portal.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tractor/_portal.py b/tractor/_portal.py
index 9016eda9..cf13d9be 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -191,7 +191,15 @@ class Portal:
 
     ) -> bool:
         '''
-        Cancel the actor on the other end of this portal.
+        Cancel the actor runtime (and thus process) on the far
+        end of this portal.
+
+        **NOTE** THIS CANCELS THE ENTIRE RUNTIME AND THE
+        SUBPROCESS, it DOES NOT just cancel the remote task. If you
+        want to have a handle to cancel a remote ``tri.Task`` look
+        at `.open_context()` and the definition of
+        `._context.Context.cancel()` which CAN be used for this
+        purpose.
 
         '''
         if not self.channel.connected():
-- 
2.34.1


From 575a24adf198b28fb57f8d3150dcacef94c7b6e0 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 10 Oct 2023 09:45:49 -0400
Subject: [PATCH 040/378] Always raise remote (cancelled) error if set

Previously we weren't raising a remote error if the local scope was
cancelled during a call to `Context.result()` which is problematic if
the caller WAS NOT the requester for said remote cancellation; in that
case we still want a `ContextCancelled` raised with the `.canceller:
str` set to the cancelling actor uid.

Further fix a naming bug where the (seemingly older) `._remote_err` was
being set to such an error instead of `._remote_error` XD
---
 tractor/_context.py | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index e35188cd..0df1e80a 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -102,10 +102,14 @@ class Context:
     _remote_error: BaseException | None = None
 
     # cancellation state
-    _cancel_called: bool = False
-    _cancelled_remote: tuple | None = None
+    _cancel_called: bool = False  # did WE cancel the far end?
+    _cancelled_remote: tuple[str, str] | None = None
     _cancel_msg: str | None = None
     _scope: trio.CancelScope | None = None
+
+    # NOTE:  this is set by the `.devx._debug` machinery
+    # to indicate whether code in `._runtime` should handle
+    # cancelled context crashes in the pdbp REPL.
     _enter_debugger_on_cancel: bool = True
 
     @property
@@ -207,7 +211,7 @@ class Context:
         # XXX: set the remote side's error so that after we cancel
         # whatever task is the opener of this context it can raise
         # that error as the reason.
-        self._remote_error = error
+        self._remote_error: BaseException = error
 
         # always record the remote actor's uid since its cancellation
         # state is directly linked to ours (the local one).
@@ -488,11 +492,7 @@ class Context:
         assert self._portal, "Context.result() can not be called from callee!"
         assert self._recv_chan
 
-        # from . import _debug
-        # await _debug.breakpoint()
-
-        re = self._remote_error
-        if re:
+        if re := self._remote_error:
             self._maybe_raise_remote_err(re)
             return re
 
@@ -507,7 +507,7 @@ class Context:
             while True:
                 msg = await self._recv_chan.receive()
                 try:
-                    self._result = msg['return']
+                    self._result: Any = msg['return']
 
                     # NOTE: we don't need to do this right?
                     # XXX: only close the rx mem chan AFTER
@@ -516,6 +516,21 @@ class Context:
                     #     await self._recv_chan.aclose()
 
                     break
+
+                # NOTE: we get here if the far end was
+                # `ContextCancelled` in 2 cases:
+                # - we requested the cancellation and thus
+                #   SHOULD NOT raise that far end error,
+                # - WE DID NOT REQUEST that cancel and thus
+                #   SHOULD RAISE HERE!
+                except trio.Cancelled:
+                    if not self._cancel_called:
+                        raise self._remote_error
+                    else:
+                        # if we DID request the cancel we simply
+                        # continue as normal.
+                        raise
+
                 except KeyError:  # as msgerr:
 
                     if 'yield' in msg:
@@ -537,7 +552,7 @@ class Context:
                     )  # from msgerr
 
                     err = self._maybe_raise_remote_err(err)
-                    self._remote_err = err
+                    self._remote_error = err
 
         return self._remote_error or self._result
 
-- 
2.34.1


From 6d951c526a7ae43e9786cacc53c8169211c755bf Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 10 Oct 2023 09:55:11 -0400
Subject: [PATCH 041/378] Comment all `.pause(shield=True)` attempts again,
 need to solve cancel scope `.__exit__()` frame hiding issue..

---
 tractor/devx/_debug.py | 244 +++++++++++++++++++++--------------------
 1 file changed, 128 insertions(+), 116 deletions(-)

diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index d3ad1bd4..561c387c 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -631,6 +631,7 @@ def _set_trace(
         # no f!#$&* idea, but when we're in async land
         # we need 2x frames up?
         frame = frame.f_back
+        # frame = frame.f_back
 
         # if shield:
         #     frame = frame.f_back
@@ -646,17 +647,19 @@ def _set_trace(
     # undo_
 
 
-
 async def pause(
 
     debug_func: Callable = _set_trace,
     release_lock_signal: trio.Event | None = None,
 
-    # allow caller to pause despite task cancellation,
+    # TODO: allow caller to pause despite task cancellation,
     # exactly the same as wrapping with:
     # with CancelScope(shield=True):
     #     await pause()
-    shield: bool = False,
+    # => the REMAINING ISSUE is that the scope's .__exit__() frame
+    # is always show in the debugger on entry.. and there seems to
+    # be no way to override it?..
+    # shield: bool = False,
 
     # TODO:
     # shield: bool = False
@@ -689,133 +692,142 @@ async def pause(
     ):
         Lock.local_pdb_complete = trio.Event()
 
-    if shield:
-        debug_func = partial(
-            debug_func,
-            shield=shield,
-        )
+    # if shield:
+    debug_func = partial(
+        debug_func,
+        # shield=shield,
+    )
 
-    with trio.CancelScope(shield=shield):
+    # def _exit(self, *args, **kwargs):
+    #     __tracebackhide__: bool = True
+    #     super().__exit__(*args, **kwargs)
+
+    # trio.CancelScope.__exit__.__tracebackhide__ = True
+
+    # import types
+    # with trio.CancelScope(shield=shield) as cs:
+        # cs.__exit__ = types.MethodType(_exit, cs)
+        # cs.__exit__.__tracebackhide__ = True
 
         # TODO: need a more robust check for the "root" actor
-        if (
-            not is_root_process()
-            and actor._parent_chan  # a connected child
-        ):
+    if (
+        not is_root_process()
+        and actor._parent_chan  # a connected child
+    ):
 
-            if Lock.local_task_in_debug:
+        if Lock.local_task_in_debug:
 
-                # Recurrence entry case: this task already has the lock and
-                # is likely recurrently entering a breakpoint
-                if Lock.local_task_in_debug == task_name:
-                    # noop on recurrent entry case but we want to trigger
-                    # a checkpoint to allow other actors error-propagate and
-                    # potetially avoid infinite re-entries in some subactor.
-                    await trio.lowlevel.checkpoint()
-                    return
-
-                # if **this** actor is already in debug mode block here
-                # waiting for the control to be released - this allows
-                # support for recursive entries to `tractor.breakpoint()`
-                log.warning(f"{actor.uid} already has a debug lock, waiting...")
-
-                await Lock.local_pdb_complete.wait()
-                await trio.sleep(0.1)
-
-            # mark local actor as "in debug mode" to avoid recurrent
-            # entries/requests to the root process
-            Lock.local_task_in_debug = task_name
-
-            # this **must** be awaited by the caller and is done using the
-            # root nursery so that the debugger can continue to run without
-            # being restricted by the scope of a new task nursery.
-
-            # TODO: if we want to debug a trio.Cancelled triggered exception
-            # we have to figure out how to avoid having the service nursery
-            # cancel on this task start? I *think* this works below:
-            # ```python
-            #   actor._service_n.cancel_scope.shield = shield
-            # ```
-            # but not entirely sure if that's a sane way to implement it?
-            try:
-                with trio.CancelScope(shield=True):
-                    await actor._service_n.start(
-                        wait_for_parent_stdin_hijack,
-                        actor.uid,
-                    )
-                    Lock.repl = pdb
-            except RuntimeError:
-                Lock.release()
-
-                if actor._cancel_called:
-                    # service nursery won't be usable and we
-                    # don't want to lock up the root either way since
-                    # we're in (the midst of) cancellation.
-                    return
-
-                raise
-
-        elif is_root_process():
-
-            # we also wait in the root-parent for any child that
-            # may have the tty locked prior
-            # TODO: wait, what about multiple root tasks acquiring it though?
-            if Lock.global_actor_in_debug == actor.uid:
-                # re-entrant root process already has it: noop.
+            # Recurrence entry case: this task already has the lock and
+            # is likely recurrently entering a breakpoint
+            if Lock.local_task_in_debug == task_name:
+                # noop on recurrent entry case but we want to trigger
+                # a checkpoint to allow other actors error-propagate and
+                # potetially avoid infinite re-entries in some subactor.
+                await trio.lowlevel.checkpoint()
                 return
 
-            # XXX: since we need to enter pdb synchronously below,
-            # we have to release the lock manually from pdb completion
-            # callbacks. Can't think of a nicer way then this atm.
-            if Lock._debug_lock.locked():
-                log.warning(
-                    'Root actor attempting to shield-acquire active tty lock'
-                    f' owned by {Lock.global_actor_in_debug}')
+            # if **this** actor is already in debug mode block here
+            # waiting for the control to be released - this allows
+            # support for recursive entries to `tractor.breakpoint()`
+            log.warning(f"{actor.uid} already has a debug lock, waiting...")
 
-                # must shield here to avoid hitting a ``Cancelled`` and
-                # a child getting stuck bc we clobbered the tty
-                with trio.CancelScope(shield=True):
-                    await Lock._debug_lock.acquire()
-            else:
-                # may be cancelled
-                await Lock._debug_lock.acquire()
+            await Lock.local_pdb_complete.wait()
+            await trio.sleep(0.1)
 
-            Lock.global_actor_in_debug = actor.uid
-            Lock.local_task_in_debug = task_name
-            Lock.repl = pdb
+        # mark local actor as "in debug mode" to avoid recurrent
+        # entries/requests to the root process
+        Lock.local_task_in_debug = task_name
 
+        # this **must** be awaited by the caller and is done using the
+        # root nursery so that the debugger can continue to run without
+        # being restricted by the scope of a new task nursery.
+
+        # TODO: if we want to debug a trio.Cancelled triggered exception
+        # we have to figure out how to avoid having the service nursery
+        # cancel on this task start? I *think* this works below:
+        # ```python
+        #   actor._service_n.cancel_scope.shield = shield
+        # ```
+        # but not entirely sure if that's a sane way to implement it?
         try:
-            if debug_func is None:
-                # assert release_lock_signal, (
-                #     'Must pass `release_lock_signal: trio.Event` if no '
-                #     'trace func provided!'
-                # )
-                print(f"{actor.uid} ENTERING WAIT")
-                task_status.started()
-
-                # with trio.CancelScope(shield=True):
-                #     await release_lock_signal.wait()
-
-            else:
-                # block here one (at the appropriate frame *up*) where
-                # ``breakpoint()`` was awaited and begin handling stdio.
-                log.debug("Entering the synchronous world of pdb")
-                debug_func(actor, pdb)
-
-        except bdb.BdbQuit:
+            with trio.CancelScope(shield=True):
+                await actor._service_n.start(
+                    wait_for_parent_stdin_hijack,
+                    actor.uid,
+                )
+                Lock.repl = pdb
+        except RuntimeError:
             Lock.release()
+
+            if actor._cancel_called:
+                # service nursery won't be usable and we
+                # don't want to lock up the root either way since
+                # we're in (the midst of) cancellation.
+                return
+
             raise
 
-        # XXX: apparently we can't do this without showing this frame
-        # in the backtrace on first entry to the REPL? Seems like an odd
-        # behaviour that should have been fixed by now. This is also why
-        # we scrapped all the @cm approaches that were tried previously.
-        # finally:
-        #     __tracebackhide__ = True
-        #     # frame = sys._getframe()
-        #     # last_f = frame.f_back
-        #     # last_f.f_globals['__tracebackhide__'] = True
-        #     # signal.signal = pdbp.hideframe(signal.signal)
+    elif is_root_process():
+
+        # we also wait in the root-parent for any child that
+        # may have the tty locked prior
+        # TODO: wait, what about multiple root tasks acquiring it though?
+        if Lock.global_actor_in_debug == actor.uid:
+            # re-entrant root process already has it: noop.
+            return
+
+        # XXX: since we need to enter pdb synchronously below,
+        # we have to release the lock manually from pdb completion
+        # callbacks. Can't think of a nicer way then this atm.
+        if Lock._debug_lock.locked():
+            log.warning(
+                'Root actor attempting to shield-acquire active tty lock'
+                f' owned by {Lock.global_actor_in_debug}')
+
+            # must shield here to avoid hitting a ``Cancelled`` and
+            # a child getting stuck bc we clobbered the tty
+            with trio.CancelScope(shield=True):
+                await Lock._debug_lock.acquire()
+        else:
+            # may be cancelled
+            await Lock._debug_lock.acquire()
+
+        Lock.global_actor_in_debug = actor.uid
+        Lock.local_task_in_debug = task_name
+        Lock.repl = pdb
+
+    try:
+        if debug_func is None:
+            # assert release_lock_signal, (
+            #     'Must pass `release_lock_signal: trio.Event` if no '
+            #     'trace func provided!'
+            # )
+            print(f"{actor.uid} ENTERING WAIT")
+            task_status.started()
+
+            # with trio.CancelScope(shield=True):
+            #     await release_lock_signal.wait()
+
+        else:
+            # block here one (at the appropriate frame *up*) where
+            # ``breakpoint()`` was awaited and begin handling stdio.
+            log.debug("Entering the synchronous world of pdb")
+            debug_func(actor, pdb)
+
+    except bdb.BdbQuit:
+        Lock.release()
+        raise
+
+    # XXX: apparently we can't do this without showing this frame
+    # in the backtrace on first entry to the REPL? Seems like an odd
+    # behaviour that should have been fixed by now. This is also why
+    # we scrapped all the @cm approaches that were tried previously.
+    # finally:
+    #     __tracebackhide__ = True
+    #     # frame = sys._getframe()
+    #     # last_f = frame.f_back
+    #     # last_f.f_globals['__tracebackhide__'] = True
+    #     # signal.signal = pdbp.hideframe(signal.signal)
 
 
 # TODO: allow pausing from sync code.
-- 
2.34.1


From 2fdb8fc25a5de02c61f58f619858668a46a342e9 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 16 Oct 2023 15:35:16 -0400
Subject: [PATCH 042/378] Factor non-yield stream msg processing into helper

Since both `MsgStream.receive()` and `.receive_nowait()` need the same
raising logic when a non-stream msg arrives (so that maybe an
appropriate IPC translated error can be raised) move the `KeyError`
handler code into a new `._streaming._raise_from_no_yield_msg()` func
and call it from both methods to make the error-interface-raising
symmetrical across both methods.
---
 tractor/_streaming.py | 112 +++++++++++++++++++++++++++---------------
 1 file changed, 72 insertions(+), 40 deletions(-)

diff --git a/tractor/_streaming.py b/tractor/_streaming.py
index 3045b835..e449fefe 100644
--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@@ -54,6 +54,60 @@ log = get_logger(__name__)
 #   messages? class ReceiveChannel(AsyncResource, Generic[ReceiveType]):
 # - use __slots__ on ``Context``?
 
+def _raise_from_no_yield_msg(
+    stream: MsgStream,
+    msg: dict,
+    src_err: KeyError,
+
+) -> bool:
+    '''
+    Raise an appopriate local error when a `MsgStream` msg arrives
+    which does not contain the expected (under normal operation)
+    `'yield'` field.
+
+    '''
+    # internal error should never get here
+    assert msg.get('cid'), ("Received internal error at portal?")
+
+    # TODO: handle 2 cases with 3.10+ match syntax
+    # - 'stop'
+    # - 'error'
+    # possibly just handle msg['stop'] here!
+
+    if stream._closed:
+        raise trio.ClosedResourceError('This stream was closed')
+
+    if msg.get('stop') or stream._eoc:
+        log.debug(f"{stream} was stopped at remote end")
+
+        # XXX: important to set so that a new ``.receive()``
+        # call (likely by another task using a broadcast receiver)
+        # doesn't accidentally pull the ``return`` message
+        # value out of the underlying feed mem chan!
+        stream._eoc = True
+
+        # # when the send is closed we assume the stream has
+        # # terminated and signal this local iterator to stop
+        # await stream.aclose()
+
+        # XXX: this causes ``ReceiveChannel.__anext__()`` to
+        # raise a ``StopAsyncIteration`` **and** in our catch
+        # block below it will trigger ``.aclose()``.
+        raise trio.EndOfChannel from src_err
+
+    # TODO: test that shows stream raising an expected error!!!
+    elif msg.get('error'):
+        # raise the error message
+        raise unpack_error(msg, stream._ctx.chan)
+
+    # always re-raise the source error if no translation error
+    # case is activated above.
+    raise src_err
+    # raise RuntimeError(
+    #     'Unknown non-yield stream msg?\n'
+    #     f'{msg}'
+    # )
+
 
 class MsgStream(trio.abc.Channel):
     '''
@@ -91,11 +145,20 @@ class MsgStream(trio.abc.Channel):
     # delegate directly to underlying mem channel
     def receive_nowait(self):
         msg = self._rx_chan.receive_nowait()
-        return msg['yield']
+        try:
+            return msg['yield']
+        except KeyError as kerr:
+            _raise_from_no_yield_msg(
+                stream=self,
+                msg=msg,
+                src_err=kerr,
+            )
 
     async def receive(self):
-        '''Async receive a single msg from the IPC transport, the next
-        in sequence for this stream.
+        '''
+        Receive a single msg from the IPC transport, the next in
+        sequence sent by the far end task (possibly in order as
+        determined by the underlying protocol).
 
         '''
         # see ``.aclose()`` for notes on the old behaviour prior to
@@ -110,43 +173,12 @@ class MsgStream(trio.abc.Channel):
             msg = await self._rx_chan.receive()
             return msg['yield']
 
-        except KeyError as err:
-            # internal error should never get here
-            assert msg.get('cid'), ("Received internal error at portal?")
-
-            # TODO: handle 2 cases with 3.10 match syntax
-            # - 'stop'
-            # - 'error'
-            # possibly just handle msg['stop'] here!
-
-            if self._closed:
-                raise trio.ClosedResourceError('This stream was closed')
-
-            if msg.get('stop') or self._eoc:
-                log.debug(f"{self} was stopped at remote end")
-
-                # XXX: important to set so that a new ``.receive()``
-                # call (likely by another task using a broadcast receiver)
-                # doesn't accidentally pull the ``return`` message
-                # value out of the underlying feed mem chan!
-                self._eoc = True
-
-                # # when the send is closed we assume the stream has
-                # # terminated and signal this local iterator to stop
-                # await self.aclose()
-
-                # XXX: this causes ``ReceiveChannel.__anext__()`` to
-                # raise a ``StopAsyncIteration`` **and** in our catch
-                # block below it will trigger ``.aclose()``.
-                raise trio.EndOfChannel from err
-
-            # TODO: test that shows stream raising an expected error!!!
-            elif msg.get('error'):
-                # raise the error message
-                raise unpack_error(msg, self._ctx.chan)
-
-            else:
-                raise
+        except KeyError as kerr:
+            _raise_from_no_yield_msg(
+                stream=self,
+                msg=msg,
+                src_err=kerr,
+            )
 
         except (
             trio.ClosedResourceError,  # by self._rx_chan
-- 
2.34.1


From 07cec023036b9109abfb9b4589b159b39cdc56e6 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 16 Oct 2023 15:45:02 -0400
Subject: [PATCH 043/378] Add comments around diff between `C/context` refs

---
 tractor/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tractor/__init__.py b/tractor/__init__.py
index 149d4d2c..980c8dc6 100644
--- a/tractor/__init__.py
+++ b/tractor/__init__.py
@@ -22,8 +22,8 @@ from exceptiongroup import BaseExceptionGroup
 
 from ._clustering import open_actor_cluster
 from ._context import (
-    Context,
-    context,
+    Context,  # the type
+    context,  # a func-decorator
 )
 from ._streaming import (
     MsgStream,
-- 
2.34.1


From ae326cbb9a3337c46bc5907c2544431b5e04e93e Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 16 Oct 2023 15:45:34 -0400
Subject: [PATCH 044/378] Ignore kbis in `open_crash_handler()` by default

---
 tractor/devx/_debug.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 561c387c..06e6071b 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -1041,15 +1041,19 @@ async def maybe_wait_for_debugger(
 
 
 # TODO: better naming and what additionals?
-# - optional runtime plugging?
-# - detection for sync vs. async code?
-# - specialized REPL entry when in distributed mode?
+# - [ ] optional runtime plugging?
+# - [ ] detection for sync vs. async code?
+# - [ ] specialized REPL entry when in distributed mode?
+# - [x] allow ignoring kbi Bo
 @cm
 def open_crash_handler(
     catch: set[BaseException] = {
         Exception,
         BaseException,
-    }
+    },
+    ignore: set[BaseException] = {
+        KeyboardInterrupt,
+    },
 ):
     '''
     Generic "post mortem" crash handler using `pdbp` REPL debugger.
@@ -1064,8 +1068,11 @@ def open_crash_handler(
     '''
     try:
         yield
-    except tuple(catch):
-        pdbp.xpm()
+    except tuple(catch) as err:
+
+        if type(err) not in ignore:
+            pdbp.xpm()
+
         raise
 
 
-- 
2.34.1


From ab2664da704c23a3a0c220dfa933320396b1b706 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 16 Oct 2023 15:46:21 -0400
Subject: [PATCH 045/378] Runtime level log on debug REPL exits

---
 tractor/_runtime.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index bd626440..881fe535 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -74,7 +74,7 @@ log = get_logger('tractor')
 
 async def _invoke(
 
-    actor: 'Actor',
+    actor: Actor,
     cid: str,
     chan: Channel,
     func: Callable,
@@ -1419,6 +1419,8 @@ async def async_main(
                     # something silly like the wrong socket-address
                     # passed via a config or CLI Bo
                     entered_debug = await _debug._maybe_enter_pm(oserr)
+                    if entered_debug:
+                        log.runtime('Exited debug REPL..')
                     raise
 
                 accept_addrs: list[tuple[str, int]] = actor.accept_addrs
-- 
2.34.1


From e4a6223256c3bd03aaa0f326f46970629d4fc1b3 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 16 Oct 2023 16:23:30 -0400
Subject: [PATCH 046/378] `._exceptions`: typing and error unpacking updates

Bump type annotations to 3.10+ style throughout module as well as fill
out doc strings a bit. Inside `unpack_error()` pop any `error_dict: dict`
and,
- return `None` early if not found,
- versus pass directly as `**error_dict` to the error constructor
  instead of a double field read.
---
 tractor/_exceptions.py | 53 ++++++++++++++++++++++++++++--------------
 1 file changed, 35 insertions(+), 18 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 9de27bdf..d9e1d17f 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -113,18 +113,24 @@ class AsyncioCancelled(Exception):
 
 def pack_error(
     exc: BaseException,
-    tb=None,
+    tb: str | None = None,
 
-) -> dict[str, Any]:
-    """Create an "error message" for tranmission over
-    a channel (aka the wire).
-    """
+) -> dict[str, dict]:
+    '''
+    Create an "error message" encoded for wire transport via an IPC
+    `Channel`; expected to be unpacked on the receiver side using
+    `unpack_error()` below.
+
+    '''
     if tb:
         tb_str = ''.join(traceback.format_tb(tb))
     else:
         tb_str = traceback.format_exc()
 
-    error_msg = {
+    error_msg: dict[
+        str,
+        str | tuple[str, str]
+    ] = {
         'tb_str': tb_str,
         'type_str': type(exc).__name__,
         'src_actor_uid': current_actor().uid,
@@ -142,18 +148,28 @@ def unpack_error(
     chan=None,
     err_type=RemoteActorError
 
-) -> Exception:
+) -> None | Exception:
     '''
     Unpack an 'error' message from the wire
-    into a local ``RemoteActorError``.
+    into a local `RemoteActorError` (subtype).
+
+    NOTE: this routine DOES not RAISE the embedded remote error,
+    which is the responsibilitiy of the caller.
 
     '''
-    __tracebackhide__ = True
-    error = msg['error']
+    __tracebackhide__: bool = True
 
-    tb_str = error.get('tb_str', '')
-    message = f'{chan.uid}\n' + tb_str
-    type_name = error['type_str']
+    error_dict: dict[str, dict] | None
+    if (
+        error_dict := msg.get('error')
+    ) is None:
+        # no error field, nothing to unpack.
+        return None
+
+    # retrieve the remote error's msg encoded details
+    tb_str: str = error_dict.get('tb_str', '')
+    message: str = f'{chan.uid}\n' + tb_str
+    type_name: str = error_dict['type_str']
     suberror_type: Type[BaseException] = Exception
 
     if type_name == 'ContextCancelled':
@@ -167,18 +183,19 @@ def unpack_error(
             eg,
             trio,
         ]:
-            try:
-                suberror_type = getattr(ns, type_name)
+            if suberror_type := getattr(
+                ns,
+                type_name,
+                False,
+            ):
                 break
-            except AttributeError:
-                continue
 
     exc = err_type(
         message,
         suberror_type=suberror_type,
 
         # unpack other fields into error type init
-        **msg['error'],
+        **error_dict,
     )
 
     return exc
-- 
2.34.1


From 534e5d150d24fa965319193ca52415a9c2bd505a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 17 Oct 2023 15:30:16 -0400
Subject: [PATCH 047/378] Drop `msg` kwarg from `Context.cancel()`

Well first off, turns out it's never used and generally speaking
doesn't seem to help much with "runtime hacking/debugging"; why would
we need to "fabricate" a msg when `.cancel()` is called to self-cancel?

Also (and since `._maybe_cancel_and_set_remote_error()` now takes an
`error: BaseException` as input and thus expects error-msg unpacking
prior to being called), we now manually set `Context._cancel_msg: dict`
just prior to any remote error assignment - so any case where we would
have fabbed a "cancel msg" near calling `.cancel()`, just do the manual
assign.

In this vein some other subtle changes:
- obviously don't set `._cancel_msg` in `.cancel()` since it's no longer
  an input.
- generally do walrus-style `error := unpack_error()` before applying
  and setting remote error-msg state.
- always raise any `._remote_error` in `.result()` instead of returning
  the exception instance and check before AND after the underlying mem
  chan read.
- add notes/todos around `raise self._remote_error from None` masking of
  (runtime) errors in `._maybe_raise_remote_err()` and use it inside
  `.result()` since we had the inverse duplicate logic there anyway..

Further, this adds and extends a ton of (internal) interface docs and
details comments around the `Context` API including many subtleties
pertaining to calling `._maybe_cancel_and_set_remote_error()`.
---
 tractor/_context.py | 288 ++++++++++++++++++++++++++++----------------
 1 file changed, 183 insertions(+), 105 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index 0df1e80a..c14f16bf 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -86,30 +86,51 @@ class Context:
 
     '''
     chan: Channel
-    cid: str
+    cid: str  # "context id", more or less a unique linked-task-pair id
 
-    # these are the "feeder" channels for delivering
-    # message values to the local task from the runtime
-    # msg processing loop.
+    # the "feeder" channels for delivering message values to the
+    # local task from the runtime's msg processing loop.
     _recv_chan: trio.MemoryReceiveChannel
     _send_chan: trio.MemorySendChannel
 
+    # the "invocation type" of the far end task-entry-point
+    # function, normally matching a logic block inside
+    # `._runtime.invoke()`.
     _remote_func_type: str | None = None
 
-    # only set on the caller side
-    _portal: Portal | None = None    # type: ignore # noqa
+    # NOTE: (for now) only set (a portal) on the caller side since
+    # the callee doesn't generally need a ref to one and should
+    # normally need to explicitly ask for handle to its peer if
+    # more the the `Context` is needed?
+    _portal: Portal | None = None
+
+    # NOTE: each side of the context has its own cancel scope
+    # which is exactly the primitive that allows for
+    # cross-actor-task-supervision and thus SC.
+    _scope: trio.CancelScope | None = None
     _result: Any | int = None
     _remote_error: BaseException | None = None
 
     # cancellation state
     _cancel_called: bool = False  # did WE cancel the far end?
     _cancelled_remote: tuple[str, str] | None = None
-    _cancel_msg: str | None = None
-    _scope: trio.CancelScope | None = None
 
-    # NOTE:  this is set by the `.devx._debug` machinery
-    # to indicate whether code in `._runtime` should handle
-    # cancelled context crashes in the pdbp REPL.
+    # NOTE: we try to ensure assignment of a "cancel msg" since
+    # there's always going to be an "underlying reason" that any
+    # context was closed due to either a remote side error or
+    # a call to `.cancel()` which triggers `ContextCancelled`.
+    _cancel_msg: str | dict | None = None
+
+    # NOTE: this state var used by the runtime to determine if the
+    # `pdbp` REPL is allowed to engage on contexts terminated via
+    # a `ContextCancelled` due to a call to `.cancel()` triggering
+    # "graceful closure" on either side:
+    # - `._runtime._invoke()` will check this flag before engaging
+    #   the crash handler REPL in such cases where the "callee"
+    #   raises the cancellation,
+    # - `.devx._debug.lock_tty_for_child()` will set it to `False` if
+    #   the global tty-lock has been configured to filter out some
+    #   actors from being able to acquire the debugger lock.
     _enter_debugger_on_cancel: bool = True
 
     @property
@@ -177,36 +198,71 @@ class Context:
 
     async def _maybe_cancel_and_set_remote_error(
         self,
-        error_msg: dict[str, Any],
+        error: BaseException,
 
     ) -> None:
         '''
-        (Maybe) unpack and raise a msg error into the local scope
-        nursery for this context.
+        (Maybe) cancel this local scope due to a received remote
+        error (normally via an IPC msg) which the actor runtime
+        routes to this context.
 
-        Acts as a form of "relay" for a remote error raised
-        in the corresponding remote callee task.
+        Acts as a form of "relay" for a remote error raised in the
+        corresponding remote task's `Context` wherein the next time
+        the local task exectutes a checkpoint, a `trio.Cancelled`
+        will be raised and depending on the type and source of the
+        original remote error, and whether or not the local task
+        called `.cancel()` itself prior, an equivalent
+        `ContextCancelled` or `RemoteActorError` wrapping the
+        remote error may be raised here by any of,
+
+        - `Portal.open_context()`
+        - `Portal.result()`
+        - `Context.open_stream()`
+        - `Context.result()`
+
+        when called/closed by actor local task(s).
+
+        NOTEs & TODOs: 
+          - It is expected that the caller has previously unwrapped
+            the remote error using a call to `unpack_error()` and
+            provides that output exception value as the input
+            `error` argument here.
+          - If this is an error message from a context opened by
+            `Portal.open_context()` we want to interrupt any
+            ongoing local tasks operating within that `Context`'s
+            cancel-scope so as to be notified ASAP of the remote
+            error and engage any caller handling (eg. for
+            cross-process task supervision).
+          - In some cases we may want to raise the remote error
+            immediately since there is no guarantee the locally
+            operating task(s) will attempt to execute a checkpoint
+            any time soon; in such cases there are 2 possible
+            approaches depending on the current task's work and
+            wrapping "thread" type:
+
+            - `trio`-native-and-graceful: only ever wait for tasks
+              to exec a next `trio.lowlevel.checkpoint()` assuming
+              that any such task must do so to interact with the
+              actor runtime and IPC interfaces.
+
+            - (NOT IMPLEMENTED) system-level-aggressive: maybe we
+              could eventually interrupt sync code (invoked using
+              `trio.to_thread` or some other adapter layer) with
+              a signal (a custom unix one for example?
+              https://stackoverflow.com/a/5744185) depending on the
+              task's wrapping thread-type such that long running
+              sync code should never cause the delay of actor
+              supervision tasks such as cancellation and respawn
+              logic.
 
         '''
-        # If this is an error message from a context opened by
-        # ``Portal.open_context()`` we want to interrupt any ongoing
-        # (child) tasks within that context to be notified of the remote
-        # error relayed here.
-        #
-        # The reason we may want to raise the remote error immediately
-        # is that there is no guarantee the associated local task(s)
-        # will attempt to read from any locally opened stream any time
-        # soon.
-        #
-        # NOTE: this only applies when
-        # ``Portal.open_context()`` has been called since it is assumed
-        # (currently) that other portal APIs (``Portal.run()``,
-        # ``.run_in_actor()``) do their own error checking at the point
-        # of the call and result processing.
-        error = unpack_error(
-            error_msg,
-            self.chan,
-        )
+        # XXX: currently this should only be used when
+        # `Portal.open_context()` has been opened since it's
+        # assumed that other portal APIs like,
+        #  - `Portal.run()`,
+        #  - `ActorNursery.run_in_actor()`
+        # do their own error checking at their own call points and
+        # result processing.
 
         # XXX: set the remote side's error so that after we cancel
         # whatever task is the opener of this context it can raise
@@ -236,35 +292,25 @@ class Context:
         else:
             log.error(
                 f'Remote context error for {self.chan.uid}:{self.cid}:\n'
-                f'{error_msg["error"]["tb_str"]}'
+                f'{error}'
             )
         # TODO: tempted to **not** do this by-reraising in a
         # nursery and instead cancel a surrounding scope, detect
         # the cancellation, then lookup the error that was set?
         # YES! this is way better and simpler!
-        if (
-            self._scope
-        ):
+        if self._scope:
             # from trio.testing import wait_all_tasks_blocked
             # await wait_all_tasks_blocked()
             # self._cancelled_remote = self.chan.uid
             self._scope.cancel()
 
-            # NOTE: this usage actually works here B)
-            # from .devx._debug import breakpoint
-            # await breakpoint()
-
-        # XXX: this will break early callee results sending
-        # since when `.result()` is finally called, this
-        # chan will be closed..
-        # if self._recv_chan:
-        #     await self._recv_chan.aclose()
+            # this REPL usage actually works here BD
+            # from .devx._debug import pause
+            # await pause()
 
     async def cancel(
         self,
-        msg: str | None = None,
         timeout: float = 0.616,
-        # timeout: float = 1000,
 
     ) -> None:
         '''
@@ -274,15 +320,12 @@ class Context:
         Timeout quickly in an attempt to sidestep 2-generals...
 
         '''
-        side = 'caller' if self._portal else 'callee'
-        if msg:
-            assert side == 'callee', 'Only callee side can provide cancel msg'
+        side: str = 'caller' if self._portal else 'callee'
+        log.cancel(
+            f'Cancelling {side} side of context to {self.chan.uid}'
+        )
 
-        log.cancel(f'Cancelling {side} side of context to {self.chan.uid}')
-
-        self._cancel_called = True
-        # await devx._debug.breakpoint()
-        # breakpoint()
+        self._cancel_called: bool = True
 
         if side == 'caller':
             if not self._portal:
@@ -290,12 +333,13 @@ class Context:
                     "No portal found, this is likely a callee side context"
                 )
 
-            cid = self.cid
+            cid: str = self.cid
             with trio.move_on_after(timeout) as cs:
                 cs.shield = True
                 log.cancel(
-                    f"Cancelling stream {cid} to "
-                    f"{self._portal.channel.uid}")
+                    f'Cancelling stream {cid} to '
+                    f'{self._portal.channel.uid}'
+                )
 
                 # NOTE: we're telling the far end actor to cancel a task
                 # corresponding to *this actor*. The far end local channel
@@ -314,17 +358,17 @@ class Context:
                 # if not self._portal.channel.connected():
                 if not self.chan.connected():
                     log.cancel(
-                        "May have failed to cancel remote task "
-                        f"{cid} for {self._portal.channel.uid}")
+                        'May have failed to cancel remote task '
+                        f'{cid} for {self._portal.channel.uid}'
+                    )
                 else:
                     log.cancel(
-                        "Timed out on cancelling remote task "
-                        f"{cid} for {self._portal.channel.uid}")
+                        'Timed out on cancel request of remote task '
+                        f'{cid} for {self._portal.channel.uid}'
+                    )
 
         # callee side remote task
         else:
-            self._cancel_msg = msg
-
             # TODO: should we have an explicit cancel message
             # or is relaying the local `trio.Cancelled` as an
             # {'error': trio.Cancelled, cid: "blah"} enough?
@@ -335,7 +379,6 @@ class Context:
 
     @acm
     async def open_stream(
-
         self,
         allow_overruns: bool | None = False,
         msg_buffer_size: int | None = None,
@@ -354,10 +397,10 @@ class Context:
         ``Portal.open_context()``.  In the future this may change but
         currently there seems to be no obvious reason to support
         "re-opening":
-            - pausing a stream can be done with a message.
-            - task errors will normally require a restart of the entire
-              scope of the inter-actor task context due to the nature of
-              ``trio``'s cancellation system.
+          - pausing a stream can be done with a message.
+          - task errors will normally require a restart of the entire
+            scope of the inter-actor task context due to the nature of
+            ``trio``'s cancellation system.
 
         '''
         actor = current_actor()
@@ -439,18 +482,19 @@ class Context:
         self,
         err: Exception,
     ) -> None:
+        '''
+        Maybe raise a remote error depending on who (which task from
+        which actor) requested a cancellation (if any).
+
+        '''
         # NOTE: whenever the context's "opener" side (task) **is**
         # the side which requested the cancellation (likekly via
         # ``Context.cancel()``), we don't want to re-raise that
         # cancellation signal locally (would be akin to
         # a ``trio.Nursery`` nursery raising ``trio.Cancelled``
-        # whenever  ``CancelScope.cancel()`` was called) and instead
-        # silently reap the expected cancellation "error"-msg.
-        # if 'pikerd' in err.msgdata['tb_str']:
-        #     # from . import _debug
-        #     # await _debug.breakpoint()
-        #     breakpoint()
-
+        # whenever  ``CancelScope.cancel()`` was called) and
+        # instead silently reap the expected cancellation
+        # "error"-msg.
         if (
             isinstance(err, ContextCancelled)
             and (
@@ -461,7 +505,18 @@ class Context:
         ):
             return err
 
-        raise err  # from None
+        # NOTE: currently we are masking underlying runtime errors
+        # which are often superfluous to user handler code. not
+        # sure if this is still needed / desired for all operation?
+        # TODO: maybe we can only NOT mask if:
+        # - [ ] debug mode is enabled or,
+        # - [ ] a certain log level is set?
+        # - [ ] consider using `.with_traceback()` to filter out
+        #       runtime frames from the tb explicitly?
+        # https://docs.python.org/3/reference/simple_stmts.html#the-raise-statement
+        # https://stackoverflow.com/a/24752607
+        __tracebackhide__: bool = True
+        raise err from None
 
     async def result(self) -> Any | Exception:
         '''
@@ -489,12 +544,12 @@ class Context:
         of the remote cancellation.
 
         '''
+        __tracebackhide__: bool = True
         assert self._portal, "Context.result() can not be called from callee!"
         assert self._recv_chan
 
         if re := self._remote_error:
-            self._maybe_raise_remote_err(re)
-            return re
+            return self._maybe_raise_remote_err(re)
 
         if (
             self._result == id(self)
@@ -505,8 +560,8 @@ class Context:
             # and discarding any bi dir stream msgs still
             # in transit from the far end.
             while True:
-                msg = await self._recv_chan.receive()
                 try:
+                    msg = await self._recv_chan.receive()
                     self._result: Any = msg['return']
 
                     # NOTE: we don't need to do this right?
@@ -519,17 +574,22 @@ class Context:
 
                 # NOTE: we get here if the far end was
                 # `ContextCancelled` in 2 cases:
-                # - we requested the cancellation and thus
-                #   SHOULD NOT raise that far end error,
-                # - WE DID NOT REQUEST that cancel and thus
-                #   SHOULD RAISE HERE!
+                # 1. we requested the cancellation and thus
+                #    SHOULD NOT raise that far end error,
+                # 2. WE DID NOT REQUEST that cancel and thus
+                #    SHOULD RAISE HERE!
                 except trio.Cancelled:
-                    if not self._cancel_called:
-                        raise self._remote_error
-                    else:
-                        # if we DID request the cancel we simply
-                        # continue as normal.
-                        raise
+
+                    # CASE 2: mask the local cancelled-error(s)
+                    # only when we are sure the remote error is the
+                    # (likely) source cause of this local runtime
+                    # task's cancellation.
+                    if re := self._remote_error:
+                        self._maybe_raise_remote_err(re)
+
+                    # CASE 1: we DID request the cancel we simply
+                    # continue to bubble up as normal.
+                    raise
 
                 except KeyError:  # as msgerr:
 
@@ -544,7 +604,8 @@ class Context:
 
                     # internal error should never get here
                     assert msg.get('cid'), (
-                        "Received internal error at portal?")
+                        "Received internal error at portal?"
+                    )
 
                     err = unpack_error(
                         msg,
@@ -554,7 +615,10 @@ class Context:
                     err = self._maybe_raise_remote_err(err)
                     self._remote_error = err
 
-        return self._remote_error or self._result
+        if re := self._remote_error:
+            return self._maybe_raise_remote_err(re)
+
+        return self._result
 
     async def started(
         self,
@@ -563,7 +627,7 @@ class Context:
     ) -> None:
         '''
         Indicate to calling actor's task that this linked context
-        has started and send ``value`` to the other side.
+        has started and send ``value`` to the other side via IPC.
 
         On the calling side ``value`` is the second item delivered
         in the tuple returned by ``Portal.open_context()``.
@@ -571,19 +635,17 @@ class Context:
         '''
         if self._portal:
             raise RuntimeError(
-                f"Caller side context {self} can not call started!")
+                f'Caller side context {self} can not call started!'
+            )
 
         elif self._started_called:
             raise RuntimeError(
-                f"called 'started' twice on context with {self.chan.uid}")
+                f'called `.started()` twice on context with {self.chan.uid}'
+            )
 
         await self.chan.send({'started': value, 'cid': self.cid})
         self._started_called = True
 
-    # TODO: do we need a restart api?
-    # async def restart(self) -> None:
-    #     pass
-
     async def _drain_overflows(
         self,
     ) -> None:
@@ -638,10 +700,21 @@ class Context:
         self,
         msg: dict,
 
-        draining: bool = False,
+        # draining: bool = False,
 
     ) -> bool:
+        '''
+        Deliver an IPC msg received from a transport-channel to
+        this context's underlying mem chan for handling by
+        user operating tasks; deliver a bool indicating whether the
+        msg was immediately sent.
 
+        If `._allow_overruns == True` (maybe) append the msg to an
+        "overflow queue" and start a "drainer task" (inside the
+        `._scope_nursery: trio.Nursery`) which ensures that such
+        messages are eventually sent if possible.
+
+        '''
         cid = self.cid
         chan = self.chan
         uid = chan.uid
@@ -652,8 +725,12 @@ class Context:
         )
 
         error = msg.get('error')
-        if error:
-            await self._maybe_cancel_and_set_remote_error(msg)
+        if error := unpack_error(
+                msg,
+                self.chan,
+        ):
+            self._cancel_msg = msg
+            await self._maybe_cancel_and_set_remote_error(error)
 
         if (
             self._in_overrun
@@ -685,6 +762,7 @@ class Context:
         # the sender; the main motivation is that using bp can block the
         # msg handling loop which calls into this method!
         except trio.WouldBlock:
+
             # XXX: always push an error even if the local
             # receiver is in overrun state.
             # await self._maybe_cancel_and_set_remote_error(msg)
-- 
2.34.1


From 7eb31f3fea8ade7877fa0a423f57f69936571660 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 17 Oct 2023 16:52:31 -0400
Subject: [PATCH 048/378] Runtime import `.get_root()` in stdin hijacker to
 avoid import cycle

---
 tractor/devx/_debug.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 06e6071b..e636e49e 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -49,7 +49,6 @@ from trio_typing import (
 )
 
 from ..log import get_logger
-from .._discovery import get_root
 from .._state import (
     is_root_process,
     debug_mode,
@@ -331,7 +330,7 @@ async def lock_tty_for_child(
             f'Actor {subactor_uid} is blocked from acquiring debug lock\n'
             f"remote task: {task_name}:{subactor_uid}"
         )
-        ctx._enter_debugger_on_cancel = False
+        ctx._enter_debugger_on_cancel: bool = False
         await ctx.cancel(f'Debug lock blocked for {subactor_uid}')
         return 'pdb_lock_blocked'
 
@@ -388,6 +387,8 @@ async def wait_for_parent_stdin_hijack(
     ``maybe_wait_for_debugger()``).
 
     '''
+    from .._discovery import get_root
+
     with trio.CancelScope(shield=True) as cs:
         Lock._debugger_request_cs = cs
 
@@ -611,7 +612,7 @@ def _set_trace(
     pdb: MultiActorPdb | None = None,
     shield: bool = False,
 ):
-    __tracebackhide__ = True
+    __tracebackhide__: bool = True
     actor: tractor.Actor = actor or tractor.current_actor()
 
     # start 2 levels up in user code
-- 
2.34.1


From 63b1488ab6db6712bdfca1bf52f597b899a854e9 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 17 Oct 2023 17:22:57 -0400
Subject: [PATCH 049/378] Get mega-pedantic in `Portal.open_context()`

Specifically in the `.__aexit__()` phase to ensure remote,
runtime-internal, and locally raised error-during-cancelled-handling
exceptions are NEVER masked by a local `ContextCancelled` or any
exception group of `trio.Cancelled`s.

Also adds a ton of details to doc strings including extreme detail
surrounding the `ContextCancelled` raising cases and their processing
inside `.open_context()`'s exception handler blocks.

Details, details:
- internal rename `err`/`_err` stuff to just be `scope_err` since it's
  effectively the error bubbled up from the context's surrounding (and
  cross-actor) "scope".
- always shield `._recv_chan.aclose()` to avoid any `Cancelled` from
  masking the `scope_err` with a runtime related `trio.Cancelled`.
- explicitly catch the specific set of `scope_err: BaseException` that
  we can reasonably expect to handle instead of the catch-all parent
  type including exception groups, cancels and KBIs.
---
 tractor/_portal.py | 241 +++++++++++++++++++++++++++++++++------------
 1 file changed, 178 insertions(+), 63 deletions(-)

diff --git a/tractor/_portal.py b/tractor/_portal.py
index cf13d9be..d53fc6b3 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -15,8 +15,12 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 '''
-Memory boundary "Portals": an API for structured
-concurrency linked tasks running in disparate memory domains.
+Memory "portal" contruct.
+
+"Memory portals" are both an API and set of IPC wrapping primitives
+for managing structured concurrency "cancel-scope linked" tasks
+running in disparate virtual memory domains - at least in different
+OS processes, possibly on different (hardware) hosts.
 
 '''
 from __future__ import annotations
@@ -47,6 +51,7 @@ from ._exceptions import (
 )
 from ._context import Context
 from ._streaming import MsgStream
+from .devx._debug import maybe_wait_for_debugger
 
 
 log = get_logger(__name__)
@@ -66,20 +71,21 @@ def _unwrap_msg(
         raise unpack_error(msg, channel) from None
 
 
+# TODO: maybe move this to ._exceptions?
 class MessagingError(Exception):
     'Some kind of unexpected SC messaging dialog issue'
 
 
 class Portal:
     '''
-    A 'portal' to a(n) (remote) ``Actor``.
+    A 'portal' to a memory-domain-separated `Actor`.
 
     A portal is "opened" (and eventually closed) by one side of an
     inter-actor communication context. The side which opens the portal
     is equivalent to a "caller" in function parlance and usually is
     either the called actor's parent (in process tree hierarchy terms)
     or a client interested in scheduling work to be done remotely in a
-    far process.
+    process which has a separate (virtual) memory domain.
 
     The portal api allows the "caller" actor to invoke remote routines
     and receive results through an underlying ``tractor.Channel`` as
@@ -89,9 +95,9 @@ class Portal:
     like having a "portal" between the seperate actor memory spaces.
 
     '''
-    # the timeout for a remote cancel request sent to
-    # a(n) (peer) actor.
-    cancel_timeout = 0.5
+    # global timeout for remote cancel requests sent to
+    # connected (peer) actors.
+    cancel_timeout: float = 0.5
 
     def __init__(self, channel: Channel) -> None:
         self.channel = channel
@@ -393,12 +399,32 @@ class Portal:
 
     ) -> AsyncGenerator[tuple[Context, Any], None]:
         '''
-        Open an inter-actor task context.
+        Open an inter-actor "task context"; a remote task is
+        scheduled and cancel-scope-state-linked to a `trio.run()` across
+        memory boundaries in another actor's runtime.
 
-        This is a synchronous API which allows for deterministic
-        setup/teardown of a remote task. The yielded ``Context`` further
-        allows for opening bidirectional streams, explicit cancellation
-        and synchronized final result collection. See ``tractor.Context``.
+        This is an `@acm` API which allows for deterministic setup
+        and teardown of a remotely scheduled task in another remote
+        actor. Once opened, the 2 now "linked" tasks run completely
+        in parallel in each actor's runtime with their enclosing
+        `trio.CancelScope`s kept in a synced state wherein if
+        either side errors or cancels an equivalent error is
+        relayed to the other side via an SC-compat IPC protocol.
+
+        The yielded `tuple` is a pair delivering a `tractor.Context`
+        and any first value "sent" by the "callee" task via a call
+        to `Context.started(<value: Any>)`; this side of the
+        context does not unblock until the "callee" task calls
+        `.started()` in similar style to `trio.Nursery.start()`.
+        When the "callee" (side that is "called"/started by a call
+        to *this* method) returns, the caller side (this) unblocks
+        and any final value delivered from the other end can be
+        retrieved using the `Contex.result()` api.
+
+        The yielded ``Context`` instance further allows for opening
+        bidirectional streams, explicit cancellation and
+        structurred-concurrency-synchronized final result-msg
+        collection. See ``tractor.Context`` for more details.
 
         '''
         # conduct target func method structural checks
@@ -431,47 +457,52 @@ class Portal:
         )
 
         assert ctx._remote_func_type == 'context'
-        msg = await ctx._recv_chan.receive()
+        msg: dict = await ctx._recv_chan.receive()
 
         try:
             # the "first" value here is delivered by the callee's
             # ``Context.started()`` call.
             first = msg['started']
-            ctx._started_called = True
+            ctx._started_called: bool = True
 
         except KeyError:
-            assert msg.get('cid'), ("Received internal error at context?")
+            if not (cid := msg.get('cid')):
+                raise MessagingError(
+                    'Received internal error at context?\n'
+                    'No call-id (cid) in startup msg?'
+                )
 
             if msg.get('error'):
-                # raise kerr from unpack_error(msg, self.channel)
+                # NOTE: mask the key error with the remote one
                 raise unpack_error(msg, self.channel) from None
             else:
                 raise MessagingError(
-                    f'Context for {ctx.cid} was expecting a `started` message'
-                    f' but received a non-error msg:\n{pformat(msg)}'
+                    f'Context for {cid} was expecting a `started` message'
+                    ' but received a non-error msg:\n'
+                    f'{pformat(msg)}'
                 )
 
-        _err: BaseException | None = None
         ctx._portal: Portal = self
-
         uid: tuple = self.channel.uid
         cid: str = ctx.cid
-        etype: Type[BaseException] | None = None
 
-        # deliver context instance and .started() msg value in enter
-        # tuple.
+        # placeholder for any exception raised in the runtime
+        # or by user tasks which cause this context's closure.
+        scope_err: BaseException | None = None
         try:
             async with trio.open_nursery() as nurse:
-                ctx._scope_nursery = nurse
-                ctx._scope = nurse.cancel_scope
+                ctx._scope_nursery: trio.Nursery = nurse
+                ctx._scope: trio.CancelScope = nurse.cancel_scope
 
+                # deliver context instance and .started() msg value
+                # in enter tuple.
                 yield ctx, first
 
-                # when in allow_ovveruns mode there may be lingering
-                # overflow sender tasks remaining?
+                # when in allow_overruns mode there may be
+                # lingering overflow sender tasks remaining?
                 if nurse.child_tasks:
-                    # ensure we are in overrun state with
-                    # ``._allow_overruns=True`` bc otherwise
+                    # XXX: ensure we are in overrun state
+                    # with ``._allow_overruns=True`` bc otherwise
                     # there should be no tasks in this nursery!
                     if (
                         not ctx._allow_overruns
@@ -479,47 +510,69 @@ class Portal:
                     ):
                         raise RuntimeError(
                             'Context has sub-tasks but is '
-                            'not in `allow_overruns=True` Mode!?'
+                            'not in `allow_overruns=True` mode!?'
                         )
+
+                    # ensure cancel of all overflow sender tasks
+                    # started in the ctx nursery.
                     ctx._scope.cancel()
 
-        except ContextCancelled as err:
-            _err = err
+        # XXX: (maybe) shield/mask context-cancellations that were
+        # initiated by any of the context's 2 tasks. There are
+        # subsequently 2 operating cases for a "graceful cancel"
+        # of a `Context`:
+        # 
+        # 1.*this* side's task called `Context.cancel()`, in
+        #   which case we mask the `ContextCancelled` from bubbling
+        #   to the opener (much like how `trio.Nursery` swallows
+        #   any `trio.Cancelled` bubbled by a call to
+        #   `Nursery.cancel_scope.cancel()`)
+        #
+        # 2.*the other* side's (callee/spawned) task cancelled due
+        #   to a self or peer cancellation request in which case we
+        #   DO let the error bubble to the opener.
+        except ContextCancelled as ctxc:
+            scope_err = ctxc
 
-            # swallow and mask cross-actor task context cancels that
-            # were initiated by *this* side's task.
+            # CASE 1: this context was never cancelled
+            # via a local task's call to `Context.cancel()`.
             if not ctx._cancel_called:
-                # XXX: this should NEVER happen!
-                # from .devx._debug import breakpoint
-                # await breakpoint()
                 raise
 
-            # if the context was cancelled by client code
-            # then we don't need to raise since user code
-            # is expecting this and the block should exit.
+            # CASE 2: context was cancelled by local task calling
+            # `.cancel()`, we don't raise and the exit block should
+            # exit silently.
             else:
-                log.debug(f'Context {ctx} cancelled gracefully')
+                log.debug(
+                    f'Context {ctx} cancelled gracefully with:\n'
+                    f'{ctxc}'
+                )
 
         except (
-            BaseException,
+            # - a standard error in the caller/yieldee
+            Exception,
 
-            # more specifically, we need to handle these but not
-            # sure it's worth being pedantic:
-            # Exception,
-            # trio.Cancelled,
-            # KeyboardInterrupt,
+            # - a runtime teardown exception-group and/or
+            #   cancellation request from a caller task.
+            BaseExceptionGroup,
+            trio.Cancelled,
+            KeyboardInterrupt,
 
         ) as err:
-            etype = type(err)
+            scope_err = err
 
-            # cancel ourselves on any error.
+            # XXX: request cancel of this context on any error.
+            # NOTE: `Context.cancel()` is conversely NOT called in
+            # the `ContextCancelled` "cancellation requested" case
+            # above.
             log.cancel(
-                'Context cancelled for task, sending cancel request..\n'
+                'Context cancelled for task due to\n'
+                f'{err}\n'
+                'Sending cancel request..\n'
                 f'task:{cid}\n'
                 f'actor:{uid}'
             )
             try:
-
                 await ctx.cancel()
             except trio.BrokenResourceError:
                 log.warning(
@@ -528,8 +581,9 @@ class Portal:
                     f'actor:{uid}'
                 )
 
-            raise
+            raise  # duh
 
+        # no scope error case
         else:
             if ctx.chan.connected():
                 log.info(
@@ -537,10 +591,20 @@ class Portal:
                     f'task: {cid}\n'
                     f'actor: {uid}'
                 )
+                # XXX NOTE XXX: the below call to
+                # `Context.result()` will ALWAYS raise
+                # a `ContextCancelled` (via an embedded call to
+                # `Context._maybe_raise_remote_err()`) IFF
+                # a `Context._remote_error` was set by the runtime
+                # via a call to
+                # `Context._maybe_cancel_and_set_remote_error()`
+                # which IS SET any time the far end fails and
+                # causes "caller side" cancellation via
+                # a `ContextCancelled` here.
                 result = await ctx.result()
                 log.runtime(
-                    f'Context {fn_name} returned '
-                    f'value from callee `{result}`'
+                    f'Context {fn_name} returned value from callee:\n'
+                    f'`{result}`'
                 )
 
         finally:
@@ -548,22 +612,73 @@ class Portal:
             # operating *in* this scope to have survived
             # we tear down the runtime feeder chan last
             # to avoid premature stream clobbers.
-            if ctx._recv_chan is not None:
-                # should we encapsulate this in the context api?
-                await ctx._recv_chan.aclose()
+            rxchan: trio.ReceiveChannel = ctx._recv_chan
+            if (
+                 rxchan
 
-            if etype:
+                # maybe TODO: yes i know the below check is
+                # touching `trio` memchan internals..BUT, there are
+                # only a couple ways to avoid a `trio.Cancelled`
+                # bubbling from the `.aclose()` call below:
+                #
+                # - catch and mask it via the cancel-scope-shielded call
+                #   as we are rn (manual and frowned upon) OR,
+                # - specially handle the case where `scope_err` is
+                #   one of {`BaseExceptionGroup`, `trio.Cancelled`}
+                #   and then presume that the `.aclose()` call will
+                #   raise a `trio.Cancelled` and just don't call it
+                #   in those cases..
+                #
+                # that latter approach is more logic, LOC, and more
+                # convoluted so for now stick with the first
+                # psuedo-hack-workaround where we just try to avoid
+                # the shielded call as much as we can detect from
+                # the memchan's `._closed` state..
+                #
+                # XXX MOTIVATION XXX-> we generally want to raise
+                # any underlying actor-runtime/internals error that
+                # surfaces from a bug in tractor itself so it can
+                # be easily detected/fixed AND, we also want to
+                # minimize noisy runtime tracebacks (normally due
+                # to the cross-actor linked task scope machinery
+                # teardown) displayed to user-code and instead only
+                # displaying `ContextCancelled` traces where the
+                # cause of crash/exit IS due to something in
+                # user/app code on either end of the context.
+                and not rxchan._closed
+            ):
+                # XXX NOTE XXX: and again as per above, we mask any
+                # `trio.Cancelled` raised here so as to NOT mask
+                # out any exception group or legit (remote) ctx
+                # error that sourced from the remote task or its
+                # runtime.
+                with trio.CancelScope(shield=True):
+                    await ctx._recv_chan.aclose()
+
+            # XXX: since we always (maybe) re-raise (and thus also
+            # mask runtime machinery related
+            # multi-`trio.Cancelled`s) any scope error which was
+            # the underlying cause of this context's exit, add
+            # different log msgs for each of the (2) cases.
+            if scope_err is not None:
+                etype: Type[BaseException] = type(scope_err)
+
+                # CASE 2
                 if ctx._cancel_called:
                     log.cancel(
-                        f'Context {fn_name} cancelled by caller with\n{etype}'
+                        f'Context {fn_name} cancelled by caller with\n'
+                        f'{etype}'
                     )
-                elif _err is not None:
+
+                # CASE 1
+                else:
                     log.cancel(
-                        f'Context for task cancelled by callee with {etype}\n'
+                        f'Context cancelled by callee with {etype}\n'
                         f'target: `{fn_name}`\n'
                         f'task:{cid}\n'
                         f'actor:{uid}'
                     )
+
             # XXX: (MEGA IMPORTANT) if this is a root opened process we
             # wait for any immediate child in debug before popping the
             # context from the runtime msg loop otherwise inside
@@ -572,10 +687,10 @@ class Portal:
             # a "stop" msg for a stream), this can result in a deadlock
             # where the root is waiting on the lock to clear but the
             # child has already cleared it and clobbered IPC.
-            from .devx._debug import maybe_wait_for_debugger
             await maybe_wait_for_debugger()
 
-            # remove the context from runtime tracking
+            # FINALLY, remove the context from runtime tracking and
+            # exit Bo
             self.actor._contexts.pop(
                 (self.channel.uid, ctx.cid),
                 None,
-- 
2.34.1


From 43b659dbe42a6fe3e2a4154f2d2a5e1088149862 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 18 Oct 2023 13:19:34 -0400
Subject: [PATCH 050/378] Tidy/clarify another `._runtime` comment

---
 tractor/_runtime.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 881fe535..2e75cff1 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -356,14 +356,13 @@ async def _invoke(
                     and ctx._enter_debugger_on_cancel
                 )
             ):
-                # XXX: is there any case where we'll want to debug IPC
-                # disconnects as a default?
-                #
-                # I can't think of a reason that inspecting
-                # this type of failure will be useful for respawns or
-                # recovery logic - the only case is some kind of strange bug
-                # in our transport layer itself? Going to keep this
-                # open ended for now.
+                # XXX QUESTION XXX: is there any case where we'll
+                # want to debug IPC disconnects as a default?
+                # => I can't think of a reason that inspecting this
+                # type of failure will be useful for respawns or
+                # recovery logic - the only case is some kind of
+                # strange bug in our transport layer itself? Going
+                # to keep this open ended for now.
                 entered_debug = await _debug._maybe_enter_pm(err)
 
                 if not entered_debug:
-- 
2.34.1


From 87c1113de47095fde9bfbd25005875996a6ab2d1 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 18 Oct 2023 13:20:29 -0400
Subject: [PATCH 051/378] Always set default reg addr in `find_actor()` if not
 defined

---
 tractor/_discovery.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/tractor/_discovery.py b/tractor/_discovery.py
index 22ab88d1..0f9f88e5 100644
--- a/tractor/_discovery.py
+++ b/tractor/_discovery.py
@@ -22,6 +22,7 @@ management of (service) actors.
 from __future__ import annotations
 from typing import (
     AsyncGenerator,
+    AsyncContextManager,
     TYPE_CHECKING,
 )
 from contextlib import asynccontextmanager as acm
@@ -190,11 +191,19 @@ async def find_actor(
             else:
                 yield None
 
+    if not registry_addrs:
+        from ._root import _default_lo_addrs
+        registry_addrs = _default_lo_addrs
+
+    maybe_portals: list[
+        AsyncContextManager[tuple[str, int]]
+    ] = list(
+        maybe_open_portal_from_reg_addr(addr)
+        for addr in registry_addrs
+    )
+
     async with gather_contexts(
-        mngrs=list(
-            maybe_open_portal_from_reg_addr(addr)
-            for addr in registry_addrs
-        )
+        mngrs=maybe_portals,
     ) as maybe_portals:
         print(f'Portalz: {maybe_portals}')
         if not maybe_portals:
@@ -206,6 +215,9 @@ async def find_actor(
             yield portals[0]
 
         else:
+            # TODO: currently this may return multiple portals
+            # given there are multi-homed or multiple registrars..
+            # SO, we probably need de-duplication logic?
             yield portals
 
 
-- 
2.34.1


From ca3f7a1b6b26ea8c72beb139da17ef076f5f9d91 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 18 Oct 2023 13:59:08 -0400
Subject: [PATCH 052/378] Add a first serious inter-peer remote cancel suite

Tests that appropriate `Context` exit state, the relay of
a `ContextCancelled` error and its `.canceller: tuple[str, str]` value
are set when an inter-peer cancellation happens via an "out of band"
request method (in this case using `Portal.cancel_actor()` and that
cancellation is propagated "horizontally" to other peers. Verify that
any such cancellation scenario which also experiences an "error during
`ContextCancelled` handling" DOES NOT result in that further error being
suppressed and that the user's exception bubbles out of the
`Context.open_context()` block(s) appropriately!

Likely more tests to come as well as some factoring of the teardown
state checks where possible.

Pertains to serious testing the major work landing in #357
---
 tests/test_inter_peer_cancellation.py | 434 ++++++++++++++++++++------
 1 file changed, 336 insertions(+), 98 deletions(-)

diff --git a/tests/test_inter_peer_cancellation.py b/tests/test_inter_peer_cancellation.py
index aeb70e81..46ca5758 100644
--- a/tests/test_inter_peer_cancellation.py
+++ b/tests/test_inter_peer_cancellation.py
@@ -3,30 +3,32 @@ Codify the cancellation request semantics in terms
 of one remote actor cancelling another.
 
 '''
-from contextlib import asynccontextmanager as acm
+# from contextlib import asynccontextmanager as acm
+import itertools
 
 import pytest
 import trio
 import tractor
-from tractor._exceptions import (
-    StreamOverrun,
+from tractor import (  # typing
+    Portal,
+    Context,
     ContextCancelled,
 )
 
 
-def test_self_cancel():
-    '''
-    2 cases:
-    - calls `Actor.cancel()` locally in some task
-    - calls LocalPortal.cancel_actor()` ?
+# def test_self_cancel():
+#     '''
+#     2 cases:
+#     - calls `Actor.cancel()` locally in some task
+#     - calls LocalPortal.cancel_actor()` ?
 
-    '''
-    ...
+#     '''
+#     ...
 
 
 @tractor.context
 async def sleep_forever(
-    ctx: tractor.Context,
+    ctx: Context,
 ) -> None:
     '''
     Sync the context, open a stream then just sleep.
@@ -37,13 +39,19 @@ async def sleep_forever(
         await trio.sleep_forever()
 
 
-@acm
-async def attach_to_sleep_forever():
+@tractor.context
+async def error_before_started(
+    ctx: Context,
+) -> None:
     '''
-    Cancel a context **before** any underlying error is raised in order
-    to trigger a local reception of a ``ContextCancelled`` which **should not**
-    be re-raised in the local surrounding ``Context`` *iff* the cancel was
-    requested by **this** side of the context.
+    This simulates exactly an original bug discovered in:
+    https://github.com/pikers/piker/issues/244
+
+    Cancel a context **before** any underlying error is raised so
+    as to trigger a local reception of a ``ContextCancelled`` which
+    SHOULD NOT be re-raised in the local surrounding ``Context``
+    *iff* the cancel was requested by **this** (callee)  side of
+    the context.
 
     '''
     async with tractor.wait_for_actor('sleeper') as p2:
@@ -51,8 +59,16 @@ async def attach_to_sleep_forever():
             p2.open_context(sleep_forever) as (peer_ctx, first),
             peer_ctx.open_stream(),
         ):
+            # NOTE: this WAS inside an @acm body but i factored it
+            # out and just put it inline here since i don't think
+            # the mngr part really matters, though maybe it could?
             try:
-                yield
+                # XXX NOTE XXX: THIS sends an UNSERIALIZABLE TYPE which
+                # should raise a `TypeError` and **NOT BE SWALLOWED** by
+                # the surrounding try/finally (normally inside the
+                # body of some acm)..
+                await ctx.started(object())
+                # yield
             finally:
                 # XXX: previously this would trigger local
                 # ``ContextCancelled`` to be received and raised in the
@@ -71,23 +87,6 @@ async def attach_to_sleep_forever():
                 await peer_ctx.cancel()
 
 
-@tractor.context
-async def error_before_started(
-    ctx: tractor.Context,
-) -> None:
-    '''
-    This simulates exactly an original bug discovered in:
-    https://github.com/pikers/piker/issues/244
-
-    '''
-    async with attach_to_sleep_forever():
-
-        # XXX NOTE XXX: THIS sends an UNSERIALIZABLE TYPE which
-        # should raise a `TypeError` and **NOT BE SWALLOWED** by
-        # the surrounding acm!!?!
-        await ctx.started(object())
-
-
 def test_do_not_swallow_error_before_started_by_remote_contextcancelled():
     '''
     Verify that an error raised in a remote context which itself
@@ -121,93 +120,332 @@ def test_do_not_swallow_error_before_started_by_remote_contextcancelled():
 
 
 @tractor.context
-async def sleep_a_bit_then_cancel_sleeper(
-    ctx: tractor.Context,
+async def sleep_a_bit_then_cancel_peer(
+    ctx: Context,
+    peer_name: str = 'sleeper',
+    cancel_after: float = .5,
+
 ) -> None:
-    async with tractor.wait_for_actor('sleeper') as sleeper:
-        await ctx.started()
-        # await trio.sleep_forever()
-        await trio.sleep(3)
-    # async with tractor.wait_for_actor('sleeper') as sleeper:
-        await sleeper.cancel_actor()
-
-
-def test_peer_canceller():
     '''
-    Verify that a cancellation triggered by a peer (whether in tree
-    or not) results in a cancelled error with
-    a `ContextCancelled.errorer` matching the requesting actor.
+    Connect to peer, sleep as per input delay, cancel the peer.
 
-    cases:
-    - some arbitrary remote peer cancels via Portal.cancel_actor().
-      => all other connected peers should get that cancel requesting peer's
-         uid in the ctx-cancelled error msg.
+    '''
+    peer: Portal
+    async with tractor.wait_for_actor(peer_name) as peer:
+        await ctx.started()
+        await trio.sleep(cancel_after)
+        await peer.cancel_actor()
 
-    - peer spawned a sub-actor which (also) spawned a failing task
-      which was unhandled and propagated up to the immediate
-      parent, the peer to the actor that also spawned a remote task
-      task in that same peer-parent.
 
-    - peer cancelled itself - so other peers should
-      get errors reflecting that the peer was itself the .canceller?
+@tractor.context
+async def stream_ints(
+    ctx: Context,
+):
+    await ctx.started()
+    async with ctx.open_stream() as stream:
+        for i in itertools.count():
+            await stream.send(i)
+
+
+@tractor.context
+async def stream_from_peer(
+    ctx: Context,
+    peer_name: str = 'sleeper',
+) -> None:
+
+    peer: Portal
+    try:
+        async with (
+            tractor.wait_for_actor(peer_name) as peer,
+            peer.open_context(stream_ints) as (peer_ctx, first),
+            peer_ctx.open_stream() as stream,
+        ):
+            await ctx.started()
+            # XXX TODO: big set of questions for this
+            # - should we raise `ContextCancelled` or `Cancelled` (rn
+            #   it does that) here?!
+            # - test the `ContextCancelled` OUTSIDE the
+            #   `.open_context()` call?
+            try:
+                async for msg in stream:
+                    print(msg)
+
+            except trio.Cancelled:
+                assert not ctx.cancel_called
+                assert not ctx.cancelled_caught
+
+                assert not peer_ctx.cancel_called
+                assert not peer_ctx.cancelled_caught
+
+                assert 'root' in ctx.cancel_called_remote
+
+                raise  # XXX MUST NEVER MASK IT!!
+
+            with trio.CancelScope(shield=True):
+                await tractor.pause()
+            # pass
+            # pytest.fail(
+            raise RuntimeError(
+                'peer never triggered local `[Context]Cancelled`?!?'
+            )
+
+    # NOTE: cancellation of the (sleeper) peer should always
+    # cause a `ContextCancelled` raise in this streaming
+    # actor.
+    except ContextCancelled as ctxerr:
+        assert ctxerr.canceller == 'canceller'
+        assert ctxerr._remote_error is ctxerr
+
+        # CASE 1: we were cancelled by our parent, the root actor.
+        # TODO: there are other cases depending on how the root
+        # actor and it's caller side task are written:
+        # - if the root does not req us to cancel then an
+        # IPC-transport related error should bubble from the async
+        # for loop and thus cause local cancellation both here
+        # and in the root (since in that case this task cancels the
+        # context with the root, not the other way around)
+        assert ctx.cancel_called_remote[0] == 'root'
+        raise
+
+    # except BaseException as err:
+
+    #     raise
+
+# cases:
+# - some arbitrary remote peer cancels via Portal.cancel_actor().
+#   => all other connected peers should get that cancel requesting peer's
+#      uid in the ctx-cancelled error msg.
+
+# - peer spawned a sub-actor which (also) spawned a failing task
+#   which was unhandled and propagated up to the immediate
+#   parent, the peer to the actor that also spawned a remote task
+#   task in that same peer-parent.
+
+# - peer cancelled itself - so other peers should
+#   get errors reflecting that the peer was itself the .canceller?
+
+# - WE cancelled the peer and thus should not see any raised
+#   `ContextCancelled` as it should be reaped silently?
+#   => pretty sure `test_context_stream_semantics::test_caller_cancels()`
+#      already covers this case?
+
+@pytest.mark.parametrize(
+    'error_during_ctxerr_handling',
+    [False, True],
+)
+def test_peer_canceller(
+    error_during_ctxerr_handling: bool,
+):
+    '''
+    Verify that a cancellation triggered by an in-actor-tree peer
+    results in a cancelled errors with all other actors which have
+    opened contexts to that same actor.
+
+    legend:
+    name>
+        a "play button" that indicates a new runtime instance,
+        an individual actor with `name`.
+
+    .subname>
+        a subactor who's parent should be on some previous
+        line and be less indented.
+
+    .actor0> ()-> .actor1>
+        a inter-actor task context opened (by `async with `Portal.open_context()`)
+        from actor0 *into* actor1.
+
+    .actor0> ()<=> .actor1>
+        a inter-actor task context opened (as above)
+        from actor0 *into* actor1 which INCLUDES an additional
+        stream open using `async with Context.open_stream()`.
+
+
+    ------ - ------
+    supervision view
+    ------ - ------
+    root>
+     .sleeper> TODO: SOME SYNTAX SHOWING JUST SLEEPING
+     .just_caller> ()=> .sleeper>
+     .canceller> ()-> .sleeper>
+                  TODO:  how define calling `Portal.cancel_actor()`
+
+    In this case a `ContextCancelled` with `.errorer` set to the
+    requesting actor, in this case 'canceller', should be relayed
+    to all other actors who have also opened a (remote task)
+    context with that now cancelled actor.
+
+    ------ - ------
+    task view
+    ------ - ------
+    So there are 5 context open in total with 3 from the root to
+    its children and 2 from children to their peers:
+    1. root> ()-> .sleeper>
+    2. root> ()-> .streamer>
+    3. root> ()-> .canceller>
+
+    4. .streamer> ()<=> .sleep>
+    5. .canceller> ()-> .sleeper>
+        - calls `Portal.cancel_actor()`
 
-    - WE cancelled the peer and thus should not see any raised
-      `ContextCancelled` as it should be reaped silently?
-      => pretty sure `test_context_stream_semantics::test_caller_cancels()`
-         already covers this case?
 
     '''
 
     async def main():
-        async with tractor.open_nursery() as n:
-            canceller: tractor.Portal = await n.start_actor(
+        async with tractor.open_nursery() as an:
+            canceller: Portal = await an.start_actor(
                 'canceller',
                 enable_modules=[__name__],
             )
-            sleeper: tractor.Portal = await n.start_actor(
+            sleeper: Portal = await an.start_actor(
                 'sleeper',
                 enable_modules=[__name__],
             )
+            just_caller: Portal = await an.start_actor(
+                'just_caller',  # but i just met her?
+                enable_modules=[__name__],
+            )
 
-            async with (
-                sleeper.open_context(
-                    sleep_forever,
-                ) as (sleeper_ctx, sent),
+            try:
+                async with (
+                    sleeper.open_context(
+                        sleep_forever,
+                    ) as (sleeper_ctx, sent),
 
-                canceller.open_context(
-                    sleep_a_bit_then_cancel_sleeper,
-                ) as (canceller_ctx, sent),
-            ):
-                # await tractor.pause()
-                try:
-                    print('PRE CONTEXT RESULT')
-                    await sleeper_ctx.result()
+                    just_caller.open_context(
+                        stream_from_peer,
+                    ) as (caller_ctx, sent),
 
-                # TODO: not sure why this isn't catching
-                # but maybe we need an `ExceptionGroup` and
-                # the whole except *errs: thinger in 3.11?
-                except (
-                    ContextCancelled,
-                ) as berr:
-                    print('CAUGHT REMOTE CONTEXT CANCEL')
+                    canceller.open_context(
+                        sleep_a_bit_then_cancel_peer,
+                    ) as (canceller_ctx, sent),
 
-                    # canceller should not have been remotely
-                    # cancelled.
-                    assert canceller_ctx.cancel_called_remote is None
+                ):
+                    ctxs: list[Context] = [
+                        sleeper_ctx,
+                        caller_ctx,
+                        canceller_ctx,
+                    ]
 
-                    # NOTE: will only enter if you wrap in
-                    # a shielded cs..
-                    # await tractor.pause()  # TODO: shield=True)
+                    try:
+                        print('PRE CONTEXT RESULT')
+                        await sleeper_ctx.result()
 
-                    assert sleeper_ctx.canceller == 'canceller'
-                    assert not sleep_ctx.cancelled_caught
+                        # should never get here
+                        pytest.fail(
+                            'Context.result() did not raise ctx-cancelled?'
+                        )
+
+                    # TODO: not sure why this isn't catching
+                    # but maybe we need an `ExceptionGroup` and
+                    # the whole except *errs: thinger in 3.11?
+                    except ContextCancelled as ctxerr:
+                        print(f'CAUGHT REMOTE CONTEXT CANCEL {ctxerr}')
+
+                        # canceller and caller peers should not
+                        # have been remotely cancelled.
+                        assert canceller_ctx.cancel_called_remote is None
+                        assert caller_ctx.cancel_called_remote is None
+
+                        assert ctxerr.canceller[0] == 'canceller'
+
+                        # XXX NOTE XXX: since THIS `ContextCancelled`
+                        # HAS NOT YET bubbled up to the
+                        # `sleeper.open_context().__aexit__()` this
+                        # value is not yet set, however outside this
+                        # block it should be.
+                        assert not sleeper_ctx.cancelled_caught
+
+                        # TODO: a test which ensures this error is
+                        # bubbled and caught (NOT MASKED) by the
+                        # runtime!!! 
+                        if error_during_ctxerr_handling:
+                            raise RuntimeError('Simulated error during teardown')
+
+                        raise
+
+                    # SHOULD NEVER GET HERE!
+                    except BaseException:
+                        pytest.fail('did not rx ctx-cancelled error?')
+                    else:
+                        pytest.fail('did not rx ctx-cancelled error?')
+
+            except (
+                ContextCancelled,
+                RuntimeError,
+            )as ctxerr:
+                _err = ctxerr
+
+                if error_during_ctxerr_handling:
+                    assert isinstance(ctxerr, RuntimeError)
+
+                    # NOTE: this root actor task should have
+                    # called `Context.cancel()` on the
+                    # `.__aexit__()` to every opened ctx.
+                    for ctx in ctxs:
+                        assert ctx.cancel_called
+
+                        # each context should have received
+                        # a silently absorbed context cancellation
+                        # from its peer actor's task.
+                        assert ctx.chan.uid == ctx.cancel_called_remote
+
+                        # this root actor task should have
+                        # cancelled all opened contexts except
+                        # the sleeper which is cancelled by its
+                        # peer "canceller"
+                        if ctx is not sleeper_ctx:
+                            assert ctx._remote_error.canceller[0] == 'root'
 
-                    raise
                 else:
-                    raise RuntimeError('NEVER RXED EXPECTED `ContextCancelled`')
+                    assert ctxerr.canceller[0] == 'canceller'
 
+                    # the sleeper's remote error is the error bubbled
+                    # out of the context-stack above!
+                    re = sleeper_ctx._remote_error
+                    assert re is ctxerr
 
-    with pytest.raises(tractor.ContextCancelled) as excinfo:
-        trio.run(main)
+                    for ctx in ctxs:
 
-    assert excinfo.value.type == ContextCancelled
+                        if ctx is sleeper_ctx:
+                            assert not ctx.cancel_called
+                            assert ctx.cancelled_caught
+                        else:
+                            assert ctx.cancel_called
+                            assert not ctx.cancelled_caught
+
+                        # each context should have received
+                        # a silently absorbed context cancellation
+                        # from its peer actor's task.
+                        assert ctx.chan.uid == ctx.cancel_called_remote
+
+                    # NOTE: when an inter-peer cancellation
+                    # occurred, we DO NOT expect this
+                    # root-actor-task to have requested a cancel of
+                    # the context since cancellation was caused by
+                    # the "canceller" peer and thus
+                    # `Context.cancel()` SHOULD NOT have been
+                    # called inside
+                    # `Portal.open_context().__aexit__()`.
+                    assert not sleeper_ctx.cancel_called
+
+                # XXX NOTE XXX: and see matching comment above but,
+                # this flag is set only AFTER the `.open_context()`
+                # has exited and should be set in both outcomes
+                # including the case where ctx-cancel handling
+                # itself errors.
+                assert sleeper_ctx.cancelled_caught
+                assert sleeper_ctx.cancel_called_remote[0] == 'sleeper'
+
+                # await tractor.pause()
+                raise  # always to ensure teardown
+
+    if error_during_ctxerr_handling:
+        with pytest.raises(RuntimeError) as excinfo:
+            trio.run(main)
+    else:
+
+        with pytest.raises(ContextCancelled) as excinfo:
+            trio.run(main)
+
+        assert excinfo.value.type == ContextCancelled
+        assert excinfo.value.canceller[0] == 'canceller'
-- 
2.34.1


From fcc8cee9d380eec540bac30edd69d8a041402dac Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 18 Oct 2023 14:12:58 -0400
Subject: [PATCH 053/378] ._root: set a `_default_lo_addrs` and apply it when
 not provided by caller

---
 tractor/_root.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/tractor/_root.py b/tractor/_root.py
index 0969fe55..99a5ad85 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -46,8 +46,14 @@ from ._exceptions import is_multi_cancelled
 
 
 # set at startup and after forks
-_default_arbiter_host: str = '127.0.0.1'
-_default_arbiter_port: int = 1616
+_default_lo_host: str = '127.0.0.1'
+_default_port: int = 1616
+
+# default registry always on localhost
+_default_lo_addrs: list[tuple[str, int]] = [(
+    _default_lo_host,
+    _default_port,
+)]
 
 
 logger = log.get_logger('tractor')
@@ -124,10 +130,8 @@ async def open_root_actor(
 
     registry_addrs: list[tuple[str, int]] = (
         registry_addrs
-        or [(  # default on localhost
-            _default_arbiter_host,
-            _default_arbiter_port,
-         )]
+        or     
+        _default_lo_addrs
     )
 
     loglevel = (loglevel or log._default_loglevel).upper()
@@ -329,9 +333,7 @@ def run_daemon(
 
     # runtime kwargs
     name: str | None = 'root',
-    registry_addrs: list[tuple[str, int]] = [
-        (_default_arbiter_host, _default_arbiter_port)
-    ],
+    registry_addrs: list[tuple[str, int]] = _default_lo_addrs,
 
     start_method: str | None = None,
     debug_mode: bool = False,
-- 
2.34.1


From 215fec1d4114df3bf1211f40b09fac4ab5492791 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 18 Oct 2023 15:01:04 -0400
Subject: [PATCH 054/378] Change old `._debug._pause()` name, cherry to #362 re
 `greenback`

---
 tractor/devx/_debug.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index e636e49e..1b225052 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -847,7 +847,7 @@ def pause_from_sync() -> None:
     # waiting.. not the most ideal but works for now ;)
     greenback.await_(
         actor._service_n.start(partial(
-            _pause,
+            pause,
             debug_func=None,
             # release_lock_signal=task_can_release_tty_lock,
         ))
-- 
2.34.1


From 0c74b04c83260b01a67bf21dbfea94c669b4890a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 18 Oct 2023 15:22:54 -0400
Subject: [PATCH 055/378] Facepalm, `wait_for_actor()` dun take an addr
 `list`..

---
 tractor/_discovery.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tractor/_discovery.py b/tractor/_discovery.py
index 0f9f88e5..1fa2a885 100644
--- a/tractor/_discovery.py
+++ b/tractor/_discovery.py
@@ -243,9 +243,7 @@ async def wait_for_actor(
             DeprecationWarning,
             stacklevel=2,
         )
-        registry_addr: list[tuple[str, int]] = [
-            arbiter_sockaddr,
-        ]
+        registry_addr: tuple[str, int] = arbiter_sockaddr
 
     # TODO: use `.trionics.gather_contexts()` like
     # above in `find_actor()` as well?
-- 
2.34.1


From 190845ce1d0fd12e6027c505922e33168cd0c88c Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 18 Oct 2023 15:29:43 -0400
Subject: [PATCH 056/378] Add masked super timeout line to `do_hard_kill()` for
 would-be runtime hackers

---
 tractor/_spawn.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tractor/_spawn.py b/tractor/_spawn.py
index 9c618557..aede3b75 100644
--- a/tractor/_spawn.py
+++ b/tractor/_spawn.py
@@ -199,6 +199,10 @@ async def do_hard_kill(
     proc: trio.Process,
     terminate_after: int = 3,
 
+    # NOTE: for mucking with `.pause()`-ing inside the runtime
+    # whilst also hacking on it XD
+    # terminate_after: int = 99999,
+
 ) -> None:
     # NOTE: this timeout used to do nothing since we were shielding
     # the ``.wait()`` inside ``new_proc()`` which will pretty much
-- 
2.34.1


From 1e689ee7012831951a44c6142911fc8e0d54ed91 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 18 Oct 2023 15:35:35 -0400
Subject: [PATCH 057/378] Rename fixture `arb_addr` -> `reg_addr` and set the
 session value globally as `._root._default_lo_addrs`

---
 tests/conftest.py | 60 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 38 insertions(+), 22 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 3363cf56..8e9a67c4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -29,7 +29,7 @@ def tractor_test(fn):
 
     If fixtures:
 
-        - ``arb_addr`` (a socket addr tuple where arbiter is listening)
+        - ``reg_addr`` (a socket addr tuple where arbiter is listening)
         - ``loglevel`` (logging level passed to tractor internals)
         - ``start_method`` (subprocess spawning backend)
 
@@ -40,16 +40,16 @@ def tractor_test(fn):
     def wrapper(
         *args,
         loglevel=None,
-        arb_addr=None,
+        reg_addr=None,
         start_method=None,
         **kwargs
     ):
         # __tracebackhide__ = True
 
-        if 'arb_addr' in inspect.signature(fn).parameters:
+        if 'reg_addr' in inspect.signature(fn).parameters:
             # injects test suite fixture value to test as well
             # as `run()`
-            kwargs['arb_addr'] = arb_addr
+            kwargs['reg_addr'] = reg_addr
 
         if 'loglevel' in inspect.signature(fn).parameters:
             # allows test suites to define a 'loglevel' fixture
@@ -71,7 +71,7 @@ def tractor_test(fn):
             async def _main():
                 async with tractor.open_root_actor(
                     # **kwargs,
-                    arbiter_addr=arb_addr,
+                    registry_addrs=[reg_addr] if reg_addr else None,
                     loglevel=loglevel,
                     start_method=start_method,
 
@@ -92,9 +92,6 @@ def tractor_test(fn):
     return wrapper
 
 
-_arb_addr = '127.0.0.1', random.randint(1000, 9999)
-
-
 # Sending signal.SIGINT on subprocess fails on windows. Use CTRL_* alternatives
 if platform.system() == 'Windows':
     _KILL_SIGNAL = signal.CTRL_BREAK_EVENT
@@ -173,9 +170,23 @@ def ci_env() -> bool:
     return _ci_env
 
 
+# choose randomly at import time
+_reg_addr: tuple[str, int] = (
+    '127.0.0.1',
+    random.randint(1000, 9999),
+)
+
+
 @pytest.fixture(scope='session')
-def arb_addr():
-    return _arb_addr
+def reg_addr() -> tuple[str, int]:
+
+    # globally override the runtime to the per-test-session-dynamic
+    # addr so that all tests never conflict with any other actor
+    # tree using the default.
+    from tractor import _root
+    _root._default_lo_addrs = [_reg_addr]
+
+    return _reg_addr
 
 
 def pytest_generate_tests(metafunc):
@@ -216,30 +227,35 @@ def sig_prog(proc, sig):
 def daemon(
     loglevel: str,
     testdir,
-    arb_addr: tuple[str, int],
+    reg_addr: tuple[str, int],
 ):
     '''
-    Run a daemon actor as a "remote arbiter".
+    Run a daemon root actor as a separate actor-process tree and
+    "remote registrar" for discovery-protocol related tests.
 
     '''
     if loglevel in ('trace', 'debug'):
-        # too much logging will lock up the subproc (smh)
-        loglevel = 'info'
+        # XXX: too much logging will lock up the subproc (smh)
+        loglevel: str = 'info'
 
-    cmdargs = [
-        sys.executable, '-c',
-        "import tractor; tractor.run_daemon([], registry_addr={}, loglevel={})"
-        .format(
-            arb_addr,
-            "'{}'".format(loglevel) if loglevel else None)
+    code: str = (
+            "import tractor; "
+            "tractor.run_daemon([], registry_addrs={reg_addrs}, loglevel={ll})"
+    ).format(
+        reg_addrs=str([reg_addr]),
+        ll="'{}'".format(loglevel) if loglevel else None,
+    )
+    cmd: list[str] = [
+        sys.executable,
+        '-c', code,
     ]
-    kwargs = dict()
+    kwargs = {}
     if platform.system() == 'Windows':
         # without this, tests hang on windows forever
         kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
 
     proc = testdir.popen(
-        cmdargs,
+        cmd,
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
         **kwargs,
-- 
2.34.1


From 6b1ceee19f9e5b9d76e332b264b578b26b3d6932 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 18 Oct 2023 15:36:00 -0400
Subject: [PATCH 058/378] Type out the full-fledged streaming ex.

---
 examples/full_fledged_streaming_service.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/examples/full_fledged_streaming_service.py b/examples/full_fledged_streaming_service.py
index 1650b583..c93df242 100644
--- a/examples/full_fledged_streaming_service.py
+++ b/examples/full_fledged_streaming_service.py
@@ -65,21 +65,28 @@ async def aggregate(seed):
     print("AGGREGATOR COMPLETE!")
 
 
-# this is the main actor and *arbiter*
-async def main():
-    # a nursery which spawns "actors"
-    async with tractor.open_nursery(
-        arbiter_addr=('127.0.0.1', 1616)
-    ) as nursery:
+async def main() -> list[int]:
+    '''
+    This is the "root" actor's main task's entrypoint.
+
+    By default (and if not otherwise specified) that root process
+    also acts as a "registry actor" / "registrar" on the localhost
+    for the purposes of multi-actor "service discovery".
+
+    '''
+    # yes, a nursery which spawns `trio`-"actors" B)
+    nursery: tractor.ActorNursery
+    async with tractor.open_nursery() as nursery:
 
         seed = int(1e3)
         pre_start = time.time()
 
-        portal = await nursery.start_actor(
+        portal: tractor.Portal = await nursery.start_actor(
             name='aggregator',
             enable_modules=[__name__],
         )
 
+        stream: tractor.MsgStream
         async with portal.open_stream_from(
             aggregate,
             seed=seed,
-- 
2.34.1


From 0e9457299c45097b71712625b44a9177aa83fd61 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 18 Oct 2023 15:39:20 -0400
Subject: [PATCH 059/378] Port all tests to new `reg_addr` fixture name

---
 tests/test_cancellation.py                  | 29 ++++----
 tests/test_child_manages_service_nursery.py |  2 +-
 tests/test_context_stream_semantics.py      |  2 +-
 tests/test_debugger.py                      |  2 +-
 tests/test_discovery.py                     | 76 +++++++++++++--------
 tests/test_infected_asyncio.py              | 26 +++----
 tests/test_legacy_one_way_streaming.py      | 22 +++---
 tests/test_local.py                         | 12 ++--
 tests/test_multi_program.py                 | 10 +--
 tests/test_pubsub.py                        |  8 +--
 tests/test_rpc.py                           |  4 +-
 tests/test_spawning.py                      | 14 ++--
 tests/test_task_broadcasting.py             | 18 ++---
 13 files changed, 126 insertions(+), 99 deletions(-)

diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py
index 657ab8e4..ce396ace 100644
--- a/tests/test_cancellation.py
+++ b/tests/test_cancellation.py
@@ -47,7 +47,7 @@ async def do_nuthin():
     ],
     ids=['no_args', 'unexpected_args'],
 )
-def test_remote_error(arb_addr, args_err):
+def test_remote_error(reg_addr, args_err):
     """Verify an error raised in a subactor that is propagated
     to the parent nursery, contains the underlying boxed builtin
     error type info and causes cancellation and reraising all the
@@ -57,7 +57,7 @@ def test_remote_error(arb_addr, args_err):
 
     async def main():
         async with tractor.open_nursery(
-            arbiter_addr=arb_addr,
+            registry_addrs=[reg_addr],
         ) as nursery:
 
             # on a remote type error caused by bad input args
@@ -97,7 +97,7 @@ def test_remote_error(arb_addr, args_err):
             assert exc.type == errtype
 
 
-def test_multierror(arb_addr):
+def test_multierror(reg_addr):
     '''
     Verify we raise a ``BaseExceptionGroup`` out of a nursery where
     more then one actor errors.
@@ -105,7 +105,7 @@ def test_multierror(arb_addr):
     '''
     async def main():
         async with tractor.open_nursery(
-            arbiter_addr=arb_addr,
+            registry_addrs=[reg_addr],
         ) as nursery:
 
             await nursery.run_in_actor(assert_err, name='errorer1')
@@ -130,14 +130,14 @@ def test_multierror(arb_addr):
 @pytest.mark.parametrize(
     'num_subactors', range(25, 26),
 )
-def test_multierror_fast_nursery(arb_addr, start_method, num_subactors, delay):
+def test_multierror_fast_nursery(reg_addr, start_method, num_subactors, delay):
     """Verify we raise a ``BaseExceptionGroup`` out of a nursery where
     more then one actor errors and also with a delay before failure
     to test failure during an ongoing spawning.
     """
     async def main():
         async with tractor.open_nursery(
-            arbiter_addr=arb_addr,
+            registry_addrs=[reg_addr],
         ) as nursery:
 
             for i in range(num_subactors):
@@ -175,15 +175,20 @@ async def do_nothing():
 
 
 @pytest.mark.parametrize('mechanism', ['nursery_cancel', KeyboardInterrupt])
-def test_cancel_single_subactor(arb_addr, mechanism):
-    """Ensure a ``ActorNursery.start_actor()`` spawned subactor
+def test_cancel_single_subactor(reg_addr, mechanism):
+    '''
+    Ensure a ``ActorNursery.start_actor()`` spawned subactor
     cancels when the nursery is cancelled.
-    """
+
+    '''
     async def spawn_actor():
-        """Spawn an actor that blocks indefinitely.
-        """
+        '''
+        Spawn an actor that blocks indefinitely then cancel via
+        either `ActorNursery.cancel()` or an exception raise.
+
+        '''
         async with tractor.open_nursery(
-            arbiter_addr=arb_addr,
+            registry_addrs=[reg_addr],
         ) as nursery:
 
             portal = await nursery.start_actor(
diff --git a/tests/test_child_manages_service_nursery.py b/tests/test_child_manages_service_nursery.py
index 806e6d7e..fd1ceb80 100644
--- a/tests/test_child_manages_service_nursery.py
+++ b/tests/test_child_manages_service_nursery.py
@@ -141,7 +141,7 @@ async def open_actor_local_nursery(
 )
 def test_actor_managed_trio_nursery_task_error_cancels_aio(
     asyncio_mode: bool,
-    arb_addr
+    reg_addr: tuple,
 ):
     '''
     Verify that a ``trio`` nursery created managed in a child actor
diff --git a/tests/test_context_stream_semantics.py b/tests/test_context_stream_semantics.py
index a0d291d7..29d50e84 100644
--- a/tests/test_context_stream_semantics.py
+++ b/tests/test_context_stream_semantics.py
@@ -5,7 +5,7 @@ Verify the we raise errors when streams are opened prior to
 sync-opening a ``tractor.Context`` beforehand.
 
 '''
-from contextlib import asynccontextmanager as acm
+# from contextlib import asynccontextmanager as acm
 from itertools import count
 import platform
 from typing import Optional
diff --git a/tests/test_debugger.py b/tests/test_debugger.py
index a44a3138..e7bb0d73 100644
--- a/tests/test_debugger.py
+++ b/tests/test_debugger.py
@@ -78,7 +78,7 @@ has_nested_actors = pytest.mark.has_nested_actors
 def spawn(
     start_method,
     testdir,
-    arb_addr,
+    reg_addr,
 ) -> 'pexpect.spawn':
 
     if start_method != 'trio':
diff --git a/tests/test_discovery.py b/tests/test_discovery.py
index 8ba4ebee..8b47700c 100644
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@@ -15,19 +15,19 @@ from conftest import tractor_test
 
 
 @tractor_test
-async def test_reg_then_unreg(arb_addr):
+async def test_reg_then_unreg(reg_addr):
     actor = tractor.current_actor()
     assert actor.is_arbiter
     assert len(actor._registry) == 1  # only self is registered
 
     async with tractor.open_nursery(
-        arbiter_addr=arb_addr,
+        registry_addrs=[reg_addr],
     ) as n:
 
         portal = await n.start_actor('actor', enable_modules=[__name__])
         uid = portal.channel.uid
 
-        async with tractor.get_arbiter(*arb_addr) as aportal:
+        async with tractor.get_arbiter(*reg_addr) as aportal:
             # this local actor should be the arbiter
             assert actor is aportal.actor
 
@@ -53,15 +53,27 @@ async def hi():
     return the_line.format(tractor.current_actor().name)
 
 
-async def say_hello(other_actor):
+async def say_hello(
+    other_actor: str,
+    reg_addr: tuple[str, int],
+):
     await trio.sleep(1)  # wait for other actor to spawn
-    async with tractor.find_actor(other_actor) as portal:
+    async with tractor.find_actor(
+        other_actor,
+        registry_addrs=[reg_addr],
+    ) as portal:
         assert portal is not None
         return await portal.run(__name__, 'hi')
 
 
-async def say_hello_use_wait(other_actor):
-    async with tractor.wait_for_actor(other_actor) as portal:
+async def say_hello_use_wait(
+    other_actor: str,
+    reg_addr: tuple[str, int],
+):
+    async with tractor.wait_for_actor(
+        other_actor,
+        registry_addr=reg_addr,
+    ) as portal:
         assert portal is not None
         result = await portal.run(__name__, 'hi')
         return result
@@ -69,21 +81,29 @@ async def say_hello_use_wait(other_actor):
 
 @tractor_test
 @pytest.mark.parametrize('func', [say_hello, say_hello_use_wait])
-async def test_trynamic_trio(func, start_method, arb_addr):
-    """Main tractor entry point, the "master" process (for now
-    acts as the "director").
-    """
+async def test_trynamic_trio(
+    func,
+    start_method,
+    reg_addr,
+):
+    '''
+    Root actor acting as the "director" and running one-shot-task-actors
+    for the directed subs.
+
+    '''
     async with tractor.open_nursery() as n:
         print("Alright... Action!")
 
         donny = await n.run_in_actor(
             func,
             other_actor='gretchen',
+            reg_addr=reg_addr,
             name='donny',
         )
         gretchen = await n.run_in_actor(
             func,
             other_actor='donny',
+            reg_addr=reg_addr,
             name='gretchen',
         )
         print(await gretchen.result())
@@ -131,7 +151,7 @@ async def unpack_reg(actor_or_portal):
 
 
 async def spawn_and_check_registry(
-    arb_addr: tuple,
+    reg_addr: tuple,
     use_signal: bool,
     remote_arbiter: bool = False,
     with_streaming: bool = False,
@@ -139,9 +159,9 @@ async def spawn_and_check_registry(
 ) -> None:
 
     async with tractor.open_root_actor(
-        arbiter_addr=arb_addr,
+        registry_addrs=[reg_addr],
     ):
-        async with tractor.get_arbiter(*arb_addr) as portal:
+        async with tractor.get_arbiter(*reg_addr) as portal:
             # runtime needs to be up to call this
             actor = tractor.current_actor()
 
@@ -213,17 +233,19 @@ async def spawn_and_check_registry(
 def test_subactors_unregister_on_cancel(
     start_method,
     use_signal,
-    arb_addr,
+    reg_addr,
     with_streaming,
 ):
-    """Verify that cancelling a nursery results in all subactors
+    '''
+    Verify that cancelling a nursery results in all subactors
     deregistering themselves with the arbiter.
-    """
+
+    '''
     with pytest.raises(KeyboardInterrupt):
         trio.run(
             partial(
                 spawn_and_check_registry,
-                arb_addr,
+                reg_addr,
                 use_signal,
                 remote_arbiter=False,
                 with_streaming=with_streaming,
@@ -237,7 +259,7 @@ def test_subactors_unregister_on_cancel_remote_daemon(
     daemon,
     start_method,
     use_signal,
-    arb_addr,
+    reg_addr,
     with_streaming,
 ):
     """Verify that cancelling a nursery results in all subactors
@@ -248,7 +270,7 @@ def test_subactors_unregister_on_cancel_remote_daemon(
         trio.run(
             partial(
                 spawn_and_check_registry,
-                arb_addr,
+                reg_addr,
                 use_signal,
                 remote_arbiter=True,
                 with_streaming=with_streaming,
@@ -262,7 +284,7 @@ async def streamer(agen):
 
 
 async def close_chans_before_nursery(
-    arb_addr: tuple,
+    reg_addr: tuple,
     use_signal: bool,
     remote_arbiter: bool = False,
 ) -> None:
@@ -275,9 +297,9 @@ async def close_chans_before_nursery(
         entries_at_end = 1
 
     async with tractor.open_root_actor(
-        arbiter_addr=arb_addr,
+        registry_addrs=[reg_addr],
     ):
-        async with tractor.get_arbiter(*arb_addr) as aportal:
+        async with tractor.get_arbiter(*reg_addr) as aportal:
             try:
                 get_reg = partial(unpack_reg, aportal)
 
@@ -329,7 +351,7 @@ async def close_chans_before_nursery(
 def test_close_channel_explicit(
     start_method,
     use_signal,
-    arb_addr,
+    reg_addr,
 ):
     """Verify that closing a stream explicitly and killing the actor's
     "root nursery" **before** the containing nursery tears down also
@@ -339,7 +361,7 @@ def test_close_channel_explicit(
         trio.run(
             partial(
                 close_chans_before_nursery,
-                arb_addr,
+                reg_addr,
                 use_signal,
                 remote_arbiter=False,
             ),
@@ -351,7 +373,7 @@ def test_close_channel_explicit_remote_arbiter(
     daemon,
     start_method,
     use_signal,
-    arb_addr,
+    reg_addr,
 ):
     """Verify that closing a stream explicitly and killing the actor's
     "root nursery" **before** the containing nursery tears down also
@@ -361,7 +383,7 @@ def test_close_channel_explicit_remote_arbiter(
         trio.run(
             partial(
                 close_chans_before_nursery,
-                arb_addr,
+                reg_addr,
                 use_signal,
                 remote_arbiter=True,
             ),
diff --git a/tests/test_infected_asyncio.py b/tests/test_infected_asyncio.py
index dd9d681a..76744198 100644
--- a/tests/test_infected_asyncio.py
+++ b/tests/test_infected_asyncio.py
@@ -47,7 +47,7 @@ async def trio_cancels_single_aio_task():
         await tractor.to_asyncio.run_task(sleep_forever)
 
 
-def test_trio_cancels_aio_on_actor_side(arb_addr):
+def test_trio_cancels_aio_on_actor_side(reg_addr):
     '''
     Spawn an infected actor that is cancelled by the ``trio`` side
     task using std cancel scope apis.
@@ -55,7 +55,7 @@ def test_trio_cancels_aio_on_actor_side(arb_addr):
     '''
     async def main():
         async with tractor.open_nursery(
-            arbiter_addr=arb_addr
+            registry_addrs=[reg_addr]
         ) as n:
             await n.run_in_actor(
                 trio_cancels_single_aio_task,
@@ -94,7 +94,7 @@ async def asyncio_actor(
         raise
 
 
-def test_aio_simple_error(arb_addr):
+def test_aio_simple_error(reg_addr):
     '''
     Verify a simple remote asyncio error propagates back through trio
     to the parent actor.
@@ -103,7 +103,7 @@ def test_aio_simple_error(arb_addr):
     '''
     async def main():
         async with tractor.open_nursery(
-            arbiter_addr=arb_addr
+            registry_addrs=[reg_addr]
         ) as n:
             await n.run_in_actor(
                 asyncio_actor,
@@ -120,7 +120,7 @@ def test_aio_simple_error(arb_addr):
     assert err.type == AssertionError
 
 
-def test_tractor_cancels_aio(arb_addr):
+def test_tractor_cancels_aio(reg_addr):
     '''
     Verify we can cancel a spawned asyncio task gracefully.
 
@@ -139,7 +139,7 @@ def test_tractor_cancels_aio(arb_addr):
     trio.run(main)
 
 
-def test_trio_cancels_aio(arb_addr):
+def test_trio_cancels_aio(reg_addr):
     '''
     Much like the above test with ``tractor.Portal.cancel_actor()``
     except we just use a standard ``trio`` cancellation api.
@@ -194,7 +194,7 @@ async def trio_ctx(
     ids='parent_actor_cancels_child={}'.format
 )
 def test_context_spawns_aio_task_that_errors(
-    arb_addr,
+    reg_addr,
     parent_cancels: bool,
 ):
     '''
@@ -258,7 +258,7 @@ async def aio_cancel():
     await sleep_forever()
 
 
-def test_aio_cancelled_from_aio_causes_trio_cancelled(arb_addr):
+def test_aio_cancelled_from_aio_causes_trio_cancelled(reg_addr):
 
     async def main():
         async with tractor.open_nursery() as n:
@@ -395,7 +395,7 @@ async def stream_from_aio(
     'fan_out', [False, True],
     ids='fan_out_w_chan_subscribe={}'.format
 )
-def test_basic_interloop_channel_stream(arb_addr, fan_out):
+def test_basic_interloop_channel_stream(reg_addr, fan_out):
     async def main():
         async with tractor.open_nursery() as n:
             portal = await n.run_in_actor(
@@ -409,7 +409,7 @@ def test_basic_interloop_channel_stream(arb_addr, fan_out):
 
 
 # TODO: parametrize the above test and avoid the duplication here?
-def test_trio_error_cancels_intertask_chan(arb_addr):
+def test_trio_error_cancels_intertask_chan(reg_addr):
     async def main():
         async with tractor.open_nursery() as n:
             portal = await n.run_in_actor(
@@ -428,7 +428,7 @@ def test_trio_error_cancels_intertask_chan(arb_addr):
         assert exc.type == Exception
 
 
-def test_trio_closes_early_and_channel_exits(arb_addr):
+def test_trio_closes_early_and_channel_exits(reg_addr):
     async def main():
         async with tractor.open_nursery() as n:
             portal = await n.run_in_actor(
@@ -443,7 +443,7 @@ def test_trio_closes_early_and_channel_exits(arb_addr):
     trio.run(main)
 
 
-def test_aio_errors_and_channel_propagates_and_closes(arb_addr):
+def test_aio_errors_and_channel_propagates_and_closes(reg_addr):
     async def main():
         async with tractor.open_nursery() as n:
             portal = await n.run_in_actor(
@@ -520,7 +520,7 @@ async def trio_to_aio_echo_server(
     ids='raise_error={}'.format,
 )
 def test_echoserver_detailed_mechanics(
-    arb_addr,
+    reg_addr,
     raise_error_mid_stream,
 ):
 
diff --git a/tests/test_legacy_one_way_streaming.py b/tests/test_legacy_one_way_streaming.py
index 17e94ba3..0cbda4d8 100644
--- a/tests/test_legacy_one_way_streaming.py
+++ b/tests/test_legacy_one_way_streaming.py
@@ -55,7 +55,7 @@ async def context_stream(
 
 
 async def stream_from_single_subactor(
-    arb_addr,
+    reg_addr,
     start_method,
     stream_func,
 ):
@@ -64,7 +64,7 @@ async def stream_from_single_subactor(
     # only one per host address, spawns an actor if None
 
     async with tractor.open_nursery(
-        arbiter_addr=arb_addr,
+        registry_addrs=[reg_addr],
         start_method=start_method,
     ) as nursery:
 
@@ -115,13 +115,13 @@ async def stream_from_single_subactor(
 @pytest.mark.parametrize(
     'stream_func', [async_gen_stream, context_stream]
 )
-def test_stream_from_single_subactor(arb_addr, start_method, stream_func):
+def test_stream_from_single_subactor(reg_addr, start_method, stream_func):
     """Verify streaming from a spawned async generator.
     """
     trio.run(
         partial(
             stream_from_single_subactor,
-            arb_addr,
+            reg_addr,
             start_method,
             stream_func=stream_func,
         ),
@@ -225,14 +225,14 @@ async def a_quadruple_example():
         return result_stream
 
 
-async def cancel_after(wait, arb_addr):
-    async with tractor.open_root_actor(arbiter_addr=arb_addr):
+async def cancel_after(wait, reg_addr):
+    async with tractor.open_root_actor(registry_addrs=[reg_addr]):
         with trio.move_on_after(wait):
             return await a_quadruple_example()
 
 
 @pytest.fixture(scope='module')
-def time_quad_ex(arb_addr, ci_env, spawn_backend):
+def time_quad_ex(reg_addr, ci_env, spawn_backend):
     if spawn_backend == 'mp':
         """no idea but the  mp *nix runs are flaking out here often...
         """
@@ -240,7 +240,7 @@ def time_quad_ex(arb_addr, ci_env, spawn_backend):
 
     timeout = 7 if platform.system() in ('Windows', 'Darwin') else 4
     start = time.time()
-    results = trio.run(cancel_after, timeout, arb_addr)
+    results = trio.run(cancel_after, timeout, reg_addr)
     diff = time.time() - start
     assert results
     return results, diff
@@ -260,14 +260,14 @@ def test_a_quadruple_example(time_quad_ex, ci_env, spawn_backend):
     list(map(lambda i: i/10, range(3, 9)))
 )
 def test_not_fast_enough_quad(
-    arb_addr, time_quad_ex, cancel_delay, ci_env, spawn_backend
+    reg_addr, time_quad_ex, cancel_delay, ci_env, spawn_backend
 ):
     """Verify we can cancel midway through the quad example and all actors
     cancel gracefully.
     """
     results, diff = time_quad_ex
     delay = max(diff - cancel_delay, 0)
-    results = trio.run(cancel_after, delay, arb_addr)
+    results = trio.run(cancel_after, delay, reg_addr)
     system = platform.system()
     if system in ('Windows', 'Darwin') and results is not None:
         # In CI envoirments it seems later runs are quicker then the first
@@ -280,7 +280,7 @@ def test_not_fast_enough_quad(
 
 @tractor_test
 async def test_respawn_consumer_task(
-    arb_addr,
+    reg_addr,
     spawn_backend,
     loglevel,
 ):
diff --git a/tests/test_local.py b/tests/test_local.py
index 97a83285..009d0d71 100644
--- a/tests/test_local.py
+++ b/tests/test_local.py
@@ -24,7 +24,7 @@ async def test_no_runtime():
 
 
 @tractor_test
-async def test_self_is_registered(arb_addr):
+async def test_self_is_registered(reg_addr):
     "Verify waiting on the arbiter to register itself using the standard api."
     actor = tractor.current_actor()
     assert actor.is_arbiter
@@ -34,20 +34,20 @@ async def test_self_is_registered(arb_addr):
 
 
 @tractor_test
-async def test_self_is_registered_localportal(arb_addr):
+async def test_self_is_registered_localportal(reg_addr):
     "Verify waiting on the arbiter to register itself using a local portal."
     actor = tractor.current_actor()
     assert actor.is_arbiter
-    async with tractor.get_arbiter(*arb_addr) as portal:
+    async with tractor.get_arbiter(*reg_addr) as portal:
         assert isinstance(portal, tractor._portal.LocalPortal)
 
         with trio.fail_after(0.2):
             sockaddr = await portal.run_from_ns(
                     'self', 'wait_for_actor', name='root')
-            assert sockaddr[0] == arb_addr
+            assert sockaddr[0] == reg_addr
 
 
-def test_local_actor_async_func(arb_addr):
+def test_local_actor_async_func(reg_addr):
     """Verify a simple async function in-process.
     """
     nums = []
@@ -55,7 +55,7 @@ def test_local_actor_async_func(arb_addr):
     async def print_loop():
 
         async with tractor.open_root_actor(
-            arbiter_addr=arb_addr,
+            registry_addrs=[reg_addr],
         ):
             # arbiter is started in-proc if dne
             assert tractor.current_actor().is_arbiter
diff --git a/tests/test_multi_program.py b/tests/test_multi_program.py
index e7a3ac5c..d1ee0f5e 100644
--- a/tests/test_multi_program.py
+++ b/tests/test_multi_program.py
@@ -28,9 +28,9 @@ def test_abort_on_sigint(daemon):
 
 
 @tractor_test
-async def test_cancel_remote_arbiter(daemon, arb_addr):
+async def test_cancel_remote_arbiter(daemon, reg_addr):
     assert not tractor.current_actor().is_arbiter
-    async with tractor.get_arbiter(*arb_addr) as portal:
+    async with tractor.get_arbiter(*reg_addr) as portal:
         await portal.cancel_actor()
 
     time.sleep(0.1)
@@ -39,16 +39,16 @@ async def test_cancel_remote_arbiter(daemon, arb_addr):
 
     # no arbiter socket should exist
     with pytest.raises(OSError):
-        async with tractor.get_arbiter(*arb_addr) as portal:
+        async with tractor.get_arbiter(*reg_addr) as portal:
             pass
 
 
-def test_register_duplicate_name(daemon, arb_addr):
+def test_register_duplicate_name(daemon, reg_addr):
 
     async def main():
 
         async with tractor.open_nursery(
-            arbiter_addr=arb_addr,
+            registry_addrs=[reg_addr],
         ) as n:
 
             assert not tractor.current_actor().is_arbiter
diff --git a/tests/test_pubsub.py b/tests/test_pubsub.py
index ababcb51..20554fa5 100644
--- a/tests/test_pubsub.py
+++ b/tests/test_pubsub.py
@@ -160,7 +160,7 @@ async def test_required_args(callwith_expecterror):
 )
 def test_multi_actor_subs_arbiter_pub(
     loglevel,
-    arb_addr,
+    reg_addr,
     pub_actor,
 ):
     """Try out the neato @pub decorator system.
@@ -170,7 +170,7 @@ def test_multi_actor_subs_arbiter_pub(
     async def main():
 
         async with tractor.open_nursery(
-            arbiter_addr=arb_addr,
+            registry_addrs=[reg_addr],
             enable_modules=[__name__],
         ) as n:
 
@@ -255,12 +255,12 @@ def test_multi_actor_subs_arbiter_pub(
 
 def test_single_subactor_pub_multitask_subs(
     loglevel,
-    arb_addr,
+    reg_addr,
 ):
     async def main():
 
         async with tractor.open_nursery(
-            arbiter_addr=arb_addr,
+            registry_addrs=[reg_addr],
             enable_modules=[__name__],
         ) as n:
 
diff --git a/tests/test_rpc.py b/tests/test_rpc.py
index 6d158961..7ede231b 100644
--- a/tests/test_rpc.py
+++ b/tests/test_rpc.py
@@ -45,7 +45,7 @@ async def short_sleep():
     ids=['no_mods', 'this_mod', 'this_mod_bad_func', 'fail_to_import',
          'fail_on_syntax'],
 )
-def test_rpc_errors(arb_addr, to_call, testdir):
+def test_rpc_errors(reg_addr, to_call, testdir):
     """Test errors when making various RPC requests to an actor
     that either doesn't have the requested module exposed or doesn't define
     the named function.
@@ -77,7 +77,7 @@ def test_rpc_errors(arb_addr, to_call, testdir):
 
         # spawn a subactor which calls us back
         async with tractor.open_nursery(
-            arbiter_addr=arb_addr,
+            arbiter_addr=reg_addr,
             enable_modules=exposed_mods.copy(),
         ) as n:
 
diff --git a/tests/test_spawning.py b/tests/test_spawning.py
index 17798c09..0f6a8cfe 100644
--- a/tests/test_spawning.py
+++ b/tests/test_spawning.py
@@ -16,14 +16,14 @@ data_to_pass_down = {'doggy': 10, 'kitty': 4}
 async def spawn(
     is_arbiter: bool,
     data: dict,
-    arb_addr: tuple[str, int],
+    reg_addr: tuple[str, int],
 ):
     namespaces = [__name__]
 
     await trio.sleep(0.1)
 
     async with tractor.open_root_actor(
-        arbiter_addr=arb_addr,
+        arbiter_addr=reg_addr,
     ):
 
         actor = tractor.current_actor()
@@ -41,7 +41,7 @@ async def spawn(
                     is_arbiter=False,
                     name='sub-actor',
                     data=data,
-                    arb_addr=arb_addr,
+                    reg_addr=reg_addr,
                     enable_modules=namespaces,
                 )
 
@@ -55,12 +55,12 @@ async def spawn(
             return 10
 
 
-def test_local_arbiter_subactor_global_state(arb_addr):
+def test_local_arbiter_subactor_global_state(reg_addr):
     result = trio.run(
         spawn,
         True,
         data_to_pass_down,
-        arb_addr,
+        reg_addr,
     )
     assert result == 10
 
@@ -140,7 +140,7 @@ async def check_loglevel(level):
 def test_loglevel_propagated_to_subactor(
     start_method,
     capfd,
-    arb_addr,
+    reg_addr,
 ):
     if start_method == 'mp_forkserver':
         pytest.skip(
@@ -152,7 +152,7 @@ def test_loglevel_propagated_to_subactor(
         async with tractor.open_nursery(
             name='arbiter',
             start_method=start_method,
-            arbiter_addr=arb_addr,
+            arbiter_addr=reg_addr,
 
         ) as tn:
             await tn.run_in_actor(
diff --git a/tests/test_task_broadcasting.py b/tests/test_task_broadcasting.py
index 5e18e10a..d7a29134 100644
--- a/tests/test_task_broadcasting.py
+++ b/tests/test_task_broadcasting.py
@@ -66,13 +66,13 @@ async def ensure_sequence(
 async def open_sequence_streamer(
 
     sequence: list[int],
-    arb_addr: tuple[str, int],
+    reg_addr: tuple[str, int],
     start_method: str,
 
 ) -> tractor.MsgStream:
 
     async with tractor.open_nursery(
-        arbiter_addr=arb_addr,
+        arbiter_addr=reg_addr,
         start_method=start_method,
     ) as tn:
 
@@ -93,7 +93,7 @@ async def open_sequence_streamer(
 
 
 def test_stream_fan_out_to_local_subscriptions(
-    arb_addr,
+    reg_addr,
     start_method,
 ):
 
@@ -103,7 +103,7 @@ def test_stream_fan_out_to_local_subscriptions(
 
         async with open_sequence_streamer(
             sequence,
-            arb_addr,
+            reg_addr,
             start_method,
         ) as stream:
 
@@ -138,7 +138,7 @@ def test_stream_fan_out_to_local_subscriptions(
     ]
 )
 def test_consumer_and_parent_maybe_lag(
-    arb_addr,
+    reg_addr,
     start_method,
     task_delays,
 ):
@@ -150,7 +150,7 @@ def test_consumer_and_parent_maybe_lag(
 
         async with open_sequence_streamer(
             sequence,
-            arb_addr,
+            reg_addr,
             start_method,
         ) as stream:
 
@@ -211,7 +211,7 @@ def test_consumer_and_parent_maybe_lag(
 
 
 def test_faster_task_to_recv_is_cancelled_by_slower(
-    arb_addr,
+    reg_addr,
     start_method,
 ):
     '''
@@ -225,7 +225,7 @@ def test_faster_task_to_recv_is_cancelled_by_slower(
 
         async with open_sequence_streamer(
             sequence,
-            arb_addr,
+            reg_addr,
             start_method,
 
         ) as stream:
@@ -302,7 +302,7 @@ def test_subscribe_errors_after_close():
 
 
 def test_ensure_slow_consumers_lag_out(
-    arb_addr,
+    reg_addr,
     start_method,
 ):
     '''This is a pure local task test; no tractor
-- 
2.34.1


From 022bf8ce75916f0446a26c6f13f7442067b97440 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 18 Oct 2023 19:08:35 -0400
Subject: [PATCH 060/378] Ensure `registry_addrs` is always set to something

---
 tractor/_root.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tractor/_root.py b/tractor/_root.py
index 99a5ad85..a36a7e70 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -46,12 +46,12 @@ from ._exceptions import is_multi_cancelled
 
 
 # set at startup and after forks
-_default_lo_host: str = '127.0.0.1'
+_default_host: str = '127.0.0.1'
 _default_port: int = 1616
 
 # default registry always on localhost
 _default_lo_addrs: list[tuple[str, int]] = [(
-    _default_lo_host,
+    _default_host,
     _default_port,
 )]
 
@@ -133,6 +133,7 @@ async def open_root_actor(
         or     
         _default_lo_addrs
     )
+    assert registry_addrs
 
     loglevel = (loglevel or log._default_loglevel).upper()
 
-- 
2.34.1


From 2e81ccf5b4fdbf893f9033eb1cb1f8ae9a683e4c Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 18 Oct 2023 19:09:07 -0400
Subject: [PATCH 061/378] Dump `.msgdata` in `RemoteActorError.__repr__()`

---
 tractor/_exceptions.py | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index d9e1d17f..0bb4552b 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -20,6 +20,7 @@ Our classy exception set.
 """
 import builtins
 import importlib
+from pprint import pformat
 from typing import (
     Any,
     Type,
@@ -38,12 +39,17 @@ class ActorFailure(Exception):
     "General actor failure"
 
 
+# TODO: rename to just `RemoteError`?
 class RemoteActorError(Exception):
     '''
-    Remote actor exception bundled locally
+    A box(ing) type which bundles a remote actor `BaseException` for
+    (near identical, and only if possible,) local object/instance
+    re-construction in the local process memory domain.
+
+    Normally each instance is expected to be constructed from
+    a special "error" IPC msg sent by some remote actor-runtime.
 
     '''
-    # TODO: local recontruction of remote exception deats
     def __init__(
         self,
         message: str,
@@ -53,13 +59,36 @@ class RemoteActorError(Exception):
     ) -> None:
         super().__init__(message)
 
-        self.type = suberror_type
-        self.msgdata = msgdata
+        # TODO: maybe a better name?
+        # - .errtype
+        # - .retype
+        # - .boxed_errtype
+        # - .boxed_type
+        # - .remote_type
+        # also pertains to our long long oustanding issue XD
+        # https://github.com/goodboy/tractor/issues/5
+        self.type: str = suberror_type
+        self.msgdata: dict[str, Any] = msgdata
 
     @property
     def src_actor_uid(self) -> tuple[str, str] | None:
         return self.msgdata.get('src_actor_uid')
 
+    def __repr__(self) -> str:
+        if remote_tb := self.msgdata.get('tb_str'):
+            pformat(remote_tb)
+            return (
+                f'{type(self).__name__}(\n'
+                f'msgdata={pformat(self.msgdata)}\n'
+                ')'
+            )
+
+        return super().__repr__(self)
+
+    # TODO: local recontruction of remote exception deats
+    # def unbox(self) -> BaseException:
+    #     ...
+
 
 class InternalActorError(RemoteActorError):
     '''
-- 
2.34.1


From 42d621bba7b1e133fa917f12d4c6fe791e90dbab Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 18 Oct 2023 19:10:04 -0400
Subject: [PATCH 062/378] Always dynamically re-read the
 `._root._default_lo_addrs` value in `find_actor()`

---
 tractor/_discovery.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tractor/_discovery.py b/tractor/_discovery.py
index 1fa2a885..070321b6 100644
--- a/tractor/_discovery.py
+++ b/tractor/_discovery.py
@@ -192,8 +192,11 @@ async def find_actor(
                 yield None
 
     if not registry_addrs:
-        from ._root import _default_lo_addrs
-        registry_addrs = _default_lo_addrs
+        # XXX NOTE: make sure to dynamically read the value on
+        # every call since something may change it globally (eg.
+        # like in our discovery test suite)!
+        from . import _root
+        registry_addrs = _root._default_lo_addrs
 
     maybe_portals: list[
         AsyncContextManager[tuple[str, int]]
-- 
2.34.1


From a3ed30e62ba0d700d3b3d84e780f0fe40e041863 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 19 Oct 2023 11:17:07 -0400
Subject: [PATCH 063/378] Get remaining suites passing..

..by ensuring `reg_addr` fixture value passthrough to subactor eps
---
 tests/test_debugger.py      |  4 ++--
 tests/test_docs_examples.py |  2 +-
 tests/test_rpc.py           | 32 +++++++++++++++++++++++++++-----
 tests/test_spawning.py      |  7 ++++---
 4 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/tests/test_debugger.py b/tests/test_debugger.py
index e7bb0d73..3bd26b61 100644
--- a/tests/test_debugger.py
+++ b/tests/test_debugger.py
@@ -166,7 +166,7 @@ def ctlc(
         # XXX: disable pygments highlighting for auto-tests
         # since some envs (like actions CI) will struggle
         # the the added color-char encoding..
-        from tractor._debug import TractorConfig
+        from tractor.devx._debug import TractorConfig
         TractorConfig.use_pygements = False
 
     yield use_ctlc
@@ -607,7 +607,7 @@ def test_multi_daemon_subactors(
     # now the root actor won't clobber the bp_forever child
     # during it's first access to the debug lock, but will instead
     # wait for the lock to release, by the edge triggered
-    # ``_debug.Lock.no_remote_has_tty`` event before sending cancel messages
+    # ``devx._debug.Lock.no_remote_has_tty`` event before sending cancel messages
     # (via portals) to its underlings B)
 
     # at some point here there should have been some warning msg from
diff --git a/tests/test_docs_examples.py b/tests/test_docs_examples.py
index f134c71b..1eefdb40 100644
--- a/tests/test_docs_examples.py
+++ b/tests/test_docs_examples.py
@@ -21,7 +21,7 @@ from conftest import (
 def run_example_in_subproc(
     loglevel: str,
     testdir,
-    arb_addr: tuple[str, int],
+    reg_addr: tuple[str, int],
 ):
 
     @contextmanager
diff --git a/tests/test_rpc.py b/tests/test_rpc.py
index 7ede231b..3404c602 100644
--- a/tests/test_rpc.py
+++ b/tests/test_rpc.py
@@ -13,9 +13,19 @@ async def sleep_back_actor(
     func_name,
     func_defined,
     exposed_mods,
+    *,
+    reg_addr: tuple,
 ):
     if actor_name:
-        async with tractor.find_actor(actor_name) as portal:
+        async with tractor.find_actor(
+            actor_name,
+            # NOTE: must be set manually since
+            # the subactor doesn't have the reg_addr
+            # fixture code run in it!
+            # TODO: maybe we should just set this once in the
+            # _state mod and derive to all children?
+            registry_addrs=[reg_addr],
+        ) as portal:
             try:
                 await portal.run(__name__, func_name)
             except tractor.RemoteActorError as err:
@@ -45,11 +55,17 @@ async def short_sleep():
     ids=['no_mods', 'this_mod', 'this_mod_bad_func', 'fail_to_import',
          'fail_on_syntax'],
 )
-def test_rpc_errors(reg_addr, to_call, testdir):
-    """Test errors when making various RPC requests to an actor
+def test_rpc_errors(
+    reg_addr,
+    to_call,
+    testdir,
+):
+    '''
+    Test errors when making various RPC requests to an actor
     that either doesn't have the requested module exposed or doesn't define
     the named function.
-    """
+
+    '''
     exposed_mods, funcname, inside_err = to_call
     subactor_exposed_mods = []
     func_defined = globals().get(funcname, False)
@@ -77,8 +93,13 @@ def test_rpc_errors(reg_addr, to_call, testdir):
 
         # spawn a subactor which calls us back
         async with tractor.open_nursery(
-            arbiter_addr=reg_addr,
+            registry_addrs=[reg_addr],
             enable_modules=exposed_mods.copy(),
+
+            # NOTE: will halt test in REPL if uncommented, so only
+            # do that if actually debugging subactor but keep it
+            # disabled for the test.
+            # debug_mode=True,
         ) as n:
 
             actor = tractor.current_actor()
@@ -95,6 +116,7 @@ def test_rpc_errors(reg_addr, to_call, testdir):
                 exposed_mods=exposed_mods,
                 func_defined=True if func_defined else False,
                 enable_modules=subactor_exposed_mods,
+                reg_addr=reg_addr,
             )
 
     def run():
diff --git a/tests/test_spawning.py b/tests/test_spawning.py
index 0f6a8cfe..1a07610a 100644
--- a/tests/test_spawning.py
+++ b/tests/test_spawning.py
@@ -32,8 +32,7 @@ async def spawn(
 
         if actor.is_arbiter:
 
-            async with tractor.open_nursery(
-            ) as nursery:
+            async with tractor.open_nursery() as nursery:
 
                 # forks here
                 portal = await nursery.run_in_actor(
@@ -55,7 +54,9 @@ async def spawn(
             return 10
 
 
-def test_local_arbiter_subactor_global_state(reg_addr):
+def test_local_arbiter_subactor_global_state(
+    reg_addr,
+):
     result = trio.run(
         spawn,
         True,
-- 
2.34.1


From 1d6f55543d827473a48da69ae98fd72ee66328e9 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 19 Oct 2023 12:05:44 -0400
Subject: [PATCH 064/378] Expose per-actor registry addrs via `.reg_addrs`

Since it's handy to be able to debug the *writing* of this instance var
(particularly when checking state passed down to a child in
`Actor._from_parent()`), rename and wrap the underlying
`Actor._reg_addrs` as a settable `@property` and add validation to
the `.setter` for sanity - actor discovery is a critical functionality.

Other tweaks:
- fix `.cancel_soon()` to pass expected argument..
- update internal runtime error message to be simpler and link to GH issues.
- use new `Actor.reg_addrs` throughout core.
---
 tractor/_runtime.py | 107 ++++++++++++++++++++++++++++++++------------
 1 file changed, 78 insertions(+), 29 deletions(-)

diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 2e75cff1..5f4da96a 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -553,12 +553,6 @@ class Actor:
             )
             registry_addrs: list[tuple[str, int]] = [arbiter_addr]
 
-        self._reg_addrs: list[tuple[str, int]] = (
-            registry_addrs
-            or
-            None
-        )
-
         # marked by the process spawning backend at startup
         # will be None for the parent most process started manually
         # by the user (currently called the "arbiter")
@@ -591,6 +585,44 @@ class Actor:
             ActorNursery | None,
         ] = {}  # type: ignore  # noqa
 
+        # when provided, init the registry addresses property from
+        # input via the validator.
+        self._reg_addrs: list[tuple[str, int]] = []
+        if registry_addrs:
+            self.reg_addrs: list[tuple[str, int]] = registry_addrs
+
+    @property
+    def reg_addrs(self) -> list[tuple[str, int]]:
+        '''
+        List of (socket) addresses for all known (and contactable)
+        registry actors.
+
+        '''
+        return self._reg_addrs
+
+    @reg_addrs.setter
+    def reg_addrs(
+        self,
+        addrs: list[tuple[str, int]],
+    ) -> None:
+        if not addrs:
+            log.warning(
+                'Empty registry address list is invalid:\n'
+                f'{addrs}'
+            )
+            return
+
+        # always sanity check the input list since it's critical
+        # that addrs are correct for discovery sys operation.
+        for addr in addrs:
+            if not isinstance(addr, tuple):
+                raise ValueError(
+                    'Expected `Actor.reg_addrs: list[tuple[str, int]]`\n'
+                    f'Got {addrs}'
+                )
+
+            self._reg_addrs = addrs
+
     async def wait_for_peer(
         self, uid: tuple[str, str]
     ) -> tuple[trio.Event, Channel]:
@@ -670,9 +702,10 @@ class Actor:
         stream: trio.SocketStream,
 
     ) -> None:
-        """Entry point for new inbound connections to the channel server.
+        '''
+        Entry point for new inbound connections to the channel server.
 
-        """
+        '''
         self._no_more_peers = trio.Event()  # unset
 
         chan = Channel.from_stream(stream)
@@ -792,17 +825,21 @@ class Actor:
 
                 if disconnected:
                     # if the transport died and this actor is still
-                    # registered within a local nursery, we report that the
-                    # IPC layer may have failed unexpectedly since it may be
-                    # the cause of other downstream errors.
+                    # registered within a local nursery, we report
+                    # that the IPC layer may have failed
+                    # unexpectedly since it may be the cause of
+                    # other downstream errors.
                     entry = local_nursery._children.get(uid)
                     if entry:
                         _, proc, _ = entry
 
-                        poll = getattr(proc, 'poll', None)
-                        if poll and poll() is None:
+                        if (
+                            (poll := getattr(proc, 'poll', None))
+                            and poll() is None
+                        ):
                             log.cancel(
-                                f'Actor {uid} IPC broke but proc is alive?'
+                                f'Actor {uid} IPC broke but proc is alive?\n'
+                                'Attempting to self cancel..'
                             )
 
             # ``Channel`` teardown and closure sequence
@@ -1016,14 +1053,18 @@ class Actor:
                 _state._runtime_vars.update(rvs)
 
                 for attr, value in parent_data.items():
-
-                    if attr == '_reg_addrs':
+                    if (
+                        attr == 'reg_addrs'
+                        and value
+                    ):
                         # XXX: ``msgspec`` doesn't support serializing tuples
                         # so just cash manually here since it's what our
                         # internals expect.
-                        self._reg_addrs = [
-                            tuple(val) for val in value
-                        ] if value else None
+                        # TODO: we don't really NEED these as
+                        # tuples so we can probably drop this
+                        # casting since apparently in python lists
+                        # are "more efficient"?
+                        self.reg_addrs = [tuple(val) for val in value]
 
                     else:
                         setattr(self, attr, value)
@@ -1099,7 +1140,10 @@ class Actor:
 
         '''
         assert self._service_n
-        self._service_n.start_soon(self.cancel)
+        self._service_n.start_soon(
+            self.cancel,
+            self.uid,
+        )
 
     async def cancel(
         self,
@@ -1445,9 +1489,12 @@ async def async_main(
                 # if addresses point to the same actor..
                 # So we need a way to detect that? maybe iterate
                 # only on unique actor uids?
-                for addr in actor._reg_addrs:
-                    assert isinstance(addr, tuple)
-                    assert addr[1]  # non-zero after bind
+                for addr in actor.reg_addrs:
+                    try:
+                        assert isinstance(addr, tuple)
+                        assert addr[1]  # non-zero after bind
+                    except AssertionError:
+                        await _debug.pause()
 
                     async with get_registry(*addr) as reg_portal:
                         for accept_addr in accept_addrs:
@@ -1500,12 +1547,14 @@ async def async_main(
             # once we have that all working with std streams locking?
             log.exception(
                 f"Actor errored and failed to register with arbiter "
-                f"@ {actor._reg_addrs[0]}?")
+                f"@ {actor.reg_addrs[0]}?")
             log.error(
-                "\n\n\t^^^ THIS IS PROBABLY A TRACTOR BUGGGGG!!! ^^^\n"
-                "\tCALMLY CALL THE AUTHORITIES AND HIDE YOUR CHILDREN.\n\n"
-                "\tIf this is a sub-actor likely its parent will keep running "
-                "\tcorrectly if this error is caught and ignored.."
+                "\n\n\t^^^ THIS IS PROBABLY AN INTERNAL `tractor` BUG! ^^^\n\n"
+                "\t>> CALMLY CALL THE AUTHORITIES AND HIDE YOUR CHILDREN <<\n\n"
+                "\tIf this is a sub-actor hopefully its parent will keep running "
+                "correctly presuming this error was safely ignored..\n\n"
+                "\tPLEASE REPORT THIS TRACEBACK IN A BUG REPORT: "
+                "https://github.com/goodboy/tractor/issues\n"
             )
 
         if actor._parent_chan:
@@ -1546,7 +1595,7 @@ async def async_main(
             and not actor.is_registrar
         ):
             failed: bool = False
-            for addr in actor._reg_addrs:
+            for addr in actor.reg_addrs:
                 assert isinstance(addr, tuple)
                 with trio.move_on_after(0.5) as cs:
                     cs.shield = True
-- 
2.34.1


From 9da3b63644bd5a5bed8d8fd195d017fc6d240d1b Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 19 Oct 2023 12:40:37 -0400
Subject: [PATCH 065/378] Change remaining internals to use `Actor.reg_addrs`

---
 tractor/_discovery.py | 32 +++++++++++++++++++-------------
 tractor/_root.py      |  9 ++++++---
 tractor/_spawn.py     |  2 +-
 tractor/_supervise.py |  2 +-
 4 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/tractor/_discovery.py b/tractor/_discovery.py
index 070321b6..b5f47165 100644
--- a/tractor/_discovery.py
+++ b/tractor/_discovery.py
@@ -133,14 +133,12 @@ async def query_actor(
         )
         regaddr: list[tuple[str, int]] = arbiter_sockaddr
 
-    regstr: Portal
-    async with get_registry(
-        *(regaddr or actor._reg_addrs[0])
-    ) as regstr:
-
+    reg_portal: Portal
+    regaddr: tuple[str, int] = regaddr or actor.reg_addrs[0]
+    async with get_registry(*regaddr) as reg_portal:
         # TODO: return portals to all available actors - for now
         # just the last one that registered
-        sockaddr: tuple[str, int] = await regstr.run_from_ns(
+        sockaddr: tuple[str, int] = await reg_portal.run_from_ns(
             'self',
             'find_actor',
             name=name,
@@ -155,6 +153,7 @@ async def find_actor(
     registry_addrs: list[tuple[str, int]] | None = None,
 
     only_first: bool = True,
+    raise_on_none: bool = False,
 
 ) -> AsyncGenerator[
     Portal | list[Portal] | None,
@@ -207,13 +206,20 @@ async def find_actor(
 
     async with gather_contexts(
         mngrs=maybe_portals,
-    ) as maybe_portals:
-        print(f'Portalz: {maybe_portals}')
-        if not maybe_portals:
+    ) as portals:
+        # log.runtime(
+        #     'Gathered portals:\n'
+        #     f'{portals}'
+        # )
+        if not portals:
+            if raise_on_none:
+                raise RuntimeError(
+                    f'No {name} found registered @ {registry_addrs}'
+                )
             yield None
             return
 
-        portals: list[Portal] = list(maybe_portals)
+        portals: list[Portal] = list(portals)
         if only_first:
             yield portals[0]
 
@@ -250,9 +256,9 @@ async def wait_for_actor(
 
     # TODO: use `.trionics.gather_contexts()` like
     # above in `find_actor()` as well?
-    async with get_registry(
-        *(registry_addr or actor._reg_addrs[0]),  # first if not passed
-    ) as reg_portal:
+    reg_portal: Portal
+    regaddr: tuple[str, int] = registry_addr or actor.reg_addrs[0]
+    async with get_registry(*regaddr) as reg_portal:
         sockaddrs = await reg_portal.run_from_ns(
             'self',
             'wait_for_actor',
diff --git a/tractor/_root.py b/tractor/_root.py
index a36a7e70..5615bb65 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -85,7 +85,7 @@ async def open_root_actor(
     enable_modules: list | None = None,
     rpc_module_paths: list | None = None,
 
-) -> typing.Any:
+) -> Actor:
     '''
     Runtime init entry point for ``tractor``.
 
@@ -130,7 +130,7 @@ async def open_root_actor(
 
     registry_addrs: list[tuple[str, int]] = (
         registry_addrs
-        or     
+        or
         _default_lo_addrs
     )
     assert registry_addrs
@@ -195,7 +195,10 @@ async def open_root_actor(
 
     async with trio.open_nursery() as tn:
         for addr in registry_addrs:
-            tn.start_soon(ping_tpt_socket, addr)
+            tn.start_soon(
+                ping_tpt_socket,
+                tuple(addr),  # TODO: just drop this requirement?
+            )
 
     trans_bind_addrs: list[tuple[str, int]] = []
 
diff --git a/tractor/_spawn.py b/tractor/_spawn.py
index aede3b75..e55e59f8 100644
--- a/tractor/_spawn.py
+++ b/tractor/_spawn.py
@@ -423,7 +423,7 @@ async def trio_proc(
         await chan.send({
             '_parent_main_data': subactor._parent_main_data,
             'enable_modules': subactor.enable_modules,
-            '_reg_addrs': subactor._reg_addrs,
+            'reg_addrs': subactor.reg_addrs,
             'bind_addrs': bind_addrs,
             '_runtime_vars': _runtime_vars,
         })
diff --git a/tractor/_supervise.py b/tractor/_supervise.py
index 7851d9fb..364d79c3 100644
--- a/tractor/_supervise.py
+++ b/tractor/_supervise.py
@@ -149,7 +149,7 @@ class ActorNursery:
             loglevel=loglevel,
 
             # verbatim relay this actor's registrar addresses
-            registry_addrs=current_actor()._reg_addrs,
+            registry_addrs=current_actor().reg_addrs,
         )
         parent_addr = self._actor.accept_addr
         assert parent_addr
-- 
2.34.1


From 2f0bed3018f75e38f899ae2db953cfd3bc443a9f Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 19 Oct 2023 12:41:15 -0400
Subject: [PATCH 066/378] Ignore `greenback` import error if not installed

---
 tractor/devx/_debug.py | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 1b225052..3bef7bd6 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -836,22 +836,25 @@ async def pause(
 # runtime aware version which takes care of all .
 def pause_from_sync() -> None:
     print("ENTER SYNC PAUSE")
-    import greenback
-    __tracebackhide__ = True
+    try:
+        import greenback
+        __tracebackhide__ = True
 
-    actor: tractor.Actor = tractor.current_actor()
-    # task_can_release_tty_lock = trio.Event()
+        actor: tractor.Actor = tractor.current_actor()
+        # task_can_release_tty_lock = trio.Event()
 
-    # spawn bg task which will lock out the TTY, we poll
-    # just below until the release event is reporting that task as
-    # waiting.. not the most ideal but works for now ;)
-    greenback.await_(
-        actor._service_n.start(partial(
-            pause,
-            debug_func=None,
-            # release_lock_signal=task_can_release_tty_lock,
-        ))
-    )
+        # spawn bg task which will lock out the TTY, we poll
+        # just below until the release event is reporting that task as
+        # waiting.. not the most ideal but works for now ;)
+        greenback.await_(
+            actor._service_n.start(partial(
+                pause,
+                debug_func=None,
+                # release_lock_signal=task_can_release_tty_lock,
+            ))
+        )
+    except ModuleNotFoundError:
+        log.warning('NO GREENBACK FOUND')
 
     db, undo_sigint = mk_mpdb()
     Lock.local_task_in_debug = 'sync'
-- 
2.34.1


From 0518b3ab04583bfda09dd3e32eb5d3817c82cc93 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 23 Oct 2023 14:17:36 -0400
Subject: [PATCH 067/378] Move `MessagingError` into `._exceptions` set

---
 tractor/_exceptions.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 0bb4552b..214dc88a 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -83,7 +83,7 @@ class RemoteActorError(Exception):
                 ')'
             )
 
-        return super().__repr__(self)
+        return super().__repr__()
 
     # TODO: local recontruction of remote exception deats
     # def unbox(self) -> BaseException:
@@ -139,6 +139,9 @@ class AsyncioCancelled(Exception):
 
     '''
 
+class MessagingError(Exception):
+    'Some kind of unexpected SC messaging dialog issue'
+
 
 def pack_error(
     exc: BaseException,
-- 
2.34.1


From 5a94e8fb5bf937f4304b6647fac12db2ee608d41 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 23 Oct 2023 14:34:12 -0400
Subject: [PATCH 068/378] Raise a `MessagingError` from the src error on msging
 edge cases

---
 tractor/_streaming.py | 44 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/tractor/_streaming.py b/tractor/_streaming.py
index e449fefe..f02197b8 100644
--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@@ -23,6 +23,7 @@ The machinery and types behind ``Context.open_stream()``
 from __future__ import annotations
 import inspect
 from contextlib import asynccontextmanager as acm
+from pprint import pformat
 from typing import (
     Any,
     Callable,
@@ -35,6 +36,7 @@ import trio
 
 from ._exceptions import (
     unpack_error,
+    MessagingError,
 )
 from .log import get_logger
 from .trionics import (
@@ -66,6 +68,8 @@ def _raise_from_no_yield_msg(
     `'yield'` field.
 
     '''
+    __tracebackhide__: bool = True
+
     # internal error should never get here
     assert msg.get('cid'), ("Received internal error at portal?")
 
@@ -73,18 +77,22 @@ def _raise_from_no_yield_msg(
     # - 'stop'
     # - 'error'
     # possibly just handle msg['stop'] here!
+    # breakpoint()
 
     if stream._closed:
         raise trio.ClosedResourceError('This stream was closed')
 
-    if msg.get('stop') or stream._eoc:
-        log.debug(f"{stream} was stopped at remote end")
+    if (
+        msg.get('stop')
+        or stream._eoc
+    ):
+        log.debug(f'{stream} was stopped at remote end')
 
         # XXX: important to set so that a new ``.receive()``
         # call (likely by another task using a broadcast receiver)
         # doesn't accidentally pull the ``return`` message
         # value out of the underlying feed mem chan!
-        stream._eoc = True
+        stream._eoc: bool = True
 
         # # when the send is closed we assume the stream has
         # # terminated and signal this local iterator to stop
@@ -93,20 +101,24 @@ def _raise_from_no_yield_msg(
         # XXX: this causes ``ReceiveChannel.__anext__()`` to
         # raise a ``StopAsyncIteration`` **and** in our catch
         # block below it will trigger ``.aclose()``.
-        raise trio.EndOfChannel from src_err
+        raise trio.EndOfChannel(
+                'Stream ended due to msg:\n'
+                f'{pformat(msg)}'
+        ) from src_err
 
     # TODO: test that shows stream raising an expected error!!!
     elif msg.get('error'):
         # raise the error message
         raise unpack_error(msg, stream._ctx.chan)
 
-    # always re-raise the source error if no translation error
-    # case is activated above.
-    raise src_err
-    # raise RuntimeError(
-    #     'Unknown non-yield stream msg?\n'
-    #     f'{msg}'
-    # )
+    # always re-raise the source error if no translation error case
+    # is activated above.
+    raise MessagingError(
+        f'Context received unexpected non-error msg!?\n'
+        f'cid: {cid}\n'
+        'received msg:\n'
+        f'{pformat(msg)}'
+    ) from src_err
 
 
 class MsgStream(trio.abc.Channel):
@@ -161,6 +173,16 @@ class MsgStream(trio.abc.Channel):
         determined by the underlying protocol).
 
         '''
+        # NOTE: `trio.ReceiveChannel` implements
+        # EOC handling as follows (aka uses it
+        # to gracefully exit async for loops):
+        #
+        # async def __anext__(self) -> ReceiveType:
+        #     try:
+        #         return await self.receive()
+        #     except trio.EndOfChannel:
+        #         raise StopAsyncIteration
+
         # see ``.aclose()`` for notes on the old behaviour prior to
         # introducing this
         if self._eoc:
-- 
2.34.1


From 131674eabd76a5b73d45259c9af4bd3d03832133 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 23 Oct 2023 14:35:36 -0400
Subject: [PATCH 069/378] Be mega-pedantic with `ContextCancelled` semantics

As part of extremely detailed inter-peer-actor testing, add much more
granular `Context` cancellation state tracking via the following (new)
fields:
- `.canceller: tuple[str, str]` the uuid of the actor responsible for
  the cancellation condition - always set by
  `Context._maybe_cancel_and_set_remote_error()` and replaces
  `._cancelled_remote` and `.cancel_called_remote`. If set, this value
  should normally always match a value from some `ContextCancelled`
  raised or caught by one side of the context.
- `._local_error` which is always set to the locally raised (and caller
  or callee task's scope-internal) error which caused any
  eventual cancellation/error condition and thus any closure of the
  context's per-task-side-`trio.Nursery`.
- `.cancelled_caught: bool` is now always `True` whenever the local task
  catches (or "silently absorbs") a `ContextCancelled` (a `ctxc`) that
  indeed originated from one of the context's linked tasks or any other
  context which raised its own `ctxc` in the current `.open_context()` scope.
  => whenever there is a case that no `ContextCancelled` was raised
  **in** the `.open_context().__aexit__()` (eg. `ctx.result()` called
  after a call `ctx.cancel()`), we still consider the context's as
  having "caught a cancellation" since the `ctxc` was indeed silently
  handled by the cancel requester; all other error cases are already
  represented by mirroring the state of the `._scope: trio.CancelScope`
  => IOW there should be **no case** where an error is **not raised** in
  the context's scope and `.cancelled_caught: bool == False`, i.e. no
  case where `._scope.cancelled_caught == False and ._local_error is not
  None`!
- always raise any `ctxc` from `.open_stream()` if `._cancel_called ==
  True` - if the cancellation request has not already resulted in
  a `._remote_error: ContextCancelled` we raise a `RuntimeError` to
  indicate improper usage to the guilty side's task code.
- make `._maybe_raise_remote_err()` a sync func and don't raise
  any `ctxc` which is matched against a `.canceller` determined to
  be the current actor, aka a "self cancel", and always set the
  `._local_error` to any such `ctxc`.
- `.side: str` taken from inside `.cancel()` and unused as of now since
  it might be better re-written as a similar `.is_opener() -> bool`?
- drop unused `._started_received: bool`..
- TONS and TONS of detailed comments/docs to attempt to explain all the
  possible cancellation/exit cases and how they should exhibit as either
  silent closes or raises from the `Context` API!

Adjust the `._runtime._invoke()` code to match:
- use `ctx._maybe_raise_remote_err()` in `._invoke()`.
- adjust to new `.canceller` property.
- more type hints.
- better `log.cancel()` msging around self-cancels vs. peer-cancels.
- always set the `._local_error: BaseException` for the "callee" task
  just like `Portal.open_context()` now will do B)

Prior we were raising any `Context._remote_error` directly and doing
(more or less) the same `ContextCancelled` "absorbing" logic (well
kinda) in block; instead delegate to the method
---
 tractor/_context.py | 232 +++++++++++++++++++++++++++++++++-----------
 tractor/_runtime.py | 119 ++++++++++++++---------
 2 files changed, 248 insertions(+), 103 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index c14f16bf..117092ac 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -56,6 +56,7 @@ from ._state import current_actor
 
 if TYPE_CHECKING:
     from ._portal import Portal
+    from ._runtime import Actor
 
 
 log = get_logger(__name__)
@@ -64,20 +65,26 @@ log = get_logger(__name__)
 @dataclass
 class Context:
     '''
-    An inter-actor, ``trio``-task communication context.
+    An inter-actor, SC transitive, `trio.Task` communication context.
 
-    NB: This class should never be instatiated directly, it is delivered
-    by either,
-     - runtime machinery to a remotely started task or,
-     - by entering ``Portal.open_context()``.
+    NB: This class should **never be instatiated directly**, it is allocated
+    by the runtime in 2 ways:
+     - by entering ``Portal.open_context()`` which is the primary
+       public API for any "caller" task or,
+     - by the RPC machinery's `._runtime._invoke()` as a `ctx` arg
+       to a remotely scheduled "callee" function.
 
-     and is always constructed using ``mkt_context()``.
+    AND is always constructed using the below ``mk_context()``.
 
     Allows maintaining task or protocol specific state between
-    2 communicating, parallel executing actor tasks. A unique context is
-    allocated on each side of any task RPC-linked msg dialog, for
-    every request to a remote actor from a portal. On the "callee"
-    side a context is always allocated inside ``._runtime._invoke()``.
+    2 cancel-scope-linked, communicating and parallel executing
+    `trio.Task`s. Contexts are allocated on each side of any task
+    RPC-linked msg dialog, i.e. for every request to a remote
+    actor from a `Portal`. On the "callee" side a context is
+    always allocated inside ``._runtime._invoke()``.
+
+    # TODO: more detailed writeup on cancellation, error and
+    # streaming semantics..
 
     A context can be cancelled and (possibly eventually restarted) from
     either side of the underlying IPC channel, it can also open task
@@ -108,12 +115,31 @@ class Context:
     # which is exactly the primitive that allows for
     # cross-actor-task-supervision and thus SC.
     _scope: trio.CancelScope | None = None
+
+    # on a clean exit there should be a final value
+    # delivered from the far end "callee" task, so
+    # this value is only set on one side.
     _result: Any | int = None
+
+    # if the local "caller"  task errors this
+    # value is always set to the error that was
+    # captured in the `Portal.open_context().__aexit__()`
+    # teardown.
+    _local_error: BaseException | None = None
+
+    # if the either side gets an error from the other
+    # this value is set to that error unpacked from an
+    # IPC msg.
     _remote_error: BaseException | None = None
 
-    # cancellation state
+    # only set if the local task called `.cancel()`
     _cancel_called: bool = False  # did WE cancel the far end?
-    _cancelled_remote: tuple[str, str] | None = None
+
+    # TODO: do we even need this? we can assume that if we're
+    # cancelled that the other side is as well, so maybe we should
+    # instead just have a `.canceller` pulled from the
+    # `ContextCancelled`?
+    _canceller: tuple[str, str] | None = None
 
     # NOTE: we try to ensure assignment of a "cancel msg" since
     # there's always going to be an "underlying reason" that any
@@ -145,23 +171,47 @@ class Context:
         return self._cancel_called
 
     @property
-    def cancel_called_remote(self) -> tuple[str, str] | None:
+    def canceller(self) -> tuple[str, str] | None:
         '''
-        ``Actor.uid`` of the remote actor who's task was cancelled
-        causing this side of the context to also be cancelled.
+        ``Actor.uid: tuple[str, str]`` of the (remote)
+        actor-process who's task was cancelled thus causing this
+        (side of the) context to also be cancelled.
 
         '''
-        remote_uid = self._cancelled_remote
-        if remote_uid:
-            return tuple(remote_uid)
+        return self._canceller
 
     @property
     def cancelled_caught(self) -> bool:
-        return self._scope.cancelled_caught
+        return (
+            # the local scope was cancelled either by
+            # remote error or self-request
+            self._scope.cancelled_caught
+
+            # the local scope was never cancelled
+            # and instead likely we received a remote side
+            # cancellation that was raised inside `.result()`
+            or (
+                (se := self._local_error)
+                and
+                isinstance(se, ContextCancelled)
+                and (
+                    se.canceller == self.canceller
+                    or
+                    se is self._remote_error
+                )
+            )
+        )
+
+    @property
+    def side(self) -> str:
+        '''
+        Return string indicating which task this instance is wrapping.
+
+        '''
+        return 'caller' if self._portal else 'callee'
 
     # init and streaming state
     _started_called: bool = False
-    _started_received: bool = False
     _stream_opened: bool = False
 
     # overrun handling machinery
@@ -196,7 +246,7 @@ class Context:
     async def send_stop(self) -> None:
         await self.chan.send({'stop': True, 'cid': self.cid})
 
-    async def _maybe_cancel_and_set_remote_error(
+    def _maybe_cancel_and_set_remote_error(
         self,
         error: BaseException,
 
@@ -269,16 +319,19 @@ class Context:
         # that error as the reason.
         self._remote_error: BaseException = error
 
-        # always record the remote actor's uid since its cancellation
-        # state is directly linked to ours (the local one).
-        self._cancelled_remote = self.chan.uid
-
         if (
             isinstance(error, ContextCancelled)
         ):
+            # always record the cancelling actor's uid since its cancellation
+            # state is linked and we want to know which process was
+            # the cause / requester of the cancellation.
+            self._canceller = error.canceller
+
             log.cancel(
-                'Remote task-context sucessfully cancelled for '
-                f'{self.chan.uid}:{self.cid}'
+                'Remote task-context was cancelled for '
+                f'actor: {self.chan.uid}\n'
+                f'task: {self.cid}\n'
+                f'canceller: {error.canceller}\n'
             )
 
             if self._cancel_called:
@@ -289,22 +342,37 @@ class Context:
                 # and we **don't need to raise it** in local cancel
                 # scope since it will potentially override a real error.
                 return
+
         else:
             log.error(
-                f'Remote context error for {self.chan.uid}:{self.cid}:\n'
+                f'Remote context error,\n'
+                f'remote actor: {self.chan.uid}\n'
+                f'task: {self.cid}\n'
                 f'{error}'
             )
+            self._canceller = self.chan.uid
+
         # TODO: tempted to **not** do this by-reraising in a
         # nursery and instead cancel a surrounding scope, detect
         # the cancellation, then lookup the error that was set?
         # YES! this is way better and simpler!
-        if self._scope:
+        cs: trio.CancelScope = self._scope
+        if (
+            cs
+            and not cs.cancel_called
+            and not cs.cancelled_caught
+        ):
+
+            # TODO: we can for sure drop this right?
             # from trio.testing import wait_all_tasks_blocked
             # await wait_all_tasks_blocked()
-            # self._cancelled_remote = self.chan.uid
+
+            # TODO: it'd sure be handy to inject our own
+            # `trio.Cancelled` subtype here ;)
+            # https://github.com/goodboy/tractor/issues/368
             self._scope.cancel()
 
-            # this REPL usage actually works here BD
+            # NOTE: this REPL usage actually works here dawg! Bo
             # from .devx._debug import pause
             # await pause()
 
@@ -320,13 +388,19 @@ class Context:
         Timeout quickly in an attempt to sidestep 2-generals...
 
         '''
-        side: str = 'caller' if self._portal else 'callee'
+        side: str = self.side
         log.cancel(
             f'Cancelling {side} side of context to {self.chan.uid}'
         )
-
         self._cancel_called: bool = True
 
+        # caller side who entered `Portal.open_context()`
+        # NOTE: on the call side we never manually call
+        # `._scope.cancel()` since we expect the eventual
+        # `ContextCancelled` from the other side to trigger this
+        # when the runtime finally receives it during teardown
+        # (normally in `.result()` called from
+        # `Portal.open_context().__aexit__()`)
         if side == 'caller':
             if not self._portal:
                 raise RuntimeError(
@@ -349,7 +423,6 @@ class Context:
                     '_cancel_task',
                     cid=cid,
                 )
-                # print("EXITING CANCEL CALL")
 
             if cs.cancelled_caught:
                 # XXX: there's no way to know if the remote task was indeed
@@ -368,6 +441,9 @@ class Context:
                     )
 
         # callee side remote task
+        # NOTE: on this side we ALWAYS cancel the local scope since
+        # the caller expects a `ContextCancelled` to be sent from
+        # `._runtime._invoke()` back to the other side.
         else:
             # TODO: should we have an explicit cancel message
             # or is relaying the local `trio.Cancelled` as an
@@ -403,7 +479,7 @@ class Context:
             ``trio``'s cancellation system.
 
         '''
-        actor = current_actor()
+        actor: Actor = current_actor()
 
         # here we create a mem chan that corresponds to the
         # far end caller / callee.
@@ -413,12 +489,34 @@ class Context:
         # killed
 
         if self._cancel_called:
-            task = trio.lowlevel.current_task().name
-            raise ContextCancelled(
-                f'Context around {actor.uid[0]}:{task} was already cancelled!'
+
+            # XXX NOTE: ALWAYS RAISE any remote error here even if
+            # it's an expected `ContextCancelled` (after some local
+            # task having called `.cancel()` !
+            #
+            # WHY: we expect the error to always bubble up to the
+            # surrounding `Portal.open_context()` call and be
+            # absorbed there (silently) and we DO NOT want to
+            # actually try to stream - a cancel msg was already
+            # sent to the other side!
+            if re := self._remote_error:
+                raise self._remote_error
+
+            # XXX NOTE: if no `ContextCancelled` has been responded
+            # back from the other side (yet), we raise a different
+            # runtime error indicating that this task's usage of
+            # `Context.cancel()` and then `.open_stream()` is WRONG!
+            task: str = trio.lowlevel.current_task().name
+            raise RuntimeError(
+                'Stream opened after `Context.cancel()` called..?\n'
+                f'task: {actor.uid[0]}:{task}\n'
+                f'{self}'
             )
 
-        if not self._portal and not self._started_called:
+        if (
+            not self._portal
+            and not self._started_called
+        ):
             raise RuntimeError(
                 'Context.started()` must be called before opening a stream'
             )
@@ -434,7 +532,7 @@ class Context:
             msg_buffer_size=msg_buffer_size,
             allow_overruns=allow_overruns,
         )
-        ctx._allow_overruns = allow_overruns
+        ctx._allow_overruns: bool = allow_overruns
         assert ctx is self
 
         # XXX: If the underlying channel feeder receive mem chan has
@@ -444,27 +542,32 @@ class Context:
 
         if ctx._recv_chan._closed:
             raise trio.ClosedResourceError(
-                'The underlying channel for this stream was already closed!?')
+                'The underlying channel for this stream was already closed!?'
+            )
 
         async with MsgStream(
             ctx=self,
             rx_chan=ctx._recv_chan,
         ) as stream:
 
+            # NOTE: we track all existing streams per portal for
+            # the purposes of attempting graceful closes on runtime
+            # cancel requests.
             if self._portal:
                 self._portal._streams.add(stream)
 
             try:
-                self._stream_opened = True
+                self._stream_opened: bool = True
 
                 # XXX: do we need this?
                 # ensure we aren't cancelled before yielding the stream
                 # await trio.lowlevel.checkpoint()
                 yield stream
 
-                # NOTE: Make the stream "one-shot use".  On exit, signal
-                # ``trio.EndOfChannel``/``StopAsyncIteration`` to the
-                # far end.
+                # NOTE: Make the stream "one-shot use".  On exit,
+                # signal
+                # ``trio.EndOfChannel``/``StopAsyncIteration`` to
+                # the far end.
                 await stream.aclose()
 
             finally:
@@ -495,14 +598,22 @@ class Context:
         # whenever  ``CancelScope.cancel()`` was called) and
         # instead silently reap the expected cancellation
         # "error"-msg.
+        our_uid: tuple[str, str] = current_actor().uid
         if (
             isinstance(err, ContextCancelled)
             and (
                 self._cancel_called
                 or self.chan._cancel_called
-                or tuple(err.canceller) == current_actor().uid
+                or self.canceller == our_uid
+                or tuple(err.canceller) == our_uid
             )
         ):
+            # NOTE: we set the local scope error to any "self
+            # cancellation" error-response thus "absorbing"
+            # the error silently B)
+            if self._local_error is None:
+                self._local_error = err
+
             return err
 
         # NOTE: currently we are masking underlying runtime errors
@@ -515,7 +626,7 @@ class Context:
         #       runtime frames from the tb explicitly?
         # https://docs.python.org/3/reference/simple_stmts.html#the-raise-statement
         # https://stackoverflow.com/a/24752607
-        __tracebackhide__: bool = True
+        # __tracebackhide__: bool = True
         raise err from None
 
     async def result(self) -> Any | Exception:
@@ -544,7 +655,6 @@ class Context:
         of the remote cancellation.
 
         '''
-        __tracebackhide__: bool = True
         assert self._portal, "Context.result() can not be called from callee!"
         assert self._recv_chan
 
@@ -607,13 +717,15 @@ class Context:
                         "Received internal error at portal?"
                     )
 
-                    err = unpack_error(
+                    if err:= unpack_error(
                         msg,
                         self._portal.channel
-                    )  # from msgerr
+                    ):  # from msgerr
+                        self._maybe_cancel_and_set_remote_error(err)
+                        self._maybe_raise_remote_err(err)
 
-                    err = self._maybe_raise_remote_err(err)
-                    self._remote_error = err
+                    else:
+                        raise
 
         if re := self._remote_error:
             return self._maybe_raise_remote_err(re)
@@ -724,13 +836,17 @@ class Context:
             f"Delivering {msg} from {uid} to caller {cid}"
         )
 
-        error = msg.get('error')
-        if error := unpack_error(
-                msg,
-                self.chan,
+        if (
+            msg.get('error')  # check for field
+            and (
+                error := unpack_error(
+                    msg,
+                    self.chan,
+                )
+            )
         ):
             self._cancel_msg = msg
-            await self._maybe_cancel_and_set_remote_error(error)
+            self._maybe_cancel_and_set_remote_error(error)
 
         if (
             self._in_overrun
@@ -765,7 +881,7 @@ class Context:
 
             # XXX: always push an error even if the local
             # receiver is in overrun state.
-            # await self._maybe_cancel_and_set_remote_error(msg)
+            # self._maybe_cancel_and_set_remote_error(msg)
 
             local_uid = current_actor().uid
             lines = [
diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 5f4da96a..fee14c4d 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -86,12 +86,14 @@ async def _invoke(
     ] = trio.TASK_STATUS_IGNORED,
 ):
     '''
-    Invoke local func and deliver result(s) over provided channel.
+    Schedule a `trio` task-as-func and deliver result(s) over
+    connected IPC channel.
 
-    This is the core "RPC task" starting machinery.
+    This is the core "RPC" `trio.Task` scheduling machinery used to start every
+    remotely invoked function, normally in `Actor._service_n: trio.Nursery`.
 
     '''
-    __tracebackhide__ = True
+    __tracebackhide__: bool = True
     treat_as_gen: bool = False
     failed_resp: bool = False
 
@@ -209,6 +211,8 @@ async def _invoke(
                 # far end async gen to tear down
                 await chan.send({'stop': True, 'cid': cid})
 
+        # TODO: every other "func type" should be implemented from
+        # a special case of a context eventually!
         elif context:
             # context func with support for bi-dir streaming
             await chan.send({'functype': 'context', 'cid': cid})
@@ -219,21 +223,30 @@ async def _invoke(
                     ctx._scope = nurse.cancel_scope
                     task_status.started(ctx)
                     res = await coro
-                    await chan.send({'return': res, 'cid': cid})
+                    await chan.send({
+                        'return': res,
+                        'cid': cid
+                    })
 
             # XXX: do we ever trigger this block any more?
             except (
                 BaseExceptionGroup,
                 trio.Cancelled,
-            ):
-                # if a context error was set then likely
-                # thei multierror was raised due to that
-                if ctx._remote_error is not None:
-                    raise ctx._remote_error
+            ) as scope_error:
 
-                # maybe TODO: pack in ``trio.Cancelled.__traceback__`` here
-                # so they can be unwrapped and displayed on the caller
-                # side?
+                # always set this (callee) side's exception as the
+                # local error on the context
+                ctx._local_error: BaseException = scope_error
+
+                # if a remote error was set then likely the
+                # exception group was raised due to that, so
+                # and we instead raise that error immediately!
+                if re := ctx._remote_error:
+                    ctx._maybe_raise_remote_err(re)
+
+                # maybe TODO: pack in
+                # ``trio.Cancelled.__traceback__`` here so they can
+                # be unwrapped and displayed on the caller side?
                 raise
 
             finally:
@@ -244,11 +257,11 @@ async def _invoke(
                 # don't pop the local context until we know the
                 # associated child isn't in debug any more
                 await _debug.maybe_wait_for_debugger()
-                ctx = actor._contexts.pop((chan.uid, cid))
-                if ctx:
-                    log.runtime(
-                        f'Context entrypoint {func} was terminated:\n{ctx}'
-                    )
+                ctx: Context = actor._contexts.pop((chan.uid, cid))
+                log.runtime(
+                    f'Context entrypoint {func} was terminated:\n'
+                    f'{ctx}'
+                )
 
             if ctx.cancelled_caught:
 
@@ -256,43 +269,43 @@ async def _invoke(
                 # before raising any context cancelled case
                 # so that real remote errors don't get masked as
                 # ``ContextCancelled``s.
-                re = ctx._remote_error
-                if re:
+                if re := ctx._remote_error:
                     ctx._maybe_raise_remote_err(re)
 
-                fname = func.__name__
+                fname: str = func.__name__
                 cs: trio.CancelScope = ctx._scope
                 if cs.cancel_called:
-                    canceller = ctx._cancelled_remote
+                    canceller: tuple = ctx.canceller
+                    msg: str = (
+                        f'`{fname}()`@{actor.uid} cancelled by '
+                    )
 
                     # NOTE / TODO: if we end up having
                     # ``Actor._cancel_task()`` call
                     # ``Context.cancel()`` directly, we're going to
-                    # need to change this logic branch since it will
-                    # always enter..
+                    # need to change this logic branch since it
+                    # will always enter..
                     if ctx._cancel_called:
-                        msg = f'`{fname}()`@{actor.uid} cancelled itself'
-
-                    else:
-                        msg = (
-                            f'`{fname}()`@{actor.uid} '
-                            'was remotely cancelled by '
-                        )
+                        msg += 'itself '
 
                     # if the channel which spawned the ctx is the
                     # one that cancelled it then we report that, vs.
                     # it being some other random actor that for ex.
                     # some actor who calls `Portal.cancel_actor()`
                     # and by side-effect cancels this ctx.
-                    if canceller == ctx.chan.uid:
-                        msg += f'its caller {canceller}'
+                    elif canceller == ctx.chan.uid:
+                        msg += f'its caller {canceller} '
+
                     else:
                         msg += f'remote actor {canceller}'
 
                     # TODO: does this ever get set any more or can
                     # we remove it?
                     if ctx._cancel_msg:
-                        msg += f' with msg:\n{ctx._cancel_msg}'
+                        msg += (
+                            ' with msg:\n'
+                            f'{ctx._cancel_msg}'
+                        )
 
                     # task-contex was either cancelled by request using
                     # ``Portal.cancel_actor()`` or ``Context.cancel()``
@@ -305,10 +318,13 @@ async def _invoke(
                         canceller=canceller,
                     )
 
+        # regular async function
         else:
-            # regular async function
             try:
-                await chan.send({'functype': 'asyncfunc', 'cid': cid})
+                await chan.send({
+                    'functype': 'asyncfunc',
+                    'cid': cid
+                })
             except trio.BrokenResourceError:
                 failed_resp = True
                 if is_rpc:
@@ -322,7 +338,7 @@ async def _invoke(
                 ctx._scope = cs
                 task_status.started(ctx)
                 result = await coro
-                fname = func.__name__
+                fname: str = func.__name__
                 log.runtime(f'{fname}() result: {result}')
                 if not failed_resp:
                     # only send result if we know IPC isn't down
@@ -1162,7 +1178,12 @@ class Actor:
             - return control the parent channel message loop
 
         '''
-        log.cancel(f"{self.uid} is trying to cancel")
+        log.cancel(
+            f'{self.uid} requested to cancel by:\n'
+            f'{requesting_uid}'
+        )
+
+        # TODO: what happens here when we self-cancel tho?
         self._cancel_called_by_remote: tuple = requesting_uid
         self._cancel_called = True
 
@@ -1177,7 +1198,9 @@ class Actor:
                 dbcs.cancel()
 
             # kill all ongoing tasks
-            await self.cancel_rpc_tasks(requesting_uid=requesting_uid)
+            await self.cancel_rpc_tasks(
+                requesting_uid=requesting_uid,
+            )
 
             # stop channel server
             self.cancel_server()
@@ -1207,8 +1230,8 @@ class Actor:
         self,
         cid: str,
         chan: Channel,
-
         requesting_uid: tuple[str, str] | None = None,
+
     ) -> bool:
         '''
         Cancel a local task by call-id / channel.
@@ -1225,7 +1248,7 @@ class Actor:
             # this ctx based lookup ensures the requested task to
             # be cancelled was indeed spawned by a request from this channel
             ctx, func, is_complete = self._rpc_tasks[(chan, cid)]
-            scope = ctx._scope
+            scope: trio.CancelScope = ctx._scope
         except KeyError:
             log.cancel(f"{cid} has already completed/terminated?")
             return True
@@ -1235,10 +1258,10 @@ class Actor:
             f"peer: {chan.uid}\n")
 
         if (
-            ctx._cancelled_remote is None
+            ctx._canceller is None
             and requesting_uid
         ):
-            ctx._cancelled_remote: tuple = requesting_uid
+            ctx._canceller: tuple = requesting_uid
 
         # don't allow cancelling this function mid-execution
         # (is this necessary?)
@@ -1248,6 +1271,7 @@ class Actor:
         # TODO: shouldn't we eventually be calling ``Context.cancel()``
         # directly here instead (since that method can handle both
         # side's calls into it?
+        # await ctx.cancel()
         scope.cancel()
 
         # wait for _invoke to mark the task complete
@@ -1275,9 +1299,12 @@ class Actor:
         registered for each.
 
         '''
-        tasks = self._rpc_tasks
+        tasks: dict = self._rpc_tasks
         if tasks:
-            log.cancel(f"Cancelling all {len(tasks)} rpc tasks:\n{tasks} ")
+            log.cancel(
+                f'Cancelling all {len(tasks)} rpc tasks:\n'
+                f'{tasks}'
+            )
             for (
                 (chan, cid),
                 (ctx, func, is_complete),
@@ -1295,7 +1322,9 @@ class Actor:
                     )
 
             log.cancel(
-                f"Waiting for remaining rpc tasks to complete {tasks}")
+                'Waiting for remaining rpc tasks to complete:\n'
+                f'{tasks}'
+            )
             await self._ongoing_rpc_tasks.wait()
 
     def cancel_server(self) -> None:
-- 
2.34.1


From df31047ecb971b7c3236ac0156ba366635a08401 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 23 Oct 2023 17:34:28 -0400
Subject: [PATCH 070/378] Be ultra-correct in `Portal.open_context()`

This took way too long to get right but hopefully will give us grok-able
and correct context exit semantics going forward B)

The main fixes were:
- always shielding the `MsgStream.aclose()` call on teardown to avoid
  bubbling a `Cancelled`.
- properly absorbing any `ContextCancelled` in cases due to "self
  cancellation" using the new `Context.canceller` in the logic.
- capturing any error raised by the `Context.result()` call in the
  "normal exit, result received" case and setting it as the
  `Context._local_error` so that self-cancels can be easily measured via
  `Context.cancelled_caught` in same way as remote-error caused
  cancellations.
- extremely detailed comments around all of the cancellation-error cases
  to avoid ever getting confused about the control flow in the future XD
---
 tractor/_portal.py | 170 +++++++++++++++++++++++++++++++--------------
 1 file changed, 118 insertions(+), 52 deletions(-)

diff --git a/tractor/_portal.py b/tractor/_portal.py
index d53fc6b3..4c0587af 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -48,6 +48,7 @@ from ._exceptions import (
     unpack_error,
     NoResult,
     ContextCancelled,
+    MessagingError,
 )
 from ._context import Context
 from ._streaming import MsgStream
@@ -71,11 +72,6 @@ def _unwrap_msg(
         raise unpack_error(msg, channel) from None
 
 
-# TODO: maybe move this to ._exceptions?
-class MessagingError(Exception):
-    'Some kind of unexpected SC messaging dialog issue'
-
-
 class Portal:
     '''
     A 'portal' to a memory-domain-separated `Actor`.
@@ -220,14 +216,18 @@ class Portal:
 
         try:
             # send cancel cmd - might not get response
-            # XXX: sure would be nice to make this work with a proper shield
+            # XXX: sure would be nice to make this work with
+            # a proper shield
             with trio.move_on_after(
                 timeout
                 or self.cancel_timeout
             ) as cs:
                 cs.shield = True
 
-                await self.run_from_ns('self', 'cancel')
+                await self.run_from_ns(
+                    'self',
+                    'cancel',
+                )
                 return True
 
             if cs.cancelled_caught:
@@ -462,10 +462,14 @@ class Portal:
         try:
             # the "first" value here is delivered by the callee's
             # ``Context.started()`` call.
-            first = msg['started']
+            first: Any = msg['started']
             ctx._started_called: bool = True
 
         except KeyError:
+
+            # TODO: can we maybe factor this into the new raiser
+            # `_streaming._raise_from_no_yield_msg()` and make that
+            # helper more generic, say with a `_no_<blah>_msg()`?
             if not (cid := msg.get('cid')):
                 raise MessagingError(
                     'Received internal error at context?\n'
@@ -517,54 +521,102 @@ class Portal:
                     # started in the ctx nursery.
                     ctx._scope.cancel()
 
-        # XXX: (maybe) shield/mask context-cancellations that were
-        # initiated by any of the context's 2 tasks. There are
-        # subsequently 2 operating cases for a "graceful cancel"
-        # of a `Context`:
-        # 
-        # 1.*this* side's task called `Context.cancel()`, in
-        #   which case we mask the `ContextCancelled` from bubbling
-        #   to the opener (much like how `trio.Nursery` swallows
-        #   any `trio.Cancelled` bubbled by a call to
-        #   `Nursery.cancel_scope.cancel()`)
+        # XXX NOTE XXX: maybe shield against
+        # self-context-cancellation (which raises a local
+        # `ContextCancelled`) when requested (via
+        # `Context.cancel()`) by the same task (tree) which entered
+        # THIS `.open_context()`.
         #
-        # 2.*the other* side's (callee/spawned) task cancelled due
-        #   to a self or peer cancellation request in which case we
-        #   DO let the error bubble to the opener.
+        # NOTE: There are 2 operating cases for a "graceful cancel"
+        # of a `Context`. In both cases any `ContextCancelled`
+        # raised in this scope-block came from a transport msg
+        # relayed from some remote-actor-task which our runtime set
+        # as a `Context._remote_error`
+        #
+        # the CASES:
+        #
+        # - if that context IS THE SAME ONE that called
+        #   `Context.cancel()`, we want to absorb the error
+        #   silently and let this `.open_context()` block to exit
+        #   without raising.
+        #
+        # - if it is from some OTHER context (we did NOT call
+        #   `.cancel()`), we want to re-RAISE IT whilst also
+        #   setting our own ctx's "reason for cancel" to be that
+        #   other context's cancellation condition; we set our
+        #   `.canceller: tuple[str, str]` to be same value as
+        #   caught here in a `ContextCancelled.canceller`.
+        #
+        # Again, there are 2 cases:
+        #
+        # 1-some other context opened in this `.open_context()`
+        #   block cancelled due to a self or peer cancellation
+        #   request in which case we DO let the error bubble to the
+        #   opener.
+        #
+        # 2-THIS "caller" task somewhere invoked `Context.cancel()`
+        #   and received a `ContextCanclled` from the "callee"
+        #   task, in which case we mask the `ContextCancelled` from
+        #   bubbling to this "caller" (much like how `trio.Nursery`
+        #   swallows any `trio.Cancelled` bubbled by a call to
+        #   `Nursery.cancel_scope.cancel()`)
         except ContextCancelled as ctxc:
             scope_err = ctxc
 
-            # CASE 1: this context was never cancelled
-            # via a local task's call to `Context.cancel()`.
-            if not ctx._cancel_called:
-                raise
-
             # CASE 2: context was cancelled by local task calling
             # `.cancel()`, we don't raise and the exit block should
             # exit silently.
-            else:
+            if (
+                ctx._cancel_called
+                and (
+                    ctxc is ctx._remote_error
+                    or
+                    ctxc.canceller is self.canceller
+                )
+            ):
                 log.debug(
                     f'Context {ctx} cancelled gracefully with:\n'
                     f'{ctxc}'
                 )
+            # CASE 1: this context was never cancelled via a local
+            # task (tree) having called `Context.cancel()`, raise
+            # the error since it was caused by someone else!
+            else:
+                raise
 
+        # the above `._scope` can be cancelled due to:
+        # 1. an explicit self cancel via `Context.cancel()` or
+        #    `Actor.cancel()`,
+        # 2. any "callee"-side remote error, possibly also a cancellation
+        #    request by some peer,
+        # 3. any "caller" (aka THIS scope's) local error raised in the above `yield`
         except (
-            # - a standard error in the caller/yieldee
+            # CASE 3: standard local error in this caller/yieldee
             Exception,
 
-            # - a runtime teardown exception-group and/or
-            #   cancellation request from a caller task.
-            BaseExceptionGroup,
-            trio.Cancelled,
+            # CASES 1 & 2: normally manifested as
+            # a `Context._scope_nursery` raised
+            # exception-group of,
+            # 1.-`trio.Cancelled`s, since
+            #   `._scope.cancel()` will have been called and any
+            #   `ContextCancelled` absorbed and thus NOT RAISED in
+            #   any `Context._maybe_raise_remote_err()`,
+            # 2.-`BaseExceptionGroup[ContextCancelled | RemoteActorError]`
+            #    from any error raised in the "callee" side with
+            #    a group only raised if there was any more then one
+            #    task started here in the "caller" in the
+            #    `yield`-ed to task.
+            BaseExceptionGroup,  # since overrun handler tasks may have been spawned
+            trio.Cancelled,  # NOTE: NOT from inside the ctx._scope
             KeyboardInterrupt,
 
         ) as err:
             scope_err = err
 
-            # XXX: request cancel of this context on any error.
-            # NOTE: `Context.cancel()` is conversely NOT called in
-            # the `ContextCancelled` "cancellation requested" case
-            # above.
+            # XXX: ALWAYS request the context to CANCEL ON any ERROR.
+            # NOTE: `Context.cancel()` is conversely NEVER CALLED in
+            # the `ContextCancelled` "self cancellation absorbed" case
+            # handled in the block above!
             log.cancel(
                 'Context cancelled for task due to\n'
                 f'{err}\n'
@@ -583,7 +635,7 @@ class Portal:
 
             raise  # duh
 
-        # no scope error case
+        # no local scope error, the "clean exit with a result" case.
         else:
             if ctx.chan.connected():
                 log.info(
@@ -597,15 +649,27 @@ class Portal:
                 # `Context._maybe_raise_remote_err()`) IFF
                 # a `Context._remote_error` was set by the runtime
                 # via a call to
-                # `Context._maybe_cancel_and_set_remote_error()`
-                # which IS SET any time the far end fails and
-                # causes "caller side" cancellation via
-                # a `ContextCancelled` here.
-                result = await ctx.result()
-                log.runtime(
-                    f'Context {fn_name} returned value from callee:\n'
-                    f'`{result}`'
-                )
+                # `Context._maybe_cancel_and_set_remote_error()`.
+                # As per `Context._deliver_msg()`, that error IS
+                # ALWAYS SET any time "callee" side fails and causes "caller
+                # side" cancellation via a `ContextCancelled` here.
+                # result = await ctx.result()
+                try:
+                    result = await ctx.result()
+                    log.runtime(
+                        f'Context {fn_name} returned value from callee:\n'
+                        f'`{result}`'
+                    )
+                except BaseException as berr:
+                    # on normal teardown, if we get some error
+                    # raised in `Context.result()` we still want to
+                    # save that error on the ctx's state to
+                    # determine things like `.cancelled_caught` for
+                    # cases where there was remote cancellation but
+                    # this task didn't know until final teardown
+                    # / value collection.
+                    scope_err = berr
+                    raise
 
         finally:
             # though it should be impossible for any tasks
@@ -655,12 +719,14 @@ class Portal:
                 with trio.CancelScope(shield=True):
                     await ctx._recv_chan.aclose()
 
-            # XXX: since we always (maybe) re-raise (and thus also
-            # mask runtime machinery related
-            # multi-`trio.Cancelled`s) any scope error which was
-            # the underlying cause of this context's exit, add
-            # different log msgs for each of the (2) cases.
+            # XXX: we always raise remote errors locally and
+            # generally speaking mask runtime-machinery related
+            # multi-`trio.Cancelled`s. As such, any `scope_error`
+            # which was the underlying cause of this context's exit
+            # should be stored as the `Context._local_error` and
+            # used in determining `Context.cancelled_caught: bool`.
             if scope_err is not None:
+                ctx._local_error: BaseException = scope_err
                 etype: Type[BaseException] = type(scope_err)
 
                 # CASE 2
@@ -690,7 +756,7 @@ class Portal:
             await maybe_wait_for_debugger()
 
             # FINALLY, remove the context from runtime tracking and
-            # exit Bo
+            # exit!
             self.actor._contexts.pop(
                 (self.channel.uid, ctx.cid),
                 None,
-- 
2.34.1


From f4e63465deac6351a20448a5a90947d421f7ee6d Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 23 Oct 2023 17:47:55 -0400
Subject: [PATCH 071/378] Tweak `Channel._cancel_called` comment

---
 tractor/_ipc.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tractor/_ipc.py b/tractor/_ipc.py
index a022908a..7c99467a 100644
--- a/tractor/_ipc.py
+++ b/tractor/_ipc.py
@@ -294,9 +294,11 @@ class Channel:
         self._agen = self._aiter_recv()
         self._exc: Optional[Exception] = None  # set if far end actor errors
         self._closed: bool = False
-        # flag set on ``Portal.cancel_actor()`` indicating
-        # remote (peer) cancellation of the far end actor runtime.
-        self._cancel_called: bool = False  # set on ``Portal.cancel_actor()``
+
+        # flag set by ``Portal.cancel_actor()`` indicating remote
+        # (possibly peer) cancellation of the far end actor
+        # runtime.
+        self._cancel_called: bool = False
 
     @classmethod
     def from_stream(
-- 
2.34.1


From b77d123edd1635718b62c3bb6c170526c1dd2ad8 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 23 Oct 2023 17:48:34 -0400
Subject: [PATCH 072/378] Fix `Context.result()` call to be in runtime scope

---
 tests/test_infected_asyncio.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_infected_asyncio.py b/tests/test_infected_asyncio.py
index 76744198..56b5fde5 100644
--- a/tests/test_infected_asyncio.py
+++ b/tests/test_infected_asyncio.py
@@ -225,7 +225,7 @@ def test_context_spawns_aio_task_that_errors(
 
                     await trio.sleep_forever()
 
-        return await ctx.result()
+                return await ctx.result()
 
     if parent_cancels:
         # bc the parent made the cancel request,
-- 
2.34.1


From ecb525a2bcf9eb73a2b9c1ea6647e0a2b5ff954a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 23 Oct 2023 17:49:02 -0400
Subject: [PATCH 073/378] Adjust test details where `Context.cancel()` is
 called

We can now make asserts on `.cancelled_caught` and `_remote_error` vs.
`_local_error`. Expect a runtime error when `Context.open_stream()` is
called AFTER `.cancel()` and the remote `ContextCancelled` hasn't
arrived (yet). Adjust to `'itself'` string in self-cancel case.
---
 tests/test_context_stream_semantics.py | 67 ++++++++++++++++++++------
 1 file changed, 51 insertions(+), 16 deletions(-)

diff --git a/tests/test_context_stream_semantics.py b/tests/test_context_stream_semantics.py
index 29d50e84..dda096ce 100644
--- a/tests/test_context_stream_semantics.py
+++ b/tests/test_context_stream_semantics.py
@@ -13,6 +13,11 @@ from typing import Optional
 import pytest
 import trio
 import tractor
+from tractor import (
+    Actor,
+    Context,
+    current_actor,
+)
 from tractor._exceptions import (
     StreamOverrun,
     ContextCancelled,
@@ -193,9 +198,6 @@ def test_simple_context(
                         else:
                             assert await ctx.result() == 'yo'
 
-                        if not error_parent:
-                            await ctx.cancel()
-
                         if pointlessly_open_stream:
                             async with ctx.open_stream():
                                 if error_parent:
@@ -208,10 +210,15 @@ def test_simple_context(
                                     # 'stop' msg to the far end which needs
                                     # to be ignored
                                     pass
+
                         else:
                             if error_parent:
                                 raise error_parent
 
+                            # cancel AFTER we open a stream
+                            # to avoid a cancel raised inside
+                            # `.open_stream()`
+                            await ctx.cancel()
                 finally:
 
                     # after cancellation
@@ -276,7 +283,7 @@ def test_caller_cancels(
             assert (
                 tuple(err.canceller)
                 ==
-                tractor.current_actor().uid
+                current_actor().uid
             )
 
     async def main():
@@ -430,9 +437,11 @@ async def test_caller_closes_ctx_after_callee_opens_stream(
 ):
     'caller context closes without using stream'
 
-    async with tractor.open_nursery() as n:
+    async with tractor.open_nursery() as an:
 
-        portal = await n.start_actor(
+        root: Actor = current_actor()
+
+        portal = await an.start_actor(
             'ctx_cancelled',
             enable_modules=[__name__],
         )
@@ -440,10 +449,10 @@ async def test_caller_closes_ctx_after_callee_opens_stream(
         async with portal.open_context(
             expect_cancelled,
         ) as (ctx, sent):
-            await portal.run(assert_state, value=True)
-
             assert sent is None
 
+            await portal.run(assert_state, value=True)
+
             # call cancel explicitly
             if use_ctx_cancel_method:
 
@@ -454,8 +463,21 @@ async def test_caller_closes_ctx_after_callee_opens_stream(
                         async for msg in stream:
                             pass
 
-                except tractor.ContextCancelled:
-                    raise  # XXX: must be propagated to __aexit__
+                except tractor.ContextCancelled as ctxc:
+                    # XXX: the cause is US since we call
+                    # `Context.cancel()` just above!
+                    assert (
+                        ctxc.canceller
+                        ==
+                        current_actor().uid
+                        ==
+                        root.uid
+                    )
+
+                    # XXX: must be propagated to __aexit__
+                    # and should be silently absorbed there
+                    # since we called `.cancel()` just above ;)
+                    raise
 
                 else:
                     assert 0, "Should have context cancelled?"
@@ -472,7 +494,13 @@ async def test_caller_closes_ctx_after_callee_opens_stream(
                         await ctx.result()
                         assert 0, "Callee should have blocked!?"
                 except trio.TooSlowError:
+                    # NO-OP -> since already called above
                     await ctx.cancel()
+
+        # local scope should have absorbed the cancellation
+        assert ctx.cancelled_caught
+        assert ctx._remote_error is ctx._local_error
+
         try:
             async with ctx.open_stream() as stream:
                 async for msg in stream:
@@ -551,19 +579,25 @@ async def cancel_self(
     global _state
     _state = True
 
+    # since we call this the below `.open_stream()` should always
+    # error!
     await ctx.cancel()
 
     # should inline raise immediately
     try:
         async with ctx.open_stream():
             pass
-    except tractor.ContextCancelled:
+    # except tractor.ContextCancelled:
+    except RuntimeError:
         # suppress for now so we can do checkpoint tests below
-        pass
+        print('Got expected runtime error for stream-after-cancel')
+
     else:
         raise RuntimeError('Context didnt cancel itself?!')
 
-    # check a real ``trio.Cancelled`` is raised on a checkpoint
+    # check that``trio.Cancelled`` is now raised on any further
+    # checkpoints since the self cancel above will have cancelled
+    # the `Context._scope.cancel_scope: trio.CancelScope`
     try:
         with trio.fail_after(0.1):
             await trio.sleep_forever()
@@ -574,6 +608,7 @@ async def cancel_self(
         # should never get here
         assert 0
 
+    raise RuntimeError('Context didnt cancel itself?!')
 
 @tractor_test
 async def test_callee_cancels_before_started():
@@ -601,7 +636,7 @@ async def test_callee_cancels_before_started():
             ce.type == trio.Cancelled
 
             # the traceback should be informative
-            assert 'cancelled itself' in ce.msgdata['tb_str']
+            assert 'itself' in ce.msgdata['tb_str']
 
         # teardown the actor
         await portal.cancel_actor()
@@ -773,7 +808,7 @@ async def echo_back_sequence(
 
     print(
         'EXITING CALLEEE:\n'
-        f'{ctx.cancel_called_remote}'
+        f'{ctx.canceller}'
     )
     return 'yo'
 
@@ -871,7 +906,7 @@ def test_maybe_allow_overruns_stream(
 
             if cancel_ctx:
                 assert isinstance(res, ContextCancelled)
-                assert tuple(res.canceller) == tractor.current_actor().uid
+                assert tuple(res.canceller) == current_actor().uid
 
             else:
                 print(f'RX ROOT SIDE RESULT {res}')
-- 
2.34.1


From ef0cfc4b209d064bf58c1ecce0ce3c735a444d17 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 23 Oct 2023 18:24:20 -0400
Subject: [PATCH 074/378] Get inter-peer suite passing with all `Context` state
 checks!

Definitely needs some cleaning and refinement but this gets us to stage
1 of being pretty frickin correct i'd say :dancer:
---
 tests/test_inter_peer_cancellation.py | 283 ++++++++++++++++++--------
 1 file changed, 199 insertions(+), 84 deletions(-)

diff --git a/tests/test_inter_peer_cancellation.py b/tests/test_inter_peer_cancellation.py
index 46ca5758..09f11b87 100644
--- a/tests/test_inter_peer_cancellation.py
+++ b/tests/test_inter_peer_cancellation.py
@@ -15,6 +15,26 @@ from tractor import (  # typing
     ContextCancelled,
 )
 
+# XXX TODO cases:
+# - [ ] peer cancelled itself - so other peers should
+#   get errors reflecting that the peer was itself the .canceller?
+
+# - [x] WE cancelled the peer and thus should not see any raised
+#   `ContextCancelled` as it should be reaped silently?
+#   => pretty sure `test_context_stream_semantics::test_caller_cancels()`
+#      already covers this case?
+
+# - [x] INTER-PEER: some arbitrary remote peer cancels via
+#   Portal.cancel_actor().
+#   => all other connected peers should get that cancel requesting peer's
+#      uid in the ctx-cancelled error msg raised in all open ctxs
+#      with that peer.
+
+# - [ ] PEER-FAILS-BY-CHILD-ERROR: peer spawned a sub-actor which
+#   (also) spawned a failing task which was unhandled and
+#   propagated up to the immediate parent - the peer to the actor
+#   that also spawned a remote task task in that same peer-parent.
+
 
 # def test_self_cancel():
 #     '''
@@ -29,14 +49,30 @@ from tractor import (  # typing
 @tractor.context
 async def sleep_forever(
     ctx: Context,
+    expect_ctxc: bool = False,
 ) -> None:
     '''
     Sync the context, open a stream then just sleep.
 
+    Allow checking for (context) cancellation locally.
+
     '''
-    await ctx.started()
-    async with ctx.open_stream():
-        await trio.sleep_forever()
+    try:
+        await ctx.started()
+        async with ctx.open_stream():
+            await trio.sleep_forever()
+
+    except BaseException as berr:
+
+        # TODO: it'd sure be nice to be able to inject our own
+        # `ContextCancelled` here instead of of `trio.Cancelled`
+        # so that our runtime can expect it and this "user code"
+        # would be able to tell the diff between a generic trio
+        # cancel and a tractor runtime-IPC cancel.
+        if expect_ctxc:
+            assert isinstance(berr, trio.Cancelled)
+
+        raise
 
 
 @tractor.context
@@ -145,6 +181,7 @@ async def stream_ints(
     async with ctx.open_stream() as stream:
         for i in itertools.count():
             await stream.send(i)
+            await trio.sleep(0.01)
 
 
 @tractor.context
@@ -157,77 +194,111 @@ async def stream_from_peer(
     try:
         async with (
             tractor.wait_for_actor(peer_name) as peer,
-            peer.open_context(stream_ints) as (peer_ctx, first),
-            peer_ctx.open_stream() as stream,
+            # peer.open_context(stream_ints) as (peer_ctx, first),
+            # peer_ctx.open_stream() as stream,
         ):
-            await ctx.started()
-            # XXX TODO: big set of questions for this
-            # - should we raise `ContextCancelled` or `Cancelled` (rn
-            #   it does that) here?!
-            # - test the `ContextCancelled` OUTSIDE the
-            #   `.open_context()` call?
-            try:
-                async for msg in stream:
-                    print(msg)
+            async with (
+                peer.open_context(stream_ints) as (peer_ctx, first),
+                # peer_ctx.open_stream() as stream,
+            ):
+            #     # try:
+                async with (
+                    peer_ctx.open_stream() as stream,
+                ):
 
-            except trio.Cancelled:
-                assert not ctx.cancel_called
-                assert not ctx.cancelled_caught
+                    await ctx.started()
+                    # XXX QUESTIONS & TODO: for further details around this
+                    # in the longer run..
+                    # https://github.com/goodboy/tractor/issues/368
+                    # - should we raise `ContextCancelled` or `Cancelled` (rn
+                    #   it does latter) and should/could it be implemented
+                    #   as a general injection override for `trio` such
+                    #   that ANY next checkpoint would raise the "cancel
+                    #   error type" of choice?
+                    # - should the `ContextCancelled` bubble from
+                    #   all `Context` and `MsgStream` apis wherein it
+                    #   prolly makes the most sense to make it
+                    #   a `trio.Cancelled` subtype?
+                    # - what about IPC-transport specific errors, should
+                    #   they bubble from the async for and trigger
+                    #   other special cases?
+                    # try:
+                    # NOTE: current ctl flow:
+                    # - stream raises `trio.EndOfChannel` and
+                    #   exits the loop
+                    # - `.open_context()` will raise the ctxcanc
+                    #   received from the sleeper.
+                    async for msg in stream:
+                        assert msg is not None
+                        print(msg)
+                # finally:
+                # await trio.sleep(0.1)
+                # from tractor import pause
+                # await pause()
 
-                assert not peer_ctx.cancel_called
-                assert not peer_ctx.cancelled_caught
+            # except BaseException as berr:
+            #     with trio.CancelScope(shield=True):
+            #         await tractor.pause()
+            #     raise
 
-                assert 'root' in ctx.cancel_called_remote
-
-                raise  # XXX MUST NEVER MASK IT!!
-
-            with trio.CancelScope(shield=True):
-                await tractor.pause()
-            # pass
-            # pytest.fail(
-            raise RuntimeError(
-                'peer never triggered local `[Context]Cancelled`?!?'
-            )
+            # except trio.Cancelled:
+            #     with trio.CancelScope(shield=True):
+            #         await tractor.pause()
+            #     raise  # XXX NEVER MASK IT
+            # from tractor import pause
+            # await pause()
 
     # NOTE: cancellation of the (sleeper) peer should always
     # cause a `ContextCancelled` raise in this streaming
     # actor.
     except ContextCancelled as ctxerr:
-        assert ctxerr.canceller == 'canceller'
-        assert ctxerr._remote_error is ctxerr
+        err = ctxerr
+        assert peer_ctx._remote_error is ctxerr
+        assert peer_ctx.canceller == ctxerr.canceller
 
-        # CASE 1: we were cancelled by our parent, the root actor.
-        # TODO: there are other cases depending on how the root
-        # actor and it's caller side task are written:
-        # - if the root does not req us to cancel then an
-        # IPC-transport related error should bubble from the async
-        # for loop and thus cause local cancellation both here
-        # and in the root (since in that case this task cancels the
-        # context with the root, not the other way around)
-        assert ctx.cancel_called_remote[0] == 'root'
+        # caller peer should not be the cancel requester
+        assert not ctx.cancel_called
+        # XXX can never be true since `._invoke` only
+        # sets this AFTER the nursery block this task
+        # was started in, exits.
+        assert not ctx.cancelled_caught
+
+        # we never requested cancellation
+        assert not peer_ctx.cancel_called
+        # the `.open_context()` exit definitely
+        # caught a cancellation in the internal `Context._scope`
+        # since likely the runtime called `_deliver_msg()`
+        # after receiving the remote error from the streaming
+        # task.
+        assert peer_ctx.cancelled_caught
+
+        # TODO / NOTE `.canceller` won't have been set yet
+        # here because that machinery is inside
+        # `.open_context().__aexit__()` BUT, if we had
+        # a way to know immediately (from the last
+        # checkpoint) that cancellation was due to
+        # a remote, we COULD assert this here..see,
+        # https://github.com/goodboy/tractor/issues/368
+
+        # root/parent actor task should NEVER HAVE cancelled us!
+        assert not ctx.canceller
+        assert 'canceller' in peer_ctx.canceller
+
+        # TODO: IN THEORY we could have other cases depending on
+        # who cancels first, the root actor or the canceller peer.
+        #
+        # 1- when the peer request is first then the `.canceller`
+        #   field should obvi be set to the 'canceller' uid,
+        #
+        # 2-if the root DOES req cancel then we should see the same
+        #   `trio.Cancelled` implicitly raised
+        # assert ctx.canceller[0] == 'root'
+        # assert peer_ctx.canceller[0] == 'sleeper'
         raise
 
-    # except BaseException as err:
-
-    #     raise
-
-# cases:
-# - some arbitrary remote peer cancels via Portal.cancel_actor().
-#   => all other connected peers should get that cancel requesting peer's
-#      uid in the ctx-cancelled error msg.
-
-# - peer spawned a sub-actor which (also) spawned a failing task
-#   which was unhandled and propagated up to the immediate
-#   parent, the peer to the actor that also spawned a remote task
-#   task in that same peer-parent.
-
-# - peer cancelled itself - so other peers should
-#   get errors reflecting that the peer was itself the .canceller?
-
-# - WE cancelled the peer and thus should not see any raised
-#   `ContextCancelled` as it should be reaped silently?
-#   => pretty sure `test_context_stream_semantics::test_caller_cancels()`
-#      already covers this case?
+    raise RuntimeError(
+        'peer never triggered local `ContextCancelled`?'
+    )
 
 @pytest.mark.parametrize(
     'error_during_ctxerr_handling',
@@ -251,8 +322,8 @@ def test_peer_canceller(
         line and be less indented.
 
     .actor0> ()-> .actor1>
-        a inter-actor task context opened (by `async with `Portal.open_context()`)
-        from actor0 *into* actor1.
+        a inter-actor task context opened (by `async with
+        `Portal.open_context()`) from actor0 *into* actor1.
 
     .actor0> ()<=> .actor1>
         a inter-actor task context opened (as above)
@@ -287,11 +358,11 @@ def test_peer_canceller(
     5. .canceller> ()-> .sleeper>
         - calls `Portal.cancel_actor()`
 
-
     '''
-
     async def main():
-        async with tractor.open_nursery() as an:
+        async with tractor.open_nursery(
+            # debug_mode=True
+        ) as an:
             canceller: Portal = await an.start_actor(
                 'canceller',
                 enable_modules=[__name__],
@@ -305,10 +376,13 @@ def test_peer_canceller(
                 enable_modules=[__name__],
             )
 
+            root = tractor.current_actor()
+
             try:
                 async with (
                     sleeper.open_context(
                         sleep_forever,
+                        expect_ctxc=True,
                     ) as (sleeper_ctx, sent),
 
                     just_caller.open_context(
@@ -328,6 +402,7 @@ def test_peer_canceller(
 
                     try:
                         print('PRE CONTEXT RESULT')
+                        # await tractor.pause()
                         await sleeper_ctx.result()
 
                         # should never get here
@@ -343,8 +418,8 @@ def test_peer_canceller(
 
                         # canceller and caller peers should not
                         # have been remotely cancelled.
-                        assert canceller_ctx.cancel_called_remote is None
-                        assert caller_ctx.cancel_called_remote is None
+                        assert canceller_ctx.canceller is None
+                        assert caller_ctx.canceller is None
 
                         assert ctxerr.canceller[0] == 'canceller'
 
@@ -363,8 +438,9 @@ def test_peer_canceller(
 
                         raise
 
-                    # SHOULD NEVER GET HERE!
-                    except BaseException:
+                    # XXX SHOULD NEVER EVER GET HERE XXX
+                    except BaseException as berr:
+                        err = berr
                         pytest.fail('did not rx ctx-cancelled error?')
                     else:
                         pytest.fail('did not rx ctx-cancelled error?')
@@ -375,6 +451,19 @@ def test_peer_canceller(
             )as ctxerr:
                 _err = ctxerr
 
+                # NOTE: the main state to check on `Context` is:
+                # - `.cancelled_caught` (maps to nursery cs)
+                # - `.cancel_called` (bool of whether this side
+                #    requested)
+                # - `.canceller` (uid of cancel-causing actor-task)
+                # - `._remote_error` (any `RemoteActorError`
+                #    instance from other side of context)
+                # - `._cancel_msg` (any msg that caused the
+                #    cancel)
+
+                # CASE: error raised during handling of
+                # `ContextCancelled` inside `.open_context()`
+                # block
                 if error_during_ctxerr_handling:
                     assert isinstance(ctxerr, RuntimeError)
 
@@ -384,20 +473,30 @@ def test_peer_canceller(
                     for ctx in ctxs:
                         assert ctx.cancel_called
 
+                        # this root actor task should have
+                        # cancelled all opened contexts except the
+                        # sleeper which is obvi by the "canceller"
+                        # peer.
+                        re = ctx._remote_error
+                        if (
+                            ctx is sleeper_ctx
+                            or ctx is caller_ctx
+                        ):
+                            assert re.canceller == canceller.channel.uid
+
+                        else:
+                            assert re.canceller == root.uid
+
                         # each context should have received
                         # a silently absorbed context cancellation
                         # from its peer actor's task.
-                        assert ctx.chan.uid == ctx.cancel_called_remote
-
-                        # this root actor task should have
-                        # cancelled all opened contexts except
-                        # the sleeper which is cancelled by its
-                        # peer "canceller"
-                        if ctx is not sleeper_ctx:
-                            assert ctx._remote_error.canceller[0] == 'root'
+                        # assert ctx.chan.uid == ctx.canceller
 
+                # CASE: standard teardown inside in `.open_context()` block
                 else:
-                    assert ctxerr.canceller[0] == 'canceller'
+                    assert ctxerr.canceller == sleeper_ctx.canceller
+                    # assert ctxerr.canceller[0] == 'canceller'
+                    # assert sleeper_ctx.canceller[0] == 'canceller'
 
                     # the sleeper's remote error is the error bubbled
                     # out of the context-stack above!
@@ -405,18 +504,35 @@ def test_peer_canceller(
                     assert re is ctxerr
 
                     for ctx in ctxs:
+                        re: BaseException | None = ctx._remote_error
+                        assert re
 
+                        # root doesn't cancel sleeper since it's
+                        # cancelled by its peer.
+                        # match ctx:
+                        #     case sleeper_ctx:
                         if ctx is sleeper_ctx:
                             assert not ctx.cancel_called
+                            # wait WHY?
                             assert ctx.cancelled_caught
+
+                        elif ctx is caller_ctx:
+                            # since its context was remotely
+                            # cancelled, we never needed to
+                            # call `Context.cancel()` bc our
+                            # context was already remotely
+                            # cancelled by the time we'd do it.
+                            assert ctx.cancel_called
+
                         else:
                             assert ctx.cancel_called
                             assert not ctx.cancelled_caught
 
-                        # each context should have received
+                        # TODO: do we even need this flag?
+                        # -> each context should have received
                         # a silently absorbed context cancellation
-                        # from its peer actor's task.
-                        assert ctx.chan.uid == ctx.cancel_called_remote
+                        # in its remote nursery scope.
+                        # assert ctx.chan.uid == ctx.canceller
 
                     # NOTE: when an inter-peer cancellation
                     # occurred, we DO NOT expect this
@@ -434,7 +550,6 @@ def test_peer_canceller(
                 # including the case where ctx-cancel handling
                 # itself errors.
                 assert sleeper_ctx.cancelled_caught
-                assert sleeper_ctx.cancel_called_remote[0] == 'sleeper'
 
                 # await tractor.pause()
                 raise  # always to ensure teardown
-- 
2.34.1


From d651f3d8e9bf3cecb6e3a8d7831a039895310c4b Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 25 Oct 2023 15:21:41 -0400
Subject: [PATCH 075/378] Tons of interpeer test cleanup

Drop all the nested `@acm` blocks and defunct comments from initial
validations. Add some todos for cases that are still unclear such as
whether the caller / streamer should have `.cancelled_caught == True` in
it's teardown.
---
 tests/test_inter_peer_cancellation.py | 154 ++++++++++++--------------
 1 file changed, 72 insertions(+), 82 deletions(-)

diff --git a/tests/test_inter_peer_cancellation.py b/tests/test_inter_peer_cancellation.py
index 09f11b87..5e1a4cad 100644
--- a/tests/test_inter_peer_cancellation.py
+++ b/tests/test_inter_peer_cancellation.py
@@ -194,59 +194,33 @@ async def stream_from_peer(
     try:
         async with (
             tractor.wait_for_actor(peer_name) as peer,
-            # peer.open_context(stream_ints) as (peer_ctx, first),
-            # peer_ctx.open_stream() as stream,
+            peer.open_context(stream_ints) as (peer_ctx, first),
+            peer_ctx.open_stream() as stream,
         ):
-            async with (
-                peer.open_context(stream_ints) as (peer_ctx, first),
-                # peer_ctx.open_stream() as stream,
-            ):
-            #     # try:
-                async with (
-                    peer_ctx.open_stream() as stream,
-                ):
-
-                    await ctx.started()
-                    # XXX QUESTIONS & TODO: for further details around this
-                    # in the longer run..
-                    # https://github.com/goodboy/tractor/issues/368
-                    # - should we raise `ContextCancelled` or `Cancelled` (rn
-                    #   it does latter) and should/could it be implemented
-                    #   as a general injection override for `trio` such
-                    #   that ANY next checkpoint would raise the "cancel
-                    #   error type" of choice?
-                    # - should the `ContextCancelled` bubble from
-                    #   all `Context` and `MsgStream` apis wherein it
-                    #   prolly makes the most sense to make it
-                    #   a `trio.Cancelled` subtype?
-                    # - what about IPC-transport specific errors, should
-                    #   they bubble from the async for and trigger
-                    #   other special cases?
-                    # try:
-                    # NOTE: current ctl flow:
-                    # - stream raises `trio.EndOfChannel` and
-                    #   exits the loop
-                    # - `.open_context()` will raise the ctxcanc
-                    #   received from the sleeper.
-                    async for msg in stream:
-                        assert msg is not None
-                        print(msg)
-                # finally:
-                # await trio.sleep(0.1)
-                # from tractor import pause
-                # await pause()
-
-            # except BaseException as berr:
-            #     with trio.CancelScope(shield=True):
-            #         await tractor.pause()
-            #     raise
-
-            # except trio.Cancelled:
-            #     with trio.CancelScope(shield=True):
-            #         await tractor.pause()
-            #     raise  # XXX NEVER MASK IT
-            # from tractor import pause
-            # await pause()
+            await ctx.started()
+            # XXX QUESTIONS & TODO: for further details around this
+            # in the longer run..
+            # https://github.com/goodboy/tractor/issues/368
+            # - should we raise `ContextCancelled` or `Cancelled` (rn
+            #   it does latter) and should/could it be implemented
+            #   as a general injection override for `trio` such
+            #   that ANY next checkpoint would raise the "cancel
+            #   error type" of choice?
+            # - should the `ContextCancelled` bubble from
+            #   all `Context` and `MsgStream` apis wherein it
+            #   prolly makes the most sense to make it
+            #   a `trio.Cancelled` subtype?
+            # - what about IPC-transport specific errors, should
+            #   they bubble from the async for and trigger
+            #   other special cases?
+            # NOTE: current ctl flow:
+            # - stream raises `trio.EndOfChannel` and
+            #   exits the loop
+            # - `.open_context()` will raise the ctxcanc
+            #   received from the sleeper.
+            async for msg in stream:
+                assert msg is not None
+                print(msg)
 
     # NOTE: cancellation of the (sleeper) peer should always
     # cause a `ContextCancelled` raise in this streaming
@@ -265,11 +239,10 @@ async def stream_from_peer(
 
         # we never requested cancellation
         assert not peer_ctx.cancel_called
-        # the `.open_context()` exit definitely
-        # caught a cancellation in the internal `Context._scope`
-        # since likely the runtime called `_deliver_msg()`
-        # after receiving the remote error from the streaming
-        # task.
+        # the `.open_context()` exit definitely caught
+        # a cancellation in the internal `Context._scope` since
+        # likely the runtime called `_deliver_msg()` after
+        # receiving the remote error from the streaming task.
         assert peer_ctx.cancelled_caught
 
         # TODO / NOTE `.canceller` won't have been set yet
@@ -284,8 +257,9 @@ async def stream_from_peer(
         assert not ctx.canceller
         assert 'canceller' in peer_ctx.canceller
 
+        raise
         # TODO: IN THEORY we could have other cases depending on
-        # who cancels first, the root actor or the canceller peer.
+        # who cancels first, the root actor or the canceller peer?.
         #
         # 1- when the peer request is first then the `.canceller`
         #   field should obvi be set to the 'canceller' uid,
@@ -294,12 +268,12 @@ async def stream_from_peer(
         #   `trio.Cancelled` implicitly raised
         # assert ctx.canceller[0] == 'root'
         # assert peer_ctx.canceller[0] == 'sleeper'
-        raise
 
     raise RuntimeError(
         'peer never triggered local `ContextCancelled`?'
     )
 
+
 @pytest.mark.parametrize(
     'error_during_ctxerr_handling',
     [False, True],
@@ -361,6 +335,7 @@ def test_peer_canceller(
     '''
     async def main():
         async with tractor.open_nursery(
+            # NOTE: to halt the peer tasks on ctxc, uncomment this.
             # debug_mode=True
         ) as an:
             canceller: Portal = await an.start_actor(
@@ -402,7 +377,6 @@ def test_peer_canceller(
 
                     try:
                         print('PRE CONTEXT RESULT')
-                        # await tractor.pause()
                         await sleeper_ctx.result()
 
                         # should never get here
@@ -410,9 +384,8 @@ def test_peer_canceller(
                             'Context.result() did not raise ctx-cancelled?'
                         )
 
-                    # TODO: not sure why this isn't catching
-                    # but maybe we need an `ExceptionGroup` and
-                    # the whole except *errs: thinger in 3.11?
+                    # should always raise since this root task does
+                    # not request the sleeper cancellation ;)
                     except ContextCancelled as ctxerr:
                         print(f'CAUGHT REMOTE CONTEXT CANCEL {ctxerr}')
 
@@ -430,9 +403,6 @@ def test_peer_canceller(
                         # block it should be.
                         assert not sleeper_ctx.cancelled_caught
 
-                        # TODO: a test which ensures this error is
-                        # bubbled and caught (NOT MASKED) by the
-                        # runtime!!! 
                         if error_during_ctxerr_handling:
                             raise RuntimeError('Simulated error during teardown')
 
@@ -458,6 +428,7 @@ def test_peer_canceller(
                 # - `.canceller` (uid of cancel-causing actor-task)
                 # - `._remote_error` (any `RemoteActorError`
                 #    instance from other side of context)
+                # TODO: are we really planning to use this tho?
                 # - `._cancel_msg` (any msg that caused the
                 #    cancel)
 
@@ -482,21 +453,33 @@ def test_peer_canceller(
                             ctx is sleeper_ctx
                             or ctx is caller_ctx
                         ):
-                            assert re.canceller == canceller.channel.uid
+                            assert (
+                                re.canceller
+                                ==
+                                ctx.canceller
+                                ==
+                                canceller.channel.uid
+                            )
 
                         else:
-                            assert re.canceller == root.uid
-
-                        # each context should have received
-                        # a silently absorbed context cancellation
-                        # from its peer actor's task.
-                        # assert ctx.chan.uid == ctx.canceller
+                            assert (
+                                re.canceller
+                                ==
+                                ctx.canceller
+                                ==
+                                root.uid
+                            )
 
                 # CASE: standard teardown inside in `.open_context()` block
                 else:
                     assert ctxerr.canceller == sleeper_ctx.canceller
-                    # assert ctxerr.canceller[0] == 'canceller'
-                    # assert sleeper_ctx.canceller[0] == 'canceller'
+                    assert (
+                        ctxerr.canceller[0]
+                        ==
+                        sleeper_ctx.canceller[0]
+                        ==
+                        'canceller'
+                    )
 
                     # the sleeper's remote error is the error bubbled
                     # out of the context-stack above!
@@ -509,21 +492,29 @@ def test_peer_canceller(
 
                         # root doesn't cancel sleeper since it's
                         # cancelled by its peer.
-                        # match ctx:
-                        #     case sleeper_ctx:
                         if ctx is sleeper_ctx:
                             assert not ctx.cancel_called
-                            # wait WHY?
+                            # since sleeper_ctx.result() IS called
+                            # above we should have (silently)
+                            # absorbed the corresponding
+                            # `ContextCancelled` for it and thus
+                            # the logic inside `.cancelled_caught`
+                            # should trigger!
                             assert ctx.cancelled_caught
 
                         elif ctx is caller_ctx:
                             # since its context was remotely
                             # cancelled, we never needed to
-                            # call `Context.cancel()` bc our
-                            # context was already remotely
-                            # cancelled by the time we'd do it.
+                            # call `Context.cancel()` bc it was
+                            # done by the peer and also we never 
                             assert ctx.cancel_called
 
+                            # TODO: figure out the details of
+                            # this..
+                            # if you look the `._local_error` here
+                            # is a multi of ctxc + 2 Cancelleds?
+                            # assert not ctx.cancelled_caught
+
                         else:
                             assert ctx.cancel_called
                             assert not ctx.cancelled_caught
@@ -551,7 +542,6 @@ def test_peer_canceller(
                 # itself errors.
                 assert sleeper_ctx.cancelled_caught
 
-                # await tractor.pause()
                 raise  # always to ensure teardown
 
     if error_during_ctxerr_handling:
-- 
2.34.1


From 227c9ea1736fb363a92da99400e51207752cbaba Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 6 Nov 2023 15:43:43 -0500
Subject: [PATCH 076/378] Test with `any(portals)` since `gather_contexts()`
 will return `list[None | tuple]`

---
 tractor/_discovery.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tractor/_discovery.py b/tractor/_discovery.py
index b5f47165..e5bc8dbe 100644
--- a/tractor/_discovery.py
+++ b/tractor/_discovery.py
@@ -211,10 +211,14 @@ async def find_actor(
         #     'Gathered portals:\n'
         #     f'{portals}'
         # )
-        if not portals:
+        # NOTE: `gather_contexts()` will return a
+        # `tuple[None, None, ..., None]` if no contact
+        # can be made with any regstrar at any of the
+        # N provided addrs!
+        if not any(portals):
             if raise_on_none:
                 raise RuntimeError(
-                    f'No {name} found registered @ {registry_addrs}'
+                    f'No actor "{name}" found registered @ {registry_addrs}'
                 )
             yield None
             return
-- 
2.34.1


From 48accbd28fc3636b8a36faedc4fc9be867103ee8 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 6 Nov 2023 15:44:21 -0500
Subject: [PATCH 077/378] Fix doc string "its" typo..

---
 tractor/_multiaddr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tractor/_multiaddr.py b/tractor/_multiaddr.py
index d0f562c0..e8713b40 100644
--- a/tractor/_multiaddr.py
+++ b/tractor/_multiaddr.py
@@ -100,7 +100,7 @@ def parse_maddr(
     multiaddr: str,
 ) -> dict[str, str | int | dict]:
     '''
-    Parse a libp2p style "multiaddress" into it's distinct protocol
+    Parse a libp2p style "multiaddress" into its distinct protocol
     segments where each segment is of the form:
 
         `../<protocol>/<param0>/<param1>/../<paramN>`
-- 
2.34.1


From 87cd725adb6f3f56579e7fa88a030040d9658aac Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 7 Nov 2023 16:45:22 -0500
Subject: [PATCH 078/378] Add `open_root_actor(ensure_registry: bool)`

Allows forcing the opened actor to either obtain the passed registry
addrs or raise a runtime error.
---
 tractor/_root.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tractor/_root.py b/tractor/_root.py
index 5615bb65..608773a4 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -85,6 +85,10 @@ async def open_root_actor(
     enable_modules: list | None = None,
     rpc_module_paths: list | None = None,
 
+    # NOTE: allow caller to ensure that only one registry exists
+    # and that this call creates it.
+    ensure_registry: bool = False,
+
 ) -> Actor:
     '''
     Runtime init entry point for ``tractor``.
@@ -206,6 +210,12 @@ async def open_root_actor(
     # REGISTRAR
     if ponged_addrs:
 
+        if ensure_registry:
+            raise RuntimeError(
+                 f'Failed to open `{name}`@{ponged_addrs}: '
+                'registry socket(s) already bound'
+            )
+
         # we were able to connect to an arbiter
         logger.info(
             f'Registry(s) seem(s) to exist @ {ponged_addrs}'
-- 
2.34.1


From 3f159235374b960d120c55a1be844cd1434ca48b Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 11 Dec 2023 18:17:42 -0500
Subject: [PATCH 079/378] More thurough hard kill doc strings

---
 tractor/_spawn.py | 45 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/tractor/_spawn.py b/tractor/_spawn.py
index e55e59f8..2936220c 100644
--- a/tractor/_spawn.py
+++ b/tractor/_spawn.py
@@ -204,6 +204,21 @@ async def do_hard_kill(
     # terminate_after: int = 99999,
 
 ) -> None:
+    '''
+    Un-gracefully terminate an OS level `trio.Process` after timeout.
+
+    Used in 2 main cases:
+
+    - "unknown remote runtime state": a hanging/stalled actor that
+      isn't responding after sending a (graceful) runtime cancel
+      request via an IPC msg.
+    - "cancelled during spawn": a process who's actor runtime was
+      cancelled before full startup completed (such that
+      cancel-request-handling machinery was never fully
+      initialized) and thus a "cancel request msg" is never going
+      to be handled.
+
+    '''
     # NOTE: this timeout used to do nothing since we were shielding
     # the ``.wait()`` inside ``new_proc()`` which will pretty much
     # never release until the process exits, now it acts as
@@ -219,6 +234,9 @@ async def do_hard_kill(
         # and wait for it to exit. If cancelled, kills the process and
         # waits for it to finish exiting before propagating the
         # cancellation.
+        #
+        # This code was originally triggred by ``proc.__aexit__()``
+        # but now must be called manually.
         with trio.CancelScope(shield=True):
             if proc.stdin is not None:
                 await proc.stdin.aclose()
@@ -234,10 +252,14 @@ async def do_hard_kill(
                 with trio.CancelScope(shield=True):
                     await proc.wait()
 
+    # XXX NOTE XXX: zombie squad dispatch:
+    # (should ideally never, but) If we do get here it means
+    # graceful termination of a process failed and we need to
+    # resort to OS level signalling to interrupt and cancel the
+    # (presumably stalled or hung) actor. Since we never allow
+    # zombies (as a feature) we ask the OS to do send in the
+    # removal swad as the last resort.
     if cs.cancelled_caught:
-        # XXX: should pretty much never get here unless we have
-        # to move the bits from ``proc.__aexit__()`` out and
-        # into here.
         log.critical(f"#ZOMBIE_LORD_IS_HERE: {proc}")
         proc.kill()
 
@@ -252,10 +274,13 @@ async def soft_wait(
     portal: Portal,
 
 ) -> None:
-    # Wait for proc termination but **dont' yet** call
-    # ``trio.Process.__aexit__()`` (it tears down stdio
-    # which will kill any waiting remote pdb trace).
-    # This is a "soft" (cancellable) join/reap.
+    '''
+    Wait for proc termination but **dont' yet** teardown
+    std-streams (since it will clobber any ongoing pdb REPL
+    session). This is our "soft" (and thus itself cancellable)
+    join/reap on an actor-runtime-in-process.
+
+    '''
     uid = portal.channel.uid
     try:
         log.cancel(f'Soft waiting on actor:\n{uid}')
@@ -278,7 +303,13 @@ async def soft_wait(
                 await wait_func(proc)
                 n.cancel_scope.cancel()
 
+            # start a task to wait on the termination of the
+            # process by itself waiting on a (caller provided) wait
+            # function which should unblock when the target process
+            # has terminated.
             n.start_soon(cancel_on_proc_deth)
+
+            # send the actor-runtime a cancel request.
             await portal.cancel_actor()
 
             if proc.poll() is None:  # type: ignore
-- 
2.34.1


From f415fc43ce0ae0110990fe0fba9169d0fa75c472 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 11 Dec 2023 19:37:45 -0500
Subject: [PATCH 080/378] `.discovery.get_arbiter()`: add warning around this
 now deprecated usage

---
 tractor/_discovery.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tractor/_discovery.py b/tractor/_discovery.py
index e5bc8dbe..8cccc505 100644
--- a/tractor/_discovery.py
+++ b/tractor/_discovery.py
@@ -76,8 +76,18 @@ async def get_registry(
             yield regstr_ptl
 
 
-# TODO: deprecate and remove _arbiter form
-get_arbiter = get_registry
+
+# TODO: deprecate and this remove _arbiter form!
+@acm
+async def get_arbiter(*args, **kwargs):
+    warnings.warn(
+        '`tractor.get_arbiter()` is now deprecated!\n'
+        'Use `.get_registry()` instead!',
+        DeprecationWarning,
+        stacklevel=2,
+    )
+    async with get_registry(*args, **kwargs) as to_yield:
+        yield to_yield
 
 
 @acm
-- 
2.34.1


From 250275d98d52e8076d2797b8c3fabf5f8098d124 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 2 Jan 2024 09:08:39 -0500
Subject: [PATCH 081/378] Guarding for IPC failures in `._runtime._invoke()`

Took me longer then i wanted to figure out the source of
a failed-response to a remote-cancellation (in this case in `modden`
where a client was cancelling a workspace layer.. but disconnects before
receiving the ack msg) that was triggering an IPC error when sending the
error msg for the cancellation of a `Actor._cancel_task()`, but since
this (non-rpc) `._invoke()` task was trying to send to a now
disconnected canceller it was resulting in a `BrokenPipeError` (or similar)
error.

Now, we except for such IPC errors and only raise them when,
1. the transport `Channel` is for sure up (bc ow what's the point of
   trying to send an error on the thing that caused it..)
2. it's definitely for handling an RPC task

Similarly if the entire main invoke `try:` excepts,
- we only hide the call-stack frame from the debugger (with
  `__tracebackhide__: bool`) if it's an RPC task that has a connected
  channel since we always want to see the frame when debugging internal
  task or IPC failures.
- we don't bother trying to send errors to the context caller (actor)
  when it's a non-RPC request since failures on actor-runtime-internal
  tasks shouldn't really ever be reported remotely, only maybe raised
  locally.

Also some other tidying,
- this properly corrects for the self-cancel case where an RPC context
  is cancelled due to a local (runtime) task calling a method like
  `Actor.cancel_soon()`. We now set our own `.uid` as the
  `ContextCancelled.canceller` value so that other-end tasks know that
  the cancellation was due to a self-cancellation by the actor itself.
  We still need to properly test for this though!
- add a more detailed module doc-str.
- more explicit imports for `trio` core types throughout.
---
 tractor/_runtime.py | 186 +++++++++++++++++++++++++++++++-------------
 1 file changed, 130 insertions(+), 56 deletions(-)

diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index fee14c4d..58080654 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -15,7 +15,10 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 """
-Actor primitives and helpers
+The fundamental core machinery implementing every "actor" including
+the process-local (python-interpreter global) `Actor` state-type
+primitive(s), RPC-in-task scheduling, and IPC connectivity and
+low-level transport msg handling.
 
 """
 from __future__ import annotations
@@ -41,8 +44,14 @@ import warnings
 
 from async_generator import aclosing
 from exceptiongroup import BaseExceptionGroup
-import trio  # type: ignore
-from trio_typing import TaskStatus
+import trio
+from trio import (
+    CancelScope,
+)
+from trio_typing import (
+    Nursery,
+    TaskStatus,
+)
 
 from ._ipc import Channel
 from ._context import (
@@ -90,10 +99,9 @@ async def _invoke(
     connected IPC channel.
 
     This is the core "RPC" `trio.Task` scheduling machinery used to start every
-    remotely invoked function, normally in `Actor._service_n: trio.Nursery`.
+    remotely invoked function, normally in `Actor._service_n: Nursery`.
 
     '''
-    __tracebackhide__: bool = True
     treat_as_gen: bool = False
     failed_resp: bool = False
 
@@ -110,9 +118,9 @@ async def _invoke(
     # possibly a traceback (not sure what typing is for this..)
     tb = None
 
-    cancel_scope = trio.CancelScope()
+    cancel_scope = CancelScope()
     # activated cancel scope ref
-    cs: trio.CancelScope | None = None
+    cs: CancelScope | None = None
 
     ctx = actor.get_context(
         chan,
@@ -124,6 +132,7 @@ async def _invoke(
     )
     context: bool = False
 
+    # TODO: deprecate this style..
     if getattr(func, '_tractor_stream_function', False):
         # handle decorated ``@tractor.stream`` async functions
         sig = inspect.signature(func)
@@ -165,6 +174,7 @@ async def _invoke(
         except TypeError:
             raise
 
+        # TODO: can we unify this with the `context=True` impl below?
         if inspect.isasyncgen(coro):
             await chan.send({'functype': 'asyncgen', 'cid': cid})
             # XXX: massive gotcha! If the containing scope
@@ -195,6 +205,7 @@ async def _invoke(
             await chan.send({'stop': True, 'cid': cid})
 
         # one way @stream func that gets treated like an async gen
+        # TODO: can we unify this with the `context=True` impl below?
         elif treat_as_gen:
             await chan.send({'functype': 'asyncgen', 'cid': cid})
             # XXX: the async-func may spawn further tasks which push
@@ -211,8 +222,20 @@ async def _invoke(
                 # far end async gen to tear down
                 await chan.send({'stop': True, 'cid': cid})
 
+        # our most general case: a remote SC-transitive,
+        # IPC-linked, cross-actor-task "context"
+        # ------ - ------
         # TODO: every other "func type" should be implemented from
-        # a special case of a context eventually!
+        # a special case of this impl eventually!
+        # -[ ] streaming funcs should instead of being async-for
+        #     handled directly here wrapped in
+        #     a async-with-open_stream() closure that does the
+        #     normal thing you'd expect a far end streaming context
+        #     to (if written by the app-dev).
+        # -[ ] one off async funcs can literally just be called
+        #     here and awaited directly, possibly just with a small
+        #     wrapper that calls `Context.started()` and then does
+        #     the `await coro()`?
         elif context:
             # context func with support for bi-dir streaming
             await chan.send({'functype': 'context', 'cid': cid})
@@ -273,11 +296,12 @@ async def _invoke(
                     ctx._maybe_raise_remote_err(re)
 
                 fname: str = func.__name__
-                cs: trio.CancelScope = ctx._scope
+                cs: CancelScope = ctx._scope
                 if cs.cancel_called:
+                    our_uid: tuple = actor.uid
                     canceller: tuple = ctx.canceller
                     msg: str = (
-                        f'`{fname}()`@{actor.uid} cancelled by '
+                        f'`{fname}()`@{our_uid} cancelled by '
                     )
 
                     # NOTE / TODO: if we end up having
@@ -286,6 +310,8 @@ async def _invoke(
                     # need to change this logic branch since it
                     # will always enter..
                     if ctx._cancel_called:
+                        # TODO: test for this!!!!!
+                        canceller: tuple = our_uid
                         msg += 'itself '
 
                     # if the channel which spawned the ctx is the
@@ -318,40 +344,76 @@ async def _invoke(
                         canceller=canceller,
                     )
 
-        # regular async function
+        # regular async function/method
+        # XXX: possibly just a scheduled `Actor._cancel_task()`
+        # from a remote request to cancel some `Context`.
+        # ------ - ------
+        # TODO: ideally we unify this with the above `context=True`
+        # block such that for any remote invocation ftype, we
+        # always invoke the far end RPC task scheduling the same
+        # way: using the linked IPC context machinery.
         else:
             try:
                 await chan.send({
                     'functype': 'asyncfunc',
                     'cid': cid
                 })
-            except trio.BrokenResourceError:
+            except (
+                trio.ClosedResourceError,
+                trio.BrokenResourceError,
+                BrokenPipeError,
+            ) as ipc_err:
                 failed_resp = True
                 if is_rpc:
                     raise
                 else:
+                    # TODO: should this be an `.exception()` call?
                     log.warning(
-                        f'Failed to respond to non-rpc request: {func}'
+                        f'Failed to respond to non-rpc request: {func}\n'
+                        f'{ipc_err}'
                     )
 
             with cancel_scope as cs:
-                ctx._scope = cs
+                ctx._scope: CancelScope = cs
                 task_status.started(ctx)
                 result = await coro
                 fname: str = func.__name__
                 log.runtime(f'{fname}() result: {result}')
-                if not failed_resp:
-                    # only send result if we know IPC isn't down
-                    await chan.send(
-                        {'return': result,
-                         'cid': cid}
-                    )
+
+                # NOTE: only send result if we know IPC isn't down
+                if (
+                    not failed_resp
+                    and chan.connected()
+                ):
+                    try:
+                        await chan.send(
+                            {'return': result,
+                             'cid': cid}
+                        )
+                    except (
+                        BrokenPipeError,
+                        trio.BrokenResourceError,
+                    ):
+                        log.warning(
+                            'Failed to return result:\n'
+                            f'{func}@{actor.uid}\n'
+                            f'remote chan: {chan.uid}'
+                        )
 
     except (
         Exception,
         BaseExceptionGroup,
     ) as err:
 
+        # always hide this frame from debug REPL if the crash
+        # originated from an rpc task and we DID NOT fail
+        # due to an IPC transport error!
+        if (
+            is_rpc
+            and chan.connected()
+        ):
+            __tracebackhide__: bool = True
+
         if not is_multi_cancelled(err):
 
             # TODO: maybe we'll want different "levels" of debugging
@@ -385,24 +447,31 @@ async def _invoke(
                     log.exception("Actor crashed:")
 
         # always ship errors back to caller
-        err_msg = pack_error(err, tb=tb)
+        err_msg: dict[str, dict] = pack_error(
+            err,
+            tb=tb,
+        )
         err_msg['cid'] = cid
 
-        try:
-            await chan.send(err_msg)
+        if is_rpc:
+            try:
+                await chan.send(err_msg)
 
-        # TODO: tests for this scenario:
-        # - RPC caller closes connection before getting a response
-        # should **not** crash this actor..
-        except (
-            trio.ClosedResourceError,
-            trio.BrokenResourceError,
-            BrokenPipeError,
-        ):
-            # if we can't propagate the error that's a big boo boo
-            log.exception(
-                f"Failed to ship error to caller @ {chan.uid} !?"
-            )
+            # TODO: tests for this scenario:
+            # - RPC caller closes connection before getting a response
+            # should **not** crash this actor..
+            except (
+                trio.ClosedResourceError,
+                trio.BrokenResourceError,
+                BrokenPipeError,
+            ) as ipc_err:
+
+                # if we can't propagate the error that's a big boo boo
+                log.exception(
+                    f"Failed to ship error to caller @ {chan.uid} !?\n"
+                    f'{ipc_err}'
+
+                )
 
         # error is probably from above coro running code *not from the
         # underlyingn rpc invocation* since a scope was never allocated
@@ -428,7 +497,11 @@ async def _invoke(
                 log.warning(
                     f"Task {func} likely errored or cancelled before start")
             else:
-                log.cancel(f'{func.__name__}({kwargs}) failed?')
+                log.cancel(
+                    'Failed to de-alloc internal task!?\n'
+                    f'cid: {cid}\n'
+                    f'{func.__name__}({kwargs})'
+                )
 
         finally:
             if not actor._rpc_tasks:
@@ -445,7 +518,7 @@ async def try_ship_error_to_parent(
     err: Exception | BaseExceptionGroup,
 
 ) -> None:
-    with trio.CancelScope(shield=True):
+    with CancelScope(shield=True):
         try:
             # internal error so ship to parent without cid
             await channel.send(pack_error(err))
@@ -497,13 +570,13 @@ class Actor:
     msg_buffer_size: int = 2**6
 
     # nursery placeholders filled in by `async_main()` after fork
-    _root_n: trio.Nursery | None = None
-    _service_n: trio.Nursery | None = None
-    _server_n: trio.Nursery | None = None
+    _root_n: Nursery | None = None
+    _service_n: Nursery | None = None
+    _server_n: Nursery | None = None
 
     # Information about `__main__` from parent
     _parent_main_data: dict[str, str]
-    _parent_chan_cs: trio.CancelScope | None = None
+    _parent_chan_cs: CancelScope | None = None
 
     # syncs for setup/teardown sequences
     _server_down: trio.Event | None = None
@@ -1096,12 +1169,12 @@ class Actor:
 
     async def _serve_forever(
         self,
-        handler_nursery: trio.Nursery,
+        handler_nursery: Nursery,
         *,
         # (host, port) to bind for channel server
         listen_sockaddrs: list[tuple[str, int]] | None = None,
 
-        task_status: TaskStatus[trio.Nursery] = trio.TASK_STATUS_IGNORED,
+        task_status: TaskStatus[Nursery] = trio.TASK_STATUS_IGNORED,
     ) -> None:
         '''
         Start the channel server, begin listening for new connections.
@@ -1188,7 +1261,7 @@ class Actor:
         self._cancel_called = True
 
         # cancel all ongoing rpc tasks
-        with trio.CancelScope(shield=True):
+        with CancelScope(shield=True):
 
             # kill any debugger request task to avoid deadlock
             # with the root actor in this tree
@@ -1248,7 +1321,7 @@ class Actor:
             # this ctx based lookup ensures the requested task to
             # be cancelled was indeed spawned by a request from this channel
             ctx, func, is_complete = self._rpc_tasks[(chan, cid)]
-            scope: trio.CancelScope = ctx._scope
+            scope: CancelScope = ctx._scope
         except KeyError:
             log.cancel(f"{cid} has already completed/terminated?")
             return True
@@ -1613,7 +1686,7 @@ async def async_main(
         # block it might be actually possible to debug THIS
         # machinery in the same way as user task code?
         # if actor.name == 'brokerd.ib':
-        #     with trio.CancelScope(shield=True):
+        #     with CancelScope(shield=True):
         #         await _debug.breakpoint()
 
         actor.lifetime_stack.close()
@@ -1655,7 +1728,7 @@ async def async_main(
             ):
                 log.runtime(
                     f"Waiting for remaining peers {actor._peers} to clear")
-                with trio.CancelScope(shield=True):
+                with CancelScope(shield=True):
                     await actor._no_more_peers.wait()
         log.runtime("All peer channels are complete")
 
@@ -1666,7 +1739,7 @@ async def process_messages(
     actor: Actor,
     chan: Channel,
     shield: bool = False,
-    task_status: TaskStatus[trio.CancelScope] = trio.TASK_STATUS_IGNORED,
+    task_status: TaskStatus[CancelScope] = trio.TASK_STATUS_IGNORED,
 
 ) -> bool:
     '''
@@ -1684,7 +1757,7 @@ async def process_messages(
 
     log.runtime(f"Entering msg loop for {chan} from {chan.uid}")
     try:
-        with trio.CancelScope(shield=shield) as loop_cs:
+        with CancelScope(shield=shield) as loop_cs:
             # this internal scope allows for keeping this message
             # loop running despite the current task having been
             # cancelled (eg. `open_portal()` may call this method from
@@ -1746,18 +1819,18 @@ async def process_messages(
 
                 if ns == 'self':
                     if funcname == 'cancel':
-                        func = actor.cancel
+                        func: Callable = actor.cancel
                         kwargs['requesting_uid'] = chan.uid
 
                         # don't start entire actor runtime cancellation
                         # if this actor is currently in debug mode!
-                        pdb_complete = _debug.Lock.local_pdb_complete
+                        pdb_complete: trio.Event | None = _debug.Lock.local_pdb_complete
                         if pdb_complete:
                             await pdb_complete.wait()
 
                         # we immediately start the runtime machinery
                         # shutdown
-                        with trio.CancelScope(shield=True):
+                        with CancelScope(shield=True):
                             # actor.cancel() was called so kill this
                             # msg loop and break out into
                             # ``async_main()``
@@ -1785,7 +1858,7 @@ async def process_messages(
 
                         # we immediately start the runtime machinery
                         # shutdown
-                        # with trio.CancelScope(shield=True):
+                        # with CancelScope(shield=True):
                         kwargs['chan'] = chan
                         target_cid = kwargs['cid']
                         kwargs['requesting_uid'] = chan.uid
@@ -1810,7 +1883,7 @@ async def process_messages(
                     else:
                         # normally registry methods, eg.
                         # ``.register_actor()`` etc.
-                        func = getattr(actor, funcname)
+                        func: Callable = getattr(actor, funcname)
 
                 else:
                     # complain to client about restricted modules
@@ -1900,9 +1973,10 @@ async def process_messages(
         Exception,
         BaseExceptionGroup,
     ) as err:
+
         if nursery_cancelled_before_task:
-            sn = actor._service_n
-            assert sn and sn.cancel_scope.cancel_called
+            sn: Nursery = actor._service_n
+            assert sn and sn.cancel_scope.cancel_called  # sanity
             log.cancel(
                 f'Service nursery cancelled before it handled {funcname}'
             )
-- 
2.34.1


From bea31f6d19e8c7dd3c3506e5b17645b59424d989 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 2 Jan 2024 10:24:39 -0500
Subject: [PATCH 082/378] ._child: remove some unused imports..

---
 tractor/_child.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tractor/_child.py b/tractor/_child.py
index 91aaec4f..bd1e830e 100644
--- a/tractor/_child.py
+++ b/tractor/_child.py
@@ -18,8 +18,6 @@
 This is the "bootloader" for actors started using the native trio backend.
 
 """
-import sys
-import trio
 import argparse
 
 from ast import literal_eval
@@ -37,8 +35,6 @@ def parse_ipaddr(arg):
     return (str(host), int(port))
 
 
-from ._entry import _trio_main
-
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser()
-- 
2.34.1


From 814384848d3b399257991c05806bd8a9d5ef7b83 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 2 Jan 2024 10:25:17 -0500
Subject: [PATCH 083/378] Use `import <name> as <name>,` style over `__all__`
 in pkg mod

---
 tractor/__init__.py | 89 ++++++++++++++++-----------------------------
 1 file changed, 31 insertions(+), 58 deletions(-)

diff --git a/tractor/__init__.py b/tractor/__init__.py
index 980c8dc6..01d00ec9 100644
--- a/tractor/__init__.py
+++ b/tractor/__init__.py
@@ -18,76 +18,49 @@
 tractor: structured concurrent ``trio``-"actors".
 
 """
-from exceptiongroup import BaseExceptionGroup
+from exceptiongroup import BaseExceptionGroup as BaseExceptionGroup
 
-from ._clustering import open_actor_cluster
+from ._clustering import (
+    open_actor_cluster as open_actor_cluster,
+)
 from ._context import (
-    Context,  # the type
-    context,  # a func-decorator
+    Context as Context,  # the type
+    context as context,  # a func-decorator
 )
 from ._streaming import (
-    MsgStream,
-    stream,
+    MsgStream as MsgStream,
+    stream as stream,
 )
 from ._discovery import (
-    get_arbiter,
-    find_actor,
-    wait_for_actor,
-    query_actor,
+    get_arbiter as get_arbiter,
+    find_actor as find_actor,
+    wait_for_actor as wait_for_actor,
+    query_actor as query_actor,
+)
+from ._supervise import (
+    open_nursery as open_nursery,
+    ActorNursery as ActorNursery,
 )
-from ._supervise import open_nursery
 from ._state import (
-    current_actor,
-    is_root_process,
+    current_actor as current_actor,
+    is_root_process as is_root_process,
 )
 from ._exceptions import (
-    RemoteActorError,
-    ModuleNotExposed,
-    ContextCancelled,
+    RemoteActorError as RemoteActorError,
+    ModuleNotExposed as ModuleNotExposed,
+    ContextCancelled as ContextCancelled,
 )
 from .devx import (
-    breakpoint,
-    pause,
-    pause_from_sync,
-    post_mortem,
+    breakpoint as breakpoint,
+    pause as pause,
+    pause_from_sync as pause_from_sync,
+    post_mortem as post_mortem,
 )
-from . import msg
+from . import msg as msg
 from ._root import (
-    run_daemon,
-    open_root_actor,
+    run_daemon as run_daemon,
+    open_root_actor as open_root_actor,
 )
-from ._ipc import Channel
-from ._portal import Portal
-from ._runtime import Actor
-
-
-__all__ = [
-    'Actor',
-    'BaseExceptionGroup',
-    'Channel',
-    'Context',
-    'ContextCancelled',
-    'ModuleNotExposed',
-    'MsgStream',
-    'Portal',
-    'RemoteActorError',
-    'breakpoint',
-    'context',
-    'current_actor',
-    'find_actor',
-    'query_actor',
-    'get_arbiter',
-    'is_root_process',
-    'msg',
-    'open_actor_cluster',
-    'open_nursery',
-    'open_root_actor',
-    'pause',
-    'post_mortem',
-    'pause_from_sync',
-    'query_actor',
-    'run_daemon',
-    'stream',
-    'to_asyncio',
-    'wait_for_actor',
-]
+from ._ipc import Channel as Channel
+from ._portal import Portal as Portal
+from ._runtime import Actor as Actor
-- 
2.34.1


From 00024181cd530b2bd867460a54c7a5d33c07059d Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 2 Jan 2024 10:38:04 -0500
Subject: [PATCH 084/378] `StackLevelAdapter._log(stacklevel: int)` for custom
 levels..

Apparently (and i don't know if this was always broken [i feel like no?]
or is a recent change to stdlib's `logging` stuff) we need increment the
`stacklevel` input by one for our custom level methods now? Without this
you're going to see the path to the method's-callstack-frame on every
emission instead of to the caller's. I first noticed this when debugging
the workspace layer spawning in `modden.bigd` and then verified it in
other depended projects..

I guess we should add some tests for this as well XD
---
 tractor/log.py | 53 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 10 deletions(-)

diff --git a/tractor/log.py b/tractor/log.py
index 5710e83e..590779a5 100644
--- a/tractor/log.py
+++ b/tractor/log.py
@@ -48,12 +48,15 @@ LOG_FORMAT = (
 
 DATE_FORMAT = '%b %d %H:%M:%S'
 
-LEVELS = {
+LEVELS: dict[str, int] = {
     'TRANSPORT': 5,
     'RUNTIME': 15,
     'CANCEL': 16,
     'PDB': 500,
 }
+# _custom_levels: set[str] = {
+#     lvlname.lower for lvlname in LEVELS.keys()
+# }
 
 STD_PALETTE = {
     'CRITICAL': 'red',
@@ -102,7 +105,11 @@ class StackLevelAdapter(logging.LoggerAdapter):
         Cancellation logging, mostly for runtime reporting.
 
         '''
-        return self.log(16, msg)
+        return self.log(
+            level=16,
+            msg=msg,
+            # stacklevel=4,
+        )
 
     def pdb(
         self,
@@ -114,14 +121,37 @@ class StackLevelAdapter(logging.LoggerAdapter):
         '''
         return self.log(500, msg)
 
-    def log(self, level, msg, *args, **kwargs):
-        """
+    def log(
+        self,
+        level,
+        msg,
+        *args,
+        **kwargs,
+    ):
+        '''
         Delegate a log call to the underlying logger, after adding
         contextual information from this adapter instance.
-        """
+
+        '''
         if self.isEnabledFor(level):
+            stacklevel: int = 3
+            if (
+                level in LEVELS.values()
+                # or level in _custom_levels
+            ):
+                stacklevel: int = 4
+
             # msg, kwargs = self.process(msg, kwargs)
-            self._log(level, msg, args, **kwargs)
+            self._log(
+                level=level,
+                msg=msg,
+                args=args,
+                # NOTE: not sure how this worked before but, it
+                # seems with our custom level methods defined above
+                # we do indeed (now) require another stack level??
+                stacklevel=stacklevel,
+                **kwargs,
+            )
 
     # LOL, the stdlib doesn't allow passing through ``stacklevel``..
     def _log(
@@ -134,12 +164,15 @@ class StackLevelAdapter(logging.LoggerAdapter):
         stack_info=False,
 
         # XXX: bit we added to show fileinfo from actual caller.
-        # this level then ``.log()`` then finally the caller's level..
-        stacklevel=3,
+        # - this level
+        # - then ``.log()``
+        # - then finally the caller's level..
+        stacklevel=4,
     ):
-        """
+        '''
         Low-level log implementation, proxied to allow nested logger adapters.
-        """
+
+        '''
         return self.logger._log(
             level,
             msg,
-- 
2.34.1


From ce7b8a5e18f29ab9285a138dcaadebbc2fd2596d Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 2 Jan 2024 11:21:20 -0500
Subject: [PATCH 085/378] Drop unused walrus assign of `re`

---
 tractor/_context.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index 117092ac..9e19f2a1 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -62,6 +62,7 @@ if TYPE_CHECKING:
 log = get_logger(__name__)
 
 
+# TODO: make this a msgspec.Struct!
 @dataclass
 class Context:
     '''
@@ -491,15 +492,15 @@ class Context:
         if self._cancel_called:
 
             # XXX NOTE: ALWAYS RAISE any remote error here even if
-            # it's an expected `ContextCancelled` (after some local
-            # task having called `.cancel()` !
+            # it's an expected `ContextCancelled` due to a local
+            # task having called `.cancel()`!
             #
             # WHY: we expect the error to always bubble up to the
             # surrounding `Portal.open_context()` call and be
             # absorbed there (silently) and we DO NOT want to
             # actually try to stream - a cancel msg was already
             # sent to the other side!
-            if re := self._remote_error:
+            if self._remote_error:
                 raise self._remote_error
 
             # XXX NOTE: if no `ContextCancelled` has been responded
-- 
2.34.1


From fdf3a1b01b67bb5b37b60141c21b6225b2becb68 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 2 Jan 2024 11:28:02 -0500
Subject: [PATCH 086/378] Only use `greenback` if actor-runtime is up..

---
 tractor/devx/_debug.py | 43 ++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 3bef7bd6..f3550ba6 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -682,7 +682,7 @@ async def pause(
     https://en.wikipedia.org/wiki/Breakpoint
 
     '''
-    __tracebackhide__ = True
+    # __tracebackhide__ = True
     actor = tractor.current_actor()
     pdb, undo_sigint = mk_mpdb()
     task_name = trio.lowlevel.current_task().name
@@ -836,25 +836,32 @@ async def pause(
 # runtime aware version which takes care of all .
 def pause_from_sync() -> None:
     print("ENTER SYNC PAUSE")
-    try:
-        import greenback
-        __tracebackhide__ = True
+    actor: tractor.Actor = tractor.current_actor(
+        err_on_no_runtime=False,
+    )
+    if actor:
+        try:
+            import greenback
+            # __tracebackhide__ = True
 
-        actor: tractor.Actor = tractor.current_actor()
-        # task_can_release_tty_lock = trio.Event()
 
-        # spawn bg task which will lock out the TTY, we poll
-        # just below until the release event is reporting that task as
-        # waiting.. not the most ideal but works for now ;)
-        greenback.await_(
-            actor._service_n.start(partial(
-                pause,
-                debug_func=None,
-                # release_lock_signal=task_can_release_tty_lock,
-            ))
-        )
-    except ModuleNotFoundError:
-        log.warning('NO GREENBACK FOUND')
+            # task_can_release_tty_lock = trio.Event()
+
+            # spawn bg task which will lock out the TTY, we poll
+            # just below until the release event is reporting that task as
+            # waiting.. not the most ideal but works for now ;)
+            greenback.await_(
+                actor._service_n.start(partial(
+                    pause,
+                    debug_func=None,
+                    # release_lock_signal=task_can_release_tty_lock,
+                ))
+            )
+
+        except ModuleNotFoundError:
+            log.warning('NO GREENBACK FOUND')
+    else:
+        log.warning('Not inside actor-runtime')
 
     db, undo_sigint = mk_mpdb()
     Lock.local_task_in_debug = 'sync'
-- 
2.34.1


From 0bcdea28a08a0ab0742c5668fd2c414139c63101 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 2 Jan 2024 11:28:55 -0500
Subject: [PATCH 087/378] Fmt repr as multi-line style call

---
 tractor/_ipc.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tractor/_ipc.py b/tractor/_ipc.py
index 7c99467a..39f62224 100644
--- a/tractor/_ipc.py
+++ b/tractor/_ipc.py
@@ -329,8 +329,11 @@ class Channel:
     def __repr__(self) -> str:
         if self.msgstream:
             return repr(
-                self.msgstream.stream.socket._sock).replace(  # type: ignore
-                        "socket.socket", "Channel")
+                self.msgstream.stream.socket._sock
+            ).replace(  # type: ignore
+                "socket.socket",
+                "Channel",
+            )
         return object.__repr__(self)
 
     @property
-- 
2.34.1


From 734bc09b67ef8345be0cc41d6d11ab13c662b748 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 2 Jan 2024 18:34:15 -0500
Subject: [PATCH 088/378] Move missing-key-in-msg raiser to `._exceptions`

Since we use basically the exact same set of logic in
`Portal.open_context()` when expecting the first `'started'` msg factor
and generalize `._streaming._raise_from_no_yield_msg()` into a new
`._exceptions._raise_from_no_key_in_msg()` (as per the lingering todo)
which obvi requires a more generalized / optional signature including
a caller specific `log` obj. Obvi call the new func from all the other
modules X)
---
 tractor/_context.py    |  2 +
 tractor/_exceptions.py | 96 +++++++++++++++++++++++++++++++++++++++++-
 tractor/_portal.py     | 38 +++++++----------
 tractor/_streaming.py  | 83 +++++-------------------------------
 4 files changed, 122 insertions(+), 97 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index 9e19f2a1..4d56fb3c 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -44,9 +44,11 @@ import warnings
 import trio
 
 from ._exceptions import (
+    # _raise_from_no_key_in_msg,
     unpack_error,
     pack_error,
     ContextCancelled,
+    # MessagingError,
     StreamOverrun,
 )
 from .log import get_logger
diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 214dc88a..7e148586 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -14,16 +14,18 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-"""
+'''
 Our classy exception set.
 
-"""
+'''
+from __future__ import annotations
 import builtins
 import importlib
 from pprint import pformat
 from typing import (
     Any,
     Type,
+    TYPE_CHECKING,
 )
 import traceback
 
@@ -32,6 +34,11 @@ import trio
 
 from ._state import current_actor
 
+if TYPE_CHECKING:
+    from ._context import Context
+    from ._stream import MsgStream
+    from .log import StackLevelAdapter
+
 _this_mod = importlib.import_module(__name__)
 
 
@@ -246,3 +253,88 @@ def is_multi_cancelled(exc: BaseException) -> bool:
         ) is not None
 
     return False
+
+
+def _raise_from_no_key_in_msg(
+    ctx: Context,
+    msg: dict,
+    src_err: KeyError,
+    log: StackLevelAdapter,  # caller specific `log` obj
+    expect_key: str = 'yield',
+    stream: MsgStream | None = None,
+
+) -> bool:
+    '''
+    Raise an appopriate local error when a `MsgStream` msg arrives
+    which does not contain the expected (under normal operation)
+    `'yield'` field.
+
+    '''
+    __tracebackhide__: bool = True
+
+    # internal error should never get here
+    try:
+        cid: str = msg['cid']
+    except KeyError as src_err:
+        raise MessagingError(
+            f'IPC `Context` rx-ed msg without a ctx-id (cid)!?\n'
+            f'cid: {cid}\n'
+            'received msg:\n'
+            f'{pformat(msg)}\n'
+        ) from src_err
+
+    # TODO: test that shows stream raising an expected error!!!
+    if msg.get('error'):
+        # raise the error message
+        raise unpack_error(
+            msg,
+            ctx.chan,
+        ) from None
+
+    elif (
+        msg.get('stop')
+        or (
+            stream
+            and stream._eoc
+        )
+    ):
+        log.debug(
+            f'Context[{cid}] stream was stopped by remote side\n'
+            f'cid: {cid}\n'
+        )
+
+        # XXX: important to set so that a new ``.receive()``
+        # call (likely by another task using a broadcast receiver)
+        # doesn't accidentally pull the ``return`` message
+        # value out of the underlying feed mem chan!
+        stream._eoc: bool = True
+
+        # # when the send is closed we assume the stream has
+        # # terminated and signal this local iterator to stop
+        # await stream.aclose()
+
+        # XXX: this causes ``ReceiveChannel.__anext__()`` to
+        # raise a ``StopAsyncIteration`` **and** in our catch
+        # block below it will trigger ``.aclose()``.
+        raise trio.EndOfChannel(
+                'Context[{cid}] stream ended due to msg:\n'
+                f'{pformat(msg)}'
+        ) from src_err
+
+
+    if (
+        stream
+        and stream._closed
+    ):
+        raise trio.ClosedResourceError('This stream was closed')
+
+
+    # always re-raise the source error if no translation error case
+    # is activated above.
+    _type: str = 'Stream' if stream else 'Context'
+    raise MessagingError(
+        f'{_type} was expecting a `{expect_key}` message'
+        ' BUT received a non-`error` msg:\n'
+        f'cid: {cid}\n'
+        '{pformat(msg)}'
+    ) from src_err
diff --git a/tractor/_portal.py b/tractor/_portal.py
index 4c0587af..f3106431 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -33,7 +33,6 @@ from typing import (
 )
 from functools import partial
 from dataclasses import dataclass
-from pprint import pformat
 import warnings
 
 import trio
@@ -45,13 +44,17 @@ from ._ipc import Channel
 from .log import get_logger
 from .msg import NamespacePath
 from ._exceptions import (
+    _raise_from_no_key_in_msg,
     unpack_error,
     NoResult,
     ContextCancelled,
-    MessagingError,
 )
-from ._context import Context
-from ._streaming import MsgStream
+from ._context import (
+    Context,
+)
+from ._streaming import (
+    MsgStream,
+)
 from .devx._debug import maybe_wait_for_debugger
 
 
@@ -465,26 +468,15 @@ class Portal:
             first: Any = msg['started']
             ctx._started_called: bool = True
 
-        except KeyError:
+        except KeyError as src_error:
 
-            # TODO: can we maybe factor this into the new raiser
-            # `_streaming._raise_from_no_yield_msg()` and make that
-            # helper more generic, say with a `_no_<blah>_msg()`?
-            if not (cid := msg.get('cid')):
-                raise MessagingError(
-                    'Received internal error at context?\n'
-                    'No call-id (cid) in startup msg?'
-                )
-
-            if msg.get('error'):
-                # NOTE: mask the key error with the remote one
-                raise unpack_error(msg, self.channel) from None
-            else:
-                raise MessagingError(
-                    f'Context for {cid} was expecting a `started` message'
-                    ' but received a non-error msg:\n'
-                    f'{pformat(msg)}'
-                )
+            _raise_from_no_key_in_msg(
+                ctx=ctx,
+                msg=msg,
+                src_err=src_error,
+                log=log,
+                expect_key='started',
+            )
 
         ctx._portal: Portal = self
         uid: tuple = self.channel.uid
diff --git a/tractor/_streaming.py b/tractor/_streaming.py
index f02197b8..4530e144 100644
--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@@ -23,7 +23,6 @@ The machinery and types behind ``Context.open_stream()``
 from __future__ import annotations
 import inspect
 from contextlib import asynccontextmanager as acm
-from pprint import pformat
 from typing import (
     Any,
     Callable,
@@ -35,8 +34,7 @@ import warnings
 import trio
 
 from ._exceptions import (
-    unpack_error,
-    MessagingError,
+    _raise_from_no_key_in_msg,
 )
 from .log import get_logger
 from .trionics import (
@@ -56,71 +54,6 @@ log = get_logger(__name__)
 #   messages? class ReceiveChannel(AsyncResource, Generic[ReceiveType]):
 # - use __slots__ on ``Context``?
 
-def _raise_from_no_yield_msg(
-    stream: MsgStream,
-    msg: dict,
-    src_err: KeyError,
-
-) -> bool:
-    '''
-    Raise an appopriate local error when a `MsgStream` msg arrives
-    which does not contain the expected (under normal operation)
-    `'yield'` field.
-
-    '''
-    __tracebackhide__: bool = True
-
-    # internal error should never get here
-    assert msg.get('cid'), ("Received internal error at portal?")
-
-    # TODO: handle 2 cases with 3.10+ match syntax
-    # - 'stop'
-    # - 'error'
-    # possibly just handle msg['stop'] here!
-    # breakpoint()
-
-    if stream._closed:
-        raise trio.ClosedResourceError('This stream was closed')
-
-    if (
-        msg.get('stop')
-        or stream._eoc
-    ):
-        log.debug(f'{stream} was stopped at remote end')
-
-        # XXX: important to set so that a new ``.receive()``
-        # call (likely by another task using a broadcast receiver)
-        # doesn't accidentally pull the ``return`` message
-        # value out of the underlying feed mem chan!
-        stream._eoc: bool = True
-
-        # # when the send is closed we assume the stream has
-        # # terminated and signal this local iterator to stop
-        # await stream.aclose()
-
-        # XXX: this causes ``ReceiveChannel.__anext__()`` to
-        # raise a ``StopAsyncIteration`` **and** in our catch
-        # block below it will trigger ``.aclose()``.
-        raise trio.EndOfChannel(
-                'Stream ended due to msg:\n'
-                f'{pformat(msg)}'
-        ) from src_err
-
-    # TODO: test that shows stream raising an expected error!!!
-    elif msg.get('error'):
-        # raise the error message
-        raise unpack_error(msg, stream._ctx.chan)
-
-    # always re-raise the source error if no translation error case
-    # is activated above.
-    raise MessagingError(
-        f'Context received unexpected non-error msg!?\n'
-        f'cid: {cid}\n'
-        'received msg:\n'
-        f'{pformat(msg)}'
-    ) from src_err
-
-
 class MsgStream(trio.abc.Channel):
     '''
     A bidirectional message stream for receiving logically sequenced
@@ -160,10 +93,13 @@ class MsgStream(trio.abc.Channel):
         try:
             return msg['yield']
         except KeyError as kerr:
-            _raise_from_no_yield_msg(
-                stream=self,
+            _raise_from_no_key_in_msg(
+                ctx=self._ctx,
                 msg=msg,
                 src_err=kerr,
+                log=log,
+                expect_key='yield',
+                stream=self,
             )
 
     async def receive(self):
@@ -196,10 +132,13 @@ class MsgStream(trio.abc.Channel):
             return msg['yield']
 
         except KeyError as kerr:
-            _raise_from_no_yield_msg(
-                stream=self,
+            _raise_from_no_key_in_msg(
+                ctx=self._ctx,
                 msg=msg,
                 src_err=kerr,
+                log=log,
+                expect_key='yield',
+                stream=self,
             )
 
         except (
-- 
2.34.1


From 0294455c5e35a0f0da5fc9605290b293c318f0fd Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 2 Jan 2024 18:43:43 -0500
Subject: [PATCH 089/378] `_root`: drop unused `typing` import

---
 tractor/_root.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tractor/_root.py b/tractor/_root.py
index 608773a4..c79e1d98 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -25,7 +25,6 @@ import logging
 import signal
 import sys
 import os
-import typing
 import warnings
 
 
-- 
2.34.1


From 28ea8e787aef8ab7ff99b34d6ab7cf19306b6b98 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 3 Jan 2024 22:27:05 -0500
Subject: [PATCH 090/378] Bump timeout on resource cache test a bitty bit.

---
 tests/test_resource_cache.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_resource_cache.py b/tests/test_resource_cache.py
index 0e7ad74d..d3859814 100644
--- a/tests/test_resource_cache.py
+++ b/tests/test_resource_cache.py
@@ -34,7 +34,6 @@ def test_resource_only_entered_once(key_on):
     global _resource
     _resource = 0
 
-    kwargs = {}
     key = None
     if key_on == 'key_value':
         key = 'some_common_key'
@@ -139,7 +138,7 @@ def test_open_local_sub_to_stream():
     N local tasks using ``trionics.maybe_open_context():``.
 
     '''
-    timeout = 3 if platform.system() != "Windows" else 10
+    timeout: float = 3.6 if platform.system() != "Windows" else 10
 
     async def main():
 
-- 
2.34.1


From 7e0e62792103193524c871966c3974e4a72ac39d Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 23 Jan 2024 11:09:38 -0500
Subject: [PATCH 091/378] Use `import <blah> as blah` over `__all__` in
 `.trionics`

---
 tractor/trionics/__init__.py | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/tractor/trionics/__init__.py b/tractor/trionics/__init__.py
index 31e49a9a..c51b7c51 100644
--- a/tractor/trionics/__init__.py
+++ b/tractor/trionics/__init__.py
@@ -19,22 +19,13 @@ Sugary patterns for trio + tractor designs.
 
 '''
 from ._mngrs import (
-    gather_contexts,
-    maybe_open_context,
-    maybe_open_nursery,
+    gather_contexts as gather_contexts,
+    maybe_open_context as maybe_open_context,
+    maybe_open_nursery as maybe_open_nursery,
 )
 from ._broadcast import (
-    broadcast_receiver,
-    BroadcastReceiver,
-    Lagged,
+    AsyncReceiver as AsyncReceiver,
+    broadcast_receiver as broadcast_receiver,
+    BroadcastReceiver as BroadcastReceiver,
+    Lagged as Lagged,
 )
-
-
-__all__ = [
-    'gather_contexts',
-    'broadcast_receiver',
-    'BroadcastReceiver',
-    'Lagged',
-    'maybe_open_context',
-    'maybe_open_nursery',
-]
-- 
2.34.1


From c4496f21fc83ab208100f8bd7f7624c9884edcda Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 23 Jan 2024 11:13:07 -0500
Subject: [PATCH 092/378] Try allowing multi-pops of `_Cache.locks` for now?

---
 tractor/trionics/_mngrs.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tractor/trionics/_mngrs.py b/tractor/trionics/_mngrs.py
index 801b138b..1c079cdb 100644
--- a/tractor/trionics/_mngrs.py
+++ b/tractor/trionics/_mngrs.py
@@ -225,6 +225,7 @@ async def maybe_open_context(
 
     # yielded output
     yielded: Any = None
+    lock_registered: bool = False
 
     # Lock resource acquisition around task racing  / ``trio``'s
     # scheduler protocol.
@@ -232,6 +233,7 @@ async def maybe_open_context(
     # to allow re-entrant use cases where one `maybe_open_context()`
     # wrapped factor may want to call into another.
     lock = _Cache.locks.setdefault(fid, trio.Lock())
+    lock_registered: bool = True
     await lock.acquire()
 
     # XXX: one singleton nursery per actor and we want to
@@ -291,4 +293,9 @@ async def maybe_open_context(
                     _, no_more_users = entry
                     no_more_users.set()
 
-                _Cache.locks.pop(fid)
+                if lock_registered:
+                    maybe_lock = _Cache.locks.pop(fid, None)
+                    if maybe_lock is None:
+                        log.error(
+                            f'Resource lock for {fid} ALREADY POPPED?'
+                        )
-- 
2.34.1


From 35b0c4bef0c9088fc7adfa4536c8e32cab3aec10 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 23 Jan 2024 11:14:10 -0500
Subject: [PATCH 093/378] Never mask original `KeyError` in portal-error
 unwrapper, for now?

---
 tractor/_portal.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tractor/_portal.py b/tractor/_portal.py
index f3106431..378f6a23 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -69,10 +69,10 @@ def _unwrap_msg(
     __tracebackhide__ = True
     try:
         return msg['return']
-    except KeyError:
+    except KeyError as ke:
         # internal error should never get here
         assert msg.get('cid'), "Received internal error at portal?"
-        raise unpack_error(msg, channel) from None
+        raise unpack_error(msg, channel) from ke
 
 
 class Portal:
-- 
2.34.1


From df641d9d316abc0647e5693bf702ef9437b6a706 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 28 Jan 2024 16:33:10 -0500
Subject: [PATCH 094/378] Bring in pretty-ified `msgspec.Struct` extension

Originally designed and used throughout `piker`, the subtype adds some
handy pprinting and field diffing extras often handy when viewing struct
types in logging or REPL console interfaces B)

Obvi this rejigs the `tractor.msg` mod into a sub-pkg and moves the
existing namespace obj-pointer stuff into a new `.msg.ptr` sub mod.
---
 tractor/msg/__init__.py        |  26 ++++
 tractor/{msg.py => msg/ptr.py} |   2 +-
 tractor/msg/types.py           | 251 +++++++++++++++++++++++++++++++++
 3 files changed, 278 insertions(+), 1 deletion(-)
 create mode 100644 tractor/msg/__init__.py
 rename tractor/{msg.py => msg/ptr.py} (98%)
 create mode 100644 tractor/msg/types.py

diff --git a/tractor/msg/__init__.py b/tractor/msg/__init__.py
new file mode 100644
index 00000000..906627cf
--- /dev/null
+++ b/tractor/msg/__init__.py
@@ -0,0 +1,26 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+Built-in messaging patterns, types, APIs and helpers.
+
+'''
+from .ptr import (
+    NamespacePath as NamespacePath,
+)
+from .types import (
+    Struct as Struct,
+)
diff --git a/tractor/msg.py b/tractor/msg/ptr.py
similarity index 98%
rename from tractor/msg.py
rename to tractor/msg/ptr.py
index ca34dba8..550626a1 100644
--- a/tractor/msg.py
+++ b/tractor/msg/ptr.py
@@ -15,7 +15,7 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 '''
-Built-in messaging patterns, types, APIs and helpers.
+IPC-compat cross-mem-boundary object pointer.
 
 '''
 
diff --git a/tractor/msg/types.py b/tractor/msg/types.py
new file mode 100644
index 00000000..25e7b39b
--- /dev/null
+++ b/tractor/msg/types.py
@@ -0,0 +1,251 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+Extensions to built-in or (heavily used but 3rd party) friend-lib
+types.
+
+'''
+from __future__ import annotations
+from collections import UserList
+from pprint import (
+    saferepr,
+)
+from typing import (
+    Any,
+    Iterator,
+)
+
+from msgspec import (
+    msgpack,
+    Struct as _Struct,
+    structs,
+)
+
+
+class DiffDump(UserList):
+    '''
+    Very simple list delegator that repr() dumps (presumed) tuple
+    elements of the form `tuple[str, Any, Any]` in a nice
+    multi-line readable form for analyzing `Struct` diffs.
+
+    '''
+    def __repr__(self) -> str:
+        if not len(self):
+            return super().__repr__()
+
+        # format by displaying item pair's ``repr()`` on multiple,
+        # indented lines such that they are more easily visually
+        # comparable when printed to console when printed to
+        # console.
+        repstr: str = '[\n'
+        for k, left, right in self:
+            repstr += (
+                f'({k},\n'
+                f'\t{repr(left)},\n'
+                f'\t{repr(right)},\n'
+                ')\n'
+            )
+        repstr += ']\n'
+        return repstr
+
+
+class Struct(
+    _Struct,
+
+    # https://jcristharif.com/msgspec/structs.html#tagged-unions
+    # tag='pikerstruct',
+    # tag=True,
+):
+    '''
+    A "human friendlier" (aka repl buddy) struct subtype.
+
+    '''
+    def _sin_props(self) -> Iterator[
+        tuple[
+            structs.FieldIinfo,
+            str,
+            Any,
+        ]
+    ]:
+        '''
+        Iterate over all non-@property fields of this struct.
+
+        '''
+        fi: structs.FieldInfo
+        for fi in structs.fields(self):
+            key: str = fi.name
+            val: Any = getattr(self, key)
+            yield fi, key, val
+
+    def to_dict(
+        self,
+        include_non_members: bool = True,
+
+    ) -> dict:
+        '''
+        Like it sounds.. direct delegation to:
+        https://jcristharif.com/msgspec/api.html#msgspec.structs.asdict
+
+        BUT, by default we pop all non-member (aka not defined as
+        struct fields) fields by default.
+
+        '''
+        asdict: dict = structs.asdict(self)
+        if include_non_members:
+            return asdict
+
+        # only return a dict of the struct members
+        # which were provided as input, NOT anything
+        # added as type-defined `@property` methods!
+        sin_props: dict = {}
+        fi: structs.FieldInfo
+        for fi, k, v in self._sin_props():
+            sin_props[k] = asdict[k]
+
+        return sin_props
+
+    def pformat(
+        self,
+        field_indent: int = 2,
+        indent: int = 0,
+
+    ) -> str:
+        '''
+        Recursion-safe `pprint.pformat()` style formatting of
+        a `msgspec.Struct` for sane reading by a human using a REPL.
+
+        '''
+        # global whitespace indent
+        ws: str = ' '*indent
+
+        # field whitespace indent
+        field_ws: str = ' '*(field_indent + indent)
+
+        # qtn: str = ws + self.__class__.__qualname__
+        qtn: str = self.__class__.__qualname__
+
+        obj_str: str = ''  # accumulator
+        fi: structs.FieldInfo
+        k: str
+        v: Any
+        for fi, k, v in self._sin_props():
+
+            # TODO: how can we prefer `Literal['option1',  'option2,
+            # ..]` over .__name__ == `Literal` but still get only the
+            # latter for simple types like `str | int | None` etc..?
+            ft: type = fi.type
+            typ_name: str = getattr(ft, '__name__', str(ft))
+
+            # recurse to get sub-struct's `.pformat()` output Bo
+            if isinstance(v, Struct):
+                val_str: str =  v.pformat(
+                    indent=field_indent + indent,
+                    field_indent=indent + field_indent,
+                )
+
+            else:  # the `pprint` recursion-safe format:
+                # https://docs.python.org/3.11/library/pprint.html#pprint.saferepr
+                val_str: str = saferepr(v)
+
+            obj_str += (field_ws + f'{k}: {typ_name} = {val_str},\n')
+
+        return (
+            f'{qtn}(\n'
+            f'{obj_str}'
+            f'{ws})'
+        )
+
+    # TODO: use a pprint.PrettyPrinter instance around ONLY rendering
+    # inside a known tty?
+    # def __repr__(self) -> str:
+    #     ...
+
+    # __str__ = __repr__ = pformat
+    __repr__ = pformat
+
+    def copy(
+        self,
+        update: dict | None = None,
+
+    ) -> Struct:
+        '''
+        Validate-typecast all self defined fields, return a copy of
+        us with all such fields.
+
+        NOTE: This is kinda like the default behaviour in
+        `pydantic.BaseModel` except a copy of the object is
+        returned making it compat with `frozen=True`.
+
+        '''
+        if update:
+            for k, v in update.items():
+                setattr(self, k, v)
+
+        # NOTE: roundtrip serialize to validate
+        # - enode to msgpack binary format,
+        # - decode that back to a struct.
+        return msgpack.Decoder(type=type(self)).decode(
+            msgpack.Encoder().encode(self)
+        )
+
+    def typecast(
+        self,
+
+        # TODO: allow only casting a named subset?
+        # fields: set[str] | None = None,
+
+    ) -> None:
+        '''
+        Cast all fields using their declared type annotations
+        (kinda like what `pydantic` does by default).
+
+        NOTE: this of course won't work on frozen types, use
+        ``.copy()`` above in such cases.
+
+        '''
+        # https://jcristharif.com/msgspec/api.html#msgspec.structs.fields
+        fi: structs.FieldInfo
+        for fi in structs.fields(self):
+            setattr(
+                self,
+                fi.name,
+                fi.type(getattr(self, fi.name)),
+            )
+
+    def __sub__(
+        self,
+        other: Struct,
+
+    ) -> DiffDump[tuple[str, Any, Any]]:
+        '''
+        Compare fields/items key-wise and return a ``DiffDump``
+        for easy visual REPL comparison B)
+
+        '''
+        diffs: DiffDump[tuple[str, Any, Any]] = DiffDump()
+        for fi in structs.fields(self):
+            attr_name: str = fi.name
+            ours: Any = getattr(self, attr_name)
+            theirs: Any = getattr(other, attr_name)
+            if ours != theirs:
+                diffs.append((
+                    attr_name,
+                    ours,
+                    theirs,
+                ))
+
+        return diffs
-- 
2.34.1


From 286e75d342acecfdb2e0b808d0cd9878974b595a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 14 Feb 2024 16:13:32 -0500
Subject: [PATCH 095/378] Offer `unpack_error(hid_tb: bool)` for `pdbp` REPL
 config

---
 tractor/_exceptions.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 7e148586..c75b7855 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -185,7 +185,8 @@ def unpack_error(
 
     msg: dict[str, Any],
     chan=None,
-    err_type=RemoteActorError
+    err_type=RemoteActorError,
+    hide_tb: bool = True,
 
 ) -> None | Exception:
     '''
@@ -196,7 +197,7 @@ def unpack_error(
     which is the responsibilitiy of the caller.
 
     '''
-    __tracebackhide__: bool = True
+    __tracebackhide__: bool = hide_tb
 
     error_dict: dict[str, dict] | None
     if (
@@ -309,6 +310,11 @@ def _raise_from_no_key_in_msg(
         # value out of the underlying feed mem chan!
         stream._eoc: bool = True
 
+        # TODO: if the a local task is already blocking on
+        # a `Context.result()` and thus a `.receive()` on the
+        # rx-chan, we close the chan and set state ensuring that
+        # an eoc is raised!
+
         # # when the send is closed we assume the stream has
         # # terminated and signal this local iterator to stop
         # await stream.aclose()
@@ -317,8 +323,8 @@ def _raise_from_no_key_in_msg(
         # raise a ``StopAsyncIteration`` **and** in our catch
         # block below it will trigger ``.aclose()``.
         raise trio.EndOfChannel(
-                'Context[{cid}] stream ended due to msg:\n'
-                f'{pformat(msg)}'
+            f'Context stream ended due to msg:\n'
+            f'{pformat(msg)}'
         ) from src_err
 
 
-- 
2.34.1


From 7fbada8a15a9aa8f42885ed2f52c5885cdab0961 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 16 Feb 2024 15:23:00 -0500
Subject: [PATCH 096/378] Add `StreamOverrun.sender: tuple` for better handling

Since it's generally useful to know who is the cause of an overrun (say
bc you want your system to then adjust the writer side to slow tf down)
might as well pack an extra `.sender: tuple[str, str]` actor uid field
which can be relayed through `RemoteActorError` boxing. Add an extra
case for the exc-type to `unpack_error()` to match B)
---
 tractor/_exceptions.py | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index c75b7855..dcabf402 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -134,8 +134,19 @@ class NoRuntime(RuntimeError):
     "The root actor has not been initialized yet"
 
 
-class StreamOverrun(trio.TooSlowError):
-    "This stream was overrun by sender"
+class StreamOverrun(
+    RemoteActorError,
+    trio.TooSlowError,
+):
+    '''
+    This stream was overrun by sender
+
+    '''
+    @property
+    def sender(self) -> tuple[str, str] | None:
+        value = self.msgdata.get('sender')
+        if value:
+            return tuple(value)
 
 
 class AsyncioCancelled(Exception):
@@ -175,7 +186,15 @@ def pack_error(
         'src_actor_uid': current_actor().uid,
     }
 
-    if isinstance(exc, ContextCancelled):
+    # TODO: ?just wholesale proxy `.msgdata: dict`?
+    # XXX WARNING, when i swapped these ctx-semantics
+    # tests started hanging..???!!!???
+    # if msgdata := exc.getattr('msgdata', {}):
+    #     error_msg.update(msgdata)
+    if (
+        isinstance(exc, ContextCancelled)
+        or isinstance(exc, StreamOverrun)
+    ):
         error_msg.update(exc.msgdata)
 
     return {'error': error_msg}
-- 
2.34.1


From 7f29fd8dcfde9084debb716ce105cf06f13281fd Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 18 Feb 2024 17:17:31 -0500
Subject: [PATCH 097/378] Let `pack_error()` take a msg injected `cid:
 str|None`

---
 tractor/_exceptions.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index dcabf402..d63cf6d4 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -163,13 +163,15 @@ class MessagingError(Exception):
 
 def pack_error(
     exc: BaseException,
-    tb: str | None = None,
+    tb: str|None = None,
+    cid: str|None = None,
 
 ) -> dict[str, dict]:
     '''
-    Create an "error message" encoded for wire transport via an IPC
-    `Channel`; expected to be unpacked on the receiver side using
-    `unpack_error()` below.
+    Create an "error message" which boxes a locally caught
+    exception's meta-data and encodes it for wire transport via an
+    IPC `Channel`; expected to be unpacked (and thus unboxed) on
+    the receiver side using `unpack_error()` below.
 
     '''
     if tb:
@@ -197,7 +199,12 @@ def pack_error(
     ):
         error_msg.update(exc.msgdata)
 
-    return {'error': error_msg}
+
+    pkt: dict = {'error': error_msg}
+    if cid:
+        pkt['cid'] = cid
+
+    return pkt
 
 
 def unpack_error(
@@ -207,7 +214,7 @@ def unpack_error(
     err_type=RemoteActorError,
     hide_tb: bool = True,
 
-) -> None | Exception:
+) -> None|Exception:
     '''
     Unpack an 'error' message from the wire
     into a local `RemoteActorError` (subtype).
@@ -358,8 +365,7 @@ def _raise_from_no_key_in_msg(
     # is activated above.
     _type: str = 'Stream' if stream else 'Context'
     raise MessagingError(
-        f'{_type} was expecting a `{expect_key}` message'
-        ' BUT received a non-`error` msg:\n'
-        f'cid: {cid}\n'
-        '{pformat(msg)}'
+        f"{_type} was expecting a '{expect_key}' message"
+        " BUT received a non-error msg:\n"
+        f'{pformat(msg)}'
     ) from src_err
-- 
2.34.1


From 8ce26d692fbcf0ecd37c86ed9f38ff9a5abd6134 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 19 Feb 2024 12:25:08 -0500
Subject: [PATCH 098/378] Improved log msg formatting in core

As part of solving some final edge cases todo with inter-peer remote
cancellation (particularly a remote cancel from a separate actor
tree-client hanging on the request side in `modden`..) I needed less
dense, more line-delimited log msg formats when understanding ipc
channel and context cancels from console logging; this adds a ton of
that to:
- `._invoke()` which now does,
  - better formatting of `Context`-task info as multi-line
    `'<field>: <value>\n'` messages,
  - use of `trio.Task` (from `.lowlevel.current_task()` for full
    rpc-func namespace-path info,
  - better "msg flow annotations" with `<=` for understanding
    `ContextCancelled` flow.
- `Actor._stream_handler()` where in we break down IPC peers reporting
  better as multi-line `|_<Channel>` log msgs instead of all jammed on
  one line..
- `._ipc.Channel.send()` use `pformat()` for repr of packet.

Also tweak some optional deps imports for debug mode:
- add `maybe_import_gb()` for attempting to import `greenback`.
- maybe enable `stackscope` tree pprinter on `SIGUSR1` if installed.

Add a further stale-debugger-lock guard before removal:
- read the `._debug.Lock.global_actor_in_debug: tuple` uid and possibly
  `maybe_wait_for_debugger()` when the child-user is known to have
  a live process in our tree.
- only cancel `Lock._root_local_task_cs_in_debug: CancelScope` when
  the disconnected channel maps to the `Lock.global_actor_in_debug`,
  though not sure this is correct yet?

Started adding missing type annots in sections that were modified.
---
 tractor/_ipc.py     |  10 +-
 tractor/_runtime.py | 359 +++++++++++++++++++++++++++++++++-----------
 2 files changed, 275 insertions(+), 94 deletions(-)

diff --git a/tractor/_ipc.py b/tractor/_ipc.py
index 39f62224..5e286c1d 100644
--- a/tractor/_ipc.py
+++ b/tractor/_ipc.py
@@ -19,13 +19,14 @@ Inter-process comms abstractions
 
 """
 from __future__ import annotations
-import platform
 import struct
-import typing
+import platform
+from pprint import pformat
 from collections.abc import (
     AsyncGenerator,
     AsyncIterator,
 )
+import typing
 from typing import (
     Any,
     runtime_checkable,
@@ -370,7 +371,10 @@ class Channel:
 
     async def send(self, item: Any) -> None:
 
-        log.transport(f"send `{item}`")  # type: ignore
+        log.transport(
+            '=> send IPC msg:\n\n'
+            f'{pformat(item)}\n'
+        )  # type: ignore
         assert self.msgstream
 
         await self.msgstream.send(item)
diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 58080654..f25d3e57 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -48,6 +48,10 @@ import trio
 from trio import (
     CancelScope,
 )
+from trio.lowlevel import (
+    current_task,
+    Task,
+)
 from trio_typing import (
     Nursery,
     TaskStatus,
@@ -67,7 +71,11 @@ from ._exceptions import (
     ContextCancelled,
     TransportClosed,
 )
-from .devx import _debug
+from .devx import (
+    # pause,
+    maybe_wait_for_debugger,
+    _debug,
+)
 from ._discovery import get_registry
 from ._portal import Portal
 from . import _state
@@ -80,6 +88,26 @@ if TYPE_CHECKING:
 
 log = get_logger('tractor')
 
+_gb_mod: ModuleType|None|False = None
+
+
+async def maybe_import_gb():
+    global _gb_mod
+    if _gb_mod is False:
+        return
+
+    try:
+        import greenback
+        _gb_mod = greenback
+        await greenback.ensure_portal()
+
+    except ModuleNotFoundError:
+        log.warning(
+            '`greenback` is not installed.\n'
+            'No sync debug support!'
+        )
+        _gb_mod = False
+
 
 async def _invoke(
 
@@ -106,17 +134,11 @@ async def _invoke(
     failed_resp: bool = False
 
     if _state.debug_mode():
-        try:
-            import greenback
-            await greenback.ensure_portal()
-        except ModuleNotFoundError:
-            log.warning(
-                '`greenback` is not installed.\n'
-                'No sync debug support!'
-            )
+        await maybe_import_gb()
 
-    # possibly a traceback (not sure what typing is for this..)
-    tb = None
+    # TODO: possibly a specially formatted traceback
+    # (not sure what typing is for this..)?
+    # tb = None
 
     cancel_scope = CancelScope()
     # activated cancel scope ref
@@ -237,15 +259,27 @@ async def _invoke(
         #     wrapper that calls `Context.started()` and then does
         #     the `await coro()`?
         elif context:
-            # context func with support for bi-dir streaming
-            await chan.send({'functype': 'context', 'cid': cid})
+
+            # a "context" endpoint type is the most general and
+            # "least sugary" type of RPC ep with support for
+            # bi-dir streaming B)
+            await chan.send({
+                'functype': 'context',
+                'cid': cid
+            })
 
             try:
                 async with trio.open_nursery() as nurse:
                     ctx._scope_nursery = nurse
                     ctx._scope = nurse.cancel_scope
                     task_status.started(ctx)
+
+                    # TODO: should would be nice to have our
+                    # `TaskMngr` nursery here!
+                    # res: Any = await coro
                     res = await coro
+
+                    # deliver final result to caller side.
                     await chan.send({
                         'return': res,
                         'cid': cid
@@ -279,11 +313,12 @@ async def _invoke(
 
                 # don't pop the local context until we know the
                 # associated child isn't in debug any more
-                await _debug.maybe_wait_for_debugger()
+                await maybe_wait_for_debugger()
                 ctx: Context = actor._contexts.pop((chan.uid, cid))
-                log.runtime(
-                    f'Context entrypoint {func} was terminated:\n'
-                    f'{ctx}'
+                log.cancel(
+                    f'Context task was terminated:\n'
+                    f'func: {func}\n'
+                    f'ctx: {pformat(ctx)}'
                 )
 
             if ctx.cancelled_caught:
@@ -295,13 +330,14 @@ async def _invoke(
                 if re := ctx._remote_error:
                     ctx._maybe_raise_remote_err(re)
 
-                fname: str = func.__name__
+                # fname: str = func.__name__
+                task: Task = current_task()
                 cs: CancelScope = ctx._scope
                 if cs.cancel_called:
                     our_uid: tuple = actor.uid
                     canceller: tuple = ctx.canceller
                     msg: str = (
-                        f'`{fname}()`@{our_uid} cancelled by '
+                        'actor was cancelled by '
                     )
 
                     # NOTE / TODO: if we end up having
@@ -320,16 +356,37 @@ async def _invoke(
                     # some actor who calls `Portal.cancel_actor()`
                     # and by side-effect cancels this ctx.
                     elif canceller == ctx.chan.uid:
-                        msg += f'its caller {canceller} '
+                        msg += 'its caller'
 
                     else:
-                        msg += f'remote actor {canceller}'
+                        msg += 'a remote peer'
+
+                    div_chars: str = '------ - ------'
+                    div_offset: int = (
+                        round(len(msg)/2)+1
+                        +
+                        round(len(div_chars)/2)+1
+                    )
+                    div_str: str = (
+                        '\n'
+                        +
+                        ' '*div_offset
+                        +
+                        f'{div_chars}\n'
+                    )
+                    msg += (
+                        div_str +
+                        f'<= canceller: {canceller}\n'
+                        f'=> uid: {our_uid}\n'
+                        f'  |_ task: `{task.name}()`'
+                    )
 
                     # TODO: does this ever get set any more or can
                     # we remove it?
                     if ctx._cancel_msg:
                         msg += (
-                            ' with msg:\n'
+                            '------ - ------\n'
+                            'IPC msg:\n'
                             f'{ctx._cancel_msg}'
                         )
 
@@ -449,9 +506,9 @@ async def _invoke(
         # always ship errors back to caller
         err_msg: dict[str, dict] = pack_error(
             err,
-            tb=tb,
+            # tb=tb, # TODO: special tb fmting?
+            cid=cid,
         )
-        err_msg['cid'] = cid
 
         if is_rpc:
             try:
@@ -518,19 +575,28 @@ async def try_ship_error_to_parent(
     err: Exception | BaseExceptionGroup,
 
 ) -> None:
+    '''
+    Box, pack and encode a local runtime(-internal) exception for
+    an IPC channel `.send()` with transport/network failures and
+    local cancellation ignored but logged as critical(ly bad).
+
+    '''
     with CancelScope(shield=True):
         try:
-            # internal error so ship to parent without cid
-            await channel.send(pack_error(err))
+            await channel.send(
+                # NOTE: normally only used for internal runtime errors
+                # so ship to peer actor without a cid.
+                pack_error(err)
+            )
         except (
             trio.ClosedResourceError,
             trio.BrokenResourceError,
         ):
             # in SC terms this is one of the worst things that can
-            # happen and creates the 2-general's dilemma.
+            # happen and provides for a 2-general's dilemma..
             log.critical(
-                f"Failed to ship error to parent "
-                f"{channel.uid}, channel was closed"
+                f'Failed to ship error to parent '
+                f'{channel.uid}, IPC transport failure!'
             )
 
 
@@ -588,6 +654,11 @@ class Actor:
     # if started on ``asycio`` running ``trio`` in guest mode
     _infected_aio: bool = False
 
+    # _ans: dict[
+    #     tuple[str, str],
+    #     list[ActorNursery],
+    # ] = {}
+
     # Process-global stack closed at end on actor runtime teardown.
     # NOTE: this is currently an undocumented public api.
     lifetime_stack: ExitStack = ExitStack()
@@ -612,7 +683,10 @@ class Actor:
 
         '''
         self.name = name
-        self.uid = (name, uid or str(uuid.uuid4()))
+        self.uid = (
+            name,
+            uid or str(uuid.uuid4())
+        )
 
         self._cancel_complete = trio.Event()
         self._cancel_called_by_remote: tuple[str, tuple] | None = None
@@ -827,7 +901,10 @@ class Actor:
             return
 
         # channel tracking
-        event = self._peer_connected.pop(uid, None)
+        event: trio.Event|None = self._peer_connected.pop(
+            uid,
+            None,
+        )
         if event:
             # Instructing connection: this is likely a new channel to
             # a recently spawned actor which we'd like to control via
@@ -836,46 +913,43 @@ class Actor:
             # Alert any task waiting on this connection to come up
             event.set()
 
-        chans = self._peers[uid]
-
-        # TODO: re-use channels for new connections instead
-        # of always new ones; will require changing all the
-        # discovery funcs
+        chans: list[Channel] = self._peers[uid]
         if chans:
+            # TODO: re-use channels for new connections instead
+            # of always new ones?
+            # => will require changing all the discovery funcs..
             log.runtime(
                 f"already have channel(s) for {uid}:{chans}?"
             )
 
-        log.runtime(f"Registered {chan} for {uid}")  # type: ignore
         # append new channel
+        log.runtime(f"Registered {chan} for {uid}")  # type: ignore
+        # TODO: can we just use list-ref directly?
+        # chans.append(chan)
         self._peers[uid].append(chan)
 
-        local_nursery: ActorNursery | None = None  # noqa
-        disconnected: bool = False
-
         # Begin channel management - respond to remote requests and
         # process received reponses.
+        disconnected: bool = False
         try:
-            disconnected = await process_messages(self, chan)
-
-        except (
-            trio.Cancelled,
-        ):
-            log.cancel(f"Msg loop was cancelled for {chan}")
+            disconnected: bool = await process_messages(self, chan)
+        except trio.Cancelled:
+            log.cancel(f'Msg loop was cancelled for {chan}')
             raise
 
         finally:
-            local_nursery = self._actoruid2nursery.get(uid, local_nursery)
+            local_nursery: (
+                ActorNursery|None
+            ) = self._actoruid2nursery.get(uid)
 
             # This is set in ``Portal.cancel_actor()``. So if
             # the peer was cancelled we try to wait for them
             # to tear down their side of the connection before
             # moving on with closing our own side.
-            if (
-                local_nursery
-            ):
+            if local_nursery:
                 if chan._cancel_called:
-                    log.cancel(f"Waiting on cancel request to peer {chan.uid}")
+                    log.cancel(f'Waiting on cancel request to peer {chan.uid}')
+
                 # XXX: this is a soft wait on the channel (and its
                 # underlying transport protocol) to close from the
                 # remote peer side since we presume that any channel
@@ -920,6 +994,7 @@ class Actor:
                     # other downstream errors.
                     entry = local_nursery._children.get(uid)
                     if entry:
+                        proc: trio.Process
                         _, proc, _ = entry
 
                         if (
@@ -927,22 +1002,42 @@ class Actor:
                             and poll() is None
                         ):
                             log.cancel(
-                                f'Actor {uid} IPC broke but proc is alive?\n'
-                                'Attempting to self cancel..'
+                                f'Peer actor IPC broke but proc is alive?\n'
+                                f'uid: {uid}\n'
+                                f'|_{proc}\n'
                             )
 
             # ``Channel`` teardown and closure sequence
 
             # Drop ref to channel so it can be gc-ed and disconnected
-            log.runtime(f"Releasing channel {chan} from {chan.uid}")
+            log.runtime(
+                f'Disconnected IPC channel:\n'
+                f'uid: {chan.uid}\n'
+                f'|_{pformat(chan)}\n'
+            )
             chans = self._peers.get(chan.uid)
             chans.remove(chan)
 
             if not chans:
-                log.runtime(f"No more channels for {chan.uid}")
+                log.runtime(
+                    f'No more channels with {chan.uid}'
+                )
                 self._peers.pop(uid, None)
 
-            log.runtime(f"Peers is {self._peers}")
+            peers_str: str = ''
+            for uid, chans in self._peers.items():
+                peers_str += (
+                    f'- uid: {uid}\n'
+                )
+                for i, chan in enumerate(chans):
+                    peers_str += (
+                        f' |_[{i}] {pformat(chan)}\n'
+                    )
+
+            log.runtime(
+                f'Remaining IPC {len(self._peers)} peers:\n'
+                + peers_str
+            )
 
             # No more channels to other actors (at all) registered
             # as connected.
@@ -958,15 +1053,58 @@ class Actor:
                 if _state.is_root_process():
                     pdb_lock = _debug.Lock
                     pdb_lock._blocked.add(uid)
-                    log.runtime(f"{uid} blocked from pdb locking")
 
+                    # TODO: NEEEDS TO BE TESTED!
+                    # actually, no idea if this ever even enters.. XD
+                    pdb_user_uid: tuple = pdb_lock.global_actor_in_debug
+                    if (
+                        pdb_user_uid
+                        and local_nursery
+                    ):
+                        entry: tuple|None = local_nursery._children.get(pdb_user_uid)
+                        if entry:
+                            proc: trio.Process
+                            _, proc, _ = entry
+
+                        if (
+                            (poll := getattr(proc, 'poll', None))
+                            and poll() is None
+                        ):
+                            log.cancel(
+                                'Root actor reports no-more-peers, BUT '
+                                'a DISCONNECTED child still has the debug '
+                                'lock!\n'
+                                f'root uid: {self.uid}\n'
+                                f'last disconnected child uid: {uid}\n'
+                                f'locking child uid: {pdb_user_uid}\n'
+                            )
+                            await maybe_wait_for_debugger(
+                                child_in_debug=True
+                            )
+
+                    # TODO: just bc a child's transport dropped
+                    # doesn't mean it's not still using the pdb
+                    # REPL! so,
+                    # -[ ] ideally we can check out child proc
+                    #  tree to ensure that its alive (and
+                    #  actually using the REPL) before we cancel
+                    #  it's lock acquire by doing the below!
+                    # -[ ] create a way to read the tree of each actor's
+                    #  grandchildren such that when an
+                    #  intermediary parent is cancelled but their
+                    #  child has locked the tty, the grandparent
+                    #  will not allow the parent to cancel or
+                    #  zombie reap the child! see open issue:
+                    #  - https://github.com/goodboy/tractor/issues/320
+                    # ------ - ------
                     # if a now stale local task has the TTY lock still
                     # we cancel it to allow servicing other requests for
                     # the lock.
-                    db_cs = pdb_lock._root_local_task_cs_in_debug
+                    db_cs: trio.CancelScope|None = pdb_lock._root_local_task_cs_in_debug
                     if (
                         db_cs
                         and not db_cs.cancel_called
+                        and uid == pdb_user_uid
                     ):
                         log.critical(
                             f'STALE DEBUG LOCK DETECTED FOR {uid}'
@@ -998,15 +1136,16 @@ class Actor:
         chan: Channel,
         cid: str,
         msg: dict[str, Any],
-    ) -> None:
+
+    ) -> None|bool:
         '''
         Push an RPC result to the local consumer's queue.
 
         '''
-        uid = chan.uid
+        uid: tuple[str, str] = chan.uid
         assert uid, f"`chan.uid` can't be {uid}"
         try:
-            ctx = self._contexts[(uid, cid)]
+            ctx: Context = self._contexts[(uid, cid)]
         except KeyError:
             log.warning(
                 f'Ignoring msg from [no-longer/un]known context {uid}:'
@@ -1137,6 +1276,16 @@ class Actor:
                 )
                 accept_addrs: list[tuple[str, int]] = parent_data.pop('bind_addrs')
                 rvs = parent_data.pop('_runtime_vars')
+
+                if rvs['_debug_mode']:
+                    try:
+                        from .devx import enable_stack_on_sig
+                        enable_stack_on_sig()
+                    except ImportError:
+                        log.warning(
+                            '`stackscope` not installed for use in debug mode!'
+                        )
+
                 log.runtime(f"Runtime vars are: {rvs}")
                 rvs['_is_root'] = False
                 _state._runtime_vars.update(rvs)
@@ -1374,9 +1523,15 @@ class Actor:
         '''
         tasks: dict = self._rpc_tasks
         if tasks:
+            tasks_str: str = ''
+            for (ctx, func, _) in tasks.values():
+                tasks_str += (
+                    f' |_{func.__name__}() [cid={ctx.cid[-6:]}..]\n'
+                )
+
             log.cancel(
                 f'Cancelling all {len(tasks)} rpc tasks:\n'
-                f'{tasks}'
+                f'{tasks_str}'
             )
             for (
                 (chan, cid),
@@ -1660,7 +1815,10 @@ async def async_main(
             )
 
         if actor._parent_chan:
-            await try_ship_error_to_parent(actor._parent_chan, err)
+            await try_ship_error_to_parent(
+                actor._parent_chan,
+                err,
+            )
 
         # always!
         match err:
@@ -1750,43 +1908,53 @@ async def process_messages(
     or boxed errors back to the remote caller (task).
 
     '''
-    # TODO: once https://github.com/python-trio/trio/issues/467 gets
-    # worked out we'll likely want to use that!
-    msg: dict | None = None
+    # TODO: once `trio` get's an "obvious way" for req/resp we
+    # should use it?
+    # https://github.com/python-trio/trio/issues/467
+    log.runtime(
+        'Entering IPC msg loop:\n'
+        f'peer: {chan.uid}\n'
+        f'|_{chan}'
+    )
     nursery_cancelled_before_task: bool = False
-
-    log.runtime(f"Entering msg loop for {chan} from {chan.uid}")
+    msg: dict | None = None
     try:
+        # NOTE: this internal scope allows for keeping this
+        # message loop running despite the current task having
+        # been cancelled (eg. `open_portal()` may call this method
+        # from a locally spawned task) and recieve this scope
+        # using ``scope = Nursery.start()``
         with CancelScope(shield=shield) as loop_cs:
-            # this internal scope allows for keeping this message
-            # loop running despite the current task having been
-            # cancelled (eg. `open_portal()` may call this method from
-            # a locally spawned task) and recieve this scope using
-            # ``scope = Nursery.start()``
             task_status.started(loop_cs)
             async for msg in chan:
 
-                if msg is None:  # loop terminate sentinel
+                # dedicated loop terminate sentinel
+                if msg is None:
 
+                    tasks: dict[
+                        tuple[Channel, str],
+                        tuple[Context, Callable, trio.Event]
+                    ] = actor._rpc_tasks.copy()
                     log.cancel(
-                        f"Channel to {chan.uid} terminated?\n"
-                        "Cancelling all associated tasks..")
-
-                    for (channel, cid) in actor._rpc_tasks.copy():
+                        f'Peer IPC channel terminated via `None` setinel msg?\n'
+                        f'=> Cancelling all {len(tasks)} local RPC tasks..\n'
+                        f'peer: {chan.uid}\n'
+                        f'|_{chan}\n'
+                    )
+                    for (channel, cid) in tasks:
                         if channel is chan:
                             await actor._cancel_task(
                                 cid,
                                 channel,
                             )
-
-                    log.runtime(
-                            f"Msg loop signalled to terminate for"
-                            f" {chan} from {chan.uid}")
-
                     break
 
                 log.transport(   # type: ignore
-                    f"Received msg {msg} from {chan.uid}")
+                    f'<= IPC msg from peer: {chan.uid}\n\n'
+                    # TODO: conditionally avoid fmting depending
+                    # on log level (for perf)?
+                    f'{pformat(msg)}\n'
+                )
 
                 cid = msg.get('cid')
                 if cid:
@@ -1795,7 +1963,10 @@ async def process_messages(
                     await actor._push_result(chan, cid, msg)
 
                     log.runtime(
-                        f"Waiting on next msg for {chan} from {chan.uid}")
+                        f'Waiting on next IPC msg from {chan.uid}:\n'
+                        # f'last msg: {msg}\n'
+                        f'|_{chan}'
+                    )
                     continue
 
                 # TODO: implement with ``match:`` syntax?
@@ -1848,7 +2019,7 @@ async def process_messages(
                             )
 
                         log.cancel(
-                            f'Cancelling msg loop for {chan.uid}'
+                            f'Cancelling IPC msg-loop with {chan.uid}'
                         )
                         loop_cs.cancel()
                         break
@@ -1890,8 +2061,10 @@ async def process_messages(
                     try:
                         func = actor._get_rpc_func(ns, funcname)
                     except (ModuleNotExposed, AttributeError) as err:
-                        err_msg = pack_error(err)
-                        err_msg['cid'] = cid
+                        err_msg: dict[str, dict] = pack_error(
+                            err,
+                            cid=cid,
+                        )
                         await chan.send(err_msg)
                         continue
 
@@ -1993,7 +2166,10 @@ async def process_messages(
                     log.exception("Actor errored:")
 
             if actor._parent_chan:
-                await try_ship_error_to_parent(actor._parent_chan, err)
+                await try_ship_error_to_parent(
+                    actor._parent_chan,
+                    err,
+                )
 
         # if this is the `MainProcess` we expect the error broadcasting
         # above to trigger an error at consuming portal "checkpoints"
@@ -2002,8 +2178,9 @@ async def process_messages(
     finally:
         # msg debugging for when he machinery is brokey
         log.runtime(
-            f"Exiting msg loop for {chan} from {chan.uid} "
-            f"with last msg:\n{msg}"
+            f'Exiting IPC msg loop with {chan.uid} '
+            f'final msg: {msg}\n'
+            f'|_{chan}'
         )
 
     # transport **was not** disconnected
-- 
2.34.1


From c35576e196b38bedd725edd7eb73b137dad9806b Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 19 Feb 2024 14:41:03 -0500
Subject: [PATCH 099/378] Baboso! fix `chan.send(None)` indent..

---
 tractor/_runtime.py | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index f25d3e57..4e7f9fac 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -1113,23 +1113,32 @@ class Actor:
                         db_cs.cancel()
 
             # XXX: is this necessary (GC should do it)?
-            if chan.connected():
-                # if the channel is still connected it may mean the far
-                # end has not closed and we may have gotten here due to
-                # an error and so we should at least try to terminate
-                # the channel from this end gracefully.
+            # XXX WARNING XXX
+            # Be AWARE OF THE INDENT LEVEL HERE
+            # -> ONLY ENTER THIS BLOCK WHEN ._peers IS
+            # EMPTY!!!!
+            if (
+                not self._peers
+                and chan.connected()
+            ):
+                    # if the channel is still connected it may mean the far
+                    # end has not closed and we may have gotten here due to
+                    # an error and so we should at least try to terminate
+                    # the channel from this end gracefully.
+                    log.runtime(
+                        'Terminating channel with `None` setinel msg\n'
+                        f'|_{chan}\n'
+                    )
+                    try:
+                        # send a msg loop terminate sentinel
+                        await chan.send(None)
 
-                log.runtime(f"Disconnecting channel {chan}")
-                try:
-                    # send a msg loop terminate sentinel
-                    await chan.send(None)
+                        # XXX: do we want this?
+                        # causes "[104] connection reset by peer" on other end
+                        # await chan.aclose()
 
-                    # XXX: do we want this?
-                    # causes "[104] connection reset by peer" on other end
-                    # await chan.aclose()
-
-                except trio.BrokenResourceError:
-                    log.runtime(f"Channel {chan.uid} was already closed")
+                    except trio.BrokenResourceError:
+                        log.runtime(f"Channel {chan.uid} was already closed")
 
     async def _push_result(
         self,
-- 
2.34.1


From 3e1d033708bdc101d331c809366d5db06b37818c Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 19 Feb 2024 17:00:46 -0500
Subject: [PATCH 100/378] WIP: solved the modden client hang..

---
 tractor/_context.py   | 602 ++++++++++++++++++++++++++++++++++--------
 tractor/_portal.py    | 178 +++++++++++--
 tractor/_streaming.py | 188 ++++++++++---
 3 files changed, 793 insertions(+), 175 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index 4d56fb3c..54e309e1 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -43,12 +43,17 @@ import warnings
 
 import trio
 
+# from .devx import (
+#     maybe_wait_for_debugger,
+#     pause,
+# )
 from ._exceptions import (
     # _raise_from_no_key_in_msg,
     unpack_error,
     pack_error,
     ContextCancelled,
     # MessagingError,
+    RemoteActorError,
     StreamOverrun,
 )
 from .log import get_logger
@@ -64,6 +69,164 @@ if TYPE_CHECKING:
 log = get_logger(__name__)
 
 
+async def _drain_to_final_msg(
+    ctx: Context,
+) -> list[dict]:
+
+# ) -> tuple[
+#     Any|Exception,
+#     list[dict],
+# ]:
+    raise_overrun: bool = not ctx._allow_overruns
+
+    # wait for a final context result by collecting (but
+    # basically ignoring) any bi-dir-stream msgs still in transit
+    # from the far end.
+    pre_result_drained: list[dict] = []
+    while not ctx._remote_error:
+        try:
+            # NOTE: this REPL usage actually works here dawg! Bo
+            # from .devx._debug import pause
+            # await pause()
+            # if re := ctx._remote_error:
+            #     ctx._maybe_raise_remote_err(
+            #         re,
+            #         # NOTE: obvi we don't care if we
+            #         # overran the far end if we're already
+            #         # waiting on a final result (msg).
+            #         raise_overrun_from_self=raise_overrun,
+            #     )
+
+            # TODO: bad idea?
+            # with trio.CancelScope() as res_cs:
+            #     ctx._res_scope = res_cs
+            #     msg: dict = await ctx._recv_chan.receive()
+            # if res_cs.cancelled_caught:
+
+            # from .devx._debug import pause
+            # await pause()
+            msg: dict = await ctx._recv_chan.receive()
+            ctx._result: Any = msg['return']
+            log.runtime(
+                'Context delivered final result msg:\n'
+                f'{pformat(msg)}'
+            )
+            pre_result_drained.append(msg)
+            # NOTE: we don't need to do this right?
+            # XXX: only close the rx mem chan AFTER
+            # a final result is retreived.
+            # if ctx._recv_chan:
+            #     await ctx._recv_chan.aclose()
+            break
+
+        # NOTE: we get here if the far end was
+        # `ContextCancelled` in 2 cases:
+        # 1. we requested the cancellation and thus
+        #    SHOULD NOT raise that far end error,
+        # 2. WE DID NOT REQUEST that cancel and thus
+        #    SHOULD RAISE HERE!
+        except trio.Cancelled:
+
+            # CASE 2: mask the local cancelled-error(s)
+            # only when we are sure the remote error is
+            # the source cause of this local task's
+            # cancellation.
+            if re := ctx._remote_error:
+                ctx._maybe_raise_remote_err(re)
+
+            # CASE 1: we DID request the cancel we simply
+            # continue to bubble up as normal.
+            raise
+
+        except KeyError:
+
+            if 'yield' in msg:
+                # far end task is still streaming to us so discard
+                log.warning(f'Discarding std "yield"\n{msg}')
+                pre_result_drained.append(msg)
+                continue
+
+            # TODO: work out edge cases here where
+            # a stream is open but the task also calls
+            # this?
+            # -[ ] should be a runtime error if a stream is open
+            #   right?
+            elif 'stop' in msg:
+                log.cancel(
+                    'Remote stream terminated due to "stop" msg:\n'
+                    f'{msg}'
+                )
+                pre_result_drained.append(msg)
+                continue
+
+            # internal error should never get here
+            assert msg.get('cid'), (
+                "Received internal error at portal?"
+            )
+
+            # XXX fallthrough to handle expected error XXX
+            re: Exception|None = ctx._remote_error
+            if re:
+                log.critical(
+                    'Remote ctx terminated due to "error" msg:\n'
+                    f'{re}'
+                )
+                assert msg is ctx._cancel_msg
+                # NOTE: this solved a super dupe edge case XD
+                # this was THE super duper edge case of:
+                # - local task opens a remote task,
+                # - requests remote cancellation of far end
+                #   ctx/tasks,
+                # - needs to wait for the cancel ack msg
+                #   (ctxc) or some result in the race case
+                #   where the other side's task returns
+                #   before the cancel request msg is ever
+                #   rxed and processed,
+                # - here this surrounding drain loop (which
+                #   iterates all ipc msgs until the ack or
+                #   an early result arrives) was NOT exiting
+                #   since we are the edge case: local task
+                #   does not re-raise any ctxc it receives
+                #   IFF **it** was the cancellation
+                #   requester..
+                # will raise if necessary, ow break from
+                # loop presuming any error terminates the
+                # context!
+                ctx._maybe_raise_remote_err(
+                    re,
+                    # NOTE: obvi we don't care if we
+                    # overran the far end if we're already
+                    # waiting on a final result (msg).
+                    # raise_overrun_from_self=False,
+                    raise_overrun_from_self=raise_overrun,
+                )
+
+                break  # OOOOOF, yeah obvi we need this..
+
+            # XXX we should never really get here
+            # right! since `._deliver_msg()` should
+            # always have detected an {'error': ..}
+            # msg and already called this right!?!
+            elif error := unpack_error(
+                msg=msg,
+                chan=ctx._portal.channel,
+                hide_tb=False,
+            ):
+                log.critical('SHOULD NEVER GET HERE!?')
+                assert msg is ctx._cancel_msg
+                assert error.msgdata == ctx._remote_error.msgdata
+                from .devx._debug import pause
+                await pause()
+                ctx._maybe_cancel_and_set_remote_error(error)
+                ctx._maybe_raise_remote_err(error)
+
+            else:
+                # bubble the original src key error
+                raise
+
+    return pre_result_drained
+
+
 # TODO: make this a msgspec.Struct!
 @dataclass
 class Context:
@@ -118,6 +281,7 @@ class Context:
     # which is exactly the primitive that allows for
     # cross-actor-task-supervision and thus SC.
     _scope: trio.CancelScope | None = None
+    # _res_scope: trio.CancelScope|None = None
 
     # on a clean exit there should be a final value
     # delivered from the far end "callee" task, so
@@ -205,6 +369,10 @@ class Context:
             )
         )
 
+    # @property
+    # def is_waiting_result(self) -> bool:
+    #     return bool(self._res_scope)
+
     @property
     def side(self) -> str:
         '''
@@ -247,7 +415,11 @@ class Context:
         await self.chan.send({'yield': data, 'cid': self.cid})
 
     async def send_stop(self) -> None:
-        await self.chan.send({'stop': True, 'cid': self.cid})
+        # await pause()
+        await self.chan.send({
+            'stop': True,
+            'cid': self.cid
+        })
 
     def _maybe_cancel_and_set_remote_error(
         self,
@@ -320,27 +492,37 @@ class Context:
         # XXX: set the remote side's error so that after we cancel
         # whatever task is the opener of this context it can raise
         # that error as the reason.
+        # if self._remote_error:
+        #     return
+
+        # breakpoint()
+        log.cancel(
+            'Setting remote error for ctx \n'
+            f'<= remote ctx uid: {self.chan.uid}\n'
+            f'=>\n{error}'
+        )
         self._remote_error: BaseException = error
 
         if (
             isinstance(error, ContextCancelled)
         ):
-            # always record the cancelling actor's uid since its cancellation
-            # state is linked and we want to know which process was
-            # the cause / requester of the cancellation.
-            self._canceller = error.canceller
-
             log.cancel(
                 'Remote task-context was cancelled for '
                 f'actor: {self.chan.uid}\n'
                 f'task: {self.cid}\n'
                 f'canceller: {error.canceller}\n'
             )
+            # always record the cancelling actor's uid since its cancellation
+            # state is linked and we want to know which process was
+            # the cause / requester of the cancellation.
+            # if error.canceller is None:
+            #     import pdbp; pdbp.set_trace()
+
+                # breakpoint()
+            self._canceller = error.canceller
+
 
             if self._cancel_called:
-                # from .devx._debug import breakpoint
-                # await breakpoint()
-
                 # this is an expected cancel request response message
                 # and we **don't need to raise it** in local cancel
                 # scope since it will potentially override a real error.
@@ -348,10 +530,11 @@ class Context:
 
         else:
             log.error(
-                f'Remote context error,\n'
-                f'remote actor: {self.chan.uid}\n'
-                f'task: {self.cid}\n'
-                f'{error}'
+                f'Remote context error:\n'
+                f'{error}\n'
+                f'{pformat(self)}\n'
+                # f'remote actor: {self.chan.uid}\n'
+                # f'cid: {self.cid}\n'
             )
             self._canceller = self.chan.uid
 
@@ -376,9 +559,11 @@ class Context:
             self._scope.cancel()
 
             # NOTE: this REPL usage actually works here dawg! Bo
-            # from .devx._debug import pause
             # await pause()
 
+        # TODO: maybe we have to use `._res_scope.cancel()` if it
+        # exists?
+
     async def cancel(
         self,
         timeout: float = 0.616,
@@ -395,6 +580,8 @@ class Context:
         log.cancel(
             f'Cancelling {side} side of context to {self.chan.uid}'
         )
+
+        # await pause()
         self._cancel_called: bool = True
 
         # caller side who entered `Portal.open_context()`
@@ -484,13 +671,11 @@ class Context:
         '''
         actor: Actor = current_actor()
 
-        # here we create a mem chan that corresponds to the
-        # far end caller / callee.
-
-        # Likewise if the surrounding context has been cancelled we error here
-        # since it likely means the surrounding block was exited or
-        # killed
-
+        # If the surrounding context has been cancelled by some
+        # task with a handle to THIS, we error here immediately
+        # since it likely means the surrounding lexical-scope has
+        # errored, been `trio.Cancelled` or at the least
+        # `Context.cancel()` was called by some task.
         if self._cancel_called:
 
             # XXX NOTE: ALWAYS RAISE any remote error here even if
@@ -503,6 +688,11 @@ class Context:
             # actually try to stream - a cancel msg was already
             # sent to the other side!
             if self._remote_error:
+                # NOTE: this is diff then calling
+                # `._maybe_raise_from_remote_msg()` specifically
+                # because any task entering this `.open_stream()`
+                # AFTER cancellation has already been requested,
+                # we DO NOT want to absorb any ctxc ACK silently!
                 raise self._remote_error
 
             # XXX NOTE: if no `ContextCancelled` has been responded
@@ -529,7 +719,7 @@ class Context:
         # to send a stop from the caller to the callee in the
         # single-direction-stream case you'll get a lookup error
         # currently.
-        ctx = actor.get_context(
+        ctx: Context = actor.get_context(
             self.chan,
             self.cid,
             msg_buffer_size=msg_buffer_size,
@@ -548,6 +738,19 @@ class Context:
                 'The underlying channel for this stream was already closed!?'
             )
 
+        # NOTE: implicitly this will call `MsgStream.aclose()` on
+        # `.__aexit__()` due to stream's parent `Channel` type!
+        #
+        # XXX NOTE XXX: ensures the stream is "one-shot use",
+        # which specifically means that on exit,
+        # - signal ``trio.EndOfChannel``/``StopAsyncIteration`` to
+        #   the far end indicating that the caller exited
+        #   the streaming context purposefully by letting
+        #   the exit block exec.
+        # - this is diff from the cancel/error case where
+        #   a cancel request from this side or an error
+        #   should be sent to the far end indicating the
+        #   stream WAS NOT just closed normally/gracefully.
         async with MsgStream(
             ctx=self,
             rx_chan=ctx._recv_chan,
@@ -567,11 +770,37 @@ class Context:
                 # await trio.lowlevel.checkpoint()
                 yield stream
 
-                # NOTE: Make the stream "one-shot use".  On exit,
-                # signal
-                # ``trio.EndOfChannel``/``StopAsyncIteration`` to
-                # the far end.
-                await stream.aclose()
+
+                # XXX: (MEGA IMPORTANT) if this is a root opened process we
+                # wait for any immediate child in debug before popping the
+                # context from the runtime msg loop otherwise inside
+                # ``Actor._push_result()`` the msg will be discarded and in
+                # the case where that msg is global debugger unlock (via
+                # a "stop" msg for a stream), this can result in a deadlock
+                # where the root is waiting on the lock to clear but the
+                # child has already cleared it and clobbered IPC.
+                #
+                # await maybe_wait_for_debugger()
+
+                # XXX TODO: pretty sure this isn't needed (see
+                # note above this block) AND will result in
+                # a double `.send_stop()` call. The only reason to
+                # put it here would be to due with "order" in
+                # terms of raising any remote error (as per
+                # directly below) or bc the stream's
+                # `.__aexit__()` block might not get run
+                # (doubtful)? Either way if we did put this back
+                # in we also need a state var to avoid the double
+                # stop-msg send..
+                #
+                # await stream.aclose()
+
+                # if re := ctx._remote_error:
+                #     ctx._maybe_raise_remote_err(
+                #         re,
+                #         raise_ctxc_from_self_call=True,
+                #     )
+                # await trio.lowlevel.checkpoint()
 
             finally:
                 if self._portal:
@@ -587,7 +816,10 @@ class Context:
     def _maybe_raise_remote_err(
         self,
         err: Exception,
-    ) -> None:
+        raise_ctxc_from_self_call: bool = False,
+        raise_overrun_from_self: bool = True,
+
+    ) -> ContextCancelled|None:
         '''
         Maybe raise a remote error depending on who (which task from
         which actor) requested a cancellation (if any).
@@ -603,13 +835,21 @@ class Context:
         # "error"-msg.
         our_uid: tuple[str, str] = current_actor().uid
         if (
-            isinstance(err, ContextCancelled)
-            and (
+            (not raise_ctxc_from_self_call
+             and isinstance(err, ContextCancelled)
+             and (
                 self._cancel_called
                 or self.chan._cancel_called
                 or self.canceller == our_uid
-                or tuple(err.canceller) == our_uid
+                or tuple(err.canceller) == our_uid)
             )
+            or
+            (not raise_overrun_from_self
+             and isinstance(err, RemoteActorError)
+             and err.msgdata['type_str'] == 'StreamOverrun'
+             and tuple(err.msgdata['sender']) == our_uid
+            )
+
         ):
             # NOTE: we set the local scope error to any "self
             # cancellation" error-response thus "absorbing"
@@ -661,77 +901,196 @@ class Context:
         assert self._portal, "Context.result() can not be called from callee!"
         assert self._recv_chan
 
-        if re := self._remote_error:
-            return self._maybe_raise_remote_err(re)
+        raise_overrun: bool = not self._allow_overruns
+        # if re := self._remote_error:
+        #     return self._maybe_raise_remote_err(
+        #         re,
+        #         # NOTE: obvi we don't care if we
+        #         # overran the far end if we're already
+        #         # waiting on a final result (msg).
+        #         raise_overrun_from_self=raise_overrun,
+        #     )
 
+        res_placeholder: int = id(self)
         if (
-            self._result == id(self)
+            self._result == res_placeholder
             and not self._remote_error
             and not self._recv_chan._closed  # type: ignore
         ):
-            # wait for a final context result consuming
-            # and discarding any bi dir stream msgs still
-            # in transit from the far end.
-            while True:
-                try:
-                    msg = await self._recv_chan.receive()
-                    self._result: Any = msg['return']
 
-                    # NOTE: we don't need to do this right?
-                    # XXX: only close the rx mem chan AFTER
-                    # a final result is retreived.
-                    # if self._recv_chan:
-                    #     await self._recv_chan.aclose()
+            # wait for a final context result by collecting (but
+            # basically ignoring) any bi-dir-stream msgs still in transit
+            # from the far end.
+            drained_msgs: list[dict] = await _drain_to_final_msg(ctx=self)
+            log.runtime(
+                'Ctx drained pre-result msgs:\n'
+                f'{drained_msgs}'
+            )
 
-                    break
+            # TODO: implement via helper func ^^^^
+            # pre_result_drained: list[dict] = []
+            # while not self._remote_error:
+            #     try:
+            #         # NOTE: this REPL usage actually works here dawg! Bo
+            #         # from .devx._debug import pause
+            #         # await pause()
+            #         # if re := self._remote_error:
+            #         #     self._maybe_raise_remote_err(
+            #         #         re,
+            #         #         # NOTE: obvi we don't care if we
+            #         #         # overran the far end if we're already
+            #         #         # waiting on a final result (msg).
+            #         #         raise_overrun_from_self=raise_overrun,
+            #         #     )
 
-                # NOTE: we get here if the far end was
-                # `ContextCancelled` in 2 cases:
-                # 1. we requested the cancellation and thus
-                #    SHOULD NOT raise that far end error,
-                # 2. WE DID NOT REQUEST that cancel and thus
-                #    SHOULD RAISE HERE!
-                except trio.Cancelled:
+            #         # TODO: bad idea?
+            #         # with trio.CancelScope() as res_cs:
+            #         #     self._res_scope = res_cs
+            #         #     msg: dict = await self._recv_chan.receive()
+            #         # if res_cs.cancelled_caught:
 
-                    # CASE 2: mask the local cancelled-error(s)
-                    # only when we are sure the remote error is the
-                    # (likely) source cause of this local runtime
-                    # task's cancellation.
-                    if re := self._remote_error:
-                        self._maybe_raise_remote_err(re)
+            #         # from .devx._debug import pause
+            #         # await pause()
+            #         msg: dict = await self._recv_chan.receive()
+            #         self._result: Any = msg['return']
+            #         log.runtime(
+            #             'Context delivered final result msg:\n'
+            #             f'{pformat(msg)}'
+            #         )
+            #         # NOTE: we don't need to do this right?
+            #         # XXX: only close the rx mem chan AFTER
+            #         # a final result is retreived.
+            #         # if self._recv_chan:
+            #         #     await self._recv_chan.aclose()
+            #         break
 
-                    # CASE 1: we DID request the cancel we simply
-                    # continue to bubble up as normal.
-                    raise
+            #     # NOTE: we get here if the far end was
+            #     # `ContextCancelled` in 2 cases:
+            #     # 1. we requested the cancellation and thus
+            #     #    SHOULD NOT raise that far end error,
+            #     # 2. WE DID NOT REQUEST that cancel and thus
+            #     #    SHOULD RAISE HERE!
+            #     except trio.Cancelled:
 
-                except KeyError:  # as msgerr:
+            #         # CASE 2: mask the local cancelled-error(s)
+            #         # only when we are sure the remote error is
+            #         # the source cause of this local task's
+            #         # cancellation.
+            #         if re := self._remote_error:
+            #             self._maybe_raise_remote_err(re)
 
-                    if 'yield' in msg:
-                        # far end task is still streaming to us so discard
-                        log.warning(f'Discarding stream delivered {msg}')
-                        continue
+            #         # CASE 1: we DID request the cancel we simply
+            #         # continue to bubble up as normal.
+            #         raise
 
-                    elif 'stop' in msg:
-                        log.debug('Remote stream terminated')
-                        continue
+            #     except KeyError:
 
-                    # internal error should never get here
-                    assert msg.get('cid'), (
-                        "Received internal error at portal?"
-                    )
+            #         if 'yield' in msg:
+            #             # far end task is still streaming to us so discard
+            #             log.warning(f'Discarding std "yield"\n{msg}')
+            #             pre_result_drained.append(msg)
+            #             continue
 
-                    if err:= unpack_error(
-                        msg,
-                        self._portal.channel
-                    ):  # from msgerr
-                        self._maybe_cancel_and_set_remote_error(err)
-                        self._maybe_raise_remote_err(err)
+            #         # TODO: work out edge cases here where
+            #         # a stream is open but the task also calls
+            #         # this?
+            #         # -[ ] should be a runtime error if a stream is open
+            #         #   right?
+            #         elif 'stop' in msg:
+            #             log.cancel(
+            #                 'Remote stream terminated due to "stop" msg:\n'
+            #                 f'{msg}'
+            #             )
+            #             pre_result_drained.append(msg)
+            #             continue
 
-                    else:
-                        raise
+            #         # internal error should never get here
+            #         assert msg.get('cid'), (
+            #             "Received internal error at portal?"
+            #         )
 
-        if re := self._remote_error:
-            return self._maybe_raise_remote_err(re)
+            #         # XXX fallthrough to handle expected error XXX
+            #         re: Exception|None = self._remote_error
+            #         if re:
+            #             log.critical(
+            #                 'Remote ctx terminated due to "error" msg:\n'
+            #                 f'{re}'
+            #             )
+            #             assert msg is self._cancel_msg
+            #             # NOTE: this solved a super dupe edge case XD
+            #             # this was THE super duper edge case of:
+            #             # - local task opens a remote task,
+            #             # - requests remote cancellation of far end
+            #             #   ctx/tasks,
+            #             # - needs to wait for the cancel ack msg
+            #             #   (ctxc) or some result in the race case
+            #             #   where the other side's task returns
+            #             #   before the cancel request msg is ever
+            #             #   rxed and processed,
+            #             # - here this surrounding drain loop (which
+            #             #   iterates all ipc msgs until the ack or
+            #             #   an early result arrives) was NOT exiting
+            #             #   since we are the edge case: local task
+            #             #   does not re-raise any ctxc it receives
+            #             #   IFF **it** was the cancellation
+            #             #   requester..
+            #             # will raise if necessary, ow break from
+            #             # loop presuming any error terminates the
+            #             # context!
+            #             self._maybe_raise_remote_err(
+            #                 re,
+            #                 # NOTE: obvi we don't care if we
+            #                 # overran the far end if we're already
+            #                 # waiting on a final result (msg).
+            #                 # raise_overrun_from_self=False,
+            #                 raise_overrun_from_self=raise_overrun,
+            #             )
+
+            #             break  # OOOOOF, yeah obvi we need this..
+
+            #         # XXX we should never really get here
+            #         # right! since `._deliver_msg()` should
+            #         # always have detected an {'error': ..}
+            #         # msg and already called this right!?!
+            #         elif error := unpack_error(
+            #             msg=msg,
+            #             chan=self._portal.channel,
+            #             hide_tb=False,
+            #         ):
+            #             log.critical('SHOULD NEVER GET HERE!?')
+            #             assert msg is self._cancel_msg
+            #             assert error.msgdata == self._remote_error.msgdata
+            #             from .devx._debug import pause
+            #             await pause()
+            #             self._maybe_cancel_and_set_remote_error(error)
+            #             self._maybe_raise_remote_err(error)
+
+            #         else:
+            #             # bubble the original src key error
+            #             raise
+
+        if (
+            (re := self._remote_error)
+            and self._result == res_placeholder
+        ):
+            maybe_err: Exception|None = self._maybe_raise_remote_err(
+                re,
+                # NOTE: obvi we don't care if we
+                # overran the far end if we're already
+                # waiting on a final result (msg).
+                # raise_overrun_from_self=False,
+                raise_overrun_from_self=(
+                    raise_overrun
+                    and
+                    # only when we ARE NOT the canceller
+                    # should we raise overruns, bc ow we're
+                    # raising something we know might happen
+                    # during cancellation ;)
+                    (not self._cancel_called)
+                ),
+            )
+            if maybe_err:
+                self._result = maybe_err
 
         return self._result
 
@@ -779,7 +1138,7 @@ class Context:
             while self._overflow_q:
                 # NOTE: these msgs should never be errors since we always do
                 # the check prior to checking if we're in an overrun state
-                # inside ``.deliver_msg()``.
+                # inside ``._deliver_msg()``.
                 msg = self._overflow_q.popleft()
                 try:
                     await self._send_chan.send(msg)
@@ -830,34 +1189,50 @@ class Context:
         messages are eventually sent if possible.
 
         '''
-        cid = self.cid
-        chan = self.chan
-        uid = chan.uid
+        cid: str = self.cid
+        chan: Channel = self.chan
+        from_uid: tuple[str, str]  = chan.uid
         send_chan: trio.MemorySendChannel = self._send_chan
 
-        log.runtime(
-            f"Delivering {msg} from {uid} to caller {cid}"
-        )
-
-        if (
-            msg.get('error')  # check for field
-            and (
-                error := unpack_error(
-                    msg,
-                    self.chan,
-                )
+        if re := unpack_error(
+            msg,
+            self.chan,
+        ):
+            log.error(
+                f'Delivering error-msg from {from_uid} to caller {cid}'
+                f'{re}'
             )
-        ):
             self._cancel_msg = msg
-            self._maybe_cancel_and_set_remote_error(error)
+            self._maybe_cancel_and_set_remote_error(re)
 
-        if (
-            self._in_overrun
-        ):
+            # XXX NEVER do this XXX..!!
+            # bc if the error is a ctxc and there is a task
+            # waiting on `.result()` we need the msg to be sent
+            # over the `send_chan`/`._recv_chan` so that the error
+            # is relayed to that waiter task..
+            # return True
+            #
+            # XXX ALSO NO!! XXX
+            # if self._remote_error:
+            #     self._maybe_raise_remote_err(error)
+
+        if self._in_overrun:
+            log.warning(
+                f'Capturing overrun-msg from {from_uid} to caller {cid}'
+                f'{msg}'
+            )
             self._overflow_q.append(msg)
             return False
 
         try:
+            log.runtime(
+                f'Delivering IPC `Context` msg:\n'
+                f'<= {from_uid}\n'
+                f'=> caller: {cid}\n'
+                f'{msg}'
+            )
+            # from .devx._debug import pause
+            # await pause()
             send_chan.send_nowait(msg)
             return True
             # if an error is deteced we should always
@@ -890,7 +1265,8 @@ class Context:
             lines = [
                 f'OVERRUN on actor-task context {cid}@{local_uid}!\n'
                 # TODO: put remote task name here if possible?
-                f'remote sender actor: {uid}',
+                f'sender: {from_uid}',
+                f'msg: {msg}',
                 # TODO: put task func name here and maybe an arrow
                 # from sender to overrunner?
                 # f'local task {self.func_name}'
@@ -926,11 +1302,19 @@ class Context:
                         # anything different.
                         return False
             else:
+                # raise local overrun and immediately pack as IPC
+                # msg for far end.
                 try:
-                    raise StreamOverrun(text)
+                    raise StreamOverrun(
+                        text,
+                        sender=from_uid,
+                    )
                 except StreamOverrun as err:
-                    err_msg = pack_error(err)
-                    err_msg['cid'] = cid
+                    err_msg: dict[str, dict] = pack_error(
+                        err,
+                        cid=cid,
+                    )
+                    # err_msg['cid']: str = cid
                     try:
                         await chan.send(err_msg)
                     except trio.BrokenResourceError:
diff --git a/tractor/_portal.py b/tractor/_portal.py
index 378f6a23..14f6fbf2 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -39,7 +39,15 @@ import trio
 from async_generator import asynccontextmanager
 
 from .trionics import maybe_open_nursery
-from ._state import current_actor
+from .devx import (
+    # acquire_debug_lock,
+    # pause,
+    maybe_wait_for_debugger,
+)
+from ._state import (
+    current_actor,
+    debug_mode,
+)
 from ._ipc import Channel
 from .log import get_logger
 from .msg import NamespacePath
@@ -48,6 +56,7 @@ from ._exceptions import (
     unpack_error,
     NoResult,
     ContextCancelled,
+    RemoteActorError,
 )
 from ._context import (
     Context,
@@ -55,7 +64,6 @@ from ._context import (
 from ._streaming import (
     MsgStream,
 )
-from .devx._debug import maybe_wait_for_debugger
 
 
 log = get_logger(__name__)
@@ -469,7 +477,6 @@ class Portal:
             ctx._started_called: bool = True
 
         except KeyError as src_error:
-
             _raise_from_no_key_in_msg(
                 ctx=ctx,
                 msg=msg,
@@ -494,6 +501,33 @@ class Portal:
                 # in enter tuple.
                 yield ctx, first
 
+                # between the caller exiting and arriving here the
+                # far end may have sent a ctxc-msg or other error,
+                # so check for it here immediately and maybe raise
+                # so as to engage the ctxc handling block below!
+                # if re := ctx._remote_error:
+                #     maybe_ctxc: ContextCancelled|None = ctx._maybe_raise_remote_err(
+                #         re,
+
+                #         # TODO: do we want this to always raise?
+                #         # - means that on self-ctxc, if/when the
+                #         #   block is exited before the msg arrives
+                #         #   but then the msg during __exit__
+                #         #   calling we may not activate the
+                #         #   ctxc-handler block below? should we
+                #         #   be?
+                #         # - if there's a remote error that arrives
+                #         #   after the child has exited, we won't
+                #         #   handle until the `finally:` block
+                #         #   where `.result()` is always called,
+                #         #   again in which case we handle it
+                #         #   differently then in the handler block
+                #         #   that would normally engage from THIS
+                #         #   block?
+                #         raise_ctxc_from_self_call=True,
+                #     )
+                #     assert maybe_ctxc
+
                 # when in allow_overruns mode there may be
                 # lingering overflow sender tasks remaining?
                 if nurse.child_tasks:
@@ -539,7 +573,7 @@ class Portal:
         #   `.canceller: tuple[str, str]` to be same value as
         #   caught here in a `ContextCancelled.canceller`.
         #
-        # Again, there are 2 cases:
+        # AGAIN to restate the above, there are 2 cases:
         #
         # 1-some other context opened in this `.open_context()`
         #   block cancelled due to a self or peer cancellation
@@ -555,6 +589,16 @@ class Portal:
         except ContextCancelled as ctxc:
             scope_err = ctxc
 
+            # XXX TODO XXX: FIX THIS debug_mode BUGGGG!!!
+            # using this code and then resuming the REPL will
+            # cause a SIGINT-ignoring HANG!
+            # -> prolly due to a stale debug lock entry..
+            # -[ ] USE `.stackscope` to demonstrate that (possibly
+            #   documenting it as a definittive example of
+            #   debugging the tractor-runtime itself using it's
+            #   own `.devx.` tooling!
+            # await pause()
+
             # CASE 2: context was cancelled by local task calling
             # `.cancel()`, we don't raise and the exit block should
             # exit silently.
@@ -562,18 +606,23 @@ class Portal:
                 ctx._cancel_called
                 and (
                     ctxc is ctx._remote_error
-                    or
-                    ctxc.canceller is self.canceller
+                    # ctxc.msgdata == ctx._remote_error.msgdata
+
+                    # TODO: uhh `Portal.canceller` ain't a thangg
+                    # dawg? (was `self.canceller` before?!?)
+                    and
+                    ctxc.canceller == self.actor.uid
                 )
             ):
-                log.debug(
-                    f'Context {ctx} cancelled gracefully with:\n'
+                log.cancel(
+                    f'Context (cid=[{ctx.cid[-6:]}..] cancelled gracefully with:\n'
                     f'{ctxc}'
                 )
             # CASE 1: this context was never cancelled via a local
             # task (tree) having called `Context.cancel()`, raise
             # the error since it was caused by someone else!
             else:
+                # await pause()
                 raise
 
         # the above `._scope` can be cancelled due to:
@@ -602,8 +651,8 @@ class Portal:
             trio.Cancelled,  # NOTE: NOT from inside the ctx._scope
             KeyboardInterrupt,
 
-        ) as err:
-            scope_err = err
+        ) as caller_err:
+            scope_err = caller_err
 
             # XXX: ALWAYS request the context to CANCEL ON any ERROR.
             # NOTE: `Context.cancel()` is conversely NEVER CALLED in
@@ -611,11 +660,26 @@ class Portal:
             # handled in the block above!
             log.cancel(
                 'Context cancelled for task due to\n'
-                f'{err}\n'
+                f'{caller_err}\n'
                 'Sending cancel request..\n'
                 f'task:{cid}\n'
                 f'actor:{uid}'
             )
+
+            if debug_mode():
+                log.pdb(
+                    'Delaying `ctx.cancel()` until debug lock '
+                    'acquired..'
+                )
+                # async with acquire_debug_lock(self.actor.uid):
+                #     pass
+                # TODO: factor ^ into below for non-root cases?
+                await maybe_wait_for_debugger()
+                log.pdb(
+                    'Acquired debug lock! '
+                    'Calling `ctx.cancel()`!'
+                )
+
             try:
                 await ctx.cancel()
             except trio.BrokenResourceError:
@@ -629,6 +693,33 @@ class Portal:
 
         # no local scope error, the "clean exit with a result" case.
         else:
+            # between the caller exiting and arriving here the
+            # far end may have sent a ctxc-msg or other error,
+            # so check for it here immediately and maybe raise
+            # so as to engage the ctxc handling block below!
+            # if re := ctx._remote_error:
+            #     maybe_ctxc: ContextCancelled|None = ctx._maybe_raise_remote_err(
+            #         re,
+
+            #         # TODO: do we want this to always raise?
+            #         # - means that on self-ctxc, if/when the
+            #         #   block is exited before the msg arrives
+            #         #   but then the msg during __exit__
+            #         #   calling we may not activate the
+            #         #   ctxc-handler block below? should we
+            #         #   be?
+            #         # - if there's a remote error that arrives
+            #         #   after the child has exited, we won't
+            #         #   handle until the `finally:` block
+            #         #   where `.result()` is always called,
+            #         #   again in which case we handle it
+            #         #   differently then in the handler block
+            #         #   that would normally engage from THIS
+            #         #   block?
+            #         raise_ctxc_from_self_call=True,
+            #     )
+            #     assert maybe_ctxc
+
             if ctx.chan.connected():
                 log.info(
                     'Waiting on final context-task result for\n'
@@ -645,13 +736,8 @@ class Portal:
                 # As per `Context._deliver_msg()`, that error IS
                 # ALWAYS SET any time "callee" side fails and causes "caller
                 # side" cancellation via a `ContextCancelled` here.
-                # result = await ctx.result()
                 try:
-                    result = await ctx.result()
-                    log.runtime(
-                        f'Context {fn_name} returned value from callee:\n'
-                        f'`{result}`'
-                    )
+                    result_or_err: Exception|Any = await ctx.result()
                 except BaseException as berr:
                     # on normal teardown, if we get some error
                     # raised in `Context.result()` we still want to
@@ -663,7 +749,48 @@ class Portal:
                     scope_err = berr
                     raise
 
+                # an exception type boxed in a `RemoteActorError`
+                # is returned (meaning it was obvi not raised).
+                msgdata: str|None = getattr(
+                    result_or_err,
+                    'msgdata',
+                    None
+                )
+                # yes! this worx Bp
+                # from .devx import _debug
+                # await _debug.pause()
+                match (msgdata, result_or_err):
+                    case (
+                        {'tb_str': tbstr},
+                        ContextCancelled(),
+                    ):
+                        log.cancel(tbstr)
+
+                    case (
+                        {'tb_str': tbstr},
+                        RemoteActorError(),
+                    ):
+                        log.exception(
+                            f'Context `{fn_name}` remotely errored:\n'
+                            f'`{tbstr}`'
+                        )
+                    case (None, _):
+                        log.runtime(
+                            f'Context {fn_name} returned value from callee:\n'
+                            f'`{result_or_err}`'
+                        )
+
         finally:
+            # XXX: (MEGA IMPORTANT) if this is a root opened process we
+            # wait for any immediate child in debug before popping the
+            # context from the runtime msg loop otherwise inside
+            # ``Actor._push_result()`` the msg will be discarded and in
+            # the case where that msg is global debugger unlock (via
+            # a "stop" msg for a stream), this can result in a deadlock
+            # where the root is waiting on the lock to clear but the
+            # child has already cleared it and clobbered IPC.
+            await maybe_wait_for_debugger()
+
             # though it should be impossible for any tasks
             # operating *in* this scope to have survived
             # we tear down the runtime feeder chan last
@@ -708,6 +835,10 @@ class Portal:
                 # out any exception group or legit (remote) ctx
                 # error that sourced from the remote task or its
                 # runtime.
+                #
+                # NOTE: further, this should be the only place the
+                # underlying feeder channel is
+                # once-and-only-CLOSED!
                 with trio.CancelScope(shield=True):
                     await ctx._recv_chan.aclose()
 
@@ -737,18 +868,11 @@ class Portal:
                         f'actor:{uid}'
                     )
 
-            # XXX: (MEGA IMPORTANT) if this is a root opened process we
-            # wait for any immediate child in debug before popping the
-            # context from the runtime msg loop otherwise inside
-            # ``Actor._push_result()`` the msg will be discarded and in
-            # the case where that msg is global debugger unlock (via
-            # a "stop" msg for a stream), this can result in a deadlock
-            # where the root is waiting on the lock to clear but the
-            # child has already cleared it and clobbered IPC.
-            await maybe_wait_for_debugger()
-
             # FINALLY, remove the context from runtime tracking and
             # exit!
+            log.runtime(
+                f'Exiting context opened with {ctx.chan.uid}'
+            )
             self.actor._contexts.pop(
                 (self.channel.uid, ctx.cid),
                 None,
diff --git a/tractor/_streaming.py b/tractor/_streaming.py
index 4530e144..e8f735ec 100644
--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@@ -21,8 +21,9 @@ The machinery and types behind ``Context.open_stream()``
 
 '''
 from __future__ import annotations
-import inspect
 from contextlib import asynccontextmanager as acm
+import inspect
+from pprint import pformat
 from typing import (
     Any,
     Callable,
@@ -35,6 +36,7 @@ import trio
 
 from ._exceptions import (
     _raise_from_no_key_in_msg,
+    ContextCancelled,
 )
 from .log import get_logger
 from .trionics import (
@@ -84,8 +86,8 @@ class MsgStream(trio.abc.Channel):
         self._broadcaster = _broadcaster
 
         # flag to denote end of stream
-        self._eoc: bool = False
-        self._closed: bool = False
+        self._eoc: bool|trio.EndOfChannel = False
+        self._closed: bool|trio.ClosedResourceError = False
 
     # delegate directly to underlying mem channel
     def receive_nowait(self):
@@ -93,6 +95,9 @@ class MsgStream(trio.abc.Channel):
         try:
             return msg['yield']
         except KeyError as kerr:
+            # if 'return' in msg:
+            #     return msg
+
             _raise_from_no_key_in_msg(
                 ctx=self._ctx,
                 msg=msg,
@@ -122,30 +127,43 @@ class MsgStream(trio.abc.Channel):
         # see ``.aclose()`` for notes on the old behaviour prior to
         # introducing this
         if self._eoc:
-            raise trio.EndOfChannel
+            raise self._eoc
+            # raise trio.EndOfChannel
 
         if self._closed:
-            raise trio.ClosedResourceError('This stream was closed')
+            raise self._closed
+            # raise trio.ClosedResourceError(
+            #     'This stream was already closed'
+            # )
 
+        src_err: Exception|None = None
         try:
-            msg = await self._rx_chan.receive()
-            return msg['yield']
+            try:
+                msg = await self._rx_chan.receive()
+                return msg['yield']
 
-        except KeyError as kerr:
-            _raise_from_no_key_in_msg(
-                ctx=self._ctx,
-                msg=msg,
-                src_err=kerr,
-                log=log,
-                expect_key='yield',
-                stream=self,
-            )
+            except KeyError as kerr:
+                src_err = kerr
 
+                # NOTE: may raise any of the below error types
+                # includg EoC when a 'stop' msg is found.
+                _raise_from_no_key_in_msg(
+                    ctx=self._ctx,
+                    msg=msg,
+                    src_err=kerr,
+                    log=log,
+                    expect_key='yield',
+                    stream=self,
+                )
+
+        # XXX: we close the stream on any of these error conditions:
         except (
-            trio.ClosedResourceError,  # by self._rx_chan
+            # trio.ClosedResourceError,  # by self._rx_chan
             trio.EndOfChannel,  # by self._rx_chan or `stop` msg from far end
-        ):
-            # XXX: we close the stream on any of these error conditions:
+        ) as eoc:
+            src_err = eoc
+            self._eoc = eoc
+            # await trio.sleep(1)
 
             # a ``ClosedResourceError`` indicates that the internal
             # feeder memory receive channel was closed likely by the
@@ -168,14 +186,53 @@ class MsgStream(trio.abc.Channel):
             # closing this stream and not flushing a final value to
             # remaining (clone) consumers who may not have been
             # scheduled to receive it yet.
+            # try:
+            #     maybe_err_msg_or_res: dict = self._rx_chan.receive_nowait()
+            #     if maybe_err_msg_or_res:
+            #         log.warning(
+            #             'Discarding un-processed msg:\n'
+            #             f'{maybe_err_msg_or_res}'
+            #         )
+            # except trio.WouldBlock:
+            #     # no queued msgs that might be another remote
+            #     # error, so just raise the original EoC
+            #     pass
 
-            # when the send is closed we assume the stream has
-            # terminated and signal this local iterator to stop
-            await self.aclose()
+            # raise eoc
 
-            raise  # propagate
+        except trio.ClosedResourceError as cre:  # by self._rx_chan
+            src_err = cre
+            log.warning(
+                '`Context._rx_chan` was already closed?'
+            )
+            self._closed = cre
 
-    async def aclose(self):
+        # when the send is closed we assume the stream has
+        # terminated and signal this local iterator to stop
+        drained: list[Exception|dict] = await self.aclose()
+        if drained:
+            log.warning(
+                'Drained context msgs during closure:\n'
+                f'{drained}'
+            )
+        # TODO: pass these to the `._ctx._drained_msgs: deque`
+        # and then iterate them as part of any `.result()` call?
+
+        # NOTE XXX: if the context was cancelled or remote-errored
+        # but we received the stream close msg first, we
+        # probably want to instead raise the remote error
+        # over the end-of-stream connection error since likely
+        # the remote error was the source cause?
+        ctx: Context = self._ctx
+        if re := ctx._remote_error:
+            ctx._maybe_raise_remote_err(
+                re,
+                raise_ctxc_from_self_call=True,
+            )
+
+        raise src_err  # propagate
+
+    async def aclose(self) -> list[Exception|dict]:
         '''
         Cancel associated remote actor task and local memory channel on
         close.
@@ -185,15 +242,55 @@ class MsgStream(trio.abc.Channel):
         # https://trio.readthedocs.io/en/stable/reference-io.html#trio.abc.AsyncResource.aclose
         rx_chan = self._rx_chan
 
-        if rx_chan._closed:
-            log.cancel(f"{self} is already closed")
+        if (
+            rx_chan._closed
+            or
+            self._closed
+        ):
+            log.cancel(
+                f'`MsgStream` is already closed\n'
+                f'.cid: {self._ctx.cid}\n'
+                f'._rx_chan`: {rx_chan}\n'
+                f'._eoc: {self._eoc}\n'
+                f'._closed: {self._eoc}\n'
+            )
 
             # this stream has already been closed so silently succeed as
             # per ``trio.AsyncResource`` semantics.
             # https://trio.readthedocs.io/en/stable/reference-io.html#trio.abc.AsyncResource.aclose
-            return
+            return []
 
-        self._eoc = True
+        ctx: Context = self._ctx
+        # caught_eoc: bool = False
+        drained: list[Exception|dict] = []
+        while not drained:
+            try:
+                maybe_final_msg = self.receive_nowait()
+                if maybe_final_msg:
+                    log.cancel(
+                        'Drained un-processed stream msg:\n'
+                        f'{pformat(maybe_final_msg)}'
+                    )
+                    # TODO: inject into parent `Context` buf?
+                    drained.append(maybe_final_msg)
+
+            except trio.WouldBlock as be:
+                drained.append(be)
+                break
+
+            except trio.EndOfChannel as eoc:
+                drained.append(eoc)
+                # caught_eoc = True
+                self._eoc: bool = eoc
+                break
+
+            except ContextCancelled as ctxc:
+                log.cancel(
+                    'Context was cancelled during stream closure:\n'
+                    f'canceller: {ctxc.canceller}\n'
+                    f'{pformat(ctxc.msgdata)}'
+                )
+                break
 
         # NOTE: this is super subtle IPC messaging stuff:
         # Relay stop iteration to far end **iff** we're
@@ -224,26 +321,33 @@ class MsgStream(trio.abc.Channel):
         except (
             trio.BrokenResourceError,
             trio.ClosedResourceError
-        ):
+        ) as re:
             # the underlying channel may already have been pulled
             # in which case our stop message is meaningless since
             # it can't traverse the transport.
-            ctx = self._ctx
             log.warning(
                 f'Stream was already destroyed?\n'
                 f'actor: {ctx.chan.uid}\n'
                 f'ctx id: {ctx.cid}'
             )
+            drained.append(re)
+            self._closed = re
 
-        self._closed = True
+        # if caught_eoc:
+        #     # from .devx import _debug
+        #     # await _debug.pause()
+        #     with trio.CancelScope(shield=True):
+        #         await rx_chan.aclose()
 
-        # Do we close the local mem chan ``self._rx_chan`` ??!?
+        # self._eoc: bool = caught_eoc
 
-        # NO, DEFINITELY NOT if we're a bi-dir ``MsgStream``!
-        # BECAUSE this same core-msg-loop mem recv-chan is used to deliver
-        # the potential final result from the surrounding inter-actor
-        # `Context` so we don't want to close it until that context has
-        # run to completion.
+        # ?XXX WAIT, why do we not close the local mem chan `._rx_chan` XXX?
+        # => NO, DEFINITELY NOT! <=
+        # if we're a bi-dir ``MsgStream`` BECAUSE this same
+        # core-msg-loop mem recv-chan is used to deliver the
+        # potential final result from the surrounding inter-actor
+        # `Context` so we don't want to close it until that
+        # context has run to completion.
 
         # XXX: Notes on old behaviour:
         # await rx_chan.aclose()
@@ -272,6 +376,8 @@ class MsgStream(trio.abc.Channel):
         # runtime's closure of ``rx_chan`` in the case where we may
         # still need to consume msgs that are "in transit" from the far
         # end (eg. for ``Context.result()``).
+        # self._closed = True
+        return drained
 
     @acm
     async def subscribe(
@@ -337,9 +443,13 @@ class MsgStream(trio.abc.Channel):
             raise self._ctx._remote_error  # from None
 
         if self._closed:
-            raise trio.ClosedResourceError('This stream was already closed')
+            raise self._closed
+            # raise trio.ClosedResourceError('This stream was already closed')
 
-        await self._ctx.chan.send({'yield': data, 'cid': self._ctx.cid})
+        await self._ctx.chan.send({
+            'yield': data,
+            'cid': self._ctx.cid,
+        })
 
 
 def stream(func: Callable) -> Callable:
-- 
2.34.1


From 5fe3f58ea9757f5453d4f0b28cf35cb0742d79f5 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 20 Feb 2024 08:53:37 -0500
Subject: [PATCH 101/378] Add a `debug_mode: bool` fixture via `--tpdb` flag

Allows tests (including any `@tractor_test`s) to subscribe to a CLI flag
`--tpdb` (for "tractor python debugger") which the session can provide
to tests which can then proxy the value to `open_root_actor()` (via
`open_nursery()`) when booting the runtime - thus enabling our debug
mode globally to any subscribers B)

This is real handy if you have some failures but can't determine the
root issue without jumping into a `pdbp` REPL inside a (sub-)actor's
spawned-task.
---
 tests/conftest.py | 38 +++++++++++++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 8e9a67c4..c9159f0d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -41,11 +41,14 @@ def tractor_test(fn):
         *args,
         loglevel=None,
         reg_addr=None,
-        start_method=None,
+        start_method: str|None = None,
+        debug_mode: bool = False,
         **kwargs
     ):
         # __tracebackhide__ = True
 
+        # NOTE: inject ant test func declared fixture
+        # names by manually checking!
         if 'reg_addr' in inspect.signature(fn).parameters:
             # injects test suite fixture value to test as well
             # as `run()`
@@ -64,10 +67,14 @@ def tractor_test(fn):
             # set of subprocess spawning backends
             kwargs['start_method'] = start_method
 
+        if 'debug_mode' in inspect.signature(fn).parameters:
+            # set of subprocess spawning backends
+            kwargs['debug_mode'] = debug_mode
+
+
         if kwargs:
 
             # use explicit root actor start
-
             async def _main():
                 async with tractor.open_root_actor(
                     # **kwargs,
@@ -76,7 +83,7 @@ def tractor_test(fn):
                     start_method=start_method,
 
                     # TODO: only enable when pytest is passed --pdb
-                    # debug_mode=True,
+                    debug_mode=debug_mode,
 
                 ):
                     await fn(*args, **kwargs)
@@ -130,22 +137,43 @@ def examples_dir() -> pathlib.Path:
 
 def pytest_addoption(parser):
     parser.addoption(
-        "--ll", action="store", dest='loglevel',
+        "--ll",
+        action="store",
+        dest='loglevel',
         default='ERROR', help="logging level to set when testing"
     )
 
     parser.addoption(
-        "--spawn-backend", action="store", dest='spawn_backend',
+        "--spawn-backend",
+        action="store",
+        dest='spawn_backend',
         default='trio',
         help="Processing spawning backend to use for test run",
     )
 
+    parser.addoption(
+        "--tpdb", "--debug-mode",
+        action="store_true",
+        dest='tractor_debug_mode',
+        # default=False,
+        help=(
+            'Enable a flag that can be used by tests to to set the '
+            '`debug_mode: bool` for engaging the internal '
+            'multi-proc debugger sys.'
+        ),
+    )
+
 
 def pytest_configure(config):
     backend = config.option.spawn_backend
     tractor._spawn.try_set_start_method(backend)
 
 
+@pytest.fixture(scope='session')
+def debug_mode(request):
+    return request.config.option.tractor_debug_mode
+
+
 @pytest.fixture(scope='session', autouse=True)
 def loglevel(request):
     orig = tractor.log._default_loglevel
-- 
2.34.1


From bf0739c19446adeb982a167b0509da29f9860104 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 20 Feb 2024 08:59:21 -0500
Subject: [PATCH 102/378] Add `stackscope` tree pprinter triggered by SIGUSR1

Can be optionally enabled via a new `enable_stack_on_sig()` which will
swap in the SIGUSR1 handler. Much thanks to @oremanj for writing this
amazing project, it's thus far helped me fix some very subtle hangs
inside our new IPC-context cancellation machinery that would have
otherwise taken much more manual pdb-ing and hair pulling XD

Full credit for `dump_task_tree()` goes to the original project author
with some minor tweaks as was handed to me via the trio-general matrix
room B)

Slight changes from orig version:
- use a `log.pdb()` emission to pprint to console
- toss in an ex sh CLI cmd to trigger the dump from another terminal
  using `kill` + `pgrep`.
---
 tractor/devx/__init__.py    |  3 ++
 tractor/devx/_stackscope.py | 84 +++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 tractor/devx/_stackscope.py

diff --git a/tractor/devx/__init__.py b/tractor/devx/__init__.py
index 89b9a336..5f832615 100644
--- a/tractor/devx/__init__.py
+++ b/tractor/devx/__init__.py
@@ -32,6 +32,9 @@ from ._debug import (
     maybe_open_crash_handler,
     post_mortem,
 )
+from ._stackscope import (
+    enable_stack_on_sig as enable_stack_on_sig,
+)
 
 __all__ = [
     'maybe_wait_for_debugger',
diff --git a/tractor/devx/_stackscope.py b/tractor/devx/_stackscope.py
new file mode 100644
index 00000000..706b85d3
--- /dev/null
+++ b/tractor/devx/_stackscope.py
@@ -0,0 +1,84 @@
+# tractor: structured concurrent "actors".
+# Copyright eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+The fundamental cross process SC abstraction: an inter-actor,
+cancel-scope linked task "context".
+
+A ``Context`` is very similar to the ``trio.Nursery.cancel_scope`` built
+into each ``trio.Nursery`` except it links the lifetimes of memory space
+disjoint, parallel executing tasks in separate actors.
+
+'''
+from signal import (
+    signal,
+    SIGUSR1,
+)
+
+import trio
+
+@trio.lowlevel.disable_ki_protection
+def dump_task_tree() -> None:
+    import stackscope
+    from tractor.log import get_console_log
+
+    tree_str: str = str(
+        stackscope.extract(
+            trio.lowlevel.current_root_task(),
+            recurse_child_tasks=True
+        )
+    )
+    log = get_console_log('cancel')
+    log.pdb(
+        f'Dumping `stackscope` tree:\n\n'
+        f'{tree_str}\n'
+    )
+    # import logging
+    # try:
+    #     with open("/dev/tty", "w") as tty:
+    #         tty.write(tree_str)
+    # except BaseException:
+    #     logging.getLogger(
+    #         "task_tree"
+    #     ).exception("Error printing task tree")
+
+
+def signal_handler(sig: int, frame: object) -> None:
+    import traceback
+    try:
+        trio.lowlevel.current_trio_token(
+        ).run_sync_soon(dump_task_tree)
+    except RuntimeError:
+        # not in async context -- print a normal traceback
+        traceback.print_stack()
+
+
+
+def enable_stack_on_sig(
+    sig: int = SIGUSR1
+) -> None:
+    '''
+    Enable `stackscope` tracing on reception of a signal; by
+    default this is SIGUSR1.
+
+    '''
+    signal(
+        sig,
+        signal_handler,
+    )
+    # NOTE: not the above can be triggered from
+    # a (xonsh) shell using:
+    # kill -SIGUSR1 @$(pgrep -f '<cmd>')
-- 
2.34.1


From 81f8e2d4ac86b10b4f19af03f3c4c395d3b0d023 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 20 Feb 2024 09:18:22 -0500
Subject: [PATCH 103/378] _supervise: iter nice expanded multi-line
 `._children` tups with typing

---
 tractor/_supervise.py | 41 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 34 insertions(+), 7 deletions(-)

diff --git a/tractor/_supervise.py b/tractor/_supervise.py
index 364d79c3..7319b15b 100644
--- a/tractor/_supervise.py
+++ b/tractor/_supervise.py
@@ -156,7 +156,7 @@ class ActorNursery:
 
         # start a task to spawn a process
         # blocks until process has been started and a portal setup
-        nursery = nursery or self._da_nursery
+        nursery: trio.Nursery = nursery or self._da_nursery
 
         # XXX: the type ignore is actually due to a `mypy` bug
         return await nursery.start(  # type: ignore
@@ -232,12 +232,14 @@ class ActorNursery:
         return portal
 
     async def cancel(self, hard_kill: bool = False) -> None:
-        """Cancel this nursery by instructing each subactor to cancel
+        '''
+        Cancel this nursery by instructing each subactor to cancel
         itself and wait for all subactors to terminate.
 
         If ``hard_killl`` is set to ``True`` then kill the processes
         directly without any far end graceful ``trio`` cancellation.
-        """
+
+        '''
         self.cancelled = True
 
         log.cancel(f"Cancelling nursery in {self._actor.uid}")
@@ -245,7 +247,14 @@ class ActorNursery:
 
             async with trio.open_nursery() as nursery:
 
-                for subactor, proc, portal in self._children.values():
+                subactor: Actor
+                proc: trio.Process
+                portal: Portal
+                for (
+                    subactor,
+                    proc,
+                    portal,
+                ) in self._children.values():
 
                     # TODO: are we ever even going to use this or
                     # is the spawning backend responsible for such
@@ -285,8 +294,16 @@ class ActorNursery:
         # then hard kill all sub-processes
         if cs.cancelled_caught:
             log.error(
-                f"Failed to cancel {self}\nHard killing process tree!")
-            for subactor, proc, portal in self._children.values():
+                f'Failed to cancel {self}\nHard killing process tree!'
+            )
+            subactor: Actor
+            proc: trio.Process
+            portal: Portal
+            for (
+                subactor,
+                proc,
+                portal,
+            ) in self._children.values():
                 log.warning(f"Hard killing process {proc}")
                 proc.terminate()
 
@@ -383,7 +400,17 @@ async def _open_and_supervise_one_cancels_all_nursery(
                         else:
                             log.exception(
                                 f"Nursery for {current_actor().uid} "
-                                f"errored with")
+                                "errored with\n"
+
+                                # TODO: same thing as in
+                                # `._invoke()` to compute how to
+                                # place this div-line in the
+                                # middle of the above msg
+                                # content..
+                                # -[ ] prolly helper-func it too
+                                #   in our `.log` module..
+                                # '------ - ------'
+                            )
 
                         # cancel all subactors
                         await anursery.cancel()
-- 
2.34.1


From 0268b2ce9141b2be4588530b3a46fec814fa8f48 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 20 Feb 2024 13:12:51 -0500
Subject: [PATCH 104/378] Better subproc supervisor logging, todo for #320

Given i just similarly revamped a buncha `._runtime` log msg formatting,
might as well do something similar inside the spawning machinery such
that groking teardown sequences of each supervising task is much more
sane XD

Mostly this includes doing similar `'<field>: <value>\n'` multi-line
formatting when reporting various subproc supervision steps as well as
showing a detailed `trio.Process.__repr__()` as appropriate.

Also adds a detailed #TODO according to the needs of #320 for which
we're going to need some internal mechanism for intermediary parent
actors to determine if a given debug tty locker (sub-actor) is one of
*their* (transitive) children and thus stall the normal
cancellation/teardown sequence until that locker is complete.
---
 tractor/_spawn.py | 118 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 83 insertions(+), 35 deletions(-)

diff --git a/tractor/_spawn.py b/tractor/_spawn.py
index 2936220c..141d7c80 100644
--- a/tractor/_spawn.py
+++ b/tractor/_spawn.py
@@ -35,7 +35,7 @@ from exceptiongroup import BaseExceptionGroup
 import trio
 from trio_typing import TaskStatus
 
-from .devx._debug import (
+from .devx import (
     maybe_wait_for_debugger,
     acquire_debug_lock,
 )
@@ -144,7 +144,7 @@ async def exhaust_portal(
 
         # XXX: streams should never be reaped here since they should
         # always be established and shutdown using a context manager api
-        final = await portal.result()
+        final: Any = await portal.result()
 
     except (
         Exception,
@@ -152,13 +152,23 @@ async def exhaust_portal(
     ) as err:
         # we reraise in the parent task via a ``BaseExceptionGroup``
         return err
+
     except trio.Cancelled as err:
         # lol, of course we need this too ;P
         # TODO: merge with above?
-        log.warning(f"Cancelled result waiter for {portal.actor.uid}")
+        log.warning(
+            'Cancelled portal result waiter task:\n'
+            f'uid: {portal.channel.uid}\n'
+            f'error: {err}\n'
+        )
         return err
+
     else:
-        log.debug(f"Returning final result: {final}")
+        log.debug(
+            f'Returning final result from portal:\n'
+            f'uid: {portal.channel.uid}\n'
+            f'result: {final}\n'
+        )
         return final
 
 
@@ -170,26 +180,34 @@ async def cancel_on_completion(
 
 ) -> None:
     '''
-    Cancel actor gracefully once it's "main" portal's
+    Cancel actor gracefully once its "main" portal's
     result arrives.
 
-    Should only be called for actors spawned with `run_in_actor()`.
+    Should only be called for actors spawned via the
+    `Portal.run_in_actor()` API.
+
+    => and really this API will be deprecated and should be
+    re-implemented as a `.hilevel.one_shot_task_nursery()`..)
 
     '''
     # if this call errors we store the exception for later
     # in ``errors`` which will be reraised inside
     # an exception group and we still send out a cancel request
-    result = await exhaust_portal(portal, actor)
+    result: Any|Exception = await exhaust_portal(portal, actor)
     if isinstance(result, Exception):
-        errors[actor.uid] = result
+        errors[actor.uid]: Exception = result
         log.warning(
-            f"Cancelling {portal.channel.uid} after error {result}"
+            'Cancelling subactor due to error:\n'
+            f'uid: {portal.channel.uid}\n'
+            f'error: {result}\n'
         )
 
     else:
         log.runtime(
-            f"Cancelling {portal.channel.uid} gracefully "
-            f"after result {result}")
+            'Cancelling subactor gracefully:\n'
+            f'uid: {portal.channel.uid}\n'
+            f'result: {result}\n'
+        )
 
     # cancel the process now that we have a final result
     await portal.cancel_actor()
@@ -219,11 +237,14 @@ async def do_hard_kill(
       to be handled.
 
     '''
+    log.cancel(
+        'Terminating sub-proc:\n'
+        f'|_{proc}\n'
+    )
     # NOTE: this timeout used to do nothing since we were shielding
     # the ``.wait()`` inside ``new_proc()`` which will pretty much
     # never release until the process exits, now it acts as
     # a hard-kill time ultimatum.
-    log.debug(f"Terminating {proc}")
     with trio.move_on_after(terminate_after) as cs:
 
         # NOTE: code below was copied verbatim from the now deprecated
@@ -260,7 +281,10 @@ async def do_hard_kill(
     # zombies (as a feature) we ask the OS to do send in the
     # removal swad as the last resort.
     if cs.cancelled_caught:
-        log.critical(f"#ZOMBIE_LORD_IS_HERE: {proc}")
+        log.critical(
+            'Well, the #ZOMBIE_LORD_IS_HERE# to collect\n'
+            f'|_{proc}\n'
+        )
         proc.kill()
 
 
@@ -281,10 +305,16 @@ async def soft_wait(
     join/reap on an actor-runtime-in-process.
 
     '''
-    uid = portal.channel.uid
+    uid: tuple[str, str] = portal.channel.uid
     try:
-        log.cancel(f'Soft waiting on actor:\n{uid}')
+        log.cancel(
+            'Soft waiting on sub-actor proc:\n'
+            f'uid: {uid}\n'
+            f'|_{proc}\n'
+        )
+        # wait on sub-proc to signal termination
         await wait_func(proc)
+
     except trio.Cancelled:
         # if cancelled during a soft wait, cancel the child
         # actor before entering the hard reap sequence
@@ -296,8 +326,8 @@ async def soft_wait(
 
             async def cancel_on_proc_deth():
                 '''
-                Cancel the actor cancel request if we detect that
-                that the process terminated.
+                "Cancel the (actor) cancel" request if we detect
+                that that the underlying sub-process terminated.
 
                 '''
                 await wait_func(proc)
@@ -314,10 +344,10 @@ async def soft_wait(
 
             if proc.poll() is None:  # type: ignore
                 log.warning(
-                    'Actor still alive after cancel request:\n'
-                    f'{uid}'
+                    'Subactor still alive after cancel request?\n\n'
+                    f'uid: {uid}\n'
+                    f'|_{proc}\n'
                 )
-
                 n.cancel_scope.cancel()
         raise
 
@@ -341,7 +371,7 @@ async def new_proc(
 ) -> None:
 
     # lookup backend spawning target
-    target = _methods[_spawn_method]
+    target: Callable = _methods[_spawn_method]
 
     # mark the new actor with the global spawn method
     subactor._spawn_method = _spawn_method
@@ -491,8 +521,9 @@ async def trio_proc(
             # cancel result waiter that may have been spawned in
             # tandem if not done already
             log.cancel(
-                "Cancelling existing result waiter task for "
-                f"{subactor.uid}")
+                'Cancelling existing result waiter task for '
+                f'{subactor.uid}'
+            )
             nursery.cancel_scope.cancel()
 
     finally:
@@ -510,18 +541,35 @@ async def trio_proc(
                         with trio.move_on_after(0.5):
                             await proc.wait()
 
-                if is_root_process():
-                    # TODO: solve the following issue where we need
-                    # to do a similar wait like this but in an
-                    # "intermediary" parent actor that itself isn't
-                    # in debug but has a child that is, and we need
-                    # to hold off on relaying SIGINT until that child
-                    # is complete.
-                    # https://github.com/goodboy/tractor/issues/320
-                    await maybe_wait_for_debugger(
-                        child_in_debug=_runtime_vars.get(
-                            '_debug_mode', False),
-                    )
+                log.pdb(
+                    'Delaying subproc reaper while debugger locked..'
+                )
+                await maybe_wait_for_debugger(
+                    child_in_debug=_runtime_vars.get(
+                        '_debug_mode', False
+                    ),
+                    # TODO: need a diff value then default?
+                    # poll_steps=9999999,
+                )
+                # TODO: solve the following issue where we need
+                # to do a similar wait like this but in an
+                # "intermediary" parent actor that itself isn't
+                # in debug but has a child that is, and we need
+                # to hold off on relaying SIGINT until that child
+                # is complete.
+                # https://github.com/goodboy/tractor/issues/320
+                # -[ ] we need to handle non-root parent-actors specially
+                # by somehow determining if a child is in debug and then
+                # avoiding cancel/kill of said child by this
+                # (intermediary) parent until such a time as the root says
+                # the pdb lock is released and we are good to tear down
+                # (our children)..
+                #
+                # -[ ] so maybe something like this where we try to
+                #     acquire the lock and get notified of who has it,
+                #     check that uid against our known children?
+                # this_uid: tuple[str, str] = current_actor().uid
+                # await acquire_debug_lock(this_uid)
 
                 if proc.poll() is None:
                     log.cancel(f"Attempting to hard kill {proc}")
-- 
2.34.1


From 54a0a0000d15210590fab1ba79ce18d85e940e74 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 20 Feb 2024 13:22:44 -0500
Subject: [PATCH 105/378] .log: more multi-line styling

---
 tractor/log.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tractor/log.py b/tractor/log.py
index 590779a5..6c040209 100644
--- a/tractor/log.py
+++ b/tractor/log.py
@@ -289,11 +289,19 @@ def get_console_log(
     if not level:
         return log
 
-    log.setLevel(level.upper() if not isinstance(level, int) else level)
+    log.setLevel(
+        level.upper()
+        if not isinstance(level, int)
+        else level
+    )
 
     if not any(
         handler.stream == sys.stderr  # type: ignore
-        for handler in logger.handlers if getattr(handler, 'stream', None)
+        for handler in logger.handlers if getattr(
+            handler,
+            'stream',
+            None,
+        )
     ):
         handler = logging.StreamHandler()
         formatter = colorlog.ColoredFormatter(
-- 
2.34.1


From 1d7cf7d1dd13f4a0a5867dafae73e39005106dbf Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 20 Feb 2024 13:23:16 -0500
Subject: [PATCH 106/378] Enable `stackscope` render via root in debug mode

If `stackscope` is importable and debug_mode is enabled then we by
default call and report `.devx.enable_stack_on_sig()` is set B)

This makes debugging unexpected (SIGINT ignoring) hangs a cinch!
---
 tractor/_root.py | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/tractor/_root.py b/tractor/_root.py
index c79e1d98..1d147dd5 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -138,13 +138,19 @@ async def open_root_actor(
     )
     assert registry_addrs
 
-    loglevel = (loglevel or log._default_loglevel).upper()
+    loglevel = (
+        loglevel
+        or log._default_loglevel
+    ).upper()
 
-    if debug_mode and _spawn._spawn_method == 'trio':
+    if (
+        debug_mode
+        and _spawn._spawn_method == 'trio'
+    ):
         _state._runtime_vars['_debug_mode'] = True
 
-        # expose internal debug module to every actor allowing
-        # for use of ``await tractor.breakpoint()``
+        # expose internal debug module to every actor allowing for
+        # use of ``await tractor.pause()``
         enable_modules.append('tractor.devx._debug')
 
         # if debug mode get's enabled *at least* use that level of
@@ -163,7 +169,20 @@ async def open_root_actor(
             "Debug mode is only supported for the `trio` backend!"
         )
 
-    log.get_console_log(loglevel)
+    assert loglevel
+    _log = log.get_console_log(loglevel)
+    assert _log
+
+    # TODO: factor this into `.devx._stackscope`!!
+    if debug_mode:
+        try:
+            logger.info('Enabling `stackscope` traces on SIGUSR1')
+            from .devx import enable_stack_on_sig
+            enable_stack_on_sig()
+        except ImportError:
+            logger.warning(
+                '`stackscope` not installed for use in debug mode!'
+            )
 
     # closed into below ping task-func
     ponged_addrs: list[tuple[str, int]] = []
-- 
2.34.1


From 6c9bc627d87fc312ca310e64a441504dc862a871 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 20 Feb 2024 15:14:58 -0500
Subject: [PATCH 107/378] Make ctx tests support `debug_mode: bool` fixture

Such that with `--tpdb` passed (sub)actors will engage the `pdbp` REPL
automatically and so that we can use the new `stackscope` support when
complex cases hang Bo

Also,
- simplified some type-annots (ns paths),
- doc-ed an inter-peer test func with some ascii msg flows,
- added a bottom #TODO for replicating the scenario i hit in `modden`
  where a separate client actor-tree was hanging on cancelling a `bigd`
  sub-workspace..
---
 tests/test_context_stream_semantics.py | 142 ++++++++++++++++---------
 tests/test_inter_peer_cancellation.py  |  75 +++++++++----
 2 files changed, 150 insertions(+), 67 deletions(-)

diff --git a/tests/test_context_stream_semantics.py b/tests/test_context_stream_semantics.py
index dda096ce..4eb06e8a 100644
--- a/tests/test_context_stream_semantics.py
+++ b/tests/test_context_stream_semantics.py
@@ -8,7 +8,9 @@ sync-opening a ``tractor.Context`` beforehand.
 # from contextlib import asynccontextmanager as acm
 from itertools import count
 import platform
-from typing import Optional
+from typing import (
+    Callable,
+)
 
 import pytest
 import trio
@@ -69,7 +71,7 @@ _state: bool = False
 
 @tractor.context
 async def too_many_starteds(
-    ctx: tractor.Context,
+    ctx: Context,
 ) -> None:
     '''
     Call ``Context.started()`` more then once (an error).
@@ -84,7 +86,7 @@ async def too_many_starteds(
 
 @tractor.context
 async def not_started_but_stream_opened(
-    ctx: tractor.Context,
+    ctx: Context,
 ) -> None:
     '''
     Enter ``Context.open_stream()`` without calling ``.started()``.
@@ -105,11 +107,15 @@ async def not_started_but_stream_opened(
     ],
     ids='misuse_type={}'.format,
 )
-def test_started_misuse(target):
-
+def test_started_misuse(
+    target: Callable,
+    debug_mode: bool,
+):
     async def main():
-        async with tractor.open_nursery() as n:
-            portal = await n.start_actor(
+        async with tractor.open_nursery(
+            debug_mode=debug_mode,
+        ) as an:
+            portal = await an.start_actor(
                 target.__name__,
                 enable_modules=[__name__],
             )
@@ -124,7 +130,7 @@ def test_started_misuse(target):
 @tractor.context
 async def simple_setup_teardown(
 
-    ctx: tractor.Context,
+    ctx: Context,
     data: int,
     block_forever: bool = False,
 
@@ -170,6 +176,7 @@ def test_simple_context(
     error_parent,
     callee_blocks_forever,
     pointlessly_open_stream,
+    debug_mode: bool,
 ):
 
     timeout = 1.5 if not platform.system() == 'Windows' else 4
@@ -177,9 +184,10 @@ def test_simple_context(
     async def main():
 
         with trio.fail_after(timeout):
-            async with tractor.open_nursery() as nursery:
-
-                portal = await nursery.start_actor(
+            async with tractor.open_nursery(
+                debug_mode=debug_mode,
+            ) as an:
+                portal = await an.start_actor(
                     'simple_context',
                     enable_modules=[__name__],
                 )
@@ -260,6 +268,7 @@ def test_caller_cancels(
     cancel_method: str,
     chk_ctx_result_before_exit: bool,
     callee_returns_early: bool,
+    debug_mode: bool,
 ):
     '''
     Verify that when the opening side of a context (aka the caller)
@@ -268,7 +277,7 @@ def test_caller_cancels(
 
     '''
     async def check_canceller(
-        ctx: tractor.Context,
+        ctx: Context,
     ) -> None:
         # should not raise yet return the remote
         # context cancelled error.
@@ -287,8 +296,10 @@ def test_caller_cancels(
             )
 
     async def main():
-        async with tractor.open_nursery() as nursery:
-            portal = await nursery.start_actor(
+        async with tractor.open_nursery(
+            debug_mode=debug_mode,
+        ) as an:
+            portal = await an.start_actor(
                 'simple_context',
                 enable_modules=[__name__],
             )
@@ -338,7 +349,7 @@ def test_caller_cancels(
 @tractor.context
 async def close_ctx_immediately(
 
-    ctx: tractor.Context,
+    ctx: Context,
 
 ) -> None:
 
@@ -350,17 +361,33 @@ async def close_ctx_immediately(
 
 
 @tractor_test
-async def test_callee_closes_ctx_after_stream_open():
-    'callee context closes without using stream'
+async def test_callee_closes_ctx_after_stream_open(
+    debug_mode: bool,
+):
+    '''
+    callee context closes without using stream.
 
-    async with tractor.open_nursery() as n:
+    This should result in a msg sequence
+    |_<root>_
+             |_<fast_stream_closer>
 
-        portal = await n.start_actor(
+             <= {'started': <Any>, 'cid': <str>}
+             <= {'stop': True, 'cid': <str>}
+             <= {'result': Any, ..}
+
+     (ignored by child)
+    => {'stop': True, 'cid': <str>}
+
+    '''
+    async with tractor.open_nursery(
+        debug_mode=debug_mode,
+    ) as an:
+        portal = await an.start_actor(
             'fast_stream_closer',
             enable_modules=[__name__],
         )
 
-        with trio.fail_after(2):
+        with trio.fail_after(0.5):
             async with portal.open_context(
                 close_ctx_immediately,
 
@@ -368,10 +395,9 @@ async def test_callee_closes_ctx_after_stream_open():
                 # cancel_on_exit=True,
 
             ) as (ctx, sent):
-
                 assert sent is None
 
-                with trio.fail_after(0.5):
+                with trio.fail_after(0.4):
                     async with ctx.open_stream() as stream:
 
                         # should fall through since ``StopAsyncIteration``
@@ -379,12 +405,15 @@ async def test_callee_closes_ctx_after_stream_open():
                         # a ``trio.EndOfChannel`` by
                         # ``trio.abc.ReceiveChannel.__anext__()``
                         async for _ in stream:
+                            # trigger failure if we DO NOT
+                            # get an EOC!
                             assert 0
                         else:
 
                             # verify stream is now closed
                             try:
-                                await stream.receive()
+                                with trio.fail_after(0.3):
+                                    await stream.receive()
                             except trio.EndOfChannel:
                                 pass
 
@@ -405,7 +434,7 @@ async def test_callee_closes_ctx_after_stream_open():
 @tractor.context
 async def expect_cancelled(
 
-    ctx: tractor.Context,
+    ctx: Context,
 
 ) -> None:
     global _state
@@ -434,11 +463,15 @@ async def expect_cancelled(
 @tractor_test
 async def test_caller_closes_ctx_after_callee_opens_stream(
     use_ctx_cancel_method: bool,
+    debug_mode: bool,
 ):
-    'caller context closes without using stream'
-
-    async with tractor.open_nursery() as an:
+    '''
+    caller context closes without using/opening stream
 
+    '''
+    async with tractor.open_nursery(
+        debug_mode=debug_mode,
+    ) as an:
         root: Actor = current_actor()
 
         portal = await an.start_actor(
@@ -522,11 +555,13 @@ async def test_caller_closes_ctx_after_callee_opens_stream(
 
 
 @tractor_test
-async def test_multitask_caller_cancels_from_nonroot_task():
-
-    async with tractor.open_nursery() as n:
-
-        portal = await n.start_actor(
+async def test_multitask_caller_cancels_from_nonroot_task(
+    debug_mode: bool,
+):
+    async with tractor.open_nursery(
+        debug_mode=debug_mode,
+    ) as an:
+        portal = await an.start_actor(
             'ctx_cancelled',
             enable_modules=[__name__],
         )
@@ -573,7 +608,7 @@ async def test_multitask_caller_cancels_from_nonroot_task():
 @tractor.context
 async def cancel_self(
 
-    ctx: tractor.Context,
+    ctx: Context,
 
 ) -> None:
     global _state
@@ -610,16 +645,20 @@ async def cancel_self(
 
     raise RuntimeError('Context didnt cancel itself?!')
 
+
 @tractor_test
-async def test_callee_cancels_before_started():
+async def test_callee_cancels_before_started(
+    debug_mode: bool,
+):
     '''
     Callee calls `Context.cancel()` while streaming and caller
     sees stream terminated in `ContextCancelled`.
 
     '''
-    async with tractor.open_nursery() as n:
-
-        portal = await n.start_actor(
+    async with tractor.open_nursery(
+        debug_mode=debug_mode,
+    ) as an:
+        portal = await an.start_actor(
             'cancels_self',
             enable_modules=[__name__],
         )
@@ -645,7 +684,7 @@ async def test_callee_cancels_before_started():
 @tractor.context
 async def never_open_stream(
 
-    ctx:  tractor.Context,
+    ctx:  Context,
 
 ) -> None:
     '''
@@ -659,8 +698,8 @@ async def never_open_stream(
 @tractor.context
 async def keep_sending_from_callee(
 
-    ctx:  tractor.Context,
-    msg_buffer_size: Optional[int] = None,
+    ctx:  Context,
+    msg_buffer_size: int|None = None,
 
 ) -> None:
     '''
@@ -685,7 +724,10 @@ async def keep_sending_from_callee(
     ],
     ids='overrun_condition={}'.format,
 )
-def test_one_end_stream_not_opened(overrun_by):
+def test_one_end_stream_not_opened(
+    overrun_by: tuple[str, int, Callable],
+    debug_mode: bool,
+):
     '''
     This should exemplify the bug from:
     https://github.com/goodboy/tractor/issues/265
@@ -696,8 +738,10 @@ def test_one_end_stream_not_opened(overrun_by):
     buf_size = buf_size_increase + Actor.msg_buffer_size
 
     async def main():
-        async with tractor.open_nursery() as n:
-            portal = await n.start_actor(
+        async with tractor.open_nursery(
+            debug_mode=debug_mode,
+        ) as an:
+            portal = await an.start_actor(
                 entrypoint.__name__,
                 enable_modules=[__name__],
             )
@@ -754,7 +798,7 @@ def test_one_end_stream_not_opened(overrun_by):
 @tractor.context
 async def echo_back_sequence(
 
-    ctx:  tractor.Context,
+    ctx:  Context,
     seq: list[int],
     wait_for_cancel: bool,
     allow_overruns_side: str,
@@ -837,6 +881,7 @@ def test_maybe_allow_overruns_stream(
     slow_side: str,
     allow_overruns_side: str,
     loglevel: str,
+    debug_mode: bool,
 ):
     '''
     Demonstrate small overruns of each task back and forth
@@ -855,13 +900,14 @@ def test_maybe_allow_overruns_stream(
 
     '''
     async def main():
-        async with tractor.open_nursery() as n:
-            portal = await n.start_actor(
+        async with tractor.open_nursery(
+            debug_mode=debug_mode,
+        ) as an:
+            portal = await an.start_actor(
                 'callee_sends_forever',
                 enable_modules=[__name__],
                 loglevel=loglevel,
-
-                # debug_mode=True,
+                debug_mode=debug_mode,
             )
             seq = list(range(10))
             async with portal.open_context(
diff --git a/tests/test_inter_peer_cancellation.py b/tests/test_inter_peer_cancellation.py
index 5e1a4cad..1ead6172 100644
--- a/tests/test_inter_peer_cancellation.py
+++ b/tests/test_inter_peer_cancellation.py
@@ -123,7 +123,9 @@ async def error_before_started(
                 await peer_ctx.cancel()
 
 
-def test_do_not_swallow_error_before_started_by_remote_contextcancelled():
+def test_do_not_swallow_error_before_started_by_remote_contextcancelled(
+    debug_mode: bool,
+):
     '''
     Verify that an error raised in a remote context which itself
     opens YET ANOTHER remote context, which it then cancels, does not
@@ -132,7 +134,9 @@ def test_do_not_swallow_error_before_started_by_remote_contextcancelled():
 
     '''
     async def main():
-        async with tractor.open_nursery() as n:
+        async with tractor.open_nursery(
+            debug_mode=debug_mode,
+        ) as n:
             portal = await n.start_actor(
                 'errorer',
                 enable_modules=[__name__],
@@ -225,13 +229,16 @@ async def stream_from_peer(
     # NOTE: cancellation of the (sleeper) peer should always
     # cause a `ContextCancelled` raise in this streaming
     # actor.
-    except ContextCancelled as ctxerr:
-        err = ctxerr
+    except ContextCancelled as ctxc:
+        ctxerr = ctxc
+
         assert peer_ctx._remote_error is ctxerr
+        assert peer_ctx._remote_error.msgdata == ctxerr.msgdata
         assert peer_ctx.canceller == ctxerr.canceller
 
         # caller peer should not be the cancel requester
         assert not ctx.cancel_called
+
         # XXX can never be true since `._invoke` only
         # sets this AFTER the nursery block this task
         # was started in, exits.
@@ -269,9 +276,7 @@ async def stream_from_peer(
         # assert ctx.canceller[0] == 'root'
         # assert peer_ctx.canceller[0] == 'sleeper'
 
-    raise RuntimeError(
-        'peer never triggered local `ContextCancelled`?'
-    )
+    raise RuntimeError('Never triggered local `ContextCancelled` ?!?')
 
 
 @pytest.mark.parametrize(
@@ -280,6 +285,7 @@ async def stream_from_peer(
 )
 def test_peer_canceller(
     error_during_ctxerr_handling: bool,
+    debug_mode: bool,
 ):
     '''
     Verify that a cancellation triggered by an in-actor-tree peer
@@ -336,7 +342,7 @@ def test_peer_canceller(
     async def main():
         async with tractor.open_nursery(
             # NOTE: to halt the peer tasks on ctxc, uncomment this.
-            # debug_mode=True
+            debug_mode=debug_mode,
         ) as an:
             canceller: Portal = await an.start_actor(
                 'canceller',
@@ -377,7 +383,8 @@ def test_peer_canceller(
 
                     try:
                         print('PRE CONTEXT RESULT')
-                        await sleeper_ctx.result()
+                        res = await sleeper_ctx.result()
+                        assert res
 
                         # should never get here
                         pytest.fail(
@@ -387,7 +394,10 @@ def test_peer_canceller(
                     # should always raise since this root task does
                     # not request the sleeper cancellation ;)
                     except ContextCancelled as ctxerr:
-                        print(f'CAUGHT REMOTE CONTEXT CANCEL {ctxerr}')
+                        print(
+                            'CAUGHT REMOTE CONTEXT CANCEL FOM\n'
+                            f'{ctxerr}'
+                        )
 
                         # canceller and caller peers should not
                         # have been remotely cancelled.
@@ -410,16 +420,31 @@ def test_peer_canceller(
 
                     # XXX SHOULD NEVER EVER GET HERE XXX
                     except BaseException as berr:
-                        err = berr
-                        pytest.fail('did not rx ctx-cancelled error?')
+                        raise
+
+                        # XXX if needed to debug failure
+                        # _err = berr
+                        # await tractor.pause()
+                        # await trio.sleep_forever()
+
+                        pytest.fail(
+                            'did not rx ctxc ?!?\n\n'
+
+                            f'{berr}\n'
+                        )
+
                     else:
-                        pytest.fail('did not rx ctx-cancelled error?')
+                        pytest.fail(
+                            'did not rx ctxc ?!?\n\n'
+
+                            f'{ctxs}\n'
+                        )
 
             except (
                 ContextCancelled,
                 RuntimeError,
-            )as ctxerr:
-                _err = ctxerr
+            )as loc_err:
+                _loc_err = loc_err
 
                 # NOTE: the main state to check on `Context` is:
                 # - `.cancelled_caught` (maps to nursery cs)
@@ -436,7 +461,7 @@ def test_peer_canceller(
                 # `ContextCancelled` inside `.open_context()`
                 # block
                 if error_during_ctxerr_handling:
-                    assert isinstance(ctxerr, RuntimeError)
+                    assert isinstance(loc_err, RuntimeError)
 
                     # NOTE: this root actor task should have
                     # called `Context.cancel()` on the
@@ -472,9 +497,10 @@ def test_peer_canceller(
 
                 # CASE: standard teardown inside in `.open_context()` block
                 else:
-                    assert ctxerr.canceller == sleeper_ctx.canceller
+                    assert isinstance(loc_err, ContextCancelled)
+                    assert loc_err.canceller == sleeper_ctx.canceller
                     assert (
-                        ctxerr.canceller[0]
+                        loc_err.canceller[0]
                         ==
                         sleeper_ctx.canceller[0]
                         ==
@@ -484,7 +510,7 @@ def test_peer_canceller(
                     # the sleeper's remote error is the error bubbled
                     # out of the context-stack above!
                     re = sleeper_ctx._remote_error
-                    assert re is ctxerr
+                    assert re is loc_err
 
                     for ctx in ctxs:
                         re: BaseException | None = ctx._remote_error
@@ -554,3 +580,14 @@ def test_peer_canceller(
 
         assert excinfo.value.type == ContextCancelled
         assert excinfo.value.canceller[0] == 'canceller'
+
+
+def test_client_tree_spawns_and_cancels_service_subactor():
+    ...
+# TODO: test for the modden `mod wks open piker` bug!
+# -> start actor-tree (server) that offers sub-actor spawns via
+#   context API
+# -> start another full actor-tree (client) which requests to the first to
+#   spawn over its `@context` ep / api.
+# -> client actor cancels the context and should exit gracefully
+#   and the server's spawned child should cancel and terminate!
-- 
2.34.1


From f568fca98fdbeb88ba0edc2a888b48c9a08767c1 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 20 Feb 2024 15:26:14 -0500
Subject: [PATCH 108/378] Emit warning on any `ContextCancelled.canceller ==
 None`

---
 tractor/_exceptions.py | 35 +++++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index d63cf6d4..a6d10de7 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -33,12 +33,15 @@ import exceptiongroup as eg
 import trio
 
 from ._state import current_actor
+from .log import get_logger
 
 if TYPE_CHECKING:
     from ._context import Context
     from ._stream import MsgStream
     from .log import StackLevelAdapter
 
+log = get_logger('tractor')
+
 _this_mod = importlib.import_module(__name__)
 
 
@@ -112,11 +115,36 @@ class ContextCancelled(RemoteActorError):
 
     '''
     @property
-    def canceller(self) -> tuple[str, str] | None:
+    def canceller(self) -> tuple[str, str]|None:
+        '''
+        Return the (maybe) `Actor.uid` for the requesting-author
+        of this ctxc.
+
+        Emit a warning msg when `.canceller` has not been set,
+        which usually idicates that a `None` msg-loop setinel was
+        sent before expected in the runtime. This can happen in
+        a few situations:
+
+        - (simulating) an IPC transport network outage
+        - a (malicious) pkt sent specifically to cancel an actor's
+          runtime non-gracefully without ensuring ongoing RPC tasks are 
+          incrementally cancelled as is done with:
+          `Actor`
+          |_`.cancel()`
+          |_`.cancel_soon()`
+          |_`._cancel_task()`
+
+        '''
         value = self.msgdata.get('canceller')
         if value:
             return tuple(value)
 
+        log.warning(
+            'IPC Context cancelled without a requesting actor?\n'
+            'Maybe the IPC transport ended abruptly?\n\n'
+            f'{self}'
+        )
+
 
 class TransportClosed(trio.ClosedResourceError):
     "Underlying channel transport was closed prior to use"
@@ -199,7 +227,6 @@ def pack_error(
     ):
         error_msg.update(exc.msgdata)
 
-
     pkt: dict = {'error': error_msg}
     if cid:
         pkt['cid'] = cid
@@ -349,8 +376,8 @@ def _raise_from_no_key_in_msg(
         # raise a ``StopAsyncIteration`` **and** in our catch
         # block below it will trigger ``.aclose()``.
         raise trio.EndOfChannel(
-            f'Context stream ended due to msg:\n'
-            f'{pformat(msg)}'
+            f'Context stream ended due to msg:\n\n'
+            f'{pformat(msg)}\n'
         ) from src_err
 
 
-- 
2.34.1


From 179d7d2b04358f1edfec332bbd6994a93239a431 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 20 Feb 2024 15:28:11 -0500
Subject: [PATCH 109/378] Add `NamespacePath._ns` todo for `self:<ns.meth>`
 support

---
 tractor/msg/ptr.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/tractor/msg/ptr.py b/tractor/msg/ptr.py
index 550626a1..87d7bf2b 100644
--- a/tractor/msg/ptr.py
+++ b/tractor/msg/ptr.py
@@ -58,6 +58,11 @@ class NamespacePath(str):
     '''
     _ref: object | type | None = None
 
+    # TODO: support providing the ns instance in
+    # order to support 'self.<meth>` style to make
+    # `Portal.run_from_ns()` work!
+    # _ns: ModuleType|type|None = None
+
     def load_ref(self) -> object | type:
         if self._ref is None:
             self._ref = resolve_name(self)
@@ -100,5 +105,13 @@ class NamespacePath(str):
         fqnp: tuple[str, str] = cls._mk_fqnp(ref)
         return cls(':'.join(fqnp))
 
-    def to_tuple(self) -> tuple[str, str]:
-        return self._mk_fqnp(self.load_ref())
+    def to_tuple(
+        self,
+
+        # TODO: could this work re `self:<meth>` case from above?
+        # load_ref: bool = True,
+
+    ) -> tuple[str, str]:
+        return self._mk_fqnp(
+            self.load_ref()
+        )
-- 
2.34.1


From 114ec36436cfad57e2a7715f26ebbdbff223ae68 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 20 Feb 2024 15:29:31 -0500
Subject: [PATCH 110/378] Add `stackscope` as dep, drop legacy `pdb` issue
 cruft

---
 setup.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index d26deb9b..c226661e 100755
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@ with open('docs/README.rst', encoding='utf-8') as f:
 setup(
     name="tractor",
     version='0.1.0a6dev0',  # alpha zone
-    description='structured concurrrent `trio`-"actors"',
+    description='structured concurrent `trio`-"actors"',
     long_description=readme,
     license='AGPLv3',
     author='Tyler Goodlet',
@@ -50,6 +50,7 @@ setup(
         'exceptiongroup',
 
         # tooling
+        'stackscope',
         'tricycle',
         'trio_typing',
         'colorlog',
@@ -61,16 +62,15 @@ setup(
         # debug mode REPL
         'pdbp',
 
+        # TODO: distributed transport using
+        # linux kernel networking
+        # 'pyroute2',
+
         # pip ref docs on these specs:
         # https://pip.pypa.io/en/stable/reference/requirement-specifiers/#examples
         # and pep:
         # https://peps.python.org/pep-0440/#version-specifiers
 
-        # windows deps workaround for ``pdbpp``
-        # https://github.com/pdbpp/pdbpp/issues/498
-        # https://github.com/pdbpp/fancycompleter/issues/37
-        'pyreadline3 ; platform_system == "Windows"',
-
     ],
     tests_require=['pytest'],
     python_requires=">=3.10",
-- 
2.34.1


From df50d780423d8603b2bc785d3f50de362f9b5adb Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 20 Feb 2024 15:39:45 -0500
Subject: [PATCH 111/378] Fix `.devx.maybe_wait_for_debugger()` polling deats

When entered by the root actor avoid excessive polling cycles by,
- blocking on the `Lock.no_remote_has_tty: trio.Event` and breaking
  *immediately* when set (though we should really also lock
  it from the root right?) to avoid extra loops..
- shielding the `await trio.sleep(poll_delay)` call to avoid any local
  cancellation causing the (presumably root-actor task) caller to move
  on (possibly to cancel its children) and instead to continue
  poll-blocking until the lock is actually released by its user.
- `break` the poll loop immediately if no remote locker is detected.
- use `.pdb()` level for reporting lock state changes.

Also add a #TODO to handle calls by non-root actors as it pertains to
---
 tractor/devx/_debug.py | 139 +++++++++++++++++++++++++----------------
 1 file changed, 86 insertions(+), 53 deletions(-)

diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index f3550ba6..43fd9018 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -1,18 +1,19 @@
 # tractor: structured concurrent "actors".
 # Copyright 2018-eternity Tyler Goodlet.
 
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
+# This program is free software: you can redistribute it and/or
+# modify it under the terms of the GNU Affero General Public License
+# as published by the Free Software Foundation, either version 3 of
+# the License, or (at your option) any later version.
 
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Affero General Public License for more details.
 
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+# You should have received a copy of the GNU Affero General Public
+# License along with this program.  If not, see
+# <https://www.gnu.org/licenses/>.
 
 """
 Multi-core debugging for da peeps!
@@ -43,6 +44,7 @@ from types import FrameType
 import pdbp
 import tractor
 import trio
+from trio.lowlevel import current_task
 from trio_typing import (
     TaskStatus,
     # Task,
@@ -50,6 +52,7 @@ from trio_typing import (
 
 from ..log import get_logger
 from .._state import (
+    current_actor,
     is_root_process,
     debug_mode,
 )
@@ -238,7 +241,7 @@ async def _acquire_debug_lock_from_root_task(
     to the ``pdb`` repl.
 
     '''
-    task_name: str = trio.lowlevel.current_task().name
+    task_name: str = current_task().name
     we_acquired: bool = False
 
     log.runtime(
@@ -323,8 +326,7 @@ async def lock_tty_for_child(
     highly reliable at releasing the mutex complete!
 
     '''
-    task_name = trio.lowlevel.current_task().name
-
+    task_name: str = current_task().name
     if tuple(subactor_uid) in Lock._blocked:
         log.warning(
             f'Actor {subactor_uid} is blocked from acquiring debug lock\n'
@@ -407,11 +409,13 @@ async def wait_for_parent_stdin_hijack(
                     assert val == 'Locked'
 
                     async with ctx.open_stream() as stream:
-                        # unblock local caller
-
                         try:
+                            # unblock local caller
                             assert Lock.local_pdb_complete
                             task_status.started(cs)
+
+                            # wait for local task to exit and
+                            # release the REPL
                             await Lock.local_pdb_complete.wait()
 
                         finally:
@@ -468,7 +472,7 @@ def shield_sigint_handler(
 
     uid_in_debug: tuple[str, str] | None = Lock.global_actor_in_debug
 
-    actor = tractor.current_actor()
+    actor = current_actor()
     # print(f'{actor.uid} in HANDLER with ')
 
     def do_cancel():
@@ -613,7 +617,7 @@ def _set_trace(
     shield: bool = False,
 ):
     __tracebackhide__: bool = True
-    actor: tractor.Actor = actor or tractor.current_actor()
+    actor: tractor.Actor = actor or current_actor()
 
     # start 2 levels up in user code
     frame: FrameType | None = sys._getframe()
@@ -683,9 +687,9 @@ async def pause(
 
     '''
     # __tracebackhide__ = True
-    actor = tractor.current_actor()
+    actor = current_actor()
     pdb, undo_sigint = mk_mpdb()
-    task_name = trio.lowlevel.current_task().name
+    task_name: str = trio.lowlevel.current_task().name
 
     if (
         not Lock.local_pdb_complete
@@ -836,7 +840,7 @@ async def pause(
 # runtime aware version which takes care of all .
 def pause_from_sync() -> None:
     print("ENTER SYNC PAUSE")
-    actor: tractor.Actor = tractor.current_actor(
+    actor: tractor.Actor = current_actor(
         err_on_no_runtime=False,
     )
     if actor:
@@ -971,9 +975,10 @@ async def acquire_debug_lock(
     '''
     Grab root's debug lock on entry, release on exit.
 
-    This helper is for actor's who don't actually need
-    to acquired the debugger but want to wait until the
-    lock is free in the process-tree root.
+    This helper is for actor's who don't actually need to acquired
+    the debugger but want to wait until the lock is free in the
+    process-tree root such that they don't clobber an ongoing pdb
+    REPL session in some peer or child!
 
     '''
     if not debug_mode():
@@ -1013,43 +1018,71 @@ async def maybe_wait_for_debugger(
         # tearing down.
         sub_in_debug: tuple[str, str] | None = None
 
-        for _ in range(poll_steps):
+        for istep in range(poll_steps):
 
-            if Lock.global_actor_in_debug:
-                sub_in_debug = tuple(Lock.global_actor_in_debug)
-
-            log.debug('Root polling for debug')
-
-            with trio.CancelScope(shield=True):
-                await trio.sleep(poll_delay)
-
-                # TODO: could this make things more deterministic?  wait
-                # to see if a sub-actor task will be scheduled and grab
-                # the tty lock on the next tick?
-                # XXX: doesn't seem to work
+            if sub_in_debug := Lock.global_actor_in_debug:
+                log.pdb(
+                    f'Lock in use by {sub_in_debug}'
+                )
+                # TODO: could this make things more deterministic?
+                # wait to see if a sub-actor task will be
+                # scheduled and grab the tty lock on the next
+                # tick?
+                # XXX => but it doesn't seem to work..
                 # await trio.testing.wait_all_tasks_blocked(cushion=0)
 
-                debug_complete = Lock.no_remote_has_tty
-                if (
-                    debug_complete
-                    and sub_in_debug is not None
-                    and not debug_complete.is_set()
-                ):
-                    log.pdb(
-                        'Root has errored but pdb is in use by '
-                        f'child {sub_in_debug}\n'
-                        'Waiting on tty lock to release..'
-                    )
+            debug_complete: trio.Event|None = Lock.no_remote_has_tty
+            if (
+                debug_complete
+                and not debug_complete.is_set()
+                and sub_in_debug is not None
+            ):
+                log.pdb(
+                    'Root has errored but pdb is in use by child\n'
+                    'Waiting on tty lock to release..\n'
+                    f'uid: {sub_in_debug}\n'
+                )
+                await debug_complete.wait()
+                log.pdb(
+                    f'Child subactor released debug lock!\n'
+                    f'uid: {sub_in_debug}\n'
+                )
+                if debug_complete.is_set():
+                    break
 
-                    await debug_complete.wait()
+            # is no subactor locking debugger currently?
+            elif (
+                debug_complete is None
+                or sub_in_debug is None
+            ):
+                log.pdb(
+                    'Root acquired debug TTY LOCK from child\n'
+                    f'uid: {sub_in_debug}'
+                )
+                break
 
-                await trio.sleep(poll_delay)
-                continue
+            else:
+                # TODO: don't need this right?
+                # await trio.lowlevel.checkpoint()
+
+                log.debug(
+                    'Root polling for debug:\n'
+                    f'poll step: {istep}\n'
+                    f'poll delya: {poll_delay}'
+                )
+                with trio.CancelScope(shield=True):
+                    await trio.sleep(poll_delay)
+                    continue
         else:
-            log.debug(
-                    'Root acquired TTY LOCK'
-            )
+            log.pdb('Root acquired debug TTY LOCK')
 
+    # else:
+    #     # TODO: non-root call for #320?
+    #     this_uid: tuple[str, str] = current_actor().uid
+    #     async with acquire_debug_lock(
+    #         subactor_uid=this_uid,
+    #     ):
+    #         pass
 
 # TODO: better naming and what additionals?
 # - [ ] optional runtime plugging?
-- 
2.34.1


From 20a089c3311ed690e90a468637ec7a016f7e5172 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 20 Feb 2024 15:58:11 -0500
Subject: [PATCH 112/378] Drop extra " " when logging actor nursery errors

---
 tractor/_supervise.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tractor/_supervise.py b/tractor/_supervise.py
index 7319b15b..af83aa5b 100644
--- a/tractor/_supervise.py
+++ b/tractor/_supervise.py
@@ -400,7 +400,7 @@ async def _open_and_supervise_one_cancels_all_nursery(
                         else:
                             log.exception(
                                 f"Nursery for {current_actor().uid} "
-                                "errored with\n"
+                                "errored with:"
 
                                 # TODO: same thing as in
                                 # `._invoke()` to compute how to
-- 
2.34.1


From 621b252b0ca3cf6d17d53940d30b46354e3081d3 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 20 Feb 2024 15:59:55 -0500
Subject: [PATCH 113/378] Use `NamespacePath` in `Context` mgmt internals

The only case where we can't is in `Portal.run_from_ns()` usage (since we
pass a path with `self:<Actor.meth>`) and because `.to_tuple()`
internally uses `.load_ref()` which will of course fail on such a path..

So or now impl as,
- mk `Actor.start_remote_task()` take a `nsf: NamespacePath` but also
  offer a `load_nsf: bool = False` such that by default we bypass ref
  loading (maybe this is fine for perf long run as well?) for the
  `Actor`/'self:'` case mentioned above.
- mk `.get_context()` take an instance `nsf` obvi.

More logging msg format tweaks:
- change msg-flow related content to show the `Context._nsf`, which,
  right, is coming follow up commit..
- bunch more `.runtime()` format updates to show `msg: dict` contents
  and internal primitives with trailing `'\n'` for easier reading.
- report import loading `stackscope` in subactors.
---
 tractor/_runtime.py | 175 +++++++++++++++++++++++++++++++-------------
 1 file changed, 124 insertions(+), 51 deletions(-)

diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 4e7f9fac..c41f6f5b 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -48,15 +48,12 @@ import trio
 from trio import (
     CancelScope,
 )
-from trio.lowlevel import (
-    current_task,
-    Task,
-)
 from trio_typing import (
     Nursery,
     TaskStatus,
 )
 
+from .msg import NamespacePath
 from ._ipc import Channel
 from ._context import (
     mk_context,
@@ -145,8 +142,9 @@ async def _invoke(
     cs: CancelScope | None = None
 
     ctx = actor.get_context(
-        chan,
-        cid,
+        chan=chan,
+        cid=cid,
+        nsf=NamespacePath.from_ref(func),
         # We shouldn't ever need to pass this through right?
         # it's up to the soon-to-be called rpc task to
         # open the stream with this option.
@@ -276,8 +274,8 @@ async def _invoke(
 
                     # TODO: should would be nice to have our
                     # `TaskMngr` nursery here!
-                    # res: Any = await coro
-                    res = await coro
+                    res: Any = await coro
+                    ctx._result = res
 
                     # deliver final result to caller side.
                     await chan.send({
@@ -315,11 +313,13 @@ async def _invoke(
                 # associated child isn't in debug any more
                 await maybe_wait_for_debugger()
                 ctx: Context = actor._contexts.pop((chan.uid, cid))
-                log.cancel(
-                    f'Context task was terminated:\n'
-                    f'func: {func}\n'
-                    f'ctx: {pformat(ctx)}'
+                res_msg: str = (
+                    'IPC context terminated with result:\n'
+                    f'result={ctx._result}\n'
+                    f'error={ctx._local_error}\n'
+                    f'|_{pformat(ctx)}\n\n'
                 )
+                log.cancel(res_msg)
 
             if ctx.cancelled_caught:
 
@@ -331,7 +331,6 @@ async def _invoke(
                     ctx._maybe_raise_remote_err(re)
 
                 # fname: str = func.__name__
-                task: Task = current_task()
                 cs: CancelScope = ctx._scope
                 if cs.cancel_called:
                     our_uid: tuple = actor.uid
@@ -378,16 +377,16 @@ async def _invoke(
                         div_str +
                         f'<= canceller: {canceller}\n'
                         f'=> uid: {our_uid}\n'
-                        f'  |_ task: `{task.name}()`'
+                        f'  |_{ctx._task}()\n'
                     )
 
                     # TODO: does this ever get set any more or can
                     # we remove it?
                     if ctx._cancel_msg:
                         msg += (
-                            '------ - ------\n'
-                            'IPC msg:\n'
-                            f'{ctx._cancel_msg}'
+                            # '------ - ------\n'
+                            # 'IPC msg:\n'
+                            f'\n{ctx._cancel_msg}'
                         )
 
                     # task-contex was either cancelled by request using
@@ -435,7 +434,12 @@ async def _invoke(
                 task_status.started(ctx)
                 result = await coro
                 fname: str = func.__name__
-                log.runtime(f'{fname}() result: {result}')
+                log.runtime(
+                    'RPC complete:\n'
+                    f'task: {ctx._task}\n'
+                    f'|_cid={ctx.cid}\n'
+                    f'|_{fname}() -> {pformat(result)}\n'
+                )
 
                 # NOTE: only send result if we know IPC isn't down
                 if (
@@ -965,7 +969,7 @@ class Actor:
                     # and bail after timeout (2-generals on closure).
                     assert chan.msgstream
 
-                    log.runtime(
+                    log.warning(
                         f'Draining lingering msgs from stream {chan.msgstream}'
                     )
 
@@ -977,13 +981,24 @@ class Actor:
                         # making sure any RPC response to that call is
                         # delivered the local calling task.
                         # TODO: factor this into a helper?
-                        log.runtime(f'drained {msg} for {chan.uid}')
+                        log.warning(
+                            'Draining msg from disconnected\n'
+                            f'peer:  {chan.uid}]\n\n'
+                            f'{pformat(msg)}\n'
+                        )
                         cid = msg.get('cid')
                         if cid:
                             # deliver response to local caller/waiter
-                            await self._push_result(chan, cid, msg)
+                            await self._push_result(
+                                chan,
+                                cid,
+                                msg,
+                            )
 
-                    log.runtime('Waiting on actor nursery to exit..')
+                    log.runtime(
+                        'Waiting on local actor nursery to exit..\n'
+                        f'|_{local_nursery}\n'
+                    )
                     await local_nursery.exited.wait()
 
                 if disconnected:
@@ -1167,6 +1182,7 @@ class Actor:
         self,
         chan: Channel,
         cid: str,
+        nsf: NamespacePath,
 
         msg_buffer_size: int | None = None,
         allow_overruns: bool = False,
@@ -1180,11 +1196,15 @@ class Actor:
         task-as-function invocation.
 
         '''
-        log.runtime(f"Getting result queue for {chan.uid} cid {cid}")
         actor_uid = chan.uid
         assert actor_uid
         try:
             ctx = self._contexts[(actor_uid, cid)]
+            log.runtime(
+                f'Retreived cached IPC ctx for\n'
+                f'peer: {chan.uid}\n'
+                f'cid:{cid}\n'
+            )
             ctx._allow_overruns = allow_overruns
 
             # adjust buffer size if specified
@@ -1193,9 +1213,15 @@ class Actor:
                 state.max_buffer_size = msg_buffer_size
 
         except KeyError:
+            log.runtime(
+                f'Creating NEW IPC ctx for\n'
+                f'peer: {chan.uid}\n'
+                f'cid: {cid}\n'
+            )
             ctx = mk_context(
                 chan,
                 cid,
+                nsf=nsf,
                 msg_buffer_size=msg_buffer_size or self.msg_buffer_size,
                 _allow_overruns=allow_overruns,
             )
@@ -1206,11 +1232,13 @@ class Actor:
     async def start_remote_task(
         self,
         chan: Channel,
-        ns: str,
-        func: str,
+        nsf: NamespacePath,
         kwargs: dict,
+
+        # IPC channel config
         msg_buffer_size: int | None = None,
         allow_overruns: bool = False,
+        load_nsf: bool = False,
 
     ) -> Context:
         '''
@@ -1225,20 +1253,43 @@ class Actor:
         cid = str(uuid.uuid4())
         assert chan.uid
         ctx = self.get_context(
-            chan,
-            cid,
+            chan=chan,
+            cid=cid,
+            nsf=nsf,
             msg_buffer_size=msg_buffer_size,
             allow_overruns=allow_overruns,
         )
-        log.runtime(f"Sending cmd to {chan.uid}: {ns}.{func}({kwargs})")
+
+        if (
+            'self' in nsf
+            or not load_nsf
+        ):
+            ns, _, func = nsf.partition(':')
+        else:
+            # TODO: pass nsf directly over wire!
+            # -[ ] but, how to do `self:<Actor.meth>`??
+            ns, func = nsf.to_tuple()
+
+        log.runtime(
+            'Sending cmd to\n'
+            f'peer: {chan.uid} => \n'
+            '\n'
+            f'=> {ns}.{func}({kwargs})\n'
+        )
         await chan.send(
-            {'cmd': (ns, func, kwargs, self.uid, cid)}
+            {'cmd': (
+                ns,
+                func,
+                kwargs,
+                self.uid,
+                cid,
+            )}
         )
 
         # Wait on first response msg and validate; this should be
         # immediate.
-        first_msg = await ctx._recv_chan.receive()
-        functype = first_msg.get('functype')
+        first_msg: dict = await ctx._recv_chan.receive()
+        functype: str = first_msg.get('functype')
 
         if 'error' in first_msg:
             raise unpack_error(first_msg, chan)
@@ -1280,14 +1331,19 @@ class Actor:
                 parent_data: dict[str, Any]
                 parent_data = await chan.recv()
                 log.runtime(
-                    "Received state from parent:\n"
-                    f"{parent_data}"
+                    'Received state from parent:\n\n'
+                    # TODO: eventually all these msgs as
+                    # `msgspec.Struct` with a special mode that
+                    # pformats them in multi-line mode, BUT only
+                    # if "trace"/"util" mode is enabled?
+                    f'{pformat(parent_data)}\n'
                 )
                 accept_addrs: list[tuple[str, int]] = parent_data.pop('bind_addrs')
                 rvs = parent_data.pop('_runtime_vars')
 
                 if rvs['_debug_mode']:
                     try:
+                        log.info('Enabling `stackscope` traces on SIGUSR1')
                         from .devx import enable_stack_on_sig
                         enable_stack_on_sig()
                     except ImportError:
@@ -1368,7 +1424,8 @@ class Actor:
                         for listener in listeners
                     ]
                     log.runtime(
-                        f'Started tcp server(s) on {sockets}'
+                        'Started TCP server(s)\n'
+                        f'|_{sockets}\n'
                     )
                     self._listeners.extend(listeners)
 
@@ -1923,7 +1980,7 @@ async def process_messages(
     log.runtime(
         'Entering IPC msg loop:\n'
         f'peer: {chan.uid}\n'
-        f'|_{chan}'
+        f'|_{chan}\n'
     )
     nursery_cancelled_before_task: bool = False
     msg: dict | None = None
@@ -1969,12 +2026,17 @@ async def process_messages(
                 if cid:
                     # deliver response to local caller/waiter
                     # via its per-remote-context memory channel.
-                    await actor._push_result(chan, cid, msg)
+                    await actor._push_result(
+                        chan,
+                        cid,
+                        msg,
+                    )
 
                     log.runtime(
-                        f'Waiting on next IPC msg from {chan.uid}:\n'
+                        'Waiting on next IPC msg from\n'
+                        f'peer: {chan.uid}:\n'
+                        f'|_{chan}\n'
                         # f'last msg: {msg}\n'
-                        f'|_{chan}'
                     )
                     continue
 
@@ -1994,9 +2056,11 @@ async def process_messages(
                     raise exc
 
                 log.runtime(
-                    f"Processing request from {actorid}\n"
-                    f"{ns}.{funcname}({kwargs})")
-
+                    'Handling RPC cmd from\n'
+                    f'peer: {actorid}\n'
+                    '\n'
+                    f'=> {ns}.{funcname}({kwargs})\n'
+                )
                 if ns == 'self':
                     if funcname == 'cancel':
                         func: Callable = actor.cancel
@@ -2105,17 +2169,18 @@ async def process_messages(
                 # in the lone case where a ``Context`` is not
                 # delivered, it's likely going to be a locally
                 # scoped exception from ``_invoke()`` itself.
-                if isinstance(ctx, Exception):
+                if isinstance(err := ctx, Exception):
                     log.warning(
-                        f"Task for RPC func {func} failed with"
-                        f"{ctx}"
+                        'Task for RPC failed?'
+                        f'|_ {func}()\n\n'
+
+                        f'{err}'
                     )
                     continue
 
                 else:
                     # mark that we have ongoing rpc tasks
                     actor._ongoing_rpc_tasks = trio.Event()
-                    log.runtime(f"RPC func is {func}")
 
                     # store cancel scope such that the rpc task can be
                     # cancelled gracefully if requested
@@ -2126,7 +2191,10 @@ async def process_messages(
                     )
 
                 log.runtime(
-                    f"Waiting on next msg for {chan} from {chan.uid}")
+                    'Waiting on next IPC msg from\n'
+                    f'peer: {chan.uid}\n'
+                    f'|_{chan}\n'
+                )
 
             # end of async for, channel disconnect vis
             # ``trio.EndOfChannel``
@@ -2143,9 +2211,12 @@ async def process_messages(
         # handshake for them (yet) and instead we simply bail out of
         # the message loop and expect the teardown sequence to clean
         # up.
+        # TODO: don't show this msg if it's an emphemeral
+        # discovery ep call?
         log.runtime(
-            f'channel from {chan.uid} closed abruptly:\n'
-            f'-> {chan.raddr}\n'
+            f'channel closed abruptly with\n'
+            f'peer: {chan.uid}\n' 
+            f'|_{chan.raddr}\n'
         )
 
         # transport **was** disconnected
@@ -2187,9 +2258,11 @@ async def process_messages(
     finally:
         # msg debugging for when he machinery is brokey
         log.runtime(
-            f'Exiting IPC msg loop with {chan.uid} '
-            f'final msg: {msg}\n'
-            f'|_{chan}'
+            'Exiting IPC msg loop with\n'
+            f'peer: {chan.uid}\n'
+            f'|_{chan}\n\n'
+            'final msg:\n'
+            f'{pformat(msg)}\n'
         )
 
     # transport **was not** disconnected
-- 
2.34.1


From 82dcaff8db5c3d303fa86f50131c34d8497f022b Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 21 Feb 2024 13:05:22 -0500
Subject: [PATCH 114/378] Better logging for cancel requests in IPC msg loop

As similarly improved in other parts of the runtime, adds much more
pedantic (`.cancel()`) logging content to indicate the src of remote
cancellation request particularly for `Actor.cancel()` and
`._cancel_task()` cases prior to `._invoke()` task scheduling. Also add
detailed case comments and much more info to the
"request-to-cancel-already-terminated-RPC-task" log emission to include
the `Channel` and `Context.cid` deats.

This helped me find the src of a race condition causing a test to fail
where a callee ctx task was returning a result *before* an expected
`ctx.cancel()` request arrived B). Adding much more pedantic
`.cancel()` msg contents around the requester's deats should ensure
these cases are much easier to detect going forward!

Also, simplify the `._invoke()` final result/error log msg to only put
*one of either* the final error or returned result above the `Context`
pprint.
---
 tractor/_runtime.py | 79 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 56 insertions(+), 23 deletions(-)

diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index c41f6f5b..d127d9d3 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -312,14 +312,19 @@ async def _invoke(
                 # don't pop the local context until we know the
                 # associated child isn't in debug any more
                 await maybe_wait_for_debugger()
-                ctx: Context = actor._contexts.pop((chan.uid, cid))
-                res_msg: str = (
-                    'IPC context terminated with result:\n'
-                    f'result={ctx._result}\n'
-                    f'error={ctx._local_error}\n'
-                    f'|_{pformat(ctx)}\n\n'
+                ctx: Context = actor._contexts.pop(
+                    (chan.uid, cid)
+                )
+
+                res_str: str = (
+                    'error: {ctx._local_error}'
+                    if ctx._local_error
+                    else f'result: {ctx._result}'
+                )
+                log.cancel(
+                    f'IPC context terminated with final {res_str}\n'
+                    f'|_{pformat(ctx)}\n'
                 )
-                log.cancel(res_msg)
 
             if ctx.cancelled_caught:
 
@@ -1537,8 +1542,20 @@ class Actor:
             # be cancelled was indeed spawned by a request from this channel
             ctx, func, is_complete = self._rpc_tasks[(chan, cid)]
             scope: CancelScope = ctx._scope
+
         except KeyError:
-            log.cancel(f"{cid} has already completed/terminated?")
+            # NOTE: during msging race conditions this will often
+            # emit, some examples:
+            # - callee returns a result before cancel-msg/ctxc-raised
+            # - callee self raises ctxc before caller send request,
+            # - callee errors prior to cancel req.
+            log.cancel(
+                'Cancel request invalid, RPC task already completed?\n'
+                f'<= canceller: {requesting_uid}\n'
+                f'  |_{chan}\n\n'
+
+                f'=> ctx id: {cid}\n'
+            )
             return True
 
         log.cancel(
@@ -2017,8 +2034,10 @@ async def process_messages(
 
                 log.transport(   # type: ignore
                     f'<= IPC msg from peer: {chan.uid}\n\n'
+
                     # TODO: conditionally avoid fmting depending
                     # on log level (for perf)?
+                    # => specifically `pformat()` sub-call..?
                     f'{pformat(msg)}\n'
                 )
 
@@ -2036,14 +2055,25 @@ async def process_messages(
                         'Waiting on next IPC msg from\n'
                         f'peer: {chan.uid}:\n'
                         f'|_{chan}\n'
+
                         # f'last msg: {msg}\n'
                     )
                     continue
 
-                # TODO: implement with ``match:`` syntax?
-                # process command request
+                # process a 'cmd' request-msg upack
+                # TODO: impl with native `msgspec.Struct` support !!
+                # -[ ] implement with ``match:`` syntax?
+                # -[ ] discard un-authed msgs as per,
+                # <TODO put issue for typed msging structs>
                 try:
-                    ns, funcname, kwargs, actorid, cid = msg['cmd']
+                    (
+                        ns,
+                        funcname,
+                        kwargs,
+                        actorid,
+                        cid,
+                    ) = msg['cmd']
+
                 except KeyError:
                     # This is the non-rpc error case, that is, an
                     # error **not** raised inside a call to ``_invoke()``
@@ -2062,25 +2092,27 @@ async def process_messages(
                     f'=> {ns}.{funcname}({kwargs})\n'
                 )
                 if ns == 'self':
+                    uid: tuple = chan.uid
                     if funcname == 'cancel':
                         func: Callable = actor.cancel
-                        kwargs['requesting_uid'] = chan.uid
+                        kwargs['requesting_uid'] = uid
 
                         # don't start entire actor runtime cancellation
                         # if this actor is currently in debug mode!
-                        pdb_complete: trio.Event | None = _debug.Lock.local_pdb_complete
+                        pdb_complete: trio.Event|None = _debug.Lock.local_pdb_complete
                         if pdb_complete:
                             await pdb_complete.wait()
 
-                        # we immediately start the runtime machinery
-                        # shutdown
+                        # Either of  `Actor.cancel()`/`.cancel_soon()`
+                        # was called, so terminate this IPC msg
+                        # loop, exit back out into `async_main()`,
+                        # and immediately start the core runtime
+                        # machinery shutdown!
                         with CancelScope(shield=True):
-                            # actor.cancel() was called so kill this
-                            # msg loop and break out into
-                            # ``async_main()``
                             log.cancel(
-                                "Actor runtime for was remotely cancelled "
-                                f"by {chan.uid}"
+                                f'Cancel request for `Actor` runtime\n'
+                                f'<= canceller: {uid}\n'
+                                # f'=> uid: {actor.uid}\n'
                             )
                             await _invoke(
                                 actor,
@@ -2107,9 +2139,10 @@ async def process_messages(
                         target_cid = kwargs['cid']
                         kwargs['requesting_uid'] = chan.uid
                         log.cancel(
-                            f'Remote request to cancel task\n'
-                            f'remote actor: {chan.uid}\n'
-                            f'task: {target_cid}'
+                            f'Rx task cancel request\n'
+                            f'<= canceller: {chan.uid}\n'
+                            f'=> uid: {actor.uid}\n'
+                            f'  |_cid: {target_cid}\n'
                         )
                         try:
                             await _invoke(
-- 
2.34.1


From 10adf34be582a92f55587259845b09a5054f2265 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 21 Feb 2024 13:17:37 -0500
Subject: [PATCH 115/378] Set any `._eoc` to the err in
 `_raise_from_no_key_in_msg()`

Since that's what we're now doing in `MsgStream._eoc` internal
assignments (coming in future patch), do the same in this exception
re-raise-helper and include more extensive doc string detailing all
the msg-type-to-raised-error cases. Also expose a `hide_tb: bool` like
we have already in `unpack_error()`.
---
 tractor/_exceptions.py | 63 ++++++++++++++++++++++++++++--------------
 1 file changed, 43 insertions(+), 20 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index a6d10de7..bdd8d411 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -237,8 +237,10 @@ def pack_error(
 def unpack_error(
 
     msg: dict[str, Any],
+
     chan=None,
     err_type=RemoteActorError,
+
     hide_tb: bool = True,
 
 ) -> None|Exception:
@@ -314,37 +316,61 @@ def _raise_from_no_key_in_msg(
     msg: dict,
     src_err: KeyError,
     log: StackLevelAdapter,  # caller specific `log` obj
+
     expect_key: str = 'yield',
     stream: MsgStream | None = None,
 
+    # allow "deeper" tbs when debugging B^o
+    hide_tb: bool = True,
+
 ) -> bool:
     '''
-    Raise an appopriate local error when a `MsgStream` msg arrives
-    which does not contain the expected (under normal operation)
-    `'yield'` field.
+    Raise an appopriate local error when a
+    `MsgStream` msg arrives which does not
+    contain the expected (at least under normal
+    operation) `'yield'` field.
+
+    `Context` and any embedded `MsgStream` termination,
+    as well as remote task errors are handled in order
+    of priority as:
+
+    - any 'error' msg is re-boxed and raised locally as
+      -> `RemoteActorError`|`ContextCancelled`
+
+    - a `MsgStream` 'stop' msg is constructed, assigned
+      and raised locally as -> `trio.EndOfChannel`
+
+    - All other mis-keyed msgss (like say a "final result"
+      'return' msg, normally delivered from `Context.result()`)
+      are re-boxed inside a `MessagingError` with an explicit
+      exc content describing the missing IPC-msg-key.
 
     '''
-    __tracebackhide__: bool = True
+    __tracebackhide__: bool = hide_tb
 
-    # internal error should never get here
+    # an internal error should never get here
     try:
         cid: str = msg['cid']
     except KeyError as src_err:
         raise MessagingError(
             f'IPC `Context` rx-ed msg without a ctx-id (cid)!?\n'
-            f'cid: {cid}\n'
-            'received msg:\n'
+            f'cid: {cid}\n\n'
+
             f'{pformat(msg)}\n'
         ) from src_err
 
     # TODO: test that shows stream raising an expected error!!!
+
+    # raise the error message in a boxed exception type!
     if msg.get('error'):
-        # raise the error message
         raise unpack_error(
             msg,
             ctx.chan,
+            hide_tb=hide_tb,
+
         ) from None
 
+    # `MsgStream` termination msg.
     elif (
         msg.get('stop')
         or (
@@ -357,29 +383,26 @@ def _raise_from_no_key_in_msg(
             f'cid: {cid}\n'
         )
 
-        # XXX: important to set so that a new ``.receive()``
-        # call (likely by another task using a broadcast receiver)
-        # doesn't accidentally pull the ``return`` message
-        # value out of the underlying feed mem chan!
-        stream._eoc: bool = True
-
         # TODO: if the a local task is already blocking on
         # a `Context.result()` and thus a `.receive()` on the
         # rx-chan, we close the chan and set state ensuring that
         # an eoc is raised!
 
-        # # when the send is closed we assume the stream has
-        # # terminated and signal this local iterator to stop
-        # await stream.aclose()
-
         # XXX: this causes ``ReceiveChannel.__anext__()`` to
         # raise a ``StopAsyncIteration`` **and** in our catch
         # block below it will trigger ``.aclose()``.
-        raise trio.EndOfChannel(
+        eoc = trio.EndOfChannel(
             f'Context stream ended due to msg:\n\n'
             f'{pformat(msg)}\n'
-        ) from src_err
+        )
+        # XXX: important to set so that a new `.receive()`
+        # call (likely by another task using a broadcast receiver)
+        # doesn't accidentally pull the `return` message
+        # value out of the underlying feed mem chan which is
+        # destined for the `Context.result()` call during ctx-exit!
+        stream._eoc: Exception = eoc
 
+        raise eoc from src_err
 
     if (
         stream
-- 
2.34.1


From 28ba5e5435cded0ae8112273344a79447ee82661 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 21 Feb 2024 13:21:28 -0500
Subject: [PATCH 116/378] Add `pformat()` of `ActorNursery._children` to
 logging

Such that you see the children entries prior to exit instead of the
prior somewhat detail/use-less logging. Also, rename all `anursery` vars
to just `an` as is the convention in most examples.
---
 tractor/_supervise.py | 63 ++++++++++++++++++++++++-------------------
 1 file changed, 36 insertions(+), 27 deletions(-)

diff --git a/tractor/_supervise.py b/tractor/_supervise.py
index af83aa5b..86a317d6 100644
--- a/tractor/_supervise.py
+++ b/tractor/_supervise.py
@@ -21,6 +21,7 @@
 from contextlib import asynccontextmanager as acm
 from functools import partial
 import inspect
+from pprint import pformat
 from typing import TYPE_CHECKING
 import typing
 import warnings
@@ -189,14 +190,16 @@ class ActorNursery:
         **kwargs,  # explicit args to ``fn``
 
     ) -> Portal:
-        """Spawn a new actor, run a lone task, then terminate the actor and
+        '''
+        Spawn a new actor, run a lone task, then terminate the actor and
         return its result.
 
         Actors spawned using this method are kept alive at nursery teardown
         until the task spawned by executing ``fn`` completes at which point
         the actor is terminated.
-        """
-        mod_path = fn.__module__
+
+        '''
+        mod_path: str = fn.__module__
 
         if name is None:
             # use the explicit function name if not provided
@@ -231,7 +234,11 @@ class ActorNursery:
         )
         return portal
 
-    async def cancel(self, hard_kill: bool = False) -> None:
+    async def cancel(
+        self,
+        hard_kill: bool = False,
+
+    ) -> None:
         '''
         Cancel this nursery by instructing each subactor to cancel
         itself and wait for all subactors to terminate.
@@ -242,10 +249,12 @@ class ActorNursery:
         '''
         self.cancelled = True
 
-        log.cancel(f"Cancelling nursery in {self._actor.uid}")
+        log.cancel(
+            'Cancelling actor nursery\n'
+            f'|_{self._children}\n'
+        )
         with trio.move_on_after(3) as cs:
-
-            async with trio.open_nursery() as nursery:
+            async with trio.open_nursery() as tn:
 
                 subactor: Actor
                 proc: trio.Process
@@ -288,7 +297,7 @@ class ActorNursery:
                         # spawn cancel tasks for each sub-actor
                         assert portal
                         if portal.channel.connected():
-                            nursery.start_soon(portal.cancel_actor)
+                            tn.start_soon(portal.cancel_actor)
 
         # if we cancelled the cancel (we hung cancelling remote actors)
         # then hard kill all sub-processes
@@ -343,7 +352,7 @@ async def _open_and_supervise_one_cancels_all_nursery(
             # the above "daemon actor" nursery will be notified.
             async with trio.open_nursery() as ria_nursery:
 
-                anursery = ActorNursery(
+                an = ActorNursery(
                     actor,
                     ria_nursery,
                     da_nursery,
@@ -352,16 +361,16 @@ async def _open_and_supervise_one_cancels_all_nursery(
                 try:
                     # spawning of actors happens in the caller's scope
                     # after we yield upwards
-                    yield anursery
+                    yield an
 
                     # When we didn't error in the caller's scope,
                     # signal all process-monitor-tasks to conduct
                     # the "hard join phase".
                     log.runtime(
-                        f"Waiting on subactors {anursery._children} "
-                        "to complete"
+                        'Waiting on subactors to complete:\n'
+                        f'{pformat(an._children)}\n'
                     )
-                    anursery._join_procs.set()
+                    an._join_procs.set()
 
                 except BaseException as inner_err:
                     errors[actor.uid] = inner_err
@@ -373,13 +382,13 @@ async def _open_and_supervise_one_cancels_all_nursery(
                     # Instead try to wait for pdb to be released before
                     # tearing down.
                     await maybe_wait_for_debugger(
-                        child_in_debug=anursery._at_least_one_child_in_debug
+                        child_in_debug=an._at_least_one_child_in_debug
                     )
 
                     # if the caller's scope errored then we activate our
                     # one-cancels-all supervisor strategy (don't
                     # worry more are coming).
-                    anursery._join_procs.set()
+                    an._join_procs.set()
 
                     # XXX: hypothetically an error could be
                     # raised and then a cancel signal shows up
@@ -413,7 +422,7 @@ async def _open_and_supervise_one_cancels_all_nursery(
                             )
 
                         # cancel all subactors
-                        await anursery.cancel()
+                        await an.cancel()
 
             # ria_nursery scope end
 
@@ -434,7 +443,7 @@ async def _open_and_supervise_one_cancels_all_nursery(
             # XXX: yet another guard before allowing the cancel
             # sequence in case a (single) child is in debug.
             await maybe_wait_for_debugger(
-                child_in_debug=anursery._at_least_one_child_in_debug
+                child_in_debug=an._at_least_one_child_in_debug
             )
 
             # If actor-local error was raised while waiting on
@@ -442,9 +451,9 @@ async def _open_and_supervise_one_cancels_all_nursery(
             # remaining sub-actors (due to our lone strategy:
             # one-cancels-all).
             log.cancel(f"Nursery cancelling due to {err}")
-            if anursery._children:
+            if an._children:
                 with trio.CancelScope(shield=True):
-                    await anursery.cancel()
+                    await an.cancel()
             raise
         finally:
             # No errors were raised while awaiting ".run_in_actor()"
@@ -454,9 +463,9 @@ async def _open_and_supervise_one_cancels_all_nursery(
             # collected in ``errors`` so cancel all actors, summarize
             # all errors and re-raise.
             if errors:
-                if anursery._children:
+                if an._children:
                     with trio.CancelScope(shield=True):
-                        await anursery.cancel()
+                        await an.cancel()
 
                 # use `BaseExceptionGroup` as needed
                 if len(errors) > 1:
@@ -511,20 +520,20 @@ async def open_nursery(
                 try:
                     async with _open_and_supervise_one_cancels_all_nursery(
                         actor
-                    ) as anursery:
-                        yield anursery
+                    ) as an:
+                        yield an
                 finally:
-                    anursery.exited.set()
+                    an.exited.set()
 
         else:  # sub-nursery case
 
             try:
                 async with _open_and_supervise_one_cancels_all_nursery(
                     actor
-                ) as anursery:
-                    yield anursery
+                ) as an:
+                    yield an
             finally:
-                anursery.exited.set()
+                an.exited.set()
 
     finally:
         log.debug("Nursery teardown complete")
-- 
2.34.1


From ce1bcf6d36bf92989ab91811f95e21029ca64eb8 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 21 Feb 2024 13:24:33 -0500
Subject: [PATCH 117/378] Fix overruns test to avoid return-beats-ctxc race

Turns out that py3.11 might be so fast that iterating a EoC-ed
`MsgStream` 1k times is faster then a `Context.cancel()` msg
transmission from a parent actor to it's child (which i guess makes
sense). So tweak the test to delay 5ms between stream async-for iteration
attempts when the stream is detected to be `.closed: bool` (coming in
patch) or `ctx.cancel_called == true`.
---
 tests/test_context_stream_semantics.py | 44 +++++++++++++++++++-------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/tests/test_context_stream_semantics.py b/tests/test_context_stream_semantics.py
index 4eb06e8a..e0ffa874 100644
--- a/tests/test_context_stream_semantics.py
+++ b/tests/test_context_stream_semantics.py
@@ -8,6 +8,7 @@ sync-opening a ``tractor.Context`` beforehand.
 # from contextlib import asynccontextmanager as acm
 from itertools import count
 import platform
+from pprint import pformat
 from typing import (
     Callable,
 )
@@ -815,7 +816,10 @@ async def echo_back_sequence(
     # NOTE: ensure that if the caller is expecting to cancel this task
     # that we stay echoing much longer then they are so we don't
     # return early instead of receive the cancel msg.
-    total_batches: int = 1000 if wait_for_cancel else 6
+    total_batches: int = (
+        1000 if wait_for_cancel
+        else 6
+    )
 
     await ctx.started()
     # await tractor.breakpoint()
@@ -834,8 +838,23 @@ async def echo_back_sequence(
         )
 
         seq = list(seq)  # bleh, msgpack sometimes ain't decoded right
-        for _ in range(total_batches):
+        for i in range(total_batches):
+            print(f'starting new stream batch {i} iter in child')
             batch = []
+
+            # EoC case, delay a little instead of hot
+            # iter-stopping (since apparently py3.11+ can do that
+            # faster then a ctxc can be sent) on the async for
+            # loop when child was requested to ctxc.
+            if (
+                stream.closed
+                or
+                ctx.cancel_called
+            ):
+                print('child stream already closed!?!')
+                await trio.sleep(0.05)
+                continue
+
             async for msg in stream:
                 batch.append(msg)
                 if batch == seq:
@@ -846,15 +865,18 @@ async def echo_back_sequence(
 
                 print('callee waiting on next')
 
+            print(f'callee echoing back latest batch\n{batch}')
             for msg in batch:
-                print(f'callee sending {msg}')
+                print(f'callee sending msg\n{msg}')
                 await stream.send(msg)
 
-    print(
-        'EXITING CALLEEE:\n'
-        f'{ctx.canceller}'
-    )
-    return 'yo'
+    try:
+        return 'yo'
+    finally:
+        print(
+            'exiting callee with context:\n'
+            f'{pformat(ctx)}\n'
+        )
 
 
 @pytest.mark.parametrize(
@@ -916,8 +938,8 @@ def test_maybe_allow_overruns_stream(
                 wait_for_cancel=cancel_ctx,
                 be_slow=(slow_side == 'child'),
                 allow_overruns_side=allow_overruns_side,
-            ) as (ctx, sent):
 
+            ) as (ctx, sent):
                 assert sent is None
 
                 async with ctx.open_stream(
@@ -945,10 +967,10 @@ def test_maybe_allow_overruns_stream(
 
                 if cancel_ctx:
                     # cancel the remote task
-                    print('sending root side cancel')
+                    print('Requesting `ctx.cancel()` in parent!')
                     await ctx.cancel()
 
-            res = await ctx.result()
+            res: str|ContextCancelled = await ctx.result()
 
             if cancel_ctx:
                 assert isinstance(res, ContextCancelled)
-- 
2.34.1


From 5a09ccf459d3fd2109c42be8df1dce1c8bd739eb Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 22 Feb 2024 13:42:48 -0500
Subject: [PATCH 118/378] Tweak `Actor` cancel method signatures

Besides improving a bunch more log msg contents similarly as before this
changes the cancel method signatures slightly with different arg names:

for `.cancel()`:
- instead of `requesting_uid: str` take in a `req_chan: Channel`
  since we can always just read its `.uid: tuple` for logging and
  further we can then offer the `chan=None` case indicating a
  "self cancel" (since there's no "requesting channel").
- the semantics of "requesting" here better indicate that the IPC connection
  is an IPC peer and further (eventually) will allow permission checking
  against given peers for cancellation requests.
- when `chan==None` we also define a meth-internal `requester_type: str`
  differently for logging content :)
- add much more detailed `.cancel()` content around the requester, its
  type, and any debugger related locking steps.

for `._cancel_task()`:
- change the `chan` arg to `parent_chan: Channel` since "parent"
  correctly indicates that the channel is the parent of the locally
  spawned rpc task to cancel; in fact no other chan should be able to
  cancel tasks parented/spawned by other channels obvi!
- also add more extensive meth-internal `.cancel()` logging with a #TODO
  around showing only the "relevant/lasest" `Context` state vars in such
  logging content.

for `.cancel_rpc_tasks()`:
- shorten `requesting_uid` -> `req_uid`.
- add `parent_chan: Channel` to be similar as above in `._cancel_task()`
  (since it's internally delegated to anyway) which replaces the prior
  `only_chan` and use it to filter to only tasks spawned by this channel
  (thus as their "parent") as before.
- instead of `if tasks:` to enter, invert and `return` early on
  `if not tasks`, for less indentation B)
- add WIP str-repr format (for `.cancel()` emissions) to show
  a multi-address (maddr) + task func (via the new `Context._nsf`) and
  report all cancel task targets with it a "tree"; include #TODO to
  finalize and implement some utils for all this!

To match ensure we adjust `process_messages()` self/`Actor` cancel
handling blocks to provide the new `kwargs` (now with `dict`-merge
syntax) to `._invoke()`.
---
 tractor/_runtime.py | 328 +++++++++++++++++++++++++++++++-------------
 1 file changed, 231 insertions(+), 97 deletions(-)

diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index d127d9d3..516c2900 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -322,7 +322,7 @@ async def _invoke(
                     else f'result: {ctx._result}'
                 )
                 log.cancel(
-                    f'IPC context terminated with final {res_str}\n'
+                    f'IPC context terminated with final {res_str}\n\n'
                     f'|_{pformat(ctx)}\n'
                 )
 
@@ -1022,14 +1022,14 @@ class Actor:
                             and poll() is None
                         ):
                             log.cancel(
-                                f'Peer actor IPC broke but proc is alive?\n'
-                                f'uid: {uid}\n'
-                                f'|_{proc}\n'
+                                f'Peer IPC broke but subproc is alive?\n\n'
+
+                                f'<=x @{chan.raddr}\n'
+                                f'   |_{proc}\n'
                             )
 
             # ``Channel`` teardown and closure sequence
-
-            # Drop ref to channel so it can be gc-ed and disconnected
+            # drop ref to channel so it can be gc-ed and disconnected
             log.runtime(
                 f'Disconnected IPC channel:\n'
                 f'uid: {chan.uid}\n'
@@ -1177,8 +1177,12 @@ class Actor:
             ctx: Context = self._contexts[(uid, cid)]
         except KeyError:
             log.warning(
-                f'Ignoring msg from [no-longer/un]known context {uid}:'
-                f'\n{msg}')
+                'Ignoring invalid IPC ctx msg!\n\n'
+                f'<= sender: {uid}\n'
+                f'=> cid: {cid}\n\n'
+
+                f'{msg}\n'
+            )
             return
 
         return await ctx._deliver_msg(msg)
@@ -1381,9 +1385,12 @@ class Actor:
 
         except OSError:  # failed to connect
             log.warning(
-                f"Failed to connect to parent @ {parent_addr},"
-                " closing server")
-            await self.cancel(requesting_uid=self.uid)
+                f'Failed to connect to parent!?\n\n'
+                'Closing IPC [TCP] transport server to\n'
+                f'{parent_addr}\n'
+                f'|_{self}\n\n'
+            )
+            await self.cancel(chan=None)  # self cancel
             raise
 
     async def _serve_forever(
@@ -1451,29 +1458,53 @@ class Actor:
         assert self._service_n
         self._service_n.start_soon(
             self.cancel,
-            self.uid,
+            None,  # self cancel all rpc tasks
         )
 
     async def cancel(
         self,
-        requesting_uid: tuple[str, str],
+
+        # chan whose lifetime limits the lifetime of its remotely
+        # requested and locally spawned RPC tasks - similar to the
+        # supervision semantics of a nursery wherein the actual
+        # implementation does start all such tasks in
+        # a sub-nursery.
+        req_chan: Channel|None,
 
     ) -> bool:
         '''
-        Cancel this actor's runtime.
+        Cancel this actor's runtime, eventually resulting in
+        the exit its containing process.
 
-        The "deterministic" teardown sequence in order is:
-            - cancel all ongoing rpc tasks by cancel scope
-            - cancel the channel server to prevent new inbound
-              connections
-            - cancel the "service" nursery reponsible for
-              spawning new rpc tasks
-            - return control the parent channel message loop
+        The ideal "deterministic" teardown sequence in order is:
+         - cancel all ongoing rpc tasks by cancel scope
+         - cancel the channel server to prevent new inbound
+           connections
+         - cancel the "service" nursery reponsible for
+           spawning new rpc tasks
+         - return control the parent channel message loop
 
         '''
-        log.cancel(
-            f'{self.uid} requested to cancel by:\n'
-            f'{requesting_uid}'
+        (
+            requesting_uid,
+            requester_type,
+            req_chan,
+
+        ) = (
+            req_chan.uid,
+            'peer',
+            req_chan,
+
+        ) if req_chan else (
+
+            # a self cancel of ALL rpc tasks
+            self.uid,
+            'self',
+            self
+        )
+        msg: str = (
+            f'`Actor.cancel()` request from {requester_type}:\n'
+            f'<= {requesting_uid}\n'
         )
 
         # TODO: what happens here when we self-cancel tho?
@@ -1487,12 +1518,16 @@ class Actor:
             # with the root actor in this tree
             dbcs = _debug.Lock._debugger_request_cs
             if dbcs is not None:
-                log.cancel("Cancelling active debugger request")
+                msg += (
+                    '>> Cancelling active debugger request..\n'
+                    f'|_{_debug.Lock}\n'
+                )
                 dbcs.cancel()
 
-            # kill all ongoing tasks
+            # self-cancel **all** ongoing RPC tasks
             await self.cancel_rpc_tasks(
-                requesting_uid=requesting_uid,
+                req_uid=requesting_uid,
+                parent_chan=None,
             )
 
             # stop channel server
@@ -1501,13 +1536,14 @@ class Actor:
                 await self._server_down.wait()
             else:
                 log.warning(
-                    f'{self.uid} was likely cancelled before it started')
+                    'Transport[TCP] server was cancelled start?'
+                )
 
             # cancel all rpc tasks permanently
             if self._service_n:
                 self._service_n.cancel_scope.cancel()
 
-        log.cancel(f"{self.uid} called `Actor.cancel()`")
+        log.cancel(msg)
         self._cancel_complete.set()
         return True
 
@@ -1522,7 +1558,7 @@ class Actor:
     async def _cancel_task(
         self,
         cid: str,
-        chan: Channel,
+        parent_chan: Channel,
         requesting_uid: tuple[str, str] | None = None,
 
     ) -> bool:
@@ -1534,13 +1570,25 @@ class Actor:
         in the signature (for now).
 
         '''
-        # right now this is only implicitly called by
+        # this ctx based lookup ensures the requested task to
+        # be cancelled was indeed spawned by a request from
+        # this channel
+        ctx: Context
+        func: Callable
+        is_complete: trio.Event
+
+        # NOTE: right now this is only implicitly called by
         # streaming IPC but it should be called
         # to cancel any remotely spawned task
         try:
-            # this ctx based lookup ensures the requested task to
-            # be cancelled was indeed spawned by a request from this channel
-            ctx, func, is_complete = self._rpc_tasks[(chan, cid)]
+            (
+                ctx,
+                func,
+                is_complete,
+            ) = self._rpc_tasks[(
+                parent_chan,
+                cid,
+            )]
             scope: CancelScope = ctx._scope
 
         except KeyError:
@@ -1551,17 +1599,28 @@ class Actor:
             # - callee errors prior to cancel req.
             log.cancel(
                 'Cancel request invalid, RPC task already completed?\n'
-                f'<= canceller: {requesting_uid}\n'
-                f'  |_{chan}\n\n'
-
-                f'=> ctx id: {cid}\n'
+                f'<= canceller: {requesting_uid}\n\n'
+                f'=>{parent_chan}\n'
+                f'  |_ctx-id: {cid}\n'
             )
             return True
 
         log.cancel(
-            f"Cancelling task:\ncid: {cid}\nfunc: {func}\n"
-            f"peer: {chan.uid}\n")
+            'Cancel request for RPC task\n'
+            f'<= canceller: {requesting_uid}\n\n'
 
+            # TODO: better ascii repr for "supervisor" like
+            # a nursery or context scope?
+            f'=> ipc-parent: {parent_chan}\n'
+            # TODO: simplified `Context.__repr__()` fields output
+            # shows only application state-related stuff like,
+            # - ._stream
+            # - .closed
+            # - .started_called
+            # - .. etc.
+            f'  |_ctx: {cid}\n'
+            f'    >> {ctx._nsf}()\n'
+        )
         if (
             ctx._canceller is None
             and requesting_uid
@@ -1571,6 +1630,7 @@ class Actor:
         # don't allow cancelling this function mid-execution
         # (is this necessary?)
         if func is self._cancel_task:
+            log.error('Do not cancel a cancel!?')
             return True
 
         # TODO: shouldn't we eventually be calling ``Context.cancel()``
@@ -1580,23 +1640,29 @@ class Actor:
         scope.cancel()
 
         # wait for _invoke to mark the task complete
+        flow_info: str = (
+            f'<= canceller: {requesting_uid}\n'
+            f'=> ipc-parent: {parent_chan}\n'
+            f'  |_{ctx}\n'
+        )
         log.runtime(
-            'Waiting on task to cancel:\n'
-            f'cid: {cid}\nfunc: {func}\n'
-            f'peer: {chan.uid}\n'
+            'Waiting on RPC task to cancel\n'
+            f'{flow_info}'
         )
         await is_complete.wait()
-
         log.runtime(
-            f"Sucessfully cancelled task:\ncid: {cid}\nfunc: {func}\n"
-            f"peer: {chan.uid}\n")
-
+            f'Sucessfully cancelled RPC task\n'
+            f'{flow_info}'
+        )
         return True
 
     async def cancel_rpc_tasks(
         self,
-        only_chan: Channel | None = None,
-        requesting_uid: tuple[str, str] | None = None,
+        req_uid: tuple[str, str],
+
+        # NOTE: when None is passed we cancel **all** rpc
+        # tasks running in this actor!
+        parent_chan: Channel|None,
 
     ) -> None:
         '''
@@ -1605,38 +1671,76 @@ class Actor:
 
         '''
         tasks: dict = self._rpc_tasks
-        if tasks:
-            tasks_str: str = ''
-            for (ctx, func, _) in tasks.values():
-                tasks_str += (
-                    f' |_{func.__name__}() [cid={ctx.cid[-6:]}..]\n'
-                )
-
-            log.cancel(
-                f'Cancelling all {len(tasks)} rpc tasks:\n'
-                f'{tasks_str}'
+        if not tasks:
+            log.warning(
+                'Actor has no cancellable RPC tasks?\n'
+                f'<= cancel requester: {req_uid}\n'
+                f'=> {self}\n\n'
             )
-            for (
-                (chan, cid),
-                (ctx, func, is_complete),
-            ) in tasks.copy().items():
-                if only_chan is not None:
-                    if only_chan != chan:
-                        continue
+            return
 
-                # TODO: this should really done in a nursery batch
-                if func != self._cancel_task:
-                    await self._cancel_task(
-                        cid,
-                        chan,
-                        requesting_uid=requesting_uid,
-                    )
+        # TODO: seriously factor this into some helper funcs XD
+        tasks_str: str = ''
+        for (ctx, func, _) in tasks.values():
 
-            log.cancel(
-                'Waiting for remaining rpc tasks to complete:\n'
-                f'{tasks}'
+            # TODO: std repr of all primitives in
+            # a hierarchical tree format, since we can!!
+            # like => repr for funcs/addrs/msg-typing:
+            #
+            # -[ ] use a proper utf8 "arm" like
+            #     `stackscope` has!
+            # -[ ] for typed msging, show the
+            #      py-type-annot style?
+            #  - maybe auto-gen via `inspect` / `typing` type-sig:
+            #   https://stackoverflow.com/a/57110117
+            #   => see ex. code pasted into `.msg.types`
+            #
+            # -[ ] proper .maddr() for IPC primitives?
+            #   - `Channel.maddr() -> str:` obvi!
+            #   - `Context.maddr() -> str:`
+            tasks_str += (
+                f' |_@ /ipv4/tcp/cid="{ctx.cid[-16:]} .."\n'
+                f'   |>> {ctx._nsf}() -> dict:\n'
             )
-            await self._ongoing_rpc_tasks.wait()
+
+        log.cancel(
+            f'Cancelling all {len(tasks)} rpc tasks:\n\n'
+            f'<= .cancel() from {req_uid}\n'
+            f'{self}\n'
+            f'{tasks_str}'
+        )
+        for (
+            (task_caller_chan, cid),
+            (ctx, func, is_complete),
+        ) in tasks.copy().items():
+
+            if (
+                # maybe filter to specific IPC channel?
+                (parent_chan
+                 and
+                 task_caller_chan != parent_chan)
+
+                # never "cancel-a-cancel" XD
+                or (func == self._cancel_task)
+            ):
+                continue
+
+            # if func == self._cancel_task:
+            #     continue
+
+            # TODO: this maybe block on the task cancellation
+            # and so should really done in a nursery batch?
+            await self._cancel_task(
+                cid,
+                task_caller_chan,
+                requesting_uid=req_uid,
+            )
+
+        log.cancel(
+            'Waiting for remaining rpc tasks to complete\n'
+            f'|_{tasks}'
+        )
+        await self._ongoing_rpc_tasks.wait()
 
     def cancel_server(self) -> None:
         '''
@@ -2092,10 +2196,11 @@ async def process_messages(
                     f'=> {ns}.{funcname}({kwargs})\n'
                 )
                 if ns == 'self':
-                    uid: tuple = chan.uid
                     if funcname == 'cancel':
                         func: Callable = actor.cancel
-                        kwargs['requesting_uid'] = uid
+                        kwargs |= {
+                            'req_chan': chan,
+                        }
 
                         # don't start entire actor runtime cancellation
                         # if this actor is currently in debug mode!
@@ -2109,11 +2214,6 @@ async def process_messages(
                         # and immediately start the core runtime
                         # machinery shutdown!
                         with CancelScope(shield=True):
-                            log.cancel(
-                                f'Cancel request for `Actor` runtime\n'
-                                f'<= canceller: {uid}\n'
-                                # f'=> uid: {actor.uid}\n'
-                            )
                             await _invoke(
                                 actor,
                                 cid,
@@ -2123,25 +2223,32 @@ async def process_messages(
                                 is_rpc=False,
                             )
 
-                        log.cancel(
-                            f'Cancelling IPC msg-loop with {chan.uid}'
+                        log.runtime(
+                            'Cancelling IPC transport msg-loop with peer:\n'
+                            f'|_{chan}\n'
                         )
                         loop_cs.cancel()
                         break
 
                     if funcname == '_cancel_task':
-                        func = actor._cancel_task
+                        func: Callable = actor._cancel_task
 
                         # we immediately start the runtime machinery
                         # shutdown
                         # with CancelScope(shield=True):
-                        kwargs['chan'] = chan
-                        target_cid = kwargs['cid']
-                        kwargs['requesting_uid'] = chan.uid
+                        target_cid: str = kwargs['cid']
+                        kwargs |= {
+                            # NOTE: ONLY the rpc-task-owning
+                            # parent IPC channel should be able to
+                            # cancel it!
+                            'parent_chan': chan,
+                            'requesting_uid': chan.uid,
+                        }
                         log.cancel(
                             f'Rx task cancel request\n'
                             f'<= canceller: {chan.uid}\n'
-                            f'=> uid: {actor.uid}\n'
+                            f'  |_{chan}\n\n'
+                            f'=> {actor}\n'
                             f'  |_cid: {target_cid}\n'
                         )
                         try:
@@ -2154,8 +2261,13 @@ async def process_messages(
                                 is_rpc=False,
                             )
                         except BaseException:
-                            log.exception("failed to cancel task?")
-
+                            log.exception(
+                                'Failed to cancel task?\n'
+                                f'<= canceller: {chan.uid}\n'
+                                f'  |_{chan}\n\n'
+                                f'=> {actor}\n'
+                                f'  |_cid: {target_cid}\n'
+                            )
                         continue
                     else:
                         # normally registry methods, eg.
@@ -2174,9 +2286,25 @@ async def process_messages(
                         await chan.send(err_msg)
                         continue
 
-                # spin up a task for the requested function
-                log.runtime(f"Spawning task for {func}")
-                assert actor._service_n
+                # schedule a task for the requested RPC function
+                # in the actor's main "service nursery".
+                # TODO: possibly a service-tn per IPC channel for
+                # supervision isolation? would avoid having to
+                # manage RPC tasks individually in `._rpc_tasks`
+                # table?
+                log.runtime(
+                    f'Spawning task for RPC request\n'
+                    f'<= caller: {chan.uid}\n'
+                    f'  |_{chan}\n\n'
+                    # TODO: maddr style repr?
+                    # f'  |_@ /ipv4/{chan.raddr}/tcp/{chan.rport}/'
+                    # f'cid="{cid[-16:]} .."\n\n'
+
+                    f'=> {actor}\n'
+                    f'  |_cid: {cid}\n'
+                    f'   |>> {func}()\n'
+                )
+                assert actor._service_n  # wait why? do it at top?
                 try:
                     ctx: Context = await actor._service_n.start(
                         partial(
@@ -2234,7 +2362,13 @@ async def process_messages(
             log.runtime(
                 f"{chan} for {chan.uid} disconnected, cancelling tasks"
             )
-            await actor.cancel_rpc_tasks(chan)
+            await actor.cancel_rpc_tasks(
+                req_uid=actor.uid,
+                # a "self cancel" in terms of the lifetime of the
+                # IPC connection which is presumed to be the
+                # source of any requests for spawned tasks.
+                parent_chan=chan,
+            )
 
     except (
         TransportClosed,
-- 
2.34.1


From e244747bc3ec02ea76d210cbc6d23934b87d979e Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 22 Feb 2024 14:22:45 -0500
Subject: [PATCH 119/378] Add note that maybe `Context._eoc` should be set by
 caller?

---
 tractor/_exceptions.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index bdd8d411..fe3f2706 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -371,6 +371,8 @@ def _raise_from_no_key_in_msg(
         ) from None
 
     # `MsgStream` termination msg.
+    # TODO: does it make more sense to pack 
+    # the stream._eoc outside this in the calleer always?
     elif (
         msg.get('stop')
         or (
-- 
2.34.1


From 5ea112699d7ace3b5b529c1332fc2ec9e450c78c Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 22 Feb 2024 14:41:28 -0500
Subject: [PATCH 120/378] Tweak broadcast fanout test to never inf loop

Since a bug in the new `MsgStream.aclose()` impl's drain block logic was
triggering an actual inf loop (by not ever canceller the streamer child
actor), make sure we put a loop limit on the `inf_streamer`()` XD

Also add a bit more deats to the test `print()`s in each actor and toss
in `debug_mode` fixture support.
---
 tests/test_advanced_streaming.py | 59 ++++++++++++++++++++++++--------
 1 file changed, 44 insertions(+), 15 deletions(-)

diff --git a/tests/test_advanced_streaming.py b/tests/test_advanced_streaming.py
index 799a0897..82cc53a0 100644
--- a/tests/test_advanced_streaming.py
+++ b/tests/test_advanced_streaming.py
@@ -298,52 +298,77 @@ async def inf_streamer(
 
     async with (
         ctx.open_stream() as stream,
-        trio.open_nursery() as n,
+        trio.open_nursery() as tn,
     ):
-        async def bail_on_sentinel():
+        async def close_stream_on_sentinel():
             async for msg in stream:
                 if msg == 'done':
+                    print(
+                        'streamer RXed "done" sentinel msg!\n'
+                        'CLOSING `MsgStream`!'
+                    )
                     await stream.aclose()
                 else:
                     print(f'streamer received {msg}')
+            else:
+                print('streamer exited recv loop')
 
         # start termination detector
-        n.start_soon(bail_on_sentinel)
+        tn.start_soon(close_stream_on_sentinel)
 
-        for val in itertools.count():
+        cap: int = 10000  # so that we don't spin forever when bug..
+        for val in range(cap):
             try:
+                print(f'streamer sending {val}')
                 await stream.send(val)
+                if val > cap:
+                    raise RuntimeError(
+                        'Streamer never cancelled by setinel?'
+                    )
+                await trio.sleep(0.001)
+
+            # close out the stream gracefully
             except trio.ClosedResourceError:
-                # close out the stream gracefully
+                print('msgstream closed on streamer side!')
+                assert stream.closed
                 break
+        else:
+            raise RuntimeError(
+                'Streamer not cancelled before finished sending?'
+            )
 
-    print('terminating streamer')
+    print('streamer exited .open_streamer() block')
 
 
-def test_local_task_fanout_from_stream():
+def test_local_task_fanout_from_stream(
+    debug_mode: bool,
+):
     '''
     Single stream with multiple local consumer tasks using the
     ``MsgStream.subscribe()` api.
 
-    Ensure all tasks receive all values after stream completes sending.
+    Ensure all tasks receive all values after stream completes
+    sending.
 
     '''
-    consumers = 22
+    consumers: int = 22
 
     async def main():
 
         counts = Counter()
 
-        async with tractor.open_nursery() as tn:
-            p = await tn.start_actor(
+        async with tractor.open_nursery(
+            debug_mode=debug_mode,
+        ) as tn:
+            p: tractor.Portal = await tn.start_actor(
                 'inf_streamer',
                 enable_modules=[__name__],
             )
+            # with trio.fail_after(3):
             async with (
                 p.open_context(inf_streamer) as (ctx, _),
                 ctx.open_stream() as stream,
             ):
-
                 async def pull_and_count(name: str):
                     # name = trio.lowlevel.current_task().name
                     async with stream.subscribe() as recver:
@@ -352,7 +377,7 @@ def test_local_task_fanout_from_stream():
                             tractor.trionics.BroadcastReceiver
                         )
                         async for val in recver:
-                            # print(f'{name}: {val}')
+                            print(f'bx {name} rx: {val}')
                             counts[name] += 1
 
                         print(f'{name} bcaster ended')
@@ -362,10 +387,14 @@ def test_local_task_fanout_from_stream():
                 with trio.fail_after(3):
                     async with trio.open_nursery() as nurse:
                         for i in range(consumers):
-                            nurse.start_soon(pull_and_count, i)
+                            nurse.start_soon(
+                                pull_and_count,
+                                i,
+                            )
 
+                        # delay to let bcast consumers pull msgs
                         await trio.sleep(0.5)
-                        print('\nterminating')
+                        print('terminating nursery of bcast rxer consumers!')
                         await stream.send('done')
 
             print('closed stream connection')
-- 
2.34.1


From 930d4988413380d06c31291801a7f3a1fca137fb Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 22 Feb 2024 14:45:08 -0500
Subject: [PATCH 121/378] Call `actor.cancel(None)` from root to avoid mismatch
 with (any future) meth sig changes

---
 tests/test_advanced_streaming.py | 1 -
 tractor/_root.py                 | 4 +---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/test_advanced_streaming.py b/tests/test_advanced_streaming.py
index 82cc53a0..8061c3b9 100644
--- a/tests/test_advanced_streaming.py
+++ b/tests/test_advanced_streaming.py
@@ -364,7 +364,6 @@ def test_local_task_fanout_from_stream(
                 'inf_streamer',
                 enable_modules=[__name__],
             )
-            # with trio.fail_after(3):
             async with (
                 p.open_context(inf_streamer) as (ctx, _),
                 ctx.open_stream() as stream,
diff --git a/tractor/_root.py b/tractor/_root.py
index 1d147dd5..32cc3d57 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -343,9 +343,7 @@ async def open_root_actor(
                 #         tempn.start_soon(an.exited.wait)
 
                 logger.cancel("Shutting down root actor")
-                await actor.cancel(
-                    requesting_uid=actor.uid,
-                )
+                await actor.cancel(None)  # self cancel
     finally:
         _state._current_actor = None
 
-- 
2.34.1


From de1843dc8442476bc250f7ef43ff1db112e497f9 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 22 Feb 2024 15:06:39 -0500
Subject: [PATCH 122/378] Few more log msg tweaks in runtime

---
 tractor/_runtime.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 516c2900..b3b87e26 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -973,11 +973,6 @@ class Actor:
                     # Attempt to wait for the far end to close the channel
                     # and bail after timeout (2-generals on closure).
                     assert chan.msgstream
-
-                    log.warning(
-                        f'Draining lingering msgs from stream {chan.msgstream}'
-                    )
-
                     async for msg in chan.msgstream.drain():
                         # try to deliver any lingering msgs
                         # before we destroy the channel.
@@ -987,8 +982,11 @@ class Actor:
                         # delivered the local calling task.
                         # TODO: factor this into a helper?
                         log.warning(
-                            'Draining msg from disconnected\n'
-                            f'peer:  {chan.uid}]\n\n'
+                            'Draining msg from disconnected peer\n'
+                            f'{chan.uid}\n'
+                            f'|_{chan}\n'
+                            f'  |_{chan.msgstream}\n\n'
+
                             f'{pformat(msg)}\n'
                         )
                         cid = msg.get('cid')
@@ -1674,8 +1672,7 @@ class Actor:
         if not tasks:
             log.warning(
                 'Actor has no cancellable RPC tasks?\n'
-                f'<= cancel requester: {req_uid}\n'
-                f'=> {self}\n\n'
+                f'<= canceller: {req_uid}\n'
             )
             return
 
-- 
2.34.1


From fc72d7506145bfd26c5582efa899bdfeb6237d9e Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 22 Feb 2024 15:08:10 -0500
Subject: [PATCH 123/378] Support `maybe_wait_for_debugger(header_msg: str)`

Allow callers to stick in a header to the `.pdb()` level emitted msg(s)
such that any "waiting status" content is only shown if the caller
actually get's blocked waiting for the debug lock; use it inside the
`._spawn` sub-process reaper call.

Also, return early if `Lock.global_actor_in_debug == None` and thus
only enter the poll loop when actually needed, consequently raise
if we fall through the loop without acquisition.
---
 tractor/_spawn.py      |  7 +++--
 tractor/devx/_debug.py | 71 +++++++++++++++++++++++++++---------------
 2 files changed, 50 insertions(+), 28 deletions(-)

diff --git a/tractor/_spawn.py b/tractor/_spawn.py
index 141d7c80..7f50b9eb 100644
--- a/tractor/_spawn.py
+++ b/tractor/_spawn.py
@@ -541,13 +541,14 @@ async def trio_proc(
                         with trio.move_on_after(0.5):
                             await proc.wait()
 
-                log.pdb(
-                    'Delaying subproc reaper while debugger locked..'
-                )
                 await maybe_wait_for_debugger(
                     child_in_debug=_runtime_vars.get(
                         '_debug_mode', False
                     ),
+                    header_msg=(
+                        'Delaying subproc reaper while debugger locked..\n'
+                    ),
+
                     # TODO: need a diff value then default?
                     # poll_steps=9999999,
                 )
diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 43fd9018..d3bf4fe0 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -999,6 +999,8 @@ async def maybe_wait_for_debugger(
     poll_delay: float = 0.1,
     child_in_debug: bool = False,
 
+    header_msg: str = '',
+
 ) -> None:
 
     if (
@@ -1007,6 +1009,8 @@ async def maybe_wait_for_debugger(
     ):
         return
 
+
+    msg: str = header_msg
     if (
         is_root_process()
     ):
@@ -1016,48 +1020,59 @@ async def maybe_wait_for_debugger(
         # will make the pdb repl unusable.
         # Instead try to wait for pdb to be released before
         # tearing down.
-        sub_in_debug: tuple[str, str] | None = None
+        sub_in_debug: tuple[str, str]|None = Lock.global_actor_in_debug
+        debug_complete: trio.Event|None = Lock.no_remote_has_tty
+
+        if sub_in_debug := Lock.global_actor_in_debug:
+            msg += (
+                'Debug `Lock` in use by subactor\n'
+                f'|_{sub_in_debug}\n'
+            )
+            # TODO: could this make things more deterministic?
+            # wait to see if a sub-actor task will be
+            # scheduled and grab the tty lock on the next
+            # tick?
+            # XXX => but it doesn't seem to work..
+            # await trio.testing.wait_all_tasks_blocked(cushion=0)
+        else:
+            log.pdb(
+                msg
+                +
+                'Root immediately acquired debug TTY LOCK'
+            )
+            return
 
         for istep in range(poll_steps):
 
-            if sub_in_debug := Lock.global_actor_in_debug:
-                log.pdb(
-                    f'Lock in use by {sub_in_debug}'
-                )
-                # TODO: could this make things more deterministic?
-                # wait to see if a sub-actor task will be
-                # scheduled and grab the tty lock on the next
-                # tick?
-                # XXX => but it doesn't seem to work..
-                # await trio.testing.wait_all_tasks_blocked(cushion=0)
 
-            debug_complete: trio.Event|None = Lock.no_remote_has_tty
             if (
                 debug_complete
                 and not debug_complete.is_set()
                 and sub_in_debug is not None
             ):
                 log.pdb(
-                    'Root has errored but pdb is in use by child\n'
-                    'Waiting on tty lock to release..\n'
-                    f'uid: {sub_in_debug}\n'
+                    msg
+                    +
+                    'Root is waiting on tty lock to release..\n'
                 )
                 await debug_complete.wait()
                 log.pdb(
-                    f'Child subactor released debug lock!\n'
-                    f'uid: {sub_in_debug}\n'
+                    f'Child subactor released debug lock:'
+                    f'|_{sub_in_debug}\n'
                 )
-                if debug_complete.is_set():
-                    break
 
             # is no subactor locking debugger currently?
-            elif (
-                debug_complete is None
-                or sub_in_debug is None
+            if (
+                 sub_in_debug is None
+                and (
+                    debug_complete is None
+                    or debug_complete.is_set()
+                )
             ):
                 log.pdb(
-                    'Root acquired debug TTY LOCK from child\n'
-                    f'uid: {sub_in_debug}'
+                    msg
+                    +
+                    'Root acquired tty lock!'
                 )
                 break
 
@@ -1073,8 +1088,14 @@ async def maybe_wait_for_debugger(
                 with trio.CancelScope(shield=True):
                     await trio.sleep(poll_delay)
                     continue
+
+        # fallthrough on failure to acquire..
         else:
-            log.pdb('Root acquired debug TTY LOCK')
+            raise RuntimeError(
+                msg
+                +
+                'Root actor failed to acquire debug lock?'
+            )
 
     # else:
     #     # TODO: non-root call for #320?
-- 
2.34.1


From ad5eee5666eedddab1c55916162f9a83c552ca0f Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 22 Feb 2024 18:33:18 -0500
Subject: [PATCH 124/378] WIP final impl of ctx-cancellation-semantics

---
 tractor/_context.py   | 280 ++++++++++++++++++++++++++++++------------
 tractor/_portal.py    | 184 +++++++++++++++++----------
 tractor/_streaming.py |  86 ++++++++-----
 3 files changed, 378 insertions(+), 172 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index 54e309e1..ee05a2ba 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -47,6 +47,7 @@ import trio
 #     maybe_wait_for_debugger,
 #     pause,
 # )
+from .msg import NamespacePath
 from ._exceptions import (
     # _raise_from_no_key_in_msg,
     unpack_error,
@@ -71,12 +72,23 @@ log = get_logger(__name__)
 
 async def _drain_to_final_msg(
     ctx: Context,
-) -> list[dict]:
 
-# ) -> tuple[
-#     Any|Exception,
-#     list[dict],
-# ]:
+    msg_limit: int = 6,
+
+) -> list[dict]:
+    '''
+    Drain IPC msgs delivered to the underlying rx-mem-chan
+    `Context._recv_chan` from the runtime in search for a final
+    result or error msg.
+
+    The motivation here is to ideally capture errors during ctxc
+    conditions where a canc-request/or local error is sent but the
+    local task also excepts and enters the
+    `Portal.open_context().__aexit__()` block wherein we prefer to
+    capture and raise any remote error or ctxc-ack as part of the
+    `ctx.result()` cleanup and teardown sequence.
+
+    '''
     raise_overrun: bool = not ctx._allow_overruns
 
     # wait for a final context result by collecting (but
@@ -88,14 +100,14 @@ async def _drain_to_final_msg(
             # NOTE: this REPL usage actually works here dawg! Bo
             # from .devx._debug import pause
             # await pause()
-            # if re := ctx._remote_error:
-            #     ctx._maybe_raise_remote_err(
-            #         re,
-            #         # NOTE: obvi we don't care if we
-            #         # overran the far end if we're already
-            #         # waiting on a final result (msg).
-            #         raise_overrun_from_self=raise_overrun,
-            #     )
+            if re := ctx._remote_error:
+                ctx._maybe_raise_remote_err(
+                    re,
+                    # NOTE: obvi we don't care if we
+                    # overran the far end if we're already
+                    # waiting on a final result (msg).
+                    raise_overrun_from_self=raise_overrun,
+                )
 
             # TODO: bad idea?
             # with trio.CancelScope() as res_cs:
@@ -108,7 +120,7 @@ async def _drain_to_final_msg(
             msg: dict = await ctx._recv_chan.receive()
             ctx._result: Any = msg['return']
             log.runtime(
-                'Context delivered final result msg:\n'
+                'Context delivered final draining msg:\n'
                 f'{pformat(msg)}'
             )
             pre_result_drained.append(msg)
@@ -142,9 +154,47 @@ async def _drain_to_final_msg(
 
             if 'yield' in msg:
                 # far end task is still streaming to us so discard
-                log.warning(f'Discarding std "yield"\n{msg}')
-                pre_result_drained.append(msg)
-                continue
+                # and report per local context state.
+                if (
+                    (ctx._stream.closed
+                     and (reason := 'stream was already closed')
+                    )
+                    or (ctx._cancel_called
+                        and (reason := 'ctx called `.cancel()`')
+                    )
+                    or (ctx._cancelled_caught
+                        and (reason := 'ctx caught a cancel')
+                    )
+                    or (len(pre_result_drained) > msg_limit
+                        and (reason := f'"yield" limit={msg_limit}')
+                    )
+                ):
+                    log.cancel(
+                        'Cancelling `MsgStream` drain since '
+                        f'{reason}\n\n'
+                        f'<= {ctx.chan.uid}\n'
+                        f'  |_{ctx._nsf}()\n\n'
+                        f'=> {ctx._task}\n'
+                        f'  |_{ctx._stream}\n\n'
+
+                        f'{pformat(msg)}\n'
+                    )
+                    return pre_result_drained
+
+                # drain up to the `msg_limit` hoping to get
+                # a final result or error/ctxc.
+                else:
+                    log.warning(
+                        'Ignoring "yield" msg during `ctx.result()` drain..\n'
+                        f'<= {ctx.chan.uid}\n'
+                        f'  |_{ctx._nsf}()\n\n'
+                        f'=> {ctx._task}\n'
+                        f'  |_{ctx._stream}\n\n'
+
+                        f'{pformat(msg)}\n'
+                    )
+                    pre_result_drained.append(msg)
+                    continue
 
             # TODO: work out edge cases here where
             # a stream is open but the task also calls
@@ -153,8 +203,8 @@ async def _drain_to_final_msg(
             #   right?
             elif 'stop' in msg:
                 log.cancel(
-                    'Remote stream terminated due to "stop" msg:\n'
-                    f'{msg}'
+                    'Remote stream terminated due to "stop" msg:\n\n'
+                    f'{pformat(msg)}\n'
                 )
                 pre_result_drained.append(msg)
                 continue
@@ -260,12 +310,14 @@ class Context:
     '''
     chan: Channel
     cid: str  # "context id", more or less a unique linked-task-pair id
-
     # the "feeder" channels for delivering message values to the
     # local task from the runtime's msg processing loop.
     _recv_chan: trio.MemoryReceiveChannel
     _send_chan: trio.MemorySendChannel
 
+    # full "namespace-path" to target RPC function
+    _nsf: NamespacePath
+
     # the "invocation type" of the far end task-entry-point
     # function, normally matching a logic block inside
     # `._runtime.invoke()`.
@@ -281,6 +333,7 @@ class Context:
     # which is exactly the primitive that allows for
     # cross-actor-task-supervision and thus SC.
     _scope: trio.CancelScope | None = None
+    _task: trio.lowlevel.Task|None = None
     # _res_scope: trio.CancelScope|None = None
 
     # on a clean exit there should be a final value
@@ -384,6 +437,7 @@ class Context:
     # init and streaming state
     _started_called: bool = False
     _stream_opened: bool = False
+    _stream: MsgStream|None = None
 
     # overrun handling machinery
     # NOTE: none of this provides "backpressure" to the remote
@@ -577,13 +631,14 @@ class Context:
 
         '''
         side: str = self.side
-        log.cancel(
-            f'Cancelling {side} side of context to {self.chan.uid}'
-        )
-
-        # await pause()
         self._cancel_called: bool = True
 
+        header: str = f'Cancelling "{side.upper()}"-side of ctx with peer\n'
+        reminfo: str = (
+            f'uid: {self.chan.uid}\n'
+            f'    |_ {self._nsf}()\n'
+        )
+
         # caller side who entered `Portal.open_context()`
         # NOTE: on the call side we never manually call
         # `._scope.cancel()` since we expect the eventual
@@ -601,8 +656,9 @@ class Context:
             with trio.move_on_after(timeout) as cs:
                 cs.shield = True
                 log.cancel(
-                    f'Cancelling stream {cid} to '
-                    f'{self._portal.channel.uid}'
+                    header
+                    +
+                    reminfo
                 )
 
                 # NOTE: we're telling the far end actor to cancel a task
@@ -621,13 +677,13 @@ class Context:
                 # if not self._portal.channel.connected():
                 if not self.chan.connected():
                     log.cancel(
-                        'May have failed to cancel remote task '
-                        f'{cid} for {self._portal.channel.uid}'
+                        'May have failed to cancel remote task?\n'
+                        f'{reminfo}'
                     )
                 else:
                     log.cancel(
-                        'Timed out on cancel request of remote task '
-                        f'{cid} for {self._portal.channel.uid}'
+                        'Timed out on cancel request of remote task?\n'
+                        f'{reminfo}'
                     )
 
         # callee side remote task
@@ -635,6 +691,11 @@ class Context:
         # the caller expects a `ContextCancelled` to be sent from
         # `._runtime._invoke()` back to the other side.
         else:
+            log.cancel(
+                header
+                +
+                reminfo
+            )
             # TODO: should we have an explicit cancel message
             # or is relaying the local `trio.Cancelled` as an
             # {'error': trio.Cancelled, cid: "blah"} enough?
@@ -720,8 +781,9 @@ class Context:
         # single-direction-stream case you'll get a lookup error
         # currently.
         ctx: Context = actor.get_context(
-            self.chan,
-            self.cid,
+            chan=self.chan,
+            cid=self.cid,
+            nsf=self._nsf,
             msg_buffer_size=msg_buffer_size,
             allow_overruns=allow_overruns,
         )
@@ -735,7 +797,7 @@ class Context:
 
         if ctx._recv_chan._closed:
             raise trio.ClosedResourceError(
-                'The underlying channel for this stream was already closed!?'
+                'The underlying channel for this stream was already closed!\n'
             )
 
         # NOTE: implicitly this will call `MsgStream.aclose()` on
@@ -764,6 +826,7 @@ class Context:
 
             try:
                 self._stream_opened: bool = True
+                self._stream = stream
 
                 # XXX: do we need this?
                 # ensure we aren't cancelled before yielding the stream
@@ -1174,35 +1237,47 @@ class Context:
         self,
         msg: dict,
 
-        # draining: bool = False,
-
     ) -> bool:
         '''
         Deliver an IPC msg received from a transport-channel to
-        this context's underlying mem chan for handling by
-        user operating tasks; deliver a bool indicating whether the
-        msg was immediately sent.
+        this context's underlying mem chan for handling by local
+        user application tasks; deliver `bool` indicating whether
+        the msg was able to be delivered.
 
         If `._allow_overruns == True` (maybe) append the msg to an
         "overflow queue" and start a "drainer task" (inside the
         `._scope_nursery: trio.Nursery`) which ensures that such
-        messages are eventually sent if possible.
+        messages are queued up and eventually sent if possible.
 
         '''
         cid: str = self.cid
         chan: Channel = self.chan
         from_uid: tuple[str, str]  = chan.uid
         send_chan: trio.MemorySendChannel = self._send_chan
+        nsf: NamespacePath = self._nsf
 
+        re: Exception|None
         if re := unpack_error(
             msg,
             self.chan,
         ):
             log.error(
-                f'Delivering error-msg from {from_uid} to caller {cid}'
-                f'{re}'
+                f'Delivering error-msg to caller\n'
+                f'<= peer: {from_uid}\n'
+                f'  |_ {nsf}()\n\n'
+
+                f'=> cid: {cid}\n'
+                f'  |_{self._task}\n\n'
+
+                f'{pformat(re)}\n'
             )
-            self._cancel_msg = msg
+            self._cancel_msg: dict = msg
+
+            # NOTE: this will not raise an error, merely set
+            # `._remote_error` and maybe cancel any task currently
+            # entered in `Portal.open_context()` presuming the
+            # error is "cancel causing" (i.e. `ContextCancelled`
+            # or `RemoteActorError`).
             self._maybe_cancel_and_set_remote_error(re)
 
             # XXX NEVER do this XXX..!!
@@ -1218,26 +1293,44 @@ class Context:
 
         if self._in_overrun:
             log.warning(
-                f'Capturing overrun-msg from {from_uid} to caller {cid}'
-                f'{msg}'
+                f'Queueing OVERRUN msg on caller task:\n'
+                f'<= peer: {from_uid}\n'
+                f'  |_ {nsf}()\n\n'
+
+                f'=> cid: {cid}\n'
+                f'  |_{self._task}\n\n'
+
+                f'{pformat(msg)}\n'
             )
             self._overflow_q.append(msg)
             return False
 
         try:
             log.runtime(
-                f'Delivering IPC `Context` msg:\n'
+                f'Delivering msg from IPC ctx:\n'
                 f'<= {from_uid}\n'
-                f'=> caller: {cid}\n'
-                f'{msg}'
+                f'  |_ {nsf}()\n\n'
+
+                f'=> {self._task}\n'
+                f'  |_cid={self.cid}\n\n'
+
+                f'{pformat(msg)}\n'
             )
             # from .devx._debug import pause
             # await pause()
+
+            # NOTE: if an error is deteced we should always still
+            # send it through the feeder-mem-chan and expect
+            # it to be raised by any context (stream) consumer
+            # task via the consumer APIs on both the `Context` and
+            # `MsgStream`!
+            #
+            # XXX the reason is that this method is always called
+            # by the IPC msg handling runtime task and that is not
+            # normally the task that should get cancelled/error
+            # from some remote fault!
             send_chan.send_nowait(msg)
             return True
-            # if an error is deteced we should always
-            # expect it to be raised by any context (stream)
-            # consumer task
 
         except trio.BrokenResourceError:
             # TODO: what is the right way to handle the case where the
@@ -1248,7 +1341,13 @@ class Context:
 
             # XXX: local consumer has closed their side
             # so cancel the far end streaming task
-            log.warning(f"{send_chan} consumer is already closed")
+            log.warning(
+                'Rx chan for `Context` alfready closed?\n'
+                f'cid: {self.cid}\n'
+                'Failed to deliver msg:\n'
+                f'send_chan: {send_chan}\n\n'
+                f'{pformat(msg)}\n'
+            )
             return False
 
         # NOTE XXX: by default we do **not** maintain context-stream
@@ -1257,44 +1356,54 @@ class Context:
         # msg handling loop which calls into this method!
         except trio.WouldBlock:
 
-            # XXX: always push an error even if the local
-            # receiver is in overrun state.
-            # self._maybe_cancel_and_set_remote_error(msg)
+            # XXX: always push an error even if the local receiver
+            # is in overrun state - i.e. if an 'error' msg is
+            # delivered then
+            # `._maybe_cancel_and_set_remote_error(msg)` should
+            # have already been called above!
+            #
+            # XXX QUESTION XXX: if we rx an error while in an
+            # overrun state and that msg isn't stuck in an
+            # overflow queue what happens?!?
 
             local_uid = current_actor().uid
-            lines = [
-                f'OVERRUN on actor-task context {cid}@{local_uid}!\n'
-                # TODO: put remote task name here if possible?
-                f'sender: {from_uid}',
-                f'msg: {msg}',
-                # TODO: put task func name here and maybe an arrow
-                # from sender to overrunner?
-                # f'local task {self.func_name}'
-            ]
-            if not self._stream_opened:
-                lines.insert(
-                    1,
-                    f'\n*** No stream open on `{local_uid[0]}` side! ***\n'
-                )
+            txt: str = (
+                'on IPC context:\n'
 
-            text = '\n'.join(lines)
+                f'<= sender: {from_uid}\n'
+                f'  |_ {self._nsf}()\n\n'
+
+                f'=> overrun: {local_uid}\n'
+                f'  |_cid: {cid}\n'
+                f'  |_task: {self._task}\n'
+            )
+            if not self._stream_opened:
+                txt += (
+                    f'\n*** No stream open on `{local_uid[0]}` side! ***\n\n'
+                    f'{msg}\n'
+                )
 
             # XXX: lul, this really can't be backpressure since any
             # blocking here will block the entire msg loop rpc sched for
             # a whole channel.. maybe we should rename it?
             if self._allow_overruns:
-                text += f'\nStarting overflow queuing task on msg: {msg}'
-                log.warning(text)
+                txt += (
+                    '\n*** Starting overflow queuing task on msg ***\n\n'
+                    f'{msg}\n'
+                )
+                log.warning(txt)
                 if (
                     not self._in_overrun
                 ):
                     self._overflow_q.append(msg)
-                    n = self._scope_nursery
-                    assert not n.child_tasks
+                    tn: trio.Nursery = self._scope_nursery
+                    assert not tn.child_tasks
                     try:
-                        n.start_soon(
+                        tn.start_soon(
                             self._drain_overflows,
                         )
+                        return True
+
                     except RuntimeError:
                         # if the nursery is already cancelled due to
                         # this context exiting or in error, we ignore
@@ -1302,11 +1411,12 @@ class Context:
                         # anything different.
                         return False
             else:
+                txt += f'\n{msg}\n'
                 # raise local overrun and immediately pack as IPC
                 # msg for far end.
                 try:
                     raise StreamOverrun(
-                        text,
+                        txt,
                         sender=from_uid,
                     )
                 except StreamOverrun as err:
@@ -1314,20 +1424,28 @@ class Context:
                         err,
                         cid=cid,
                     )
-                    # err_msg['cid']: str = cid
                     try:
+                        # relay condition to sender side remote task
                         await chan.send(err_msg)
+                        return True
+
                     except trio.BrokenResourceError:
                         # XXX: local consumer has closed their side
                         # so cancel the far end streaming task
-                        log.warning(f"{chan} is already closed")
+                        log.warning(
+                            'Channel for ctx is already closed?\n'
+                            f'|_{chan}\n'
+                        )
 
+            # ow, indicate unable to deliver by default
             return False
 
 
 def mk_context(
     chan: Channel,
     cid: str,
+    nsf: NamespacePath,
+
     msg_buffer_size: int = 2**6,
 
     **kwargs,
@@ -1345,10 +1463,12 @@ def mk_context(
     send_chan, recv_chan = trio.open_memory_channel(msg_buffer_size)
 
     ctx = Context(
-        chan,
-        cid,
+        chan=chan,
+        cid=cid,
         _send_chan=send_chan,
         _recv_chan=recv_chan,
+        _nsf=nsf,
+        _task=trio.lowlevel.current_task(),
         **kwargs,
     )
     ctx._result: int | Any = id(ctx)
diff --git a/tractor/_portal.py b/tractor/_portal.py
index 14f6fbf2..a4f2f618 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -69,18 +69,35 @@ from ._streaming import (
 log = get_logger(__name__)
 
 
+# TODO: rename to `unwrap_result()` and use
+# `._raise_from_no_key_in_msg()` (after tweak to
+# accept a `chan: Channel` arg) in key block!
 def _unwrap_msg(
     msg: dict[str, Any],
-    channel: Channel
+    channel: Channel,
+
+    hide_tb: bool = True,
 
 ) -> Any:
-    __tracebackhide__ = True
+    '''
+    Unwrap a final result from a `{return: <Any>}` IPC msg.
+
+    '''
+    __tracebackhide__: bool = hide_tb
+
     try:
         return msg['return']
     except KeyError as ke:
+
         # internal error should never get here
-        assert msg.get('cid'), "Received internal error at portal?"
-        raise unpack_error(msg, channel) from ke
+        assert msg.get('cid'), (
+            "Received internal error at portal?"
+        )
+
+        raise unpack_error(
+            msg,
+            channel
+        ) from ke
 
 
 class Portal:
@@ -107,7 +124,7 @@ class Portal:
     cancel_timeout: float = 0.5
 
     def __init__(self, channel: Channel) -> None:
-        self.channel = channel
+        self.chan = channel
         # during the portal's lifetime
         self._result_msg: Optional[dict] = None
 
@@ -118,6 +135,18 @@ class Portal:
         self._streams: set[MsgStream] = set()
         self.actor = current_actor()
 
+    @property
+    def channel(self) -> Channel:
+        '''
+        Proxy to legacy attr name..
+
+        Consider the shorter `Portal.chan` instead of `.channel` ;)
+        '''
+        log.debug(
+            'Consider the shorter `Portal.chan` instead of `.channel` ;)'
+        )
+        return self.chan
+
     async def _submit_for_result(
         self,
         ns: str,
@@ -125,14 +154,14 @@ class Portal:
         **kwargs
     ) -> None:
 
-        assert self._expect_result is None, \
-                "A pending main result has already been submitted"
+        assert self._expect_result is None, (
+            "A pending main result has already been submitted"
+        )
 
         self._expect_result = await self.actor.start_remote_task(
             self.channel,
-            ns,
-            func,
-            kwargs
+            nsf=NamespacePath(f'{ns}:{func}'),
+            kwargs=kwargs
         )
 
     async def _return_once(
@@ -173,7 +202,10 @@ class Portal:
                 self._expect_result
             )
 
-        return _unwrap_msg(self._result_msg, self.channel)
+        return _unwrap_msg(
+            self._result_msg,
+            self.channel,
+        )
 
     async def _cancel_streams(self):
         # terminate all locally running async generator
@@ -215,26 +247,33 @@ class Portal:
         purpose.
 
         '''
-        if not self.channel.connected():
-            log.cancel("This channel is already closed can't cancel")
+        chan: Channel = self.channel
+        if not chan.connected():
+            log.runtime(
+                'This channel is already closed, skipping cancel request..'
+            )
             return False
 
+        reminfo: str = (
+            f'uid: {self.channel.uid}\n'
+            f'    |_{chan}\n'
+        )
         log.cancel(
-            f"Sending actor cancel request to {self.channel.uid} on "
-            f"{self.channel}")
-
-        self.channel._cancel_called = True
+            f'Sending actor cancel request to peer\n'
+            f'{reminfo}'
+        )
 
+        self.channel._cancel_called: bool = True
         try:
             # send cancel cmd - might not get response
             # XXX: sure would be nice to make this work with
             # a proper shield
             with trio.move_on_after(
                 timeout
-                or self.cancel_timeout
+                or
+                self.cancel_timeout
             ) as cs:
-                cs.shield = True
-
+                cs.shield: bool = True
                 await self.run_from_ns(
                     'self',
                     'cancel',
@@ -242,7 +281,10 @@ class Portal:
                 return True
 
             if cs.cancelled_caught:
-                log.cancel(f"May have failed to cancel {self.channel.uid}")
+                log.cancel(
+                    'May have failed to cancel peer?\n'
+                    f'{reminfo}'
+                )
 
             # if we get here some weird cancellation case happened
             return False
@@ -272,27 +314,33 @@ class Portal:
 
         Note::
 
-            A special namespace `self` can be used to invoke `Actor`
-            instance methods in the remote runtime. Currently this
-            should only be used solely for ``tractor`` runtime
-            internals.
+          A special namespace `self` can be used to invoke `Actor`
+          instance methods in the remote runtime. Currently this
+          should only ever be used for `Actor` (method) runtime
+          internals!
 
         '''
+        nsf = NamespacePath(
+            f'{namespace_path}:{function_name}'
+        )
         ctx = await self.actor.start_remote_task(
-            self.channel,
-            namespace_path,
-            function_name,
-            kwargs,
+            chan=self.channel,
+            nsf=nsf,
+            kwargs=kwargs,
         )
         ctx._portal = self
         msg = await self._return_once(ctx)
-        return _unwrap_msg(msg, self.channel)
+        return _unwrap_msg(
+            msg,
+            self.channel,
+        )
 
     async def run(
         self,
         func: str,
-        fn_name: Optional[str] = None,
+        fn_name: str|None = None,
         **kwargs
+
     ) -> Any:
         '''
         Submit a remote function to be scheduled and run by actor, in
@@ -311,8 +359,9 @@ class Portal:
                 DeprecationWarning,
                 stacklevel=2,
             )
-            fn_mod_path = func
+            fn_mod_path: str = func
             assert isinstance(fn_name, str)
+            nsf = NamespacePath(f'{fn_mod_path}:{fn_name}')
 
         else:  # function reference was passed directly
             if (
@@ -325,13 +374,12 @@ class Portal:
                 raise TypeError(
                     f'{func} must be a non-streaming async function!')
 
-            fn_mod_path, fn_name = NamespacePath.from_ref(func).to_tuple()
+            nsf = NamespacePath.from_ref(func)
 
         ctx = await self.actor.start_remote_task(
             self.channel,
-            fn_mod_path,
-            fn_name,
-            kwargs,
+            nsf=nsf,
+            kwargs=kwargs,
         )
         ctx._portal = self
         return _unwrap_msg(
@@ -355,15 +403,10 @@ class Portal:
                 raise TypeError(
                     f'{async_gen_func} must be an async generator function!')
 
-        fn_mod_path, fn_name = NamespacePath.from_ref(
-            async_gen_func
-        ).to_tuple()
-
-        ctx = await self.actor.start_remote_task(
+        ctx: Context = await self.actor.start_remote_task(
             self.channel,
-            fn_mod_path,
-            fn_name,
-            kwargs
+            nsf=NamespacePath.from_ref(async_gen_func),
+            kwargs=kwargs,
         )
         ctx._portal = self
 
@@ -405,7 +448,10 @@ class Portal:
 
         self,
         func: Callable,
+
         allow_overruns: bool = False,
+
+        # proxied to RPC
         **kwargs,
 
     ) -> AsyncGenerator[tuple[Context, Any], None]:
@@ -448,13 +494,12 @@ class Portal:
         # TODO: i think from here onward should probably
         # just be factored into an `@acm` inside a new
         # a new `_context.py` mod.
-        fn_mod_path, fn_name = NamespacePath.from_ref(func).to_tuple()
+        nsf = NamespacePath.from_ref(func)
 
-        ctx = await self.actor.start_remote_task(
+        ctx: Context = await self.actor.start_remote_task(
             self.channel,
-            fn_mod_path,
-            fn_name,
-            kwargs,
+            nsf=nsf,
+            kwargs=kwargs,
 
             # NOTE: it's imporant to expose this since you might
             # get the case where the parent who opened the context does
@@ -721,10 +766,10 @@ class Portal:
             #     assert maybe_ctxc
 
             if ctx.chan.connected():
-                log.info(
-                    'Waiting on final context-task result for\n'
-                    f'task: {cid}\n'
-                    f'actor: {uid}'
+                log.runtime(
+                    'Waiting on final context result for\n'
+                    f'peer: {uid}\n'
+                    f'|_{ctx._task}\n'
                 )
                 # XXX NOTE XXX: the below call to
                 # `Context.result()` will ALWAYS raise
@@ -771,13 +816,19 @@ class Portal:
                         RemoteActorError(),
                     ):
                         log.exception(
-                            f'Context `{fn_name}` remotely errored:\n'
-                            f'`{tbstr}`'
+                            'Context remotely errored!\n'
+                            f'<= peer: {uid}\n'
+                            f'  |_ {nsf}()\n\n'
+
+                            f'{tbstr}'
                         )
                     case (None, _):
                         log.runtime(
-                            f'Context {fn_name} returned value from callee:\n'
-                            f'`{result_or_err}`'
+                            'Context returned final result from callee task:\n'
+                            f'<= peer: {uid}\n'
+                            f'  |_ {nsf}()\n\n'
+
+                            f'`{result_or_err}`\n'
                         )
 
         finally:
@@ -855,26 +906,31 @@ class Portal:
                 # CASE 2
                 if ctx._cancel_called:
                     log.cancel(
-                        f'Context {fn_name} cancelled by caller with\n'
+                        'Context cancelled by caller task\n'
+                        f'|_{ctx._task}\n\n'
+
                         f'{etype}'
                     )
 
                 # CASE 1
                 else:
                     log.cancel(
-                        f'Context cancelled by callee with {etype}\n'
-                        f'target: `{fn_name}`\n'
-                        f'task:{cid}\n'
-                        f'actor:{uid}'
+                        f'Context cancelled by remote callee task\n'
+                        f'peer: {uid}\n'
+                        f'|_ {nsf}()\n\n'
+
+                        f'{etype}\n'
                     )
 
             # FINALLY, remove the context from runtime tracking and
             # exit!
             log.runtime(
-                f'Exiting context opened with {ctx.chan.uid}'
+                'Removing IPC ctx opened with peer\n'
+                f'{uid}\n'
+                f'|_{ctx}\n'
             )
             self.actor._contexts.pop(
-                (self.channel.uid, ctx.cid),
+                (uid, cid),
                 None,
             )
 
diff --git a/tractor/_streaming.py b/tractor/_streaming.py
index e8f735ec..64b5dd6d 100644
--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@@ -95,9 +95,6 @@ class MsgStream(trio.abc.Channel):
         try:
             return msg['yield']
         except KeyError as kerr:
-            # if 'return' in msg:
-            #     return msg
-
             _raise_from_no_key_in_msg(
                 ctx=self._ctx,
                 msg=msg,
@@ -128,13 +125,9 @@ class MsgStream(trio.abc.Channel):
         # introducing this
         if self._eoc:
             raise self._eoc
-            # raise trio.EndOfChannel
 
         if self._closed:
             raise self._closed
-            # raise trio.ClosedResourceError(
-            #     'This stream was already closed'
-            # )
 
         src_err: Exception|None = None
         try:
@@ -143,6 +136,7 @@ class MsgStream(trio.abc.Channel):
                 return msg['yield']
 
             except KeyError as kerr:
+                # log.exception('GOT KEYERROR')
                 src_err = kerr
 
                 # NOTE: may raise any of the below error types
@@ -161,9 +155,9 @@ class MsgStream(trio.abc.Channel):
             # trio.ClosedResourceError,  # by self._rx_chan
             trio.EndOfChannel,  # by self._rx_chan or `stop` msg from far end
         ) as eoc:
+            # log.exception('GOT EOC')
             src_err = eoc
             self._eoc = eoc
-            # await trio.sleep(1)
 
             # a ``ClosedResourceError`` indicates that the internal
             # feeder memory receive channel was closed likely by the
@@ -201,6 +195,7 @@ class MsgStream(trio.abc.Channel):
             # raise eoc
 
         except trio.ClosedResourceError as cre:  # by self._rx_chan
+            # log.exception('GOT CRE')
             src_err = cre
             log.warning(
                 '`Context._rx_chan` was already closed?'
@@ -211,6 +206,8 @@ class MsgStream(trio.abc.Channel):
         # terminated and signal this local iterator to stop
         drained: list[Exception|dict] = await self.aclose()
         if drained:
+            # from .devx import pause
+            # await pause()
             log.warning(
                 'Drained context msgs during closure:\n'
                 f'{drained}'
@@ -237,31 +234,32 @@ class MsgStream(trio.abc.Channel):
         Cancel associated remote actor task and local memory channel on
         close.
 
+        Notes: 
+         - REMEMBER that this is also called by `.__aexit__()` so
+           careful consideration must be made to handle whatever
+           internal stsate is mutated, particuarly in terms of
+           draining IPC msgs!
+
+         - more or less we try to maintain adherance to trio's `.aclose()` semantics:
+           https://trio.readthedocs.io/en/stable/reference-io.html#trio.abc.AsyncResource.aclose
         '''
-        # XXX: keep proper adherance to trio's `.aclose()` semantics:
-        # https://trio.readthedocs.io/en/stable/reference-io.html#trio.abc.AsyncResource.aclose
-        rx_chan = self._rx_chan
 
-        if (
-            rx_chan._closed
-            or
-            self._closed
-        ):
-            log.cancel(
-                f'`MsgStream` is already closed\n'
-                f'.cid: {self._ctx.cid}\n'
-                f'._rx_chan`: {rx_chan}\n'
-                f'._eoc: {self._eoc}\n'
-                f'._closed: {self._eoc}\n'
-            )
+        # rx_chan = self._rx_chan
 
+        # XXX NOTE XXX
+        # it's SUPER IMPORTANT that we ensure we don't DOUBLE
+        # DRAIN msgs on closure so avoid getting stuck handing on
+        # the `._rx_chan` since we call this method on
+        # `.__aexit__()` as well!!!
+        # => SO ENSURE WE CATCH ALL TERMINATION STATES in this
+        # block including the EoC..
+        if self.closed:
             # this stream has already been closed so silently succeed as
             # per ``trio.AsyncResource`` semantics.
             # https://trio.readthedocs.io/en/stable/reference-io.html#trio.abc.AsyncResource.aclose
             return []
 
         ctx: Context = self._ctx
-        # caught_eoc: bool = False
         drained: list[Exception|dict] = []
         while not drained:
             try:
@@ -274,17 +272,26 @@ class MsgStream(trio.abc.Channel):
                     # TODO: inject into parent `Context` buf?
                     drained.append(maybe_final_msg)
 
+            # NOTE: we only need these handlers due to the
+            # `.receive_nowait()` call above which may re-raise
+            # one of these errors on a msg key error!
+
             except trio.WouldBlock as be:
                 drained.append(be)
                 break
 
             except trio.EndOfChannel as eoc:
+                self._eoc: Exception = eoc
                 drained.append(eoc)
-                # caught_eoc = True
-                self._eoc: bool = eoc
+                break
+
+            except trio.ClosedResourceError as cre:
+                self._closed = cre
+                drained.append(cre)
                 break
 
             except ContextCancelled as ctxc:
+                # log.exception('GOT CTXC')
                 log.cancel(
                     'Context was cancelled during stream closure:\n'
                     f'canceller: {ctxc.canceller}\n'
@@ -339,8 +346,11 @@ class MsgStream(trio.abc.Channel):
         #     with trio.CancelScope(shield=True):
         #         await rx_chan.aclose()
 
-        # self._eoc: bool = caught_eoc
-
+        if not self._eoc:
+            self._eoc: bool = trio.EndOfChannel(
+                f'Context stream closed by {self._ctx.side}\n'
+                f'|_{self}\n'
+            )
         # ?XXX WAIT, why do we not close the local mem chan `._rx_chan` XXX?
         # => NO, DEFINITELY NOT! <=
         # if we're a bi-dir ``MsgStream`` BECAUSE this same
@@ -379,6 +389,26 @@ class MsgStream(trio.abc.Channel):
         # self._closed = True
         return drained
 
+    @property
+    def closed(self) -> bool:
+        if (
+            (rxc := self._rx_chan._closed)
+            or
+            (_closed := self._closed)
+            or
+            (_eoc := self._eoc)
+        ):
+            log.runtime(
+                f'`MsgStream` is already closed\n'
+                f'{self}\n'
+                f' |_cid: {self._ctx.cid}\n'
+                f' |_rx_chan._closed: {type(rxc)} = {rxc}\n'
+                f' |_closed: {type(_closed)} = {_closed}\n'
+                f' |_eoc: {type(_eoc)} = {_eoc}'
+            )
+            return True
+        return False
+
     @acm
     async def subscribe(
         self,
-- 
2.34.1


From c6ee4e5dc15a6913db660ee39b1e0d3cda146e7a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 22 Feb 2024 20:37:12 -0500
Subject: [PATCH 125/378] Add a `pytest.ini` config

---
 pytest.ini | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 pytest.ini

diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 00000000..6a7e51fb
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,8 @@
+# vim: ft=ini
+# pytest.ini for tractor
+
+[pytest]
+# don't show frickin captured logs AGAIN in the report..
+addopts = --show-capture='no'
+log_cli = false
+; minversion = 6.0
-- 
2.34.1


From d08aeaeafeaff8d9368146bf39bc9b07b1a599ac Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 28 Feb 2024 17:13:01 -0500
Subject: [PATCH 126/378] Make `@context`-cancelled tests more pedantic

In order to match a very significant and coming-soon patch set to the
IPC `Context` and `Channel` cancellation semantics with significant but
subtle changes to the primitives and runtime logic:

- a new set of `Context` state pub meth APIs for checking exact
  inter-actor-linked-task outcomes such as `.outcome`, `.maybe_error`,
  and `.cancel_acked`.

- trying to move away from `Context.cancelled_caught` usage since the
  semantics from `trio` don't really map well (in terms of cancel
  requests and how they result in cancel-scope graceful closure) and
  `.cancel_acked: bool` is a better approach for IPC req-resp msging.
  - change test usage to access `._scope.cancelled_caught` directly.

- more pedantic ctxc-raising expects around the "type of self
  cancellation" and final outcome in ctxc cases:
  - `ContextCancelled` is raised by ctx (`Context.result()`) consumer
    methods when `Portal.cancel_actor()` is called (since it's an
    out-of-band request) despite `Channel._cancel_called` being set.
  - also raised by `.open_context().__aexit__()` on close.
  - `.outcome` is always `.maybe_error` is always one of
    `._local/remote_error`.
---
 tests/test_cancellation.py             |  10 ++-
 tests/test_context_stream_semantics.py | 119 ++++++++++++++++++++-----
 2 files changed, 106 insertions(+), 23 deletions(-)

diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py
index ce396ace..9a729f3d 100644
--- a/tests/test_cancellation.py
+++ b/tests/test_cancellation.py
@@ -48,11 +48,13 @@ async def do_nuthin():
     ids=['no_args', 'unexpected_args'],
 )
 def test_remote_error(reg_addr, args_err):
-    """Verify an error raised in a subactor that is propagated
+    '''
+    Verify an error raised in a subactor that is propagated
     to the parent nursery, contains the underlying boxed builtin
     error type info and causes cancellation and reraising all the
     way up the stack.
-    """
+
+    '''
     args, errtype = args_err
 
     async def main():
@@ -65,7 +67,9 @@ def test_remote_error(reg_addr, args_err):
             # an exception group outside the nursery since the error
             # here and the far end task error are one in the same?
             portal = await nursery.run_in_actor(
-                assert_err, name='errorer', **args
+                assert_err,
+                name='errorer',
+                **args
             )
 
             # get result(s) from main task
diff --git a/tests/test_context_stream_semantics.py b/tests/test_context_stream_semantics.py
index e0ffa874..19a87453 100644
--- a/tests/test_context_stream_semantics.py
+++ b/tests/test_context_stream_semantics.py
@@ -5,7 +5,7 @@ Verify the we raise errors when streams are opened prior to
 sync-opening a ``tractor.Context`` beforehand.
 
 '''
-# from contextlib import asynccontextmanager as acm
+from contextlib import asynccontextmanager as acm
 from itertools import count
 import platform
 from pprint import pformat
@@ -250,6 +250,17 @@ def test_simple_context(
         trio.run(main)
 
 
+@acm
+async def expect_ctxc(yay: bool) -> None:
+    if yay:
+        try:
+            yield
+        except ContextCancelled:
+            return
+    else:
+        yield
+
+
 @pytest.mark.parametrize(
     'callee_returns_early',
     [True, False],
@@ -280,23 +291,60 @@ def test_caller_cancels(
     async def check_canceller(
         ctx: Context,
     ) -> None:
-        # should not raise yet return the remote
-        # context cancelled error.
-        res = await ctx.result()
+        actor: Actor = current_actor()
+        uid: tuple = actor.uid
 
+        if (
+            cancel_method == 'portal'
+            and not callee_returns_early
+        ):
+            try:
+                res = await ctx.result()
+                assert 0, 'Portal cancel should raise!'
+
+            except ContextCancelled as ctxc:
+                assert ctx.chan._cancel_called
+                assert ctxc.canceller == uid
+                assert ctxc is ctx.maybe_error
+
+        # NOTE: should not ever raise even in the `ctx`
+        # case since self-cancellation should swallow the ctxc
+        # silently!
+        else:
+            res = await ctx.result()
+
+        # we actually get a result
         if callee_returns_early:
             assert res == 'yo'
+            assert ctx.outcome is res
+            assert ctx.maybe_error is None
 
         else:
-            err = res
+            err: Exception = ctx.outcome
             assert isinstance(err, ContextCancelled)
             assert (
                 tuple(err.canceller)
                 ==
-                current_actor().uid
+                uid
             )
+            assert (
+                err
+                is ctx.maybe_error
+                is ctx._remote_error
+            )
+            if le := ctx._local_error:
+                assert err is le
+
+            # else:
+                # TODO: what should this be then?
+                # not defined until block closes right?
+                #
+                # await tractor.pause()
+                # assert ctx._local_error is None
+
 
     async def main():
+
         async with tractor.open_nursery(
             debug_mode=debug_mode,
         ) as an:
@@ -306,11 +354,16 @@ def test_caller_cancels(
             )
             timeout = 0.5 if not callee_returns_early else 2
             with trio.fail_after(timeout):
-                async with portal.open_context(
-                    simple_setup_teardown,
-                    data=10,
-                    block_forever=not callee_returns_early,
-                ) as (ctx, sent):
+                async with (
+
+                    expect_ctxc(yay=cancel_method == 'portal'),
+
+                    portal.open_context(
+                        simple_setup_teardown,
+                        data=10,
+                        block_forever=not callee_returns_early,
+                    ) as (ctx, sent),
+                ):
 
                     if callee_returns_early:
                         # ensure we block long enough before sending
@@ -332,6 +385,16 @@ def test_caller_cancels(
             if cancel_method != 'portal':
                 await portal.cancel_actor()
 
+            # since the `.cancel_actor()` call just above
+            # will cause the `.open_context().__aexit__()` raise
+            # a ctxc which should in turn cause `ctx._scope` to
+            # catch any cancellation?
+            if (
+                not callee_returns_early
+                and cancel_method == 'portal'
+            ):
+                assert ctx._scope.cancelled_caught
+
     trio.run(main)
 
 
@@ -434,7 +497,6 @@ async def test_callee_closes_ctx_after_stream_open(
 
 @tractor.context
 async def expect_cancelled(
-
     ctx: Context,
 
 ) -> None:
@@ -454,7 +516,7 @@ async def expect_cancelled(
         raise
 
     else:
-        assert 0, "Wasn't cancelled!?"
+        assert 0, "callee wasn't cancelled !?"
 
 
 @pytest.mark.parametrize(
@@ -473,8 +535,8 @@ async def test_caller_closes_ctx_after_callee_opens_stream(
     async with tractor.open_nursery(
         debug_mode=debug_mode,
     ) as an:
-        root: Actor = current_actor()
 
+        root: Actor = current_actor()
         portal = await an.start_actor(
             'ctx_cancelled',
             enable_modules=[__name__],
@@ -487,11 +549,13 @@ async def test_caller_closes_ctx_after_callee_opens_stream(
 
             await portal.run(assert_state, value=True)
 
-            # call cancel explicitly
+            # call `ctx.cancel()` explicitly
             if use_ctx_cancel_method:
-
                 await ctx.cancel()
 
+                # NOTE: means the local side `ctx._scope` will
+                # have been cancelled by an ctxc ack and thus
+                # `._scope.cancelled_caught` should be set.
                 try:
                     async with ctx.open_stream() as stream:
                         async for msg in stream:
@@ -520,20 +584,35 @@ async def test_caller_closes_ctx_after_callee_opens_stream(
                 assert portal.channel.connected()
 
                 # ctx is closed here
-                await portal.run(assert_state, value=False)
+                await portal.run(
+                    assert_state,
+                    value=False,
+                )
 
             else:
                 try:
                     with trio.fail_after(0.2):
                         await ctx.result()
                         assert 0, "Callee should have blocked!?"
+
                 except trio.TooSlowError:
                     # NO-OP -> since already called above
                     await ctx.cancel()
 
-        # local scope should have absorbed the cancellation
-        assert ctx.cancelled_caught
-        assert ctx._remote_error is ctx._local_error
+        # NOTE: local scope should have absorbed the cancellation since
+        # in this case we call `ctx.cancel()` and the local
+        # `._scope` gets `.cancel_called` on the ctxc ack.
+        if use_ctx_cancel_method:
+            assert ctx._scope.cancelled_caught
+
+        # rxed ctxc response from far end
+        assert ctx.cancel_acked
+        assert (
+            ctx._remote_error
+            is ctx._local_error
+            is ctx.maybe_error
+            is ctx.outcome
+        )
 
         try:
             async with ctx.open_stream() as stream:
-- 
2.34.1


From 3ed309f019536104e351111a4576c73709b669b5 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 29 Feb 2024 14:21:45 -0500
Subject: [PATCH 127/378] Add test for `modden` sub-spawner-server hangs on
 cancel

As per a lot of the recent refinements to `Context` cancellation, add
a new test case to replicate the original hang-on-cancel found with
`modden` when using a client actor to spawn a subactor in some other
tree where despite `Context.cancel()` being called the requesting client
would hang on the opened context with the server.

The specific scenario added here is to have,
- root actor spawns 2 children: a client and a spawn server.
- the spawn server opens with a spawn-request serve loop and begins to
  wait for the client.
- client spawns and connects to the sibling spawn server, requests to
  spawn a sub-actor, the "little bro", connects to it then does some
  echo streaming, cancels the request with it's sibling (the spawn
  server) which should in turn cancel the root's-grandchild and result
  in a cancel-ack back to the client's `.open_context()`.
- root ensures that it can also connect to the grandchild (little bro),
  do the same echo streaming, then ensure everything tears down
  correctly after cancelling all the children.

More refinements to come here obvi in the specific cancellation
semantics and possibly causes.

Also tweaks the other tests in suite to use the new `Context` properties
recently introduced and similarly updated in the previous patch to the
ctx-semantics suite.
---
 tests/test_inter_peer_cancellation.py | 484 +++++++++++++++++++++++---
 1 file changed, 445 insertions(+), 39 deletions(-)

diff --git a/tests/test_inter_peer_cancellation.py b/tests/test_inter_peer_cancellation.py
index 1ead6172..082c5e65 100644
--- a/tests/test_inter_peer_cancellation.py
+++ b/tests/test_inter_peer_cancellation.py
@@ -10,6 +10,9 @@ import pytest
 import trio
 import tractor
 from tractor import (  # typing
+    Actor,
+    current_actor,
+    open_nursery,
     Portal,
     Context,
     ContextCancelled,
@@ -238,19 +241,23 @@ async def stream_from_peer(
 
         # caller peer should not be the cancel requester
         assert not ctx.cancel_called
+        assert not ctx.cancel_acked
 
-        # XXX can never be true since `._invoke` only
+        # XXX can NEVER BE TRUE since `._invoke` only
         # sets this AFTER the nursery block this task
         # was started in, exits.
-        assert not ctx.cancelled_caught
+        assert not ctx._scope.cancelled_caught
 
-        # we never requested cancellation
+        # we never requested cancellation, it was the 'canceller'
+        # peer.
         assert not peer_ctx.cancel_called
+        assert not peer_ctx.cancel_acked
+
         # the `.open_context()` exit definitely caught
         # a cancellation in the internal `Context._scope` since
         # likely the runtime called `_deliver_msg()` after
         # receiving the remote error from the streaming task.
-        assert peer_ctx.cancelled_caught
+        assert not peer_ctx._scope.cancelled_caught
 
         # TODO / NOTE `.canceller` won't have been set yet
         # here because that machinery is inside
@@ -259,6 +266,8 @@ async def stream_from_peer(
         # checkpoint) that cancellation was due to
         # a remote, we COULD assert this here..see,
         # https://github.com/goodboy/tractor/issues/368
+        #
+        # assert 'canceller' in ctx.canceller
 
         # root/parent actor task should NEVER HAVE cancelled us!
         assert not ctx.canceller
@@ -356,8 +365,7 @@ def test_peer_canceller(
                 'just_caller',  # but i just met her?
                 enable_modules=[__name__],
             )
-
-            root = tractor.current_actor()
+            root: Actor = current_actor()
 
             try:
                 async with (
@@ -395,8 +403,8 @@ def test_peer_canceller(
                     # not request the sleeper cancellation ;)
                     except ContextCancelled as ctxerr:
                         print(
-                            'CAUGHT REMOTE CONTEXT CANCEL FOM\n'
-                            f'{ctxerr}'
+                            'CAUGHT REMOTE CONTEXT CANCEL\n\n'
+                            f'{ctxerr}\n'
                         )
 
                         # canceller and caller peers should not
@@ -404,6 +412,9 @@ def test_peer_canceller(
                         assert canceller_ctx.canceller is None
                         assert caller_ctx.canceller is None
 
+                        # we were not the actor, our peer was
+                        assert not sleeper_ctx.cancel_acked
+
                         assert ctxerr.canceller[0] == 'canceller'
 
                         # XXX NOTE XXX: since THIS `ContextCancelled`
@@ -411,11 +422,13 @@ def test_peer_canceller(
                         # `sleeper.open_context().__aexit__()` this
                         # value is not yet set, however outside this
                         # block it should be.
-                        assert not sleeper_ctx.cancelled_caught
+                        assert not sleeper_ctx._scope.cancelled_caught
 
+                        # CASE_1: error-during-ctxc-handling,
                         if error_during_ctxerr_handling:
                             raise RuntimeError('Simulated error during teardown')
 
+                        # CASE_2: standard teardown inside in `.open_context()` block
                         raise
 
                     # XXX SHOULD NEVER EVER GET HERE XXX
@@ -436,7 +449,6 @@ def test_peer_canceller(
                     else:
                         pytest.fail(
                             'did not rx ctxc ?!?\n\n'
-
                             f'{ctxs}\n'
                         )
 
@@ -447,21 +459,48 @@ def test_peer_canceller(
                 _loc_err = loc_err
 
                 # NOTE: the main state to check on `Context` is:
-                # - `.cancelled_caught` (maps to nursery cs)
                 # - `.cancel_called` (bool of whether this side
                 #    requested)
+                # - `.cancel_acked` (bool of whether a ctxc
+                #   response was received due to cancel req).
+                # - `.maybe_error` (highest prio error to raise
+                #    locally)
+                # - `.outcome` (final error or result value)
                 # - `.canceller` (uid of cancel-causing actor-task)
                 # - `._remote_error` (any `RemoteActorError`
                 #    instance from other side of context)
+                # - `._local_error` (any error caught inside the
+                #   `.open_context()` block).
+                #
+                # XXX: Deprecated and internal only
+                # - `.cancelled_caught` (maps to nursery cs)
+                #  - now just use `._scope.cancelled_caught`
+                #    since it maps to the internal (maps to nursery cs)
+                #
                 # TODO: are we really planning to use this tho?
                 # - `._cancel_msg` (any msg that caused the
                 #    cancel)
 
-                # CASE: error raised during handling of
-                # `ContextCancelled` inside `.open_context()`
-                # block
+                # CASE_1: error-during-ctxc-handling,
+                # - far end cancels due to peer 'canceller',
+                # - `ContextCancelled` relayed to this scope,
+                # - inside `.open_context()` ctxc is caught and
+                #   a rte raised instead
+                #
+                # => block should raise the rte but all peers
+                #   should be cancelled by US.
+                #
                 if error_during_ctxerr_handling:
                     assert isinstance(loc_err, RuntimeError)
+                    print(f'_loc_err: {_loc_err}\n')
+                    # assert sleeper_ctx._local_error is _loc_err
+                    # assert sleeper_ctx._local_error is _loc_err
+                    assert not (
+                        loc_err
+                        is sleeper_ctx.maybe_error
+                        is sleeper_ctx.outcome
+                        is sleeper_ctx._remote_error
+                    )
 
                     # NOTE: this root actor task should have
                     # called `Context.cancel()` on the
@@ -495,7 +534,25 @@ def test_peer_canceller(
                                 root.uid
                             )
 
-                # CASE: standard teardown inside in `.open_context()` block
+                    # since the sleeper errors while handling a
+                    # peer-cancelled (by ctxc) scenario, we expect
+                    # that the `.open_context()` block DOES call
+                    # `.cancel() (despite in this test case it
+                    # being unecessary).
+                    assert (
+                        sleeper_ctx.cancel_called
+                        and
+                        not sleeper_ctx.cancel_acked
+                    )
+
+                # CASE_2: standard teardown inside in `.open_context()` block
+                # - far end cancels due to peer 'canceller',
+                # - `ContextCancelled` relayed to this scope and
+                #   raised locally without any raise-during-handle,
+                #
+                # => inside `.open_context()` ctxc is raised and
+                #   propagated
+                #
                 else:
                     assert isinstance(loc_err, ContextCancelled)
                     assert loc_err.canceller == sleeper_ctx.canceller
@@ -509,24 +566,42 @@ def test_peer_canceller(
 
                     # the sleeper's remote error is the error bubbled
                     # out of the context-stack above!
-                    re = sleeper_ctx._remote_error
-                    assert re is loc_err
+                    re = sleeper_ctx.outcome
+                    assert (
+                        re is loc_err
+                        is sleeper_ctx.maybe_error
+                        is sleeper_ctx._remote_error
+                    )
 
                     for ctx in ctxs:
-                        re: BaseException | None = ctx._remote_error
-                        assert re
+                        re: BaseException|None = ctx._remote_error
+                        re: BaseException|None = ctx.outcome
+                        assert (
+                            re and
+                            (
+                                re is ctx.maybe_error
+                                is ctx._remote_error
+                            )
+                        )
+                        le: trio.MultiError = ctx._local_error
+                        assert (
+                            le
+                            and ctx._local_error
+                        )
 
                         # root doesn't cancel sleeper since it's
                         # cancelled by its peer.
                         if ctx is sleeper_ctx:
                             assert not ctx.cancel_called
+                            assert not ctx.cancel_acked
+
                             # since sleeper_ctx.result() IS called
                             # above we should have (silently)
                             # absorbed the corresponding
                             # `ContextCancelled` for it and thus
                             # the logic inside `.cancelled_caught`
                             # should trigger!
-                            assert ctx.cancelled_caught
+                            assert ctx._scope.cancelled_caught
 
                         elif ctx is caller_ctx:
                             # since its context was remotely
@@ -535,15 +610,33 @@ def test_peer_canceller(
                             # done by the peer and also we never 
                             assert ctx.cancel_called
 
-                            # TODO: figure out the details of
-                            # this..
+                            # TODO: figure out the details of this..?
                             # if you look the `._local_error` here
                             # is a multi of ctxc + 2 Cancelleds?
                             # assert not ctx.cancelled_caught
 
+                        elif ctx is canceller_ctx:
+
+                            # XXX NOTE XXX: ONLY the canceller
+                            # will get a self-cancelled outcome
+                            # whilst everyone else gets
+                            # a peer-caused cancellation!
+                            #
+                            # TODO: really we should avoid calling
+                            # .cancel() whenever an interpeer
+                            # cancel takes place since each
+                            # reception of a ctxc
+                            assert (
+                                ctx.cancel_called
+                                and ctx.cancel_acked
+                            )
+                            assert not ctx._scope.cancelled_caught
+
                         else:
-                            assert ctx.cancel_called
-                            assert not ctx.cancelled_caught
+                            pytest.fail(
+                                'Uhh wut ctx is this?\n'
+                                f'{ctx}\n'
+                            )
 
                         # TODO: do we even need this flag?
                         # -> each context should have received
@@ -559,14 +652,24 @@ def test_peer_canceller(
                     # `Context.cancel()` SHOULD NOT have been
                     # called inside
                     # `Portal.open_context().__aexit__()`.
-                    assert not sleeper_ctx.cancel_called
+                    assert not (
+                        sleeper_ctx.cancel_called
+                        or
+                        sleeper_ctx.cancel_acked
+                    )
 
                 # XXX NOTE XXX: and see matching comment above but,
-                # this flag is set only AFTER the `.open_context()`
-                # has exited and should be set in both outcomes
-                # including the case where ctx-cancel handling
-                # itself errors.
-                assert sleeper_ctx.cancelled_caught
+                # the `._scope` is only set by `trio` AFTER the
+                # `.open_context()` block has exited and should be
+                # set in both outcomes including the case where
+                # ctx-cancel handling itself errors.
+                assert sleeper_ctx._scope.cancelled_caught
+                assert _loc_err is sleeper_ctx._local_error
+                assert (
+                    sleeper_ctx.outcome
+                    is sleeper_ctx.maybe_error
+                    is sleeper_ctx._remote_error
+                )
 
                 raise  # always to ensure teardown
 
@@ -582,12 +685,315 @@ def test_peer_canceller(
         assert excinfo.value.canceller[0] == 'canceller'
 
 
-def test_client_tree_spawns_and_cancels_service_subactor():
-    ...
-# TODO: test for the modden `mod wks open piker` bug!
-# -> start actor-tree (server) that offers sub-actor spawns via
-#   context API
-# -> start another full actor-tree (client) which requests to the first to
-#   spawn over its `@context` ep / api.
-# -> client actor cancels the context and should exit gracefully
-#   and the server's spawned child should cancel and terminate!
+@tractor.context
+async def basic_echo_server(
+    ctx: Context,
+    peer_name: str = 'stepbro',
+
+) -> None:
+    '''
+    Just the simplest `MsgStream` echo server which resays what
+    you told it but with its uid in front ;)
+
+    '''
+    actor: Actor = tractor.current_actor()
+    uid: tuple = actor.uid
+    await ctx.started(uid)
+    async with ctx.open_stream() as ipc:
+        async for msg in ipc:
+
+            # repack msg pair with our uid
+            # as first element.
+            (
+                client_uid,
+                i,
+            ) = msg
+            resp: tuple = (
+                uid,
+                i,
+            )
+            # OOF! looks like my runtime-error is causing a lockup
+            # assert 0
+            await ipc.send(resp)
+
+
+@tractor.context
+async def serve_subactors(
+    ctx: Context,
+    peer_name: str,
+
+) -> None:
+    async with open_nursery() as an:
+        await ctx.started(peer_name)
+        async with ctx.open_stream() as reqs:
+            async for msg in reqs:
+                peer_name: str = msg
+                peer: Portal = await an.start_actor(
+                    name=peer_name,
+                    enable_modules=[__name__],
+                )
+                print(
+                    'Spawning new subactor\n'
+                    f'{peer_name}\n'
+                    f'|_{peer}\n'
+                )
+                await reqs.send((
+                    peer.chan.uid,
+                    peer.chan.raddr,
+                ))
+
+        print('Spawner exiting spawn serve loop!')
+
+
+@tractor.context
+async def client_req_subactor(
+    ctx: Context,
+    peer_name: str,
+
+    # used to simulate a user causing an error to be raised
+    # directly in thread (like a KBI) to better replicate the
+    # case where a `modden` CLI client would hang afer requesting
+    # a `Context.cancel()` to `bigd`'s wks spawner.
+    reraise_on_cancel: str|None = None,
+
+) -> None:
+    # TODO: other cases to do with sub lifetimes:
+    # -[ ] test that we can have the server spawn a sub
+    #   that lives longer then ctx with this client.
+    # -[ ] test that
+
+    # open ctx with peer spawn server and ask it to spawn a little
+    # bro which we'll then connect and stream with.
+    async with (
+        tractor.find_actor(
+            name='spawn_server',
+            raise_on_none=True,
+
+            # TODO: we should be isolating this from other runs!
+            # => ideally so we can eventually use something like
+            # `pytest-xdist` Bo
+            # registry_addrs=bigd._reg_addrs,
+        ) as spawner,
+
+        spawner.open_context(
+            serve_subactors,
+            peer_name=peer_name,
+        ) as (spawner_ctx, first),
+    ):
+        assert first == peer_name
+        await ctx.started(
+            'yup i had brudder',
+        )
+
+        async with spawner_ctx.open_stream() as reqs:
+
+            # send single spawn request to the server
+            await reqs.send(peer_name)
+            with trio.fail_after(3):
+                (
+                    sub_uid,
+                    sub_raddr,
+                ) = await reqs.receive()
+
+
+            await tell_little_bro(
+                actor_name=sub_uid[0],
+                caller='client',
+            )
+
+            # TODO: test different scope-layers of
+            # cancellation?
+            # with trio.CancelScope() as cs:
+            try:
+                await trio.sleep_forever()
+
+            # TODO: would be super nice to have a special injected
+            # cancel type here (maybe just our ctxc) but using
+            # some native mechanism in `trio` :p
+            except (
+                trio.Cancelled
+            ) as err:
+                _err = err
+                if reraise_on_cancel:
+                    errtype = globals()['__builtins__'][reraise_on_cancel]
+                    assert errtype
+                    to_reraise: BaseException = errtype()
+                    print(f'client re-raising on cancel: {repr(to_reraise)}')
+                    raise err
+
+                raise
+
+            # if cs.cancelled_caught:
+            #     print('client handling expected KBI!')
+            #     await ctx.
+            #     await trio.sleep(
+            #     await tractor.pause()
+            #     await spawner_ctx.cancel()
+
+            # cancel spawned sub-actor directly?
+            # await sub_ctx.cancel()
+
+            # maybe cancel runtime?
+            # await sub.cancel_actor()
+
+
+async def tell_little_bro(
+    actor_name: str,
+    caller: str = ''
+):
+    # contact target actor, do a stream dialog.
+    async with (
+        tractor.wait_for_actor(
+            name=actor_name
+        ) as lb,
+        lb.open_context(
+            basic_echo_server,
+        ) as (sub_ctx, first),
+        sub_ctx.open_stream(
+            basic_echo_server,
+        ) as echo_ipc,
+    ):
+        actor: Actor = current_actor()
+        uid: tuple = actor.uid
+        for i in range(100):
+            msg: tuple = (
+                uid,
+                i,
+            )
+            await echo_ipc.send(msg)
+            resp = await echo_ipc.receive()
+            print(
+                f'{caller} => {actor_name}: {msg}\n'
+                f'{caller} <= {actor_name}: {resp}\n'
+            )
+            (
+                sub_uid,
+                _i,
+            ) = resp
+            assert sub_uid != uid
+            assert _i == i
+
+
+@pytest.mark.parametrize(
+    'raise_client_error',
+    [None, 'KeyboardInterrupt'],
+)
+def test_peer_spawns_and_cancels_service_subactor(
+    debug_mode: bool,
+    raise_client_error: str,
+):
+    # NOTE: this tests for the modden `mod wks open piker` bug
+    # discovered as part of implementing workspace ctx
+    # open-.pause()-ctx.cancel() as part of the CLI..
+
+    # -> start actor-tree (server) that offers sub-actor spawns via
+    #   context API
+    # -> start another full actor-tree (client) which requests to the first to
+    #   spawn over its `@context` ep / api.
+    # -> client actor cancels the context and should exit gracefully
+    #   and the server's spawned child should cancel and terminate!
+    peer_name: str = 'little_bro'
+
+    async def main():
+        async with tractor.open_nursery(
+            # NOTE: to halt the peer tasks on ctxc, uncomment this.
+            debug_mode=debug_mode,
+        ) as an:
+            server: Portal = await an.start_actor(
+                (server_name := 'spawn_server'),
+                enable_modules=[__name__],
+            )
+            print(f'Spawned `{server_name}`')
+
+            client: Portal = await an.start_actor(
+                client_name := 'client',
+                enable_modules=[__name__],
+            )
+            print(f'Spawned `{client_name}`')
+
+            try:
+                async with (
+                    server.open_context(
+                        serve_subactors,
+                        peer_name=peer_name,
+                    ) as (spawn_ctx, first),
+
+                    client.open_context(
+                        client_req_subactor,
+                        peer_name=peer_name,
+                        reraise_on_cancel=raise_client_error,
+                    ) as (client_ctx, client_says),
+                ):
+                    print(
+                        f'Server says: {first}\n'
+                        f'Client says: {client_says}\n'
+                    )
+
+                    # attach to client-requested-to-spawn
+                    # (grandchild of this root actor) "little_bro"
+                    # and ensure we can also use it as an echo
+                    # server.
+                    async with tractor.wait_for_actor(
+                        name=peer_name,
+                    ) as sub:
+                        assert sub
+
+                    print(
+                        'Sub-spawn came online\n'
+                        f'portal: {sub}\n'
+                        f'.uid: {sub.actor.uid}\n'
+                        f'chan.raddr: {sub.chan.raddr}\n'
+                    )
+                    await tell_little_bro(
+                        actor_name=peer_name,
+                        caller='root',
+                    )
+
+                    # signal client to raise a KBI
+                    await client_ctx.cancel()
+                    print('root cancelled client, checking that sub-spawn is down')
+
+                    async with tractor.find_actor(
+                        name=peer_name,
+                    ) as sub:
+                        assert not sub
+
+                    print('root cancelling server/client sub-actors')
+
+                    # await tractor.pause()
+                    res = await client_ctx.result(hide_tb=False)
+                    assert isinstance(res, ContextCancelled)
+                    assert client_ctx.cancel_acked
+                    assert res.canceller == current_actor().uid
+
+                    await spawn_ctx.cancel()
+                    # await server.cancel_actor()
+
+            # since we called `.cancel_actor()`, `.cancel_ack`
+            # will not be set on the ctx bc `ctx.cancel()` was not
+            # called directly fot this confext.
+            except ContextCancelled as ctxc:
+                print('caught ctxc from contexts!')
+                assert ctxc.canceller == current_actor().uid
+                assert ctxc is spawn_ctx.outcome
+                assert ctxc is spawn_ctx.maybe_error
+                raise
+
+            # assert spawn_ctx.cancel_acked
+            assert spawn_ctx.cancel_acked
+            assert client_ctx.cancel_acked
+
+            await client.cancel_actor()
+            await server.cancel_actor()
+
+            # WOA WOA WOA! we need this to close..!!!??
+            # that's super bad XD
+
+            # TODO: why isn't this working!?!?
+            # we're now outside the `.open_context()` block so
+            # the internal `Context._scope: CancelScope` should be
+            # gracefully "closed" ;)
+
+            # assert spawn_ctx.cancelled_caught
+
+    trio.run(main)
-- 
2.34.1


From b54cb6682cf058d3726276131970f7d710256e27 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 29 Feb 2024 17:21:43 -0500
Subject: [PATCH 128/378] Add #TODO for generating func-sig type-annots as
 `str` for pprinting

---
 tractor/msg/types.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index 25e7b39b..3ceff845 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -35,6 +35,24 @@ from msgspec import (
     structs,
 )
 
+# TODO: auto-gen type sig for input func both for
+# type-msgs and logging of RPC tasks?
+# taken and modified from:
+# https://stackoverflow.com/a/57110117
+# import inspect
+# from typing import List
+
+# def my_function(input_1: str, input_2: int) -> list[int]:
+#     pass
+
+# def types_of(func):
+#     specs = inspect.getfullargspec(func)
+#     return_type = specs.annotations['return']
+#     input_types = [t.__name__ for s, t in specs.annotations.items() if s != 'return']
+#     return f'{func.__name__}({": ".join(input_types)}) -> {return_type}'
+
+# types_of(my_function)
+
 
 class DiffDump(UserList):
     '''
@@ -161,6 +179,7 @@ class Struct(
                 # https://docs.python.org/3.11/library/pprint.html#pprint.saferepr
                 val_str: str = saferepr(v)
 
+            # TODO: LOLOL use `textwrap.indent()` instead dawwwwwg!
             obj_str += (field_ws + f'{k}: {typ_name} = {val_str},\n')
 
         return (
-- 
2.34.1


From 1e5810e56c6b66f692a369a945316c6705802d1a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 29 Feb 2024 17:37:02 -0500
Subject: [PATCH 129/378] Make `NamespacePath` kinda support methods..

Obviously we can't deterministic-ally call `.load_ref()` (since you'd
have to point to an `id()` or something and presume a particular
py-runtime + virt-mem space for it to exist?) but it at least helps with
the `str` formatting for logging purposes (like `._cancel_rpc_tasks()`)
when `repr`-ing ctxs and their specific "rpc signatures".

Maybe in the future getting this working at least for singleton types
per process (like `Actor` XD ) will be a thing we can support and make
some sense of.. Bo
---
 tractor/msg/ptr.py | 37 +++++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/tractor/msg/ptr.py b/tractor/msg/ptr.py
index 87d7bf2b..4d089c3e 100644
--- a/tractor/msg/ptr.py
+++ b/tractor/msg/ptr.py
@@ -43,17 +43,24 @@ IPC-compat cross-mem-boundary object pointer.
 # - https://github.com/msgpack/msgpack-python#packingunpacking-of-custom-data-type
 
 from __future__ import annotations
-from inspect import isfunction
+from inspect import (
+    isfunction,
+    ismethod,
+)
 from pkgutil import resolve_name
 
 
 class NamespacePath(str):
     '''
-    A serializeable description of a (function) Python object
-    location described by the target's module path and namespace
-    key meant as a message-native "packet" to allows actors to
-    point-and-load objects by an absolute ``str`` (and thus
-    serializable) reference.
+    A serializeable `str`-subtype implementing a "namespace
+    pointer" to any Python object reference (like a function)
+    using the same format as the built-in `pkgutil.resolve_name()`
+    system.
+
+    A value describes a target's module-path and namespace-key
+    separated by a ':' and thus can be easily used as
+    a IPC-message-native reference-type allowing memory isolated
+    actors to point-and-load objects via a minimal `str` value.
 
     '''
     _ref: object | type | None = None
@@ -81,13 +88,23 @@ class NamespacePath(str):
 
         '''
         if (
-            isinstance(ref, object)
-            and not isfunction(ref)
+            isfunction(ref)
         ):
-            name: str = type(ref).__name__
-        else:
             name: str = getattr(ref, '__name__')
 
+        elif ismethod(ref):
+            # build out the path manually i guess..?
+            # TODO: better way?
+            name: str = '.'.join([
+                type(ref.__self__).__name__,
+                ref.__func__.__name__,
+            ])
+
+        else:  # object or other?
+            # isinstance(ref, object)
+            # and not isfunction(ref)
+            name: str = type(ref).__name__
+
         # fully qualified namespace path, tuple.
         fqnp: tuple[str, str] = (
             ref.__module__,
-- 
2.34.1


From 23aa97692ec44e90fbdb79429738e9a31e16b757 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 29 Feb 2024 18:20:41 -0500
Subject: [PATCH 130/378] Fix `Channel.__repr__()` safety, renames to
 `._transport`

Hit a reallly weird bug in the `._runtime` IPC msg handling loop where
it seems that by `str.format()`-ing a `Channel` before initializing it
would put the `._MsgTransport._agen()` in an already started state
causing an irrecoverable core startup failure..

I presume it's something to do with delegating to the
`MsgpackTCPStream.__repr__()` and, something something.. the
`.set_msg_transport(stream)` getting called to too early such that
`.msgstream.__init__()` is called thus init-ing the `._agen()` before
necessary? I'm sure there's a design lesson to be learned in here
somewhere XD

This was discovered while trying to add more "fancy" logging throughout
said core for the purposes of cobbling together an init attempt at
libp2p style multi-address representations for our IPC primitives. Thus
I also tinker here with adding some new fields to `MsgpackTCPStream`:
- `layer_key`: int = 4
- `name_key`: str = 'tcp'
- `codec_key`: str = 'msgpack'

Anyway, just changed it so that if `.msgstream` ain't set then we just
return a little "null repr" `str` value thinger.

Also renames `Channel.msgstream` internally to `._transport` with
appropriate pub `@property`s added such that everything else won't break
;p

Also drops `Optional` typing vis-a-vi modern union syntax B)
---
 tractor/_ipc.py | 93 +++++++++++++++++++++++++++++--------------------
 1 file changed, 56 insertions(+), 37 deletions(-)

diff --git a/tractor/_ipc.py b/tractor/_ipc.py
index 5e286c1d..b108c90e 100644
--- a/tractor/_ipc.py
+++ b/tractor/_ipc.py
@@ -30,7 +30,6 @@ import typing
 from typing import (
     Any,
     runtime_checkable,
-    Optional,
     Protocol,
     Type,
     TypeVar,
@@ -113,6 +112,13 @@ class MsgpackTCPStream(MsgTransport):
     using the ``msgspec`` codec lib.
 
     '''
+    layer_key: int = 4
+    name_key: str = 'tcp'
+
+    # TODO: better naming for this?
+    # -[ ] check how libp2p does naming for such things?
+    codec_key: str = 'msgpack'
+
     def __init__(
         self,
         stream: trio.SocketStream,
@@ -268,7 +274,7 @@ class Channel:
     def __init__(
 
         self,
-        destaddr: Optional[tuple[str, int]],
+        destaddr: tuple[str, int]|None,
 
         msg_transport_type_key: tuple[str, str] = ('msgpack', 'tcp'),
 
@@ -286,14 +292,14 @@ class Channel:
 
         # Either created in ``.connect()`` or passed in by
         # user in ``.from_stream()``.
-        self._stream: Optional[trio.SocketStream] = None
-        self.msgstream: Optional[MsgTransport] = None
+        self._stream: trio.SocketStream|None = None
+        self._transport: MsgTransport|None = None
 
         # set after handshake - always uid of far end
-        self.uid: Optional[tuple[str, str]] = None
+        self.uid: tuple[str, str]|None = None
 
         self._agen = self._aiter_recv()
-        self._exc: Optional[Exception] = None  # set if far end actor errors
+        self._exc: Exception|None = None  # set if far end actor errors
         self._closed: bool = False
 
         # flag set by ``Portal.cancel_actor()`` indicating remote
@@ -301,6 +307,15 @@ class Channel:
         # runtime.
         self._cancel_called: bool = False
 
+    @property
+    def msgstream(self) -> MsgTransport:
+        log.info('`Channel.msgstream` is an old name, use `._transport`')
+        return self._transport
+
+    @property
+    def transport(self) -> MsgTransport:
+        return self._transport
+
     @classmethod
     def from_stream(
         cls,
@@ -310,40 +325,44 @@ class Channel:
     ) -> Channel:
 
         src, dst = get_stream_addrs(stream)
-        chan = Channel(destaddr=dst, **kwargs)
+        chan = Channel(
+            destaddr=dst,
+            **kwargs,
+        )
 
         # set immediately here from provided instance
-        chan._stream = stream
+        chan._stream: trio.SocketStream = stream
         chan.set_msg_transport(stream)
         return chan
 
     def set_msg_transport(
         self,
         stream: trio.SocketStream,
-        type_key: Optional[tuple[str, str]] = None,
+        type_key: tuple[str, str]|None = None,
 
     ) -> MsgTransport:
         type_key = type_key or self._transport_key
-        self.msgstream = get_msg_transport(type_key)(stream)
-        return self.msgstream
+        self._transport = get_msg_transport(type_key)(stream)
+        return self._transport
 
     def __repr__(self) -> str:
-        if self.msgstream:
-            return repr(
-                self.msgstream.stream.socket._sock
-            ).replace(  # type: ignore
-                "socket.socket",
-                "Channel",
-            )
-        return object.__repr__(self)
+        if not self._transport:
+            return '<Channel with inactive transport?>'
+
+        return repr(
+            self._transport.stream.socket._sock
+        ).replace(  # type: ignore
+            "socket.socket",
+            "Channel",
+        )
 
     @property
-    def laddr(self) -> Optional[tuple[str, int]]:
-        return self.msgstream.laddr if self.msgstream else None
+    def laddr(self) -> tuple[str, int]|None:
+        return self._transport.laddr if self._transport else None
 
     @property
-    def raddr(self) -> Optional[tuple[str, int]]:
-        return self.msgstream.raddr if self.msgstream else None
+    def raddr(self) -> tuple[str, int]|None:
+        return self._transport.raddr if self._transport else None
 
     async def connect(
         self,
@@ -362,12 +381,12 @@ class Channel:
             *destaddr,
             **kwargs
         )
-        msgstream = self.set_msg_transport(stream)
+        transport = self.set_msg_transport(stream)
 
         log.transport(
-            f'Opened channel[{type(msgstream)}]: {self.laddr} -> {self.raddr}'
+            f'Opened channel[{type(transport)}]: {self.laddr} -> {self.raddr}'
         )
-        return msgstream
+        return transport
 
     async def send(self, item: Any) -> None:
 
@@ -375,16 +394,16 @@ class Channel:
             '=> send IPC msg:\n\n'
             f'{pformat(item)}\n'
         )  # type: ignore
-        assert self.msgstream
+        assert self._transport
 
-        await self.msgstream.send(item)
+        await self._transport.send(item)
 
     async def recv(self) -> Any:
-        assert self.msgstream
-        return await self.msgstream.recv()
+        assert self._transport
+        return await self._transport.recv()
 
         # try:
-        #     return await self.msgstream.recv()
+        #     return await self._transport.recv()
         # except trio.BrokenResourceError:
         #     if self._autorecon:
         #         await self._reconnect()
@@ -397,8 +416,8 @@ class Channel:
             f'Closing channel to {self.uid} '
             f'{self.laddr} -> {self.raddr}'
         )
-        assert self.msgstream
-        await self.msgstream.stream.aclose()
+        assert self._transport
+        await self._transport.stream.aclose()
         self._closed = True
 
     async def __aenter__(self):
@@ -449,16 +468,16 @@ class Channel:
         Async iterate items from underlying stream.
 
         '''
-        assert self.msgstream
+        assert self._transport
         while True:
             try:
-                async for item in self.msgstream:
+                async for item in self._transport:
                     yield item
                     # sent = yield item
                     # if sent is not None:
                     #     # optimization, passing None through all the
                     #     # time is pointless
-                    #     await self.msgstream.send(sent)
+                    #     await self._transport.send(sent)
             except trio.BrokenResourceError:
 
                 # if not self._autorecon:
@@ -471,7 +490,7 @@ class Channel:
             #     continue
 
     def connected(self) -> bool:
-        return self.msgstream.connected() if self.msgstream else False
+        return self._transport.connected() if self._transport else False
 
 
 @asynccontextmanager
-- 
2.34.1


From 9bc6a61c93de61dcff24064431ec48bcdd4b4308 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 29 Feb 2024 18:56:31 -0500
Subject: [PATCH 131/378] Add "fancier" remote-error `.__repr__()`-ing

Our remote error box types `RemoteActorError`, `ContextCancelled` and
`StreamOverrun` needed a console display makeover particularly for
logging content and `repr()` in higher level primitives like `Context`.

This adds a more "dramatic" str-representation to showcase the
underlying boxed traceback content more sensationally (via ascii-art
emphasis) as well as support a more terse `.reprol()` (representation
for one-line) format that can be used for types that track remote
errors/cancels like with `Context._remote_error`.

Impl deats:
- change `RemoteActorError.__repr__()` formatting to show (sub-type
  specific) `.msgdata` fields in a multi-line format (similar to our new
  `.msg.types.Struct` style) followed by some ascii accented delimiter
  lines to emphasize any `.msgdata["tb_str"]` packed by the remote
- for rme and subtypes allow picking the specifically relevant fields
  via a type defined `.reprol_fields: list[str]` and pick for each
  subtype:
   |_ `RemoteActorError.src_actor_uid`
   |_ `ContextCancelled.canceller`
   |_ `StreamOverrun.sender`

- add `.reprol()` to show a `repr()`-on-one-line formatted string that
  can be used by other multi-line-field-`repr()` styled composite types
  as needed in (high level) logging info.
- toss in some mod level `_body_fields: list[str]` for summary of such
  fields (if needed).
- add some new rae (remote-actor-error) props:
  - `.type` around a newly named `.boxed_type`
  - `.type_str: str`
  - `.tb_str: str`
---
 tractor/_exceptions.py | 134 +++++++++++++++++++++++++++++++++++------
 1 file changed, 116 insertions(+), 18 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index fe3f2706..259a28a7 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -27,6 +27,7 @@ from typing import (
     Type,
     TYPE_CHECKING,
 )
+import textwrap
 import traceback
 
 import exceptiongroup as eg
@@ -37,8 +38,9 @@ from .log import get_logger
 
 if TYPE_CHECKING:
     from ._context import Context
-    from ._stream import MsgStream
     from .log import StackLevelAdapter
+    from ._stream import MsgStream
+    from ._ipc import Channel
 
 log = get_logger('tractor')
 
@@ -49,6 +51,25 @@ class ActorFailure(Exception):
     "General actor failure"
 
 
+class InternalError(RuntimeError):
+    '''
+    Entirely unexpected internal machinery error indicating
+    a completely invalid state or interface.
+
+    '''
+
+_body_fields: list[str] = [
+    'src_actor_uid',
+    'canceller',
+    'sender',
+]
+
+_msgdata_keys: list[str] = [
+    'type_str',
+] + _body_fields
+
+
+
 # TODO: rename to just `RemoteError`?
 class RemoteActorError(Exception):
     '''
@@ -60,6 +81,10 @@ class RemoteActorError(Exception):
     a special "error" IPC msg sent by some remote actor-runtime.
 
     '''
+    reprol_fields: list[str] = [
+        'src_actor_uid',
+    ]
+
     def __init__(
         self,
         message: str,
@@ -77,23 +102,82 @@ class RemoteActorError(Exception):
         # - .remote_type
         # also pertains to our long long oustanding issue XD
         # https://github.com/goodboy/tractor/issues/5
-        self.type: str = suberror_type
+        self.boxed_type: str = suberror_type
         self.msgdata: dict[str, Any] = msgdata
 
     @property
-    def src_actor_uid(self) -> tuple[str, str] | None:
+    def type(self) -> str:
+        return self.boxed_type
+
+    @property
+    def type_str(self) -> str:
+        return str(type(self.boxed_type).__name__)
+
+    @property
+    def src_actor_uid(self) -> tuple[str, str]|None:
         return self.msgdata.get('src_actor_uid')
 
-    def __repr__(self) -> str:
+    @property
+    def tb_str(
+        self,
+        indent: str = ' '*3,
+    ) -> str:
         if remote_tb := self.msgdata.get('tb_str'):
-            pformat(remote_tb)
-            return (
-                f'{type(self).__name__}(\n'
-                f'msgdata={pformat(self.msgdata)}\n'
-                ')'
+            return textwrap.indent(
+                remote_tb,
+                prefix=indent,
             )
 
-        return super().__repr__()
+        return ''
+
+    def reprol(self) -> str:
+        '''
+        Represent this error for "one line" display, like in
+        a field of our `Context.__repr__()` output.
+
+        '''
+        _repr: str = f'{type(self).__name__}('
+        for key in self.reprol_fields:
+            val: Any|None = self.msgdata.get(key)
+            if val:
+                _repr += f'{key}={repr(val)} '
+
+        return _repr
+
+    def __repr__(self) -> str:
+
+        fields: str = ''
+        for key in _body_fields:
+            val: str|None = self.msgdata.get(key)
+            if val:
+                fields += f'{key}={val}\n'
+
+        fields: str = textwrap.indent(
+            fields,
+            # prefix=' '*2,
+            prefix=' |_',
+        )
+        indent: str = ''*1
+        body: str = (
+            f'{fields}'
+            f'  |\n'
+            f'   ------ - ------\n\n'
+            f'{self.tb_str}\n'
+            f'   ------ - ------\n'
+            f' _|\n'
+        )
+            # f'|\n'
+            # f'         |\n'
+        if indent:
+            body: str = textwrap.indent(
+                body,
+                prefix=indent,
+            )
+        return (
+            f'<{type(self).__name__}(\n'
+            f'{body}'
+            ')>'
+        )
 
     # TODO: local recontruction of remote exception deats
     # def unbox(self) -> BaseException:
@@ -102,8 +186,9 @@ class RemoteActorError(Exception):
 
 class InternalActorError(RemoteActorError):
     '''
-    Remote internal ``tractor`` error indicating
-    failure of some primitive or machinery.
+    (Remote) internal `tractor` error indicating failure of some
+    primitive, machinery state or lowlevel task that should never
+    occur.
 
     '''
 
@@ -114,6 +199,9 @@ class ContextCancelled(RemoteActorError):
     ``Portal.cancel_actor()`` or ``Context.cancel()``.
 
     '''
+    reprol_fields: list[str] = [
+        'canceller',
+    ]
     @property
     def canceller(self) -> tuple[str, str]|None:
         '''
@@ -145,6 +233,9 @@ class ContextCancelled(RemoteActorError):
             f'{self}'
         )
 
+    # to make `.__repr__()` work uniformly
+    # src_actor_uid = canceller
+
 
 class TransportClosed(trio.ClosedResourceError):
     "Underlying channel transport was closed prior to use"
@@ -166,6 +257,9 @@ class StreamOverrun(
     RemoteActorError,
     trio.TooSlowError,
 ):
+    reprol_fields: list[str] = [
+        'sender',
+    ]
     '''
     This stream was overrun by sender
 
@@ -213,6 +307,7 @@ def pack_error(
     ] = {
         'tb_str': tb_str,
         'type_str': type(exc).__name__,
+        'boxed_type': type(exc).__name__,
         'src_actor_uid': current_actor().uid,
     }
 
@@ -238,8 +333,8 @@ def unpack_error(
 
     msg: dict[str, Any],
 
-    chan=None,
-    err_type=RemoteActorError,
+    chan: Channel|None = None,
+    box_type: RemoteActorError = RemoteActorError,
 
     hide_tb: bool = True,
 
@@ -264,12 +359,15 @@ def unpack_error(
     # retrieve the remote error's msg encoded details
     tb_str: str = error_dict.get('tb_str', '')
     message: str = f'{chan.uid}\n' + tb_str
-    type_name: str = error_dict['type_str']
+    type_name: str = (
+        error_dict.get('type_str')
+        or error_dict['boxed_type']
+    )
     suberror_type: Type[BaseException] = Exception
 
     if type_name == 'ContextCancelled':
-        err_type = ContextCancelled
-        suberror_type = err_type
+        box_type = ContextCancelled
+        suberror_type = box_type
 
     else:  # try to lookup a suitable local error type
         for ns in [
@@ -285,7 +383,7 @@ def unpack_error(
             ):
                 break
 
-    exc = err_type(
+    exc = box_type(
         message,
         suberror_type=suberror_type,
 
-- 
2.34.1


From 4f69af872c9cf3c36c9126656ac5ff47d4262eae Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 29 Feb 2024 20:01:39 -0500
Subject: [PATCH 132/378] Add field-first subproca `.info()` to `._entry`

---
 tractor/_entry.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tractor/_entry.py b/tractor/_entry.py
index b5ab4055..4a1499a0 100644
--- a/tractor/_entry.py
+++ b/tractor/_entry.py
@@ -106,25 +106,25 @@ def _trio_main(
     Entry point for a `trio_run_in_process` subactor.
 
     '''
-    log.info(f"Started new trio process for {actor.uid}")
-
-    if actor.loglevel is not None:
-        log.info(
-            f"Setting loglevel for {actor.uid} to {actor.loglevel}")
-        get_console_log(actor.loglevel)
-
-    log.info(
-        f"Started {actor.uid}")
-
     _state._current_actor = actor
-
-    log.debug(f"parent_addr is {parent_addr}")
     trio_main = partial(
         async_main,
         actor,
         parent_addr=parent_addr
     )
 
+    if actor.loglevel is not None:
+        get_console_log(actor.loglevel)
+        import os
+        log.info(
+            'Started new trio process:\n'
+            f'|_{actor}\n'
+            f'  uid: {actor.uid}\n'
+            f'  pid: {os.getpid()}\n'
+            f'  parent_addr: {parent_addr}\n'
+            f'  loglevel: {actor.loglevel}\n'
+        )
+
     try:
         if infect_asyncio:
             actor._infected_aio = True
@@ -133,7 +133,7 @@ def _trio_main(
             trio.run(trio_main)
     except KeyboardInterrupt:
         log.cancel(
-            f'Actor@{actor.uid} received KBI'
+            f'@{actor.uid} received KBI'
         )
 
     finally:
-- 
2.34.1


From 50465d4b340ec657f07af8e2c5e7bf5e4c1594f6 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 1 Mar 2024 10:47:42 -0500
Subject: [PATCH 133/378] Spawn naming and log format tweaks

- rename `.soft_wait()` -> `.soft_kill()`
- rename `.do_hard_kill()` -> `.hard_kill()`
- adjust any `trio.Process.__repr__()` log msg contents to have the
  little tree branch prefix: `'|_'`
---
 tractor/_spawn.py | 51 +++++++++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 22 deletions(-)

diff --git a/tractor/_spawn.py b/tractor/_spawn.py
index 7f50b9eb..5268b250 100644
--- a/tractor/_spawn.py
+++ b/tractor/_spawn.py
@@ -196,16 +196,16 @@ async def cancel_on_completion(
     result: Any|Exception = await exhaust_portal(portal, actor)
     if isinstance(result, Exception):
         errors[actor.uid]: Exception = result
-        log.warning(
-            'Cancelling subactor due to error:\n'
-            f'uid: {portal.channel.uid}\n'
+        log.cancel(
+            'Cancelling subactor runtime due to error:\n\n'
+            f'Portal.cancel_actor() => {portal.channel.uid}\n\n'
             f'error: {result}\n'
         )
 
     else:
         log.runtime(
-            'Cancelling subactor gracefully:\n'
-            f'uid: {portal.channel.uid}\n'
+            'Cancelling subactor gracefully:\n\n'
+            f'Portal.cancel_actor() => {portal.channel.uid}\n\n'
             f'result: {result}\n'
         )
 
@@ -213,7 +213,7 @@ async def cancel_on_completion(
     await portal.cancel_actor()
 
 
-async def do_hard_kill(
+async def hard_kill(
     proc: trio.Process,
     terminate_after: int = 3,
 
@@ -288,7 +288,7 @@ async def do_hard_kill(
         proc.kill()
 
 
-async def soft_wait(
+async def soft_kill(
 
     proc: ProcessType,
     wait_func: Callable[
@@ -299,17 +299,20 @@ async def soft_wait(
 
 ) -> None:
     '''
-    Wait for proc termination but **dont' yet** teardown
-    std-streams (since it will clobber any ongoing pdb REPL
-    session). This is our "soft" (and thus itself cancellable)
-    join/reap on an actor-runtime-in-process.
+    Wait for proc termination but **don't yet** teardown
+    std-streams since it will clobber any ongoing pdb REPL
+    session.
+
+    This is our "soft"/graceful, and thus itself also cancellable,
+    join/reap on an actor-runtime-in-process shutdown; it is
+    **not** the same as a "hard kill" via an OS signal (for that
+    see `.hard_kill()`).
 
     '''
     uid: tuple[str, str] = portal.channel.uid
     try:
         log.cancel(
-            'Soft waiting on sub-actor proc:\n'
-            f'uid: {uid}\n'
+            'Soft killing sub-actor via `Portal.cancel_actor()`\n'
             f'|_{proc}\n'
         )
         # wait on sub-proc to signal termination
@@ -326,8 +329,9 @@ async def soft_wait(
 
             async def cancel_on_proc_deth():
                 '''
-                "Cancel the (actor) cancel" request if we detect
-                that that the underlying sub-process terminated.
+                "Cancel-the-cancel" request: if we detect that the
+                underlying sub-process exited prior to
+                a `Portal.cancel_actor()` call completing .
 
                 '''
                 await wait_func(proc)
@@ -439,19 +443,22 @@ async def trio_proc(
         spawn_cmd.append("--asyncio")
 
     cancelled_during_spawn: bool = False
-    proc: trio.Process | None = None
+    proc: trio.Process|None = None
     try:
         try:
             # TODO: needs ``trio_typing`` patch?
             proc = await trio.lowlevel.open_process(spawn_cmd)
-
-            log.runtime(f"Started {proc}")
+            log.runtime(
+                'Started new sub-proc\n'
+                f'|_{proc}\n'
+            )
 
             # wait for actor to spawn and connect back to us
             # channel should have handshake completed by the
             # local actor by the time we get a ref to it
             event, chan = await actor_nursery._actor.wait_for_peer(
-                subactor.uid)
+                subactor.uid
+            )
 
         except trio.Cancelled:
             cancelled_during_spawn = True
@@ -512,7 +519,7 @@ async def trio_proc(
             # This is a "soft" (cancellable) join/reap which
             # will remote cancel the actor on a ``trio.Cancelled``
             # condition.
-            await soft_wait(
+            await soft_kill(
                 proc,
                 trio.Process.wait,
                 portal
@@ -574,7 +581,7 @@ async def trio_proc(
 
                 if proc.poll() is None:
                     log.cancel(f"Attempting to hard kill {proc}")
-                    await do_hard_kill(proc)
+                    await hard_kill(proc)
 
                 log.debug(f"Joined {proc}")
         else:
@@ -718,7 +725,7 @@ async def mp_proc(
             # This is a "soft" (cancellable) join/reap which
             # will remote cancel the actor on a ``trio.Cancelled``
             # condition.
-            await soft_wait(
+            await soft_kill(
                 proc,
                 proc_waiter,
                 portal
-- 
2.34.1


From 08a6a51cb8f0e3c0edcf6bc195232eaab3a3fcfe Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 1 Mar 2024 15:44:01 -0500
Subject: [PATCH 134/378] Add `._implicit_runtime_started` mark, better logs

After some deep logging improvements to many parts of `._runtime`,
I realized a silly detail where we are always waiting on any opened
`local_nursery: ActorNursery` to signal exit from
`Actor._stream_handler()` even in the case of being an implicitly opened
root actor (`open_root_actor()` wasn't called by user/app code) via
`._supervise.open_nursery()`..

So, to address this add a `ActorNursery._implicit_runtime_started: bool`
that can be set and then checked to avoid doing the unnecessary
`.exited.wait()` (and any subsequent warn logging on an exit timeout) in
that special but most common case XD

Matching with other subsys log format refinements, improve readability
and simplicity of the actor-nursery supervisory log msgs, including:
- simplify and/or remove any content that more or less duplicates msg
  content found in emissions from lower-level primitives and sub-systems
  (like `._runtime`, `_context`, `_portal` etc.).
- add a specific `._open_and_supervise_one_cancels_all_nursery()`
  handler block for `ContextCancelled` to log with `.cancel()` level
  noting that the case is a "remote cancellation".
- put the nursery-exit and actor-tree shutdown status into a single msg
  in the `implicit_runtime` case.
---
 tractor/_supervise.py | 96 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 73 insertions(+), 23 deletions(-)

diff --git a/tractor/_supervise.py b/tractor/_supervise.py
index 86a317d6..c27e0e43 100644
--- a/tractor/_supervise.py
+++ b/tractor/_supervise.py
@@ -34,7 +34,10 @@ from ._state import current_actor, is_main_process
 from .log import get_logger, get_loglevel
 from ._runtime import Actor
 from ._portal import Portal
-from ._exceptions import is_multi_cancelled
+from ._exceptions import (
+    is_multi_cancelled,
+    ContextCancelled,
+)
 from ._root import open_root_actor
 from . import _state
 from . import _spawn
@@ -104,6 +107,14 @@ class ActorNursery:
         self.errors = errors
         self.exited = trio.Event()
 
+        # NOTE: when no explicit call is made to
+        # `.open_root_actor()` by application code,
+        # `.open_nursery()` will implicitly call it to start the
+        # actor-tree runtime. In this case we mark ourselves as
+        # such so that runtime components can be aware for logging
+        # and syncing purposes to any actor opened nurseries.
+        self._implicit_runtime_started: bool = False
+
     async def start_actor(
         self,
         name: str,
@@ -249,10 +260,11 @@ class ActorNursery:
         '''
         self.cancelled = True
 
-        log.cancel(
-            'Cancelling actor nursery\n'
-            f'|_{self._children}\n'
-        )
+        # TODO: impl a repr for spawn more compact
+        # then `._children`..
+        children: dict = self._children
+        child_count: int = len(children)
+        msg: str = f'Cancelling actor nursery with {child_count} children\n'
         with trio.move_on_after(3) as cs:
             async with trio.open_nursery() as tn:
 
@@ -263,7 +275,7 @@ class ActorNursery:
                     subactor,
                     proc,
                     portal,
-                ) in self._children.values():
+                ) in children.values():
 
                     # TODO: are we ever even going to use this or
                     # is the spawning backend responsible for such
@@ -275,12 +287,13 @@ class ActorNursery:
                         if portal is None:  # actor hasn't fully spawned yet
                             event = self._actor._peer_connected[subactor.uid]
                             log.warning(
-                                f"{subactor.uid} wasn't finished spawning?")
+                                f"{subactor.uid} never 't finished spawning?"
+                            )
 
                             await event.wait()
 
                             # channel/portal should now be up
-                            _, _, portal = self._children[subactor.uid]
+                            _, _, portal = children[subactor.uid]
 
                             # XXX should be impossible to get here
                             # unless method was called from within
@@ -299,11 +312,13 @@ class ActorNursery:
                         if portal.channel.connected():
                             tn.start_soon(portal.cancel_actor)
 
+                log.cancel(msg)
         # if we cancelled the cancel (we hung cancelling remote actors)
         # then hard kill all sub-processes
         if cs.cancelled_caught:
             log.error(
-                f'Failed to cancel {self}\nHard killing process tree!'
+                f'Failed to cancel {self}?\n'
+                'Hard killing underlying subprocess tree!\n'
             )
             subactor: Actor
             proc: trio.Process
@@ -312,7 +327,7 @@ class ActorNursery:
                 subactor,
                 proc,
                 portal,
-            ) in self._children.values():
+            ) in children.values():
                 log.warning(f"Hard killing process {proc}")
                 proc.terminate()
 
@@ -390,26 +405,39 @@ async def _open_and_supervise_one_cancels_all_nursery(
                     # worry more are coming).
                     an._join_procs.set()
 
-                    # XXX: hypothetically an error could be
-                    # raised and then a cancel signal shows up
+                    # XXX NOTE XXX: hypothetically an error could
+                    # be raised and then a cancel signal shows up
                     # slightly after in which case the `else:`
                     # block here might not complete?  For now,
                     # shield both.
                     with trio.CancelScope(shield=True):
-                        etype = type(inner_err)
+                        etype: type = type(inner_err)
                         if etype in (
                             trio.Cancelled,
-                            KeyboardInterrupt
+                            KeyboardInterrupt,
                         ) or (
                             is_multi_cancelled(inner_err)
                         ):
                             log.cancel(
-                                f"Nursery for {current_actor().uid} "
-                                f"was cancelled with {etype}")
+                                f'Actor-nursery cancelled by {etype}\n\n'
+
+                                f'{current_actor().uid}\n'
+                                f' |_{an}\n\n'
+
+                                # TODO: show tb str?
+                                # f'{tb_str}'
+                            )
+                        elif etype in {
+                            ContextCancelled,
+                        }:
+                            log.cancel(
+                                'Actor-nursery caught remote cancellation\n\n'
+
+                                f'{inner_err.tb_str}'
+                            )
                         else:
                             log.exception(
-                                f"Nursery for {current_actor().uid} "
-                                "errored with:"
+                                'Nursery errored with:\n'
 
                                 # TODO: same thing as in
                                 # `._invoke()` to compute how to
@@ -450,11 +478,15 @@ async def _open_and_supervise_one_cancels_all_nursery(
             # ".run_in_actor()" actors then we also want to cancel all
             # remaining sub-actors (due to our lone strategy:
             # one-cancels-all).
-            log.cancel(f"Nursery cancelling due to {err}")
             if an._children:
+                log.cancel(
+                    'Actor-nursery cancelling due error type:\n'
+                    f'{err}\n'
+                )
                 with trio.CancelScope(shield=True):
                     await an.cancel()
             raise
+
         finally:
             # No errors were raised while awaiting ".run_in_actor()"
             # actors but those actors may have returned remote errors as
@@ -500,7 +532,7 @@ async def open_nursery(
     which cancellation scopes correspond to each spawned subactor set.
 
     '''
-    implicit_runtime = False
+    implicit_runtime: bool = False
 
     actor = current_actor(err_on_no_runtime=False)
 
@@ -512,7 +544,7 @@ async def open_nursery(
             log.info("Starting actor runtime!")
 
             # mark us for teardown on exit
-            implicit_runtime = True
+            implicit_runtime: bool = True
 
             async with open_root_actor(**kwargs) as actor:
                 assert actor is current_actor()
@@ -521,8 +553,21 @@ async def open_nursery(
                     async with _open_and_supervise_one_cancels_all_nursery(
                         actor
                     ) as an:
+
+                        # NOTE: mark this nursery as having
+                        # implicitly started the root actor so
+                        # that `._runtime` machinery can avoid
+                        # certain teardown synchronization
+                        # blocking/waits and any associated (warn)
+                        # logging when it's known that this
+                        # nursery shouldn't be exited before the
+                        # root actor is.
+                        an._implicit_runtime_started = True
                         yield an
                 finally:
+                    # XXX: this event will be set after the root actor
+                    # runtime is already torn down, so we want to
+                    # avoid any blocking on it.
                     an.exited.set()
 
         else:  # sub-nursery case
@@ -536,8 +581,13 @@ async def open_nursery(
                 an.exited.set()
 
     finally:
-        log.debug("Nursery teardown complete")
+        msg: str = (
+            'Actor-nursery exited\n'
+            f'|_{an}\n\n'
+        )
 
         # shutdown runtime if it was started
         if implicit_runtime:
-            log.info("Shutting down actor tree")
+            msg += '=> Shutting down actor runtime <=\n'
+
+        log.info(msg)
-- 
2.34.1


From 28fefe4ffecc4bddcfb9ffc01eca1641d04f52c9 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 1 Mar 2024 19:27:10 -0500
Subject: [PATCH 135/378] Make stream draining status logs `.debug()` level

---
 tractor/_root.py      | 2 +-
 tractor/_streaming.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tractor/_root.py b/tractor/_root.py
index 32cc3d57..c3deac9e 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -326,7 +326,7 @@ async def open_root_actor(
                     not entered
                     and not is_multi_cancelled(err)
                 ):
-                    logger.exception("Root actor crashed:")
+                    logger.exception('Root actor crashed:\n')
 
                 # ALWAYS re-raise any error bubbled up from the
                 # runtime!
diff --git a/tractor/_streaming.py b/tractor/_streaming.py
index 64b5dd6d..b2cfe485 100644
--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@@ -265,7 +265,7 @@ class MsgStream(trio.abc.Channel):
             try:
                 maybe_final_msg = self.receive_nowait()
                 if maybe_final_msg:
-                    log.cancel(
+                    log.debug(
                         'Drained un-processed stream msg:\n'
                         f'{pformat(maybe_final_msg)}'
                     )
-- 
2.34.1


From 299429a2788635e3cba30839a1512d157137cf3a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 1 Mar 2024 22:37:32 -0500
Subject: [PATCH 136/378] Deep `Context` refinements

Spanning from the pub API, to instance `repr()` customization (for
logging/REPL content), to the impl details around the notion of a "final
outcome" and surrounding IPC msg draining mechanics during teardown.

A few API and field updates:

- new `.cancel_acked: bool` to replace what we were mostly using
  `.cancelled_caught: bool` for but, for purposes of better mapping the
  semantics of remote cancellation of parallel executing tasks; it's set
  only when `.cancel_called` is set and a ctxc arrives with
  a `.canceller` field set to the current actor uid indicating we
  requested and received acknowledgement from the other side's task
  that is cancelled gracefully.

- strongly document and delegate (and prolly eventually remove as a pub
  attr) the `.cancelled_caught` property entirely to the underlying
  `._scope: trio.CancelScope`; the `trio` semantics don't really map
  well to the "parallel with IPC msging"  case in the sense that for
  us it breaks the concept of the ctx/scope closure having "caught"
  something instead of having "received" a msg that the other side has
  "acknowledged" (i.e. which for us is the completion of cancellation).

- new `.__repr__()`/`.__str__()` format that tries to tersely yet
  comprehensively as possible display everything you need to know about
  the 3 main layers of an SC-linked-IPC-context:
  * ipc: the transport + runtime layers net-addressing and prot info.
  * rpc: the specific linked caller-callee task signature details
    including task and msg-stream instances.
  * state: current execution and final outcome state of the task pair.
  * a teensie extra `.repr_rpc` for a condensed rpc signature.

- new `.dst_maddr` to get a `libp2p` style "multi-address" (though right
  now it's just showing the transport layers so maybe we should move to
  to our `Channel`?)

- new public instance-var fields supporting more granular remote
  cancellation/result/error state:
  * `.maybe_error: Exception|None` for any final (remote) error/ctxc
    which computes logic on the values of `._remote_error`/`._local_error`
    to determine the "final error" (if any) on termination.
  * `.outcome` to the final error or result (or `None` if un-terminated)
  * `.repr_outcome()` for a console/logging friendly version of the
    final result or error as needed for the `.__str__()`.

- new private interface bits to support all of ^:
  * a new "no result yet" sentinel value, `Unresolved`, using a module
    level class singleton that `._result` is set too (instead of
    `id(self)`) to both determine if and present when no final result
    from the callee has-yet-been/was delivered (ever).
    => really we should get rid of `.result()` and change it to
    `.wait_for_result()` (or something)u
  * `_final_result_is_set()` predicate to avoid waiting for an already
    delivered result.
  * `._maybe_raise()` proto-impl that we should use to replace all the
    `if re:` blocks it can XD
  * new `._stream: MsgStream|None` for when a stream is opened to aid
    with the state repr mentioned above.

Tweaks to the termination drain loop `_drain_to_final_msg()`:

- obviously (obvi) use all the changes above when determining whether or
  not a "final outcome" has arrived and thus breaking from the loop ;)
  * like the `.outcome` `.maybe_error`  and `._final_ctx_is_set()` in
    the `while` pred expression.

- drop the `_recv_chan.receive_nowait()` + guard logic since it seems
  with all the surrounding (and coming soon) changes to
  `Portal.open_context()` using all the new API stuff (mentioned in
  first bullet set above) we never hit the case of inf-block?

Oh right and obviously a ton of (hopefully improved) logging msg content
changes, commented code removal and detailed comment-docs strewn about!
---
 tractor/_context.py | 1048 ++++++++++++++++++++++++++++---------------
 1 file changed, 675 insertions(+), 373 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index ee05a2ba..f8aaf1c9 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -43,19 +43,14 @@ import warnings
 
 import trio
 
-# from .devx import (
-#     maybe_wait_for_debugger,
-#     pause,
-# )
 from .msg import NamespacePath
 from ._exceptions import (
-    # _raise_from_no_key_in_msg,
-    unpack_error,
-    pack_error,
     ContextCancelled,
-    # MessagingError,
+    InternalError,
     RemoteActorError,
     StreamOverrun,
+    pack_error,
+    unpack_error,
 )
 from .log import get_logger
 from ._ipc import Channel
@@ -65,6 +60,7 @@ from ._state import current_actor
 if TYPE_CHECKING:
     from ._portal import Portal
     from ._runtime import Actor
+    from ._ipc import MsgTransport
 
 
 log = get_logger(__name__)
@@ -73,6 +69,7 @@ log = get_logger(__name__)
 async def _drain_to_final_msg(
     ctx: Context,
 
+    hide_tb: bool = True,
     msg_limit: int = 6,
 
 ) -> list[dict]:
@@ -89,47 +86,72 @@ async def _drain_to_final_msg(
     `ctx.result()` cleanup and teardown sequence.
 
     '''
+    __tracebackhide__: bool = hide_tb
     raise_overrun: bool = not ctx._allow_overruns
 
     # wait for a final context result by collecting (but
     # basically ignoring) any bi-dir-stream msgs still in transit
     # from the far end.
     pre_result_drained: list[dict] = []
-    while not ctx._remote_error:
+    while not (
+        ctx.maybe_error
+        and not ctx._final_result_is_set()
+    ):
         try:
+            # TODO: can remove?
+            # await trio.lowlevel.checkpoint()
+
             # NOTE: this REPL usage actually works here dawg! Bo
             # from .devx._debug import pause
             # await pause()
-            if re := ctx._remote_error:
-                ctx._maybe_raise_remote_err(
-                    re,
-                    # NOTE: obvi we don't care if we
-                    # overran the far end if we're already
-                    # waiting on a final result (msg).
-                    raise_overrun_from_self=raise_overrun,
-                )
 
             # TODO: bad idea?
+            # -[ ] wrap final outcome channel wait in a scope so
+            # it can be cancelled out of band if needed?
+            #
             # with trio.CancelScope() as res_cs:
             #     ctx._res_scope = res_cs
             #     msg: dict = await ctx._recv_chan.receive()
             # if res_cs.cancelled_caught:
 
+            # TODO: ensure there's no more hangs, debugging the
+            # runtime pretty preaase!
             # from .devx._debug import pause
             # await pause()
+
+            # TODO: can remove this finally?
+            # we have no more need for the sync draining right
+            # since we're can kinda guarantee the async
+            # `.receive()` below will never block yah?
+            #
+            # if (
+            #     ctx._cancel_called and (
+            #         ctx.cancel_acked
+            #         # or ctx.chan._cancel_called
+            #     )
+            #     # or not ctx._final_result_is_set()
+            #     # ctx.outcome is not 
+            #     # or ctx.chan._closed
+            # ):
+            #     try:
+            #         msg: dict = await ctx._recv_chan.receive_nowait()()
+            #     except trio.WouldBlock:
+            #         log.warning(
+            #             'When draining already `.cancel_called` ctx!\n'
+            #             'No final msg arrived..\n'
+            #         )
+            #         break
+            # else:
+            #     msg: dict = await ctx._recv_chan.receive()
+
+            # TODO: don't need it right jefe?
+            # with trio.move_on_after(1) as cs:
+            # if cs.cancelled_caught:
+            #     from .devx._debug import pause
+            #     await pause()
+
+            # pray to the `trio` gawds that we're corrent with this
             msg: dict = await ctx._recv_chan.receive()
-            ctx._result: Any = msg['return']
-            log.runtime(
-                'Context delivered final draining msg:\n'
-                f'{pformat(msg)}'
-            )
-            pre_result_drained.append(msg)
-            # NOTE: we don't need to do this right?
-            # XXX: only close the rx mem chan AFTER
-            # a final result is retreived.
-            # if ctx._recv_chan:
-            #     await ctx._recv_chan.aclose()
-            break
 
         # NOTE: we get here if the far end was
         # `ContextCancelled` in 2 cases:
@@ -150,7 +172,22 @@ async def _drain_to_final_msg(
             # continue to bubble up as normal.
             raise
 
+        try:
+            ctx._result: Any = msg['return']
+            log.runtime(
+                'Context delivered final draining msg:\n'
+                f'{pformat(msg)}'
+            )
+            # XXX: only close the rx mem chan AFTER
+            # a final result is retreived.
+            # if ctx._recv_chan:
+            #     await ctx._recv_chan.aclose()
+            # TODO: ^ we don't need it right?
+            break
+
         except KeyError:
+            # always capture unexpected/non-result msgs
+            pre_result_drained.append(msg)
 
             if 'yield' in msg:
                 # far end task is still streaming to us so discard
@@ -159,12 +196,12 @@ async def _drain_to_final_msg(
                     (ctx._stream.closed
                      and (reason := 'stream was already closed')
                     )
-                    or (ctx._cancel_called
-                        and (reason := 'ctx called `.cancel()`')
-                    )
                     or (ctx._cancelled_caught
                         and (reason := 'ctx caught a cancel')
                     )
+                    or (ctx._cancel_called
+                        and (reason := 'ctx called `.cancel()`')
+                    )
                     or (len(pre_result_drained) > msg_limit
                         and (reason := f'"yield" limit={msg_limit}')
                     )
@@ -193,7 +230,6 @@ async def _drain_to_final_msg(
 
                         f'{pformat(msg)}\n'
                     )
-                    pre_result_drained.append(msg)
                     continue
 
             # TODO: work out edge cases here where
@@ -206,13 +242,15 @@ async def _drain_to_final_msg(
                     'Remote stream terminated due to "stop" msg:\n\n'
                     f'{pformat(msg)}\n'
                 )
-                pre_result_drained.append(msg)
                 continue
 
-            # internal error should never get here
-            assert msg.get('cid'), (
-                "Received internal error at portal?"
-            )
+            # It's an internal error if any other msg type without
+            # a`'cid'` field arrives here!
+            if not msg.get('cid'):
+                raise InternalError(
+                    'Unexpected cid-missing msg?\n\n'
+                    f'{msg}\n'
+                )
 
             # XXX fallthrough to handle expected error XXX
             re: Exception|None = ctx._remote_error
@@ -273,11 +311,27 @@ async def _drain_to_final_msg(
             else:
                 # bubble the original src key error
                 raise
+    else:
+        log.cancel(
+            'Skipping `MsgStream` drain since final outcome is set\n\n'
+            f'{ctx.outcome}\n'
+        )
 
     return pre_result_drained
 
 
-# TODO: make this a msgspec.Struct!
+class Unresolved:
+    '''
+    Placeholder value for `Context._result` until
+    a final return value or raised error is resolved.
+
+    '''
+    ...
+
+
+# TODO: make this a .msg.types.Struct!
+# -[ ] ideally we can freeze it
+# -[ ] let's us do field diffing nicely in tests Bo
 @dataclass
 class Context:
     '''
@@ -332,28 +386,38 @@ class Context:
     # NOTE: each side of the context has its own cancel scope
     # which is exactly the primitive that allows for
     # cross-actor-task-supervision and thus SC.
-    _scope: trio.CancelScope | None = None
+    _scope: trio.CancelScope|None = None
     _task: trio.lowlevel.Task|None = None
+
+    # TODO: cs around result waiting so we can cancel any
+    # permanently blocking `._recv_chan.receive()` call in
+    # a drain loop?
     # _res_scope: trio.CancelScope|None = None
 
     # on a clean exit there should be a final value
     # delivered from the far end "callee" task, so
     # this value is only set on one side.
-    _result: Any | int = None
+    # _result: Any | int = None
+    _result: Any|Unresolved = Unresolved
 
-    # if the local "caller"  task errors this
-    # value is always set to the error that was
-    # captured in the `Portal.open_context().__aexit__()`
-    # teardown.
-    _local_error: BaseException | None = None
+    # if the local "caller"  task errors this value is always set
+    # to the error that was captured in the
+    # `Portal.open_context().__aexit__()` teardown block OR, in
+    # 2 special cases when an (maybe) expected remote error
+    #   arrives that we purposely swallow silently:
+    # - `ContextCancelled` with `.canceller` set to our uid:
+    #    a self-cancel,
+    # - `RemoteActorError[StreamOverrun]` which was caught during
+    #   a self-cancellation teardown msg drain.
+    _local_error: BaseException|None = None
 
     # if the either side gets an error from the other
     # this value is set to that error unpacked from an
     # IPC msg.
-    _remote_error: BaseException | None = None
+    _remote_error: BaseException|None = None
 
-    # only set if the local task called `.cancel()`
-    _cancel_called: bool = False  # did WE cancel the far end?
+    # only set if an actor-local task called `.cancel()`
+    _cancel_called: bool = False  # did WE request cancel of the far end?
 
     # TODO: do we even need this? we can assume that if we're
     # cancelled that the other side is as well, so maybe we should
@@ -379,61 +443,6 @@ class Context:
     #   actors from being able to acquire the debugger lock.
     _enter_debugger_on_cancel: bool = True
 
-    @property
-    def cancel_called(self) -> bool:
-        '''
-        Records whether cancellation has been requested for this context
-        by either an explicit call to  ``.cancel()`` or an implicit call
-        due to an error caught inside the ``Portal.open_context()``
-        block.
-
-        '''
-        return self._cancel_called
-
-    @property
-    def canceller(self) -> tuple[str, str] | None:
-        '''
-        ``Actor.uid: tuple[str, str]`` of the (remote)
-        actor-process who's task was cancelled thus causing this
-        (side of the) context to also be cancelled.
-
-        '''
-        return self._canceller
-
-    @property
-    def cancelled_caught(self) -> bool:
-        return (
-            # the local scope was cancelled either by
-            # remote error or self-request
-            self._scope.cancelled_caught
-
-            # the local scope was never cancelled
-            # and instead likely we received a remote side
-            # cancellation that was raised inside `.result()`
-            or (
-                (se := self._local_error)
-                and
-                isinstance(se, ContextCancelled)
-                and (
-                    se.canceller == self.canceller
-                    or
-                    se is self._remote_error
-                )
-            )
-        )
-
-    # @property
-    # def is_waiting_result(self) -> bool:
-    #     return bool(self._res_scope)
-
-    @property
-    def side(self) -> str:
-        '''
-        Return string indicating which task this instance is wrapping.
-
-        '''
-        return 'caller' if self._portal else 'callee'
-
     # init and streaming state
     _started_called: bool = False
     _stream_opened: bool = False
@@ -450,10 +459,196 @@ class Context:
             maxlen=616,
         )
     )
-    _scope_nursery: trio.Nursery | None = None
+
+    # NOTE: this was originally a legacy interface from when we
+    # were raising remote errors (set to `._remote_error`) by
+    # starting a task inside this nursery that simply raised the
+    # boxed exception. NOW, it's used for spawning overrun queuing
+    # tasks when `.allow_overruns ==  True` !!!
+    _scope_nursery: trio.Nursery|None = None
+
+    # streaming overrun state tracking
     _in_overrun: bool = False
     _allow_overruns: bool = False
 
+
+    def __str__(self) -> str:
+        ds: str = '='
+        # ds: str = ': '
+
+        # only show if opened
+        maybe_stream_repr: str = ''
+        if stream := self._stream:
+            # TODO: a `MsgStream.reprol()` !!
+            # f'   stream{ds}{self._stream}\n'
+            # f'   {self._stream}\n'
+            maybe_stream_repr: str = (
+                f'   {stream}\n'
+            )
+
+        return (
+            f'<Context(\n'
+            # f'\n'
+            # f'   ---\n'
+            f' |_ipc: {self.dst_maddr}\n'
+            # f'   dst_maddr{ds}{self.dst_maddr}\n'
+            f"   uid{ds}'{self.chan.uid}'\n"
+            f"   cid{ds}'{self.cid}'\n"
+            # f'   ---\n'
+            f'\n'
+            # f'   ---\n'
+            f' |_rpc: {self.repr_rpc}\n'
+            f"   side{ds}'{self.side}'\n"
+            # f'   rpc_sig{ds}{self.repr_rpc}\n'
+            f'   {self._task}\n'
+            f'{maybe_stream_repr}'
+            # f'   ---\n'
+            f'\n'
+            # f'   -----\n'
+            f' |_state: {self.repr_outcome()}\n'
+            f'   outcome{ds}{self.repr_outcome(show_error_fields=True)}\n'
+            f'   result{ds}{self._result}\n'
+            f'   cancel_called{ds}{self.cancel_called}\n'
+            f'   cancel_acked{ds}{self.cancel_acked}\n'
+            f'   canceller{ds}{self._canceller}\n'
+
+            # TODO: any other fields?
+            # f'   maybe_error{ds}{self.maybe_error}\n'
+            # -[ ] ^ already covered by outcome yah?
+            # f'  cancelled_caught={self.cancelled_caught}\n'
+            # -[ ] remove this ^ right?
+
+            # f'  _remote_error={self._remote_error}
+            ')>\n'
+        )
+    # NOTE: making this return a value that can be passed to
+    # `eval()` is entirely **optional** dawggg B)
+    # https://docs.python.org/3/library/functions.html#repr
+    # https://docs.python.org/3/reference/datamodel.html#object.__repr__
+    #
+    # XXX: Currently we target **readability** from a (console)
+    # logging perspective over `eval()`-ability since we do NOT
+    # target serializing non-struct instances!
+    # def __repr__(self) -> str:
+    __repr__ = __str__
+
+    @property
+    def cancel_called(self) -> bool:
+        '''
+        Records whether cancellation has been requested for this context
+        by a call to  `.cancel()` either due to,
+        - either an explicit call by some local task,
+        - or an implicit call due to an error caught inside
+          the ``Portal.open_context()`` block.
+
+        '''
+        return self._cancel_called
+
+    @property
+    def canceller(self) -> tuple[str, str] | None:
+        '''
+        ``Actor.uid: tuple[str, str]`` of the (remote)
+        actor-process who's task was cancelled thus causing this
+        (side of the) context to also be cancelled.
+
+        '''
+        return self._canceller
+
+    @property
+    def cancel_acked(self) -> bool:
+        '''
+        Records whether the task on the remote side of this IPC
+        context acknowledged a cancel request via a relayed
+        `ContextCancelled` with the `.canceller` attr set to the
+        `Actor.uid` of the local actor who's task entered
+        `Portal.open_context()`.
+
+        This will only be `True` when `.cancel()` is called and
+        the ctxc response contains a `.canceller: tuple` field
+        equal to the uid of the calling task's actor.
+
+        '''
+        portal: Portal|None = self._portal
+        if portal:
+            our_uid: tuple = portal.actor.uid
+
+        return bool(
+            self._cancel_called
+            and (re := self._remote_error)
+            and isinstance(re, ContextCancelled)
+            and (
+                re.canceller
+                ==
+                self.canceller
+                ==
+                our_uid
+           )
+        )
+
+    @property
+    def cancelled_caught(self) -> bool:
+        '''
+        Exactly the value of `self._scope.cancelled_caught`
+        (delegation) and should only be (able to be read as)
+        `True` for a `.side == "caller"` ctx wherein the
+        `Portal.open_context()` block was exited due to a call to
+        `._scope.cancel()` - which should only ocurr in 2 cases:
+
+        - a caller side calls `.cancel()`, the far side cancels
+          and delivers back a `ContextCancelled` (making
+          `.cancel_acked == True`) and `._scope.cancel()` is
+          called by `._maybe_cancel_and_set_remote_error()` which
+          in turn cancels all `.open_context()` started tasks
+          (including any overrun queuing ones).
+          => `._scope.cancelled_caught == True` by normal `trio`
+          cs semantics.
+
+        - a caller side is delivered a `._remote_error:
+          RemoteActorError` via `._deliver_msg()` and a transitive
+          call to `_maybe_cancel_and_set_remote_error()` calls
+          `._scope.cancel()` and that cancellation eventually
+          results in `trio.Cancelled`(s) caught in the
+          `.open_context()` handling around the @acm's `yield`.
+
+        Only as an FYI, in the "callee" side case it can also be
+        set but never is readable by any task outside the RPC
+        machinery in `._invoke()` since,:
+        - when a callee side calls `.cancel()`, `._scope.cancel()`
+          is called immediately and handled specially inside
+          `._invoke()` to raise a `ContextCancelled` which is then
+          sent to the caller side.
+
+          However, `._scope.cancelled_caught` can NEVER be
+          accessed/read as `True` by any RPC invoked task since it
+          will have terminated before the cs block exit.
+
+        '''
+        return bool(
+            # the local scope was cancelled either by
+            # remote error or self-request
+            (self._scope and self._scope.cancelled_caught)
+
+            # the local scope was never cancelled
+            # and instead likely we received a remote side
+            # # cancellation that was raised inside `.result()`
+            # or (
+            #     (se := self._local_error)
+            #     and se is re
+            # )
+        )
+
+    # @property
+    # def is_waiting_result(self) -> bool:
+    #     return bool(self._res_scope)
+
+    @property
+    def side(self) -> str:
+        '''
+        Return string indicating which task this instance is wrapping.
+
+        '''
+        return 'caller' if self._portal else 'callee'
+
     async def send_yield(
         self,
         data: Any,
@@ -501,17 +696,20 @@ class Context:
 
         when called/closed by actor local task(s).
 
-        NOTEs & TODOs: 
+        NOTEs: 
           - It is expected that the caller has previously unwrapped
             the remote error using a call to `unpack_error()` and
             provides that output exception value as the input
-            `error` argument here.
+            `error` argument *here*.
+
+        TODOs:
           - If this is an error message from a context opened by
-            `Portal.open_context()` we want to interrupt any
-            ongoing local tasks operating within that `Context`'s
-            cancel-scope so as to be notified ASAP of the remote
-            error and engage any caller handling (eg. for
-            cross-process task supervision).
+            `Portal.open_context()` (ideally) we want to interrupt
+            any ongoing local tasks operating within that
+            `Context`'s cancel-scope so as to be notified ASAP of
+            the remote error and engage any caller handling (eg.
+            for cross-process task supervision).
+
           - In some cases we may want to raise the remote error
             immediately since there is no guarantee the locally
             operating task(s) will attempt to execute a checkpoint
@@ -519,10 +717,13 @@ class Context:
             approaches depending on the current task's work and
             wrapping "thread" type:
 
-            - `trio`-native-and-graceful: only ever wait for tasks
-              to exec a next `trio.lowlevel.checkpoint()` assuming
-              that any such task must do so to interact with the
-              actor runtime and IPC interfaces.
+            - Currently we only support
+              a `trio`-native-and-graceful approach: we only ever
+              wait for local tasks to exec a next
+              `trio.lowlevel.checkpoint()` assuming that any such
+              task must do so to interact with the actor runtime
+              and IPC interfaces and will then be cancelled by
+              the internal `._scope` block.
 
             - (NOT IMPLEMENTED) system-level-aggressive: maybe we
               could eventually interrupt sync code (invoked using
@@ -543,80 +744,106 @@ class Context:
         # do their own error checking at their own call points and
         # result processing.
 
-        # XXX: set the remote side's error so that after we cancel
-        # whatever task is the opener of this context it can raise
-        # that error as the reason.
+        # TODO: never do this right?
         # if self._remote_error:
         #     return
 
-        # breakpoint()
-        log.cancel(
-            'Setting remote error for ctx \n'
+        # XXX: denote and set the remote side's error so that
+        # after we cancel whatever task is the opener of this
+        # context, it can raise or swallow that error
+        # appropriately.
+        log.runtime(
+            'Setting remote error for ctx\n\n'
             f'<= remote ctx uid: {self.chan.uid}\n'
-            f'=>\n{error}'
+            f'=>{error}'
         )
         self._remote_error: BaseException = error
 
-        if (
-            isinstance(error, ContextCancelled)
-        ):
-            log.cancel(
-                'Remote task-context was cancelled for '
-                f'actor: {self.chan.uid}\n'
-                f'task: {self.cid}\n'
-                f'canceller: {error.canceller}\n'
-            )
-            # always record the cancelling actor's uid since its cancellation
-            # state is linked and we want to know which process was
-            # the cause / requester of the cancellation.
-            # if error.canceller is None:
-            #     import pdbp; pdbp.set_trace()
+        # self-cancel (ack) or,
+        # peer propagated remote cancellation.
+        if isinstance(error, ContextCancelled):
+            ctxc_src: tuple = error.canceller
 
-                # breakpoint()
-            self._canceller = error.canceller
+            whom: str = (
+                'us' if ctxc_src == current_actor().uid
+                else 'peer'
+            )
+            log.cancel(
+                f'IPC context cancelled by {whom}!\n\n'
+                f'{error}'
+            )
+            # always record the cancelling actor's uid since its
+            # cancellation state is linked and we want to know
+            # which process was the cause / requester of the
+            # cancellation.
+            self._canceller = ctxc_src
 
 
             if self._cancel_called:
-                # this is an expected cancel request response message
-                # and we **don't need to raise it** in local cancel
-                # scope since it will potentially override a real error.
+                # this is an expected cancel request response
+                # message and we **don't need to raise it** in the
+                # local cancel `._scope` since it will potentially
+                # override a real error. After this returns
+                # `.cancel_acked == True`.
                 return
 
         else:
             log.error(
-                f'Remote context error:\n'
+                f'Remote context error:\n\n'
+
                 f'{error}\n'
                 f'{pformat(self)}\n'
-                # f'remote actor: {self.chan.uid}\n'
-                # f'cid: {self.cid}\n'
             )
             self._canceller = self.chan.uid
 
-        # TODO: tempted to **not** do this by-reraising in a
-        # nursery and instead cancel a surrounding scope, detect
-        # the cancellation, then lookup the error that was set?
-        # YES! this is way better and simpler!
+        # Cancel the local `._scope`, catch that
+        # `._scope.cancelled_caught` and re-raise any remote error
+        # once exiting (or manually calling `.result()`) the
+        # `.open_context()`  block.
         cs: trio.CancelScope = self._scope
         if (
             cs
             and not cs.cancel_called
             and not cs.cancelled_caught
         ):
-
-            # TODO: we can for sure drop this right?
-            # from trio.testing import wait_all_tasks_blocked
-            # await wait_all_tasks_blocked()
-
             # TODO: it'd sure be handy to inject our own
             # `trio.Cancelled` subtype here ;)
             # https://github.com/goodboy/tractor/issues/368
             self._scope.cancel()
 
-            # NOTE: this REPL usage actually works here dawg! Bo
-            # await pause()
+        # TODO: maybe we should also call `._res_scope.cancel()` if it
+        # exists to support cancelling any drain loop hangs?
 
-        # TODO: maybe we have to use `._res_scope.cancel()` if it
-        # exists?
+    # TODO: add to `Channel`?
+    @property
+    def dst_maddr(self) -> str:
+        chan: Channel = self.chan
+        dst_addr, dst_port = chan.raddr
+        trans: MsgTransport = chan.transport
+        # cid: str = self.cid
+        # cid_head, cid_tail = cid[:6], cid[-6:]
+        return (
+            f'/ipv4/{dst_addr}'
+            f'/{trans.name_key}/{dst_port}'
+            # f'/{self.chan.uid[0]}'
+            # f'/{self.cid}'
+
+            # f'/cid={cid_head}..{cid_tail}'
+            # TODO: ? not use this ^ right ?
+        )
+
+    dmaddr = dst_maddr
+
+    @property
+    def repr_rpc(
+        self,
+    ) -> str:
+        # TODO: how to show the transport interchange fmt?
+        # codec: str = self.chan.transport.codec_key
+        return (
+            # f'{self._nsf}() -{{{codec}}}-> {repr(self.outcome)}:'
+            f'{self._nsf}() -> {self.repr_outcome()}:'
+        )
 
     async def cancel(
         self,
@@ -633,13 +860,23 @@ class Context:
         side: str = self.side
         self._cancel_called: bool = True
 
-        header: str = f'Cancelling "{side.upper()}"-side of ctx with peer\n'
+        header: str = (
+            f'Cancelling ctx with peer from {side.upper()} side\n\n'
+        )
         reminfo: str = (
-            f'uid: {self.chan.uid}\n'
-            f'    |_ {self._nsf}()\n'
+            # ' =>\n'
+            f'Context.cancel() => {self.chan.uid}\n'
+            # f'{self.chan.uid}\n'
+            f'  |_ @{self.dst_maddr}\n'
+            f'    >> {self.repr_rpc}\n'
+            # f'    >> {self._nsf}() -> {codec}[dict]:\n\n'
+            # TODO: pull msg-type from spec re #320
         )
 
-        # caller side who entered `Portal.open_context()`
+        # CALLER side task
+        # ------ - ------
+        # Aka the one that entered `Portal.open_context()`
+        #
         # NOTE: on the call side we never manually call
         # `._scope.cancel()` since we expect the eventual
         # `ContextCancelled` from the other side to trigger this
@@ -648,8 +885,9 @@ class Context:
         # `Portal.open_context().__aexit__()`)
         if side == 'caller':
             if not self._portal:
-                raise RuntimeError(
-                    "No portal found, this is likely a callee side context"
+                raise InternalError(
+                    'No portal found!?\n'
+                    'Why is this supposed caller context missing it?'
                 )
 
             cid: str = self.cid
@@ -686,10 +924,18 @@ class Context:
                         f'{reminfo}'
                     )
 
-        # callee side remote task
-        # NOTE: on this side we ALWAYS cancel the local scope since
-        # the caller expects a `ContextCancelled` to be sent from
-        # `._runtime._invoke()` back to the other side.
+        # CALLEE side task
+        # ------ - ------
+        # Aka the one that DID NOT EVER enter a `Portal.open_context()`
+        # and instead was constructed and scheduled as an
+        # `_invoke()` RPC task.
+        #
+        # NOTE: on this side we ALWAYS cancel the local scope
+        # since the caller expects a `ContextCancelled` to be sent
+        # from `._runtime._invoke()` back to the other side. The
+        # logic for catching the result of the below
+        # `._scope.cancel()` is inside the `._runtime._invoke()`
+        # context RPC handling block.
         else:
             log.cancel(
                 header
@@ -750,7 +996,7 @@ class Context:
             # sent to the other side!
             if self._remote_error:
                 # NOTE: this is diff then calling
-                # `._maybe_raise_from_remote_msg()` specifically
+                # `._maybe_raise_remote_err()` specifically
                 # because any task entering this `.open_stream()`
                 # AFTER cancellation has already been requested,
                 # we DO NOT want to absorb any ctxc ACK silently!
@@ -876,53 +1122,105 @@ class Context:
                             f'ctx id: {self.cid}'
                         )
 
+    # TODO: replace all the instances of this!! XD
+    def maybe_raise(
+        self,
+        **kwargs,
+    ) -> Exception|None:
+        if re := self._remote_error:
+            return self._maybe_raise_remote_err(
+                re,
+                **kwargs,
+            )
+
     def _maybe_raise_remote_err(
         self,
-        err: Exception,
+        remote_error: Exception,
         raise_ctxc_from_self_call: bool = False,
         raise_overrun_from_self: bool = True,
 
-    ) -> ContextCancelled|None:
+    ) -> (
+        ContextCancelled  # `.cancel()` request to far side
+        |RemoteActorError  # stream overrun caused and ignored by us
+    ):
         '''
-        Maybe raise a remote error depending on who (which task from
-        which actor) requested a cancellation (if any).
+        Maybe raise a remote error depending on the type of error
+        and *who* (i.e. which task from which actor) requested
+        a  cancellation (if any).
 
         '''
-        # NOTE: whenever the context's "opener" side (task) **is**
-        # the side which requested the cancellation (likekly via
-        # ``Context.cancel()``), we don't want to re-raise that
-        # cancellation signal locally (would be akin to
-        # a ``trio.Nursery`` nursery raising ``trio.Cancelled``
-        # whenever  ``CancelScope.cancel()`` was called) and
-        # instead silently reap the expected cancellation
-        # "error"-msg.
-        our_uid: tuple[str, str] = current_actor().uid
-        if (
-            (not raise_ctxc_from_self_call
-             and isinstance(err, ContextCancelled)
-             and (
-                self._cancel_called
-                or self.chan._cancel_called
-                or self.canceller == our_uid
-                or tuple(err.canceller) == our_uid)
-            )
-            or
-            (not raise_overrun_from_self
-             and isinstance(err, RemoteActorError)
-             and err.msgdata['type_str'] == 'StreamOverrun'
-             and tuple(err.msgdata['sender']) == our_uid
-            )
+        if ((
+            # NOTE: whenever the context's "opener" side (task) **is**
+            # the side which requested the cancellation (likekly via
+            # ``Context.cancel()``), we don't want to re-raise that
+            # cancellation signal locally (would be akin to
+            # a ``trio.Nursery`` nursery raising ``trio.Cancelled``
+            # whenever  ``CancelScope.cancel()`` was called) and
+            # instead silently reap the expected cancellation
+            # "error"-msg-as-ack. In this case the `err:
+            # ContextCancelled` must have a `.canceller` set to the 
+            # uid of the requesting task's actor and we only do NOT
+            # raise that error locally if WE ARE THAT ACTOR which
+            # requested the cancellation.
+                not raise_ctxc_from_self_call
+                and isinstance(remote_error, ContextCancelled)
+                and (
+                    self._cancel_called
 
+                    # or self.chan._cancel_called
+                    # TODO: ^ should we have a special separate case
+                    # for this ^ ?
+                )
+                and ( # one of,
+
+                    (portal := self._portal)
+                    and (our_uid := portal.actor.uid)
+                    # TODO: ?potentially it is useful to emit certain
+                    # warning/cancel logs for the cases where the
+                    # cancellation is due to a lower level cancel
+                    # request, such as `Portal.cancel_actor()`, since in
+                    # that case it's not actually this specific ctx that
+                    # made a `.cancel()` call, but it is the same
+                    # actor-process?
+                    and tuple(remote_error.canceller) == our_uid
+                    or self.chan._cancel_called
+                    or self.canceller == our_uid
+                 )
+            ) or (
+
+                # NOTE: whenever this context is the cause of an
+                # overrun on the remote side (aka we sent msgs too
+                # fast that the remote task was overrun according
+                # to `MsgStream` buffer settings) AND the caller
+                # has requested to not raise overruns this side
+                # caused, we also silently absorb any remotely
+                # boxed `StreamOverrun`. This is mostly useful for
+                # supressing such faults during
+                # cancellation/error/final-result handling inside
+                # `_drain_to_final_msg()` such that we do not
+                # raise such errors particularly in the case where
+                # `._cancel_called == True`.
+                not raise_overrun_from_self
+                and isinstance(remote_error, RemoteActorError)
+                and remote_error.msgdata['type_str'] == 'StreamOverrun'
+                and tuple(remote_error.msgdata['sender']) == our_uid
+            )
         ):
             # NOTE: we set the local scope error to any "self
             # cancellation" error-response thus "absorbing"
             # the error silently B)
             if self._local_error is None:
-                self._local_error = err
+                self._local_error = remote_error
 
-            return err
+            else:
+                log.warning(
+                    'Local error already set for ctx?\n'
+                    f'{self._local_error}\n'
+                )
 
-        # NOTE: currently we are masking underlying runtime errors
+            return remote_error
+
+        # NOTE: currently we are hiding underlying runtime errors
         # which are often superfluous to user handler code. not
         # sure if this is still needed / desired for all operation?
         # TODO: maybe we can only NOT mask if:
@@ -932,10 +1230,15 @@ class Context:
         #       runtime frames from the tb explicitly?
         # https://docs.python.org/3/reference/simple_stmts.html#the-raise-statement
         # https://stackoverflow.com/a/24752607
-        # __tracebackhide__: bool = True
-        raise err from None
+        __tracebackhide__: bool = True
+        raise remote_error from None
 
-    async def result(self) -> Any | Exception:
+    # TODO: change  to `.wait_for_result()`?
+    async def result(
+        self,
+        hide_tb: bool = True,
+
+    ) -> Any|Exception:
         '''
         From some (caller) side task, wait for and return the final
         result from the remote (callee) side's task.
@@ -961,182 +1264,53 @@ class Context:
         of the remote cancellation.
 
         '''
-        assert self._portal, "Context.result() can not be called from callee!"
+        __tracebackhide__ = hide_tb
+        assert self._portal, (
+            "Context.result() can not be called from callee side!"
+        )
+        if self._final_result_is_set():
+            return self._result
+
         assert self._recv_chan
-
         raise_overrun: bool = not self._allow_overruns
-        # if re := self._remote_error:
-        #     return self._maybe_raise_remote_err(
-        #         re,
-        #         # NOTE: obvi we don't care if we
-        #         # overran the far end if we're already
-        #         # waiting on a final result (msg).
-        #         raise_overrun_from_self=raise_overrun,
-        #     )
-
-        res_placeholder: int = id(self)
+        # res_placeholder: int = id(self)
         if (
-            self._result == res_placeholder
-            and not self._remote_error
+            # self._result == res_placeholder
+            # and not self._remote_error
+            self.maybe_error is None
+            # not self._remote_error
+            # and not self._local_error
             and not self._recv_chan._closed  # type: ignore
         ):
 
-            # wait for a final context result by collecting (but
-            # basically ignoring) any bi-dir-stream msgs still in transit
-            # from the far end.
-            drained_msgs: list[dict] = await _drain_to_final_msg(ctx=self)
-            log.runtime(
+            # wait for a final context result/error by "draining"
+            # (by more or less ignoring) any bi-dir-stream "yield"
+            # msgs still in transit from the far end.
+            drained_msgs: list[dict] = await _drain_to_final_msg(
+                ctx=self,
+                hide_tb=hide_tb,
+            )
+            for msg in drained_msgs:
+
+                # TODO: mask this by default..
+                if 'return' in msg:
+                    # from .devx import pause
+                    # await pause()
+                    raise InternalError(
+                        'Final `return` msg should never be drained !?!?\n\n'
+                        f'{msg}\n'
+                    )
+
+            log.cancel(
                 'Ctx drained pre-result msgs:\n'
                 f'{drained_msgs}'
             )
 
-            # TODO: implement via helper func ^^^^
-            # pre_result_drained: list[dict] = []
-            # while not self._remote_error:
-            #     try:
-            #         # NOTE: this REPL usage actually works here dawg! Bo
-            #         # from .devx._debug import pause
-            #         # await pause()
-            #         # if re := self._remote_error:
-            #         #     self._maybe_raise_remote_err(
-            #         #         re,
-            #         #         # NOTE: obvi we don't care if we
-            #         #         # overran the far end if we're already
-            #         #         # waiting on a final result (msg).
-            #         #         raise_overrun_from_self=raise_overrun,
-            #         #     )
-
-            #         # TODO: bad idea?
-            #         # with trio.CancelScope() as res_cs:
-            #         #     self._res_scope = res_cs
-            #         #     msg: dict = await self._recv_chan.receive()
-            #         # if res_cs.cancelled_caught:
-
-            #         # from .devx._debug import pause
-            #         # await pause()
-            #         msg: dict = await self._recv_chan.receive()
-            #         self._result: Any = msg['return']
-            #         log.runtime(
-            #             'Context delivered final result msg:\n'
-            #             f'{pformat(msg)}'
-            #         )
-            #         # NOTE: we don't need to do this right?
-            #         # XXX: only close the rx mem chan AFTER
-            #         # a final result is retreived.
-            #         # if self._recv_chan:
-            #         #     await self._recv_chan.aclose()
-            #         break
-
-            #     # NOTE: we get here if the far end was
-            #     # `ContextCancelled` in 2 cases:
-            #     # 1. we requested the cancellation and thus
-            #     #    SHOULD NOT raise that far end error,
-            #     # 2. WE DID NOT REQUEST that cancel and thus
-            #     #    SHOULD RAISE HERE!
-            #     except trio.Cancelled:
-
-            #         # CASE 2: mask the local cancelled-error(s)
-            #         # only when we are sure the remote error is
-            #         # the source cause of this local task's
-            #         # cancellation.
-            #         if re := self._remote_error:
-            #             self._maybe_raise_remote_err(re)
-
-            #         # CASE 1: we DID request the cancel we simply
-            #         # continue to bubble up as normal.
-            #         raise
-
-            #     except KeyError:
-
-            #         if 'yield' in msg:
-            #             # far end task is still streaming to us so discard
-            #             log.warning(f'Discarding std "yield"\n{msg}')
-            #             pre_result_drained.append(msg)
-            #             continue
-
-            #         # TODO: work out edge cases here where
-            #         # a stream is open but the task also calls
-            #         # this?
-            #         # -[ ] should be a runtime error if a stream is open
-            #         #   right?
-            #         elif 'stop' in msg:
-            #             log.cancel(
-            #                 'Remote stream terminated due to "stop" msg:\n'
-            #                 f'{msg}'
-            #             )
-            #             pre_result_drained.append(msg)
-            #             continue
-
-            #         # internal error should never get here
-            #         assert msg.get('cid'), (
-            #             "Received internal error at portal?"
-            #         )
-
-            #         # XXX fallthrough to handle expected error XXX
-            #         re: Exception|None = self._remote_error
-            #         if re:
-            #             log.critical(
-            #                 'Remote ctx terminated due to "error" msg:\n'
-            #                 f'{re}'
-            #             )
-            #             assert msg is self._cancel_msg
-            #             # NOTE: this solved a super dupe edge case XD
-            #             # this was THE super duper edge case of:
-            #             # - local task opens a remote task,
-            #             # - requests remote cancellation of far end
-            #             #   ctx/tasks,
-            #             # - needs to wait for the cancel ack msg
-            #             #   (ctxc) or some result in the race case
-            #             #   where the other side's task returns
-            #             #   before the cancel request msg is ever
-            #             #   rxed and processed,
-            #             # - here this surrounding drain loop (which
-            #             #   iterates all ipc msgs until the ack or
-            #             #   an early result arrives) was NOT exiting
-            #             #   since we are the edge case: local task
-            #             #   does not re-raise any ctxc it receives
-            #             #   IFF **it** was the cancellation
-            #             #   requester..
-            #             # will raise if necessary, ow break from
-            #             # loop presuming any error terminates the
-            #             # context!
-            #             self._maybe_raise_remote_err(
-            #                 re,
-            #                 # NOTE: obvi we don't care if we
-            #                 # overran the far end if we're already
-            #                 # waiting on a final result (msg).
-            #                 # raise_overrun_from_self=False,
-            #                 raise_overrun_from_self=raise_overrun,
-            #             )
-
-            #             break  # OOOOOF, yeah obvi we need this..
-
-            #         # XXX we should never really get here
-            #         # right! since `._deliver_msg()` should
-            #         # always have detected an {'error': ..}
-            #         # msg and already called this right!?!
-            #         elif error := unpack_error(
-            #             msg=msg,
-            #             chan=self._portal.channel,
-            #             hide_tb=False,
-            #         ):
-            #             log.critical('SHOULD NEVER GET HERE!?')
-            #             assert msg is self._cancel_msg
-            #             assert error.msgdata == self._remote_error.msgdata
-            #             from .devx._debug import pause
-            #             await pause()
-            #             self._maybe_cancel_and_set_remote_error(error)
-            #             self._maybe_raise_remote_err(error)
-
-            #         else:
-            #             # bubble the original src key error
-            #             raise
-
         if (
             (re := self._remote_error)
-            and self._result == res_placeholder
+            # and self._result == res_placeholder
         ):
-            maybe_err: Exception|None = self._maybe_raise_remote_err(
+            self._maybe_raise_remote_err(
                 re,
                 # NOTE: obvi we don't care if we
                 # overran the far end if we're already
@@ -1152,10 +1326,126 @@ class Context:
                     (not self._cancel_called)
                 ),
             )
-            if maybe_err:
-                self._result = maybe_err
+            # if maybe_err:
+            #     self._result = maybe_err
 
-        return self._result
+        return self.outcome
+            # None if self._result == res_placeholder
+            # else self._result
+        # )
+
+    # TODO: switch this with above which should be named
+    # `.wait_for_outcome()` and instead do
+    # a `.outcome.Outcome.unwrap()` ?
+    # @property
+    # def result(self) -> Any|None:
+    #     if self._final_result_is_set():
+    #         return self._result
+
+    #     raise RuntimeError('No result is available!')
+
+    @property
+    def maybe_error(self) -> BaseException|None:
+        le: Exception|None = self._local_error
+        re: RemoteActorError|ContextCancelled|None = self._remote_error
+
+        match (le, re):
+            # NOTE: remote errors always get precedence since even
+            # in the cases where a local error was the cause, the
+            # received boxed ctxc should include the src info
+            # caused by us right?
+            case (
+                _,
+                RemoteActorError(),
+            ):
+                # give precedence to remote error if it's
+                # NOT a cancel ack (ctxc).
+                return (
+                    re or le
+                )
+
+        # TODO: extra logic to handle ctxc ack case(s)?
+        # -[ ] eg. we error, call .cancel(), rx ack but should
+        #      raise the _local_error instead?
+        # -[ ] are there special error conditions where local vs.
+        #      remote should take precedence?
+            # case (
+            #     _,
+            #     ContextCancelled(canceller=),
+            # ):
+
+        error: Exception|None = le or re
+        if error:
+            return error
+
+        assert not self._cancel_msg
+        return None
+
+    def _final_result_is_set(self) -> bool:
+        # return not (self._result == id(self))
+        return self._result is not Unresolved
+
+    # def get_result_nowait(self) -> Any|None:
+    # TODO: use `outcome.Outcome` here instead?
+    @property
+    def outcome(self) -> (
+        Any|
+        RemoteActorError|
+        ContextCancelled
+    ):
+        '''
+        The final "outcome" from an IPC context which can either be
+        some Value returned from the target `@context`-decorated
+        remote task-as-func, or an `Error` wrapping an exception
+        raised from an RPC task fault or cancellation.
+
+        Note that if the remote task has not terminated then this
+        field always resolves to the module defined `Unresolved` handle.
+
+        TODO: implement this using `outcome.Outcome` types?
+
+        '''
+        return self.maybe_error or self._result
+
+    # @property
+    def repr_outcome(
+        self,
+        show_error_fields: bool = False,
+
+    ) -> str:
+        '''
+        Deliver a (simplified) `str` representation (as in
+        `.__repr__()`) of the final `.outcome`
+
+        '''
+        merr: Exception|None = self.maybe_error
+        if merr:
+            # if the error-type is one of ours and has the custom
+            # defined "repr-(in)-one-line" method call it, ow
+            # just deliver the type name.
+            if (
+                (reprol := getattr(merr, 'reprol', False))
+                and show_error_fields
+            ):
+                return reprol()
+
+            elif isinstance(merr, BaseExceptionGroup):
+                # TODO: maybe for multis we should just show
+                # a one-line count per error type, like with
+                # `collections.Counter`?
+                #
+                # just the type name for now to avoid long lines
+                # when tons of cancels..
+                return type(merr).__name__
+
+            # just the type name
+            # else:  # but wen?
+            #     return type(merr).__name__
+
+            # for all other errors show their regular output
+            return str(merr)
+
+        return str(self._result)
 
     async def started(
         self,
@@ -1261,8 +1551,14 @@ class Context:
             msg,
             self.chan,
         ):
-            log.error(
-                f'Delivering error-msg to caller\n'
+            if not isinstance(re, ContextCancelled):
+                log_meth = log.error
+            else:
+                log_meth = log.runtime
+
+            log_meth(
+                f'Delivering error-msg to caller\n\n'
+
                 f'<= peer: {from_uid}\n'
                 f'  |_ {nsf}()\n\n'
 
@@ -1276,7 +1572,7 @@ class Context:
             # NOTE: this will not raise an error, merely set
             # `._remote_error` and maybe cancel any task currently
             # entered in `Portal.open_context()` presuming the
-            # error is "cancel causing" (i.e. `ContextCancelled`
+            # error is "cancel causing" (i.e. a `ContextCancelled`
             # or `RemoteActorError`).
             self._maybe_cancel_and_set_remote_error(re)
 
@@ -1288,6 +1584,10 @@ class Context:
             # return True
             #
             # XXX ALSO NO!! XXX
+            # => NEVER raise remote errors from the calling
+            # runtime task, they should always be raised by
+            # consumer side tasks operating on the
+            # `Portal`/`Context` APIs.
             # if self._remote_error:
             #     self._maybe_raise_remote_err(error)
 
@@ -1471,7 +1771,9 @@ def mk_context(
         _task=trio.lowlevel.current_task(),
         **kwargs,
     )
-    ctx._result: int | Any = id(ctx)
+    # TODO: we can drop the old placeholder yah?
+    # ctx._result: int | Any = id(ctx)
+    ctx._result = Unresolved
     return ctx
 
 
-- 
2.34.1


From ed10632d97cab4aeb1cbe4cff83354fb4722e7a3 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sat, 2 Mar 2024 16:27:46 -0500
Subject: [PATCH 137/378] Avoid `ctx.cancel()` after ctxc rxed in
 `.open_context()`

In the case where the callee side delivers us a ctxc with `.canceller`
set we can presume that remote cancellation already has taken place and
thus we don't need to do the normal call-`Context.cancel()`-on-error
step. Further, in the case where we do call it also handle any
`trio.CloseResourceError` gracefully with a `.warning()`.

Also, originally I had added a post-`yield`-maybe-raise to attempt
handling any remote ctxc the same as for the local case (i.e. raised
from `yield` line) wherein if we get a remote ctxc the same handler
branch-path would trigger, thus avoiding different behaviour in that
case. I ended up masking it out (but can't member why.. ) as it seems
the normal `.result()` call and its internal handling gets the same
behaviour? I've left in the heavily commented code in case it ends up
being the better way to go; likely making the move to having a single
code in both cases is better even if it is just a matter of deciding
whether to swallow the ctxc or not in the `.cancel_acked` case.

Further teensie improvements:
- obvi improve/simplify log msg contents as in prior patches.
- use the new `maybe_wait_for_debugger(header_msg: str)` if/when waiting
  to exit in debug mode.
- another `hide_tb: bool` frame hider flag.
- rando type-annot updates of course :)
---
 tractor/_portal.py | 248 +++++++++++++++++++++++++--------------------
 1 file changed, 140 insertions(+), 108 deletions(-)

diff --git a/tractor/_portal.py b/tractor/_portal.py
index a4f2f618..04f613e8 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -27,8 +27,9 @@ from __future__ import annotations
 import importlib
 import inspect
 from typing import (
-    Any, Optional,
-    Callable, AsyncGenerator,
+    Any,
+    Callable,
+    AsyncGenerator,
     Type,
 )
 from functools import partial
@@ -52,6 +53,7 @@ from ._ipc import Channel
 from .log import get_logger
 from .msg import NamespacePath
 from ._exceptions import (
+    InternalError,
     _raise_from_no_key_in_msg,
     unpack_error,
     NoResult,
@@ -126,7 +128,7 @@ class Portal:
     def __init__(self, channel: Channel) -> None:
         self.chan = channel
         # during the portal's lifetime
-        self._result_msg: Optional[dict] = None
+        self._result_msg: dict|None = None
 
         # When set to a ``Context`` (when _submit_for_result is called)
         # it is expected that ``result()`` will be awaited at some
@@ -171,7 +173,7 @@ class Portal:
     ) -> dict[str, Any]:
 
         assert ctx._remote_func_type == 'asyncfunc'  # single response
-        msg = await ctx._recv_chan.receive()
+        msg: dict = await ctx._recv_chan.receive()
         return msg
 
     async def result(self) -> Any:
@@ -255,11 +257,11 @@ class Portal:
             return False
 
         reminfo: str = (
-            f'uid: {self.channel.uid}\n'
-            f'    |_{chan}\n'
+            f'{self.channel.uid}\n'
+            f' |_{chan}\n'
         )
         log.cancel(
-            f'Sending actor cancel request to peer\n'
+            f'Sending runtime `.cancel()` request to peer\n\n'
             f'{reminfo}'
         )
 
@@ -281,7 +283,9 @@ class Portal:
                 return True
 
             if cs.cancelled_caught:
-                log.cancel(
+                # may timeout and we never get an ack (obvi racy)
+                # but that doesn't mean it wasn't cancelled.
+                log.debug(
                     'May have failed to cancel peer?\n'
                     f'{reminfo}'
                 )
@@ -293,9 +297,11 @@ class Portal:
             trio.ClosedResourceError,
             trio.BrokenResourceError,
         ):
-            log.cancel(
-                f"{self.channel} for {self.channel.uid} was already "
-                "closed or broken?")
+            log.debug(
+                'IPC chan for actor already closed or broken?\n\n'
+                f'{self.channel.uid}\n'
+                f' |_{self.channel}\n'
+            )
             return False
 
     async def run_from_ns(
@@ -416,7 +422,8 @@ class Portal:
         try:
             # deliver receive only stream
             async with MsgStream(
-                ctx, ctx._recv_chan,
+                ctx=ctx,
+                rx_chan=ctx._recv_chan,
             ) as rchan:
                 self._streams.add(rchan)
                 yield rchan
@@ -443,6 +450,11 @@ class Portal:
             # await recv_chan.aclose()
             self._streams.remove(rchan)
 
+    # TODO: move this impl to `._context` mod and
+    # instead just bind it here as a method so that the logic
+    # for ctx stuff stays all in one place (instead of frickin
+    # having to open this file in tandem every gd time!!! XD)
+    #
     @asynccontextmanager
     async def open_context(
 
@@ -451,6 +463,11 @@ class Portal:
 
         allow_overruns: bool = False,
 
+        # TODO: if we set this the wrapping `@acm` body will
+        # still be shown (awkwardly) on pdb REPL entry. Ideally
+        # we can similarly annotate that frame to NOT show?
+        hide_tb: bool = False,
+
         # proxied to RPC
         **kwargs,
 
@@ -484,6 +501,8 @@ class Portal:
         collection. See ``tractor.Context`` for more details.
 
         '''
+        __tracebackhide__: bool = hide_tb
+
         # conduct target func method structural checks
         if not inspect.iscoroutinefunction(func) and (
             getattr(func, '_tractor_contex_function', False)
@@ -536,9 +555,12 @@ class Portal:
 
         # placeholder for any exception raised in the runtime
         # or by user tasks which cause this context's closure.
-        scope_err: BaseException | None = None
+        scope_err: BaseException|None = None
+        ctxc_from_callee: ContextCancelled|None = None
         try:
             async with trio.open_nursery() as nurse:
+
+                # NOTE: used to start overrun queuing tasks
                 ctx._scope_nursery: trio.Nursery = nurse
                 ctx._scope: trio.CancelScope = nurse.cancel_scope
 
@@ -546,14 +568,26 @@ class Portal:
                 # in enter tuple.
                 yield ctx, first
 
-                # between the caller exiting and arriving here the
-                # far end may have sent a ctxc-msg or other error,
-                # so check for it here immediately and maybe raise
-                # so as to engage the ctxc handling block below!
+                # ??TODO??: do we still want to consider this or is
+                # the `else:` block handling via a `.result()`
+                # call below enough??
+                # -[ ] pretty sure `.result()` internals do the
+                # same as our ctxc handler below so it ended up
+                # being same (repeated?) behaviour, but ideally we
+                # wouldn't have that duplication either by somehow
+                # factoring the `.result()` handler impl in a way
+                # that we can re-use it around the `yield` ^ here
+                # or vice versa?
+                #
+                # NOTE: between the caller exiting and arriving
+                # here the far end may have sent a ctxc-msg or
+                # other error, so check for it here immediately
+                # and maybe raise so as to engage the ctxc
+                # handling block below!
+                #
                 # if re := ctx._remote_error:
                 #     maybe_ctxc: ContextCancelled|None = ctx._maybe_raise_remote_err(
                 #         re,
-
                 #         # TODO: do we want this to always raise?
                 #         # - means that on self-ctxc, if/when the
                 #         #   block is exited before the msg arrives
@@ -571,7 +605,7 @@ class Portal:
                 #         #   block?
                 #         raise_ctxc_from_self_call=True,
                 #     )
-                #     assert maybe_ctxc
+                #     ctxc_from_callee = maybe_ctxc
 
                 # when in allow_overruns mode there may be
                 # lingering overflow sender tasks remaining?
@@ -583,13 +617,18 @@ class Portal:
                         not ctx._allow_overruns
                         or len(nurse.child_tasks) > 1
                     ):
-                        raise RuntimeError(
+                        raise InternalError(
                             'Context has sub-tasks but is '
                             'not in `allow_overruns=True` mode!?'
                         )
 
-                    # ensure cancel of all overflow sender tasks
-                    # started in the ctx nursery.
+                    # ensure we cancel all overflow sender
+                    # tasks started in the nursery when
+                    # `._allow_overruns == True`.
+                    #
+                    # NOTE: this means `._scope.cancelled_caught`
+                    # will prolly be set! not sure if that's
+                    # non-ideal or not ???
                     ctx._scope.cancel()
 
         # XXX NOTE XXX: maybe shield against
@@ -602,14 +641,15 @@ class Portal:
         # of a `Context`. In both cases any `ContextCancelled`
         # raised in this scope-block came from a transport msg
         # relayed from some remote-actor-task which our runtime set
-        # as a `Context._remote_error`
+        # as to `Context._remote_error`
         #
         # the CASES:
         #
         # - if that context IS THE SAME ONE that called
         #   `Context.cancel()`, we want to absorb the error
         #   silently and let this `.open_context()` block to exit
-        #   without raising.
+        #   without raising, ideally eventually receiving the ctxc
+        #   ack msg thus resulting in `ctx.cancel_acked == True`.
         #
         # - if it is from some OTHER context (we did NOT call
         #   `.cancel()`), we want to re-RAISE IT whilst also
@@ -633,6 +673,7 @@ class Portal:
         #   `Nursery.cancel_scope.cancel()`)
         except ContextCancelled as ctxc:
             scope_err = ctxc
+            ctxc_from_callee = ctxc
 
             # XXX TODO XXX: FIX THIS debug_mode BUGGGG!!!
             # using this code and then resuming the REPL will
@@ -642,6 +683,7 @@ class Portal:
             #   documenting it as a definittive example of
             #   debugging the tractor-runtime itself using it's
             #   own `.devx.` tooling!
+            # 
             # await pause()
 
             # CASE 2: context was cancelled by local task calling
@@ -649,15 +691,10 @@ class Portal:
             # exit silently.
             if (
                 ctx._cancel_called
-                and (
-                    ctxc is ctx._remote_error
-                    # ctxc.msgdata == ctx._remote_error.msgdata
-
-                    # TODO: uhh `Portal.canceller` ain't a thangg
-                    # dawg? (was `self.canceller` before?!?)
-                    and
-                    ctxc.canceller == self.actor.uid
-                )
+                and
+                ctxc is ctx._remote_error
+                and
+                ctxc.canceller == self.actor.uid
             ):
                 log.cancel(
                     f'Context (cid=[{ctx.cid[-6:]}..] cancelled gracefully with:\n'
@@ -665,9 +702,9 @@ class Portal:
                 )
             # CASE 1: this context was never cancelled via a local
             # task (tree) having called `Context.cancel()`, raise
-            # the error since it was caused by someone else!
+            # the error since it was caused by someone else
+            # -> probably a remote peer!
             else:
-                # await pause()
                 raise
 
         # the above `._scope` can be cancelled due to:
@@ -680,19 +717,29 @@ class Portal:
             # CASE 3: standard local error in this caller/yieldee
             Exception,
 
-            # CASES 1 & 2: normally manifested as
-            # a `Context._scope_nursery` raised
+            # CASES 1 & 2: can manifest as a `ctx._scope_nursery`
             # exception-group of,
+            #
             # 1.-`trio.Cancelled`s, since
-            #   `._scope.cancel()` will have been called and any
-            #   `ContextCancelled` absorbed and thus NOT RAISED in
-            #   any `Context._maybe_raise_remote_err()`,
+            #   `._scope.cancel()` will have been called
+            #   (transitively by the runtime calling
+            #   `._deliver_msg()`) and any `ContextCancelled`
+            #   eventually absorbed and thus absorbed/supressed in
+            #   any `Context._maybe_raise_remote_err()` call.
+            #
             # 2.-`BaseExceptionGroup[ContextCancelled | RemoteActorError]`
-            #    from any error raised in the "callee" side with
-            #    a group only raised if there was any more then one
-            #    task started here in the "caller" in the
-            #    `yield`-ed to task.
-            BaseExceptionGroup,  # since overrun handler tasks may have been spawned
+            #    from any error delivered from the "callee" side
+            #    AND a group-exc is only raised if there was > 1
+            #    tasks started *here* in the "caller" / opener
+            #    block. If any one of those tasks calls
+            #    `.result()` or `MsgStream.receive()`
+            #    `._maybe_raise_remote_err()` will be transitively
+            #    called and the remote error raised causing all
+            #    tasks to be cancelled.
+            #    NOTE: ^ this case always can happen if any
+            #    overrun handler tasks were spawned!
+            BaseExceptionGroup,
+
             trio.Cancelled,  # NOTE: NOT from inside the ctx._scope
             KeyboardInterrupt,
 
@@ -702,69 +749,48 @@ class Portal:
             # XXX: ALWAYS request the context to CANCEL ON any ERROR.
             # NOTE: `Context.cancel()` is conversely NEVER CALLED in
             # the `ContextCancelled` "self cancellation absorbed" case
-            # handled in the block above!
+            # handled in the block above ^^^ !!
             log.cancel(
-                'Context cancelled for task due to\n'
+                'Context terminated due to\n\n'
                 f'{caller_err}\n'
-                'Sending cancel request..\n'
-                f'task:{cid}\n'
-                f'actor:{uid}'
             )
 
             if debug_mode():
-                log.pdb(
-                    'Delaying `ctx.cancel()` until debug lock '
-                    'acquired..'
-                )
                 # async with acquire_debug_lock(self.actor.uid):
                 #     pass
                 # TODO: factor ^ into below for non-root cases?
-                await maybe_wait_for_debugger()
-                log.pdb(
-                    'Acquired debug lock! '
-                    'Calling `ctx.cancel()`!'
+                was_acquired: bool = await maybe_wait_for_debugger(
+                    header_msg=(
+                        'Delaying `ctx.cancel()` until debug lock '
+                        'acquired..\n'
+                    ),
                 )
+                if was_acquired:
+                    log.pdb(
+                        'Acquired debug lock! '
+                        'Calling `ctx.cancel()`!\n'
+                    )
 
-            try:
-                await ctx.cancel()
-            except trio.BrokenResourceError:
-                log.warning(
-                    'IPC connection for context is broken?\n'
-                    f'task:{cid}\n'
-                    f'actor:{uid}'
-                )
+
+            # we don't need to cancel the callee if it already
+            # told us it's cancelled ;p
+            if ctxc_from_callee is None:
+                try:
+                    await ctx.cancel()
+                except (
+                    trio.BrokenResourceError,
+                    trio.ClosedResourceError,
+                ):
+                    log.warning(
+                        'IPC connection for context is broken?\n'
+                        f'task:{cid}\n'
+                        f'actor:{uid}'
+                    )
 
             raise  # duh
 
         # no local scope error, the "clean exit with a result" case.
         else:
-            # between the caller exiting and arriving here the
-            # far end may have sent a ctxc-msg or other error,
-            # so check for it here immediately and maybe raise
-            # so as to engage the ctxc handling block below!
-            # if re := ctx._remote_error:
-            #     maybe_ctxc: ContextCancelled|None = ctx._maybe_raise_remote_err(
-            #         re,
-
-            #         # TODO: do we want this to always raise?
-            #         # - means that on self-ctxc, if/when the
-            #         #   block is exited before the msg arrives
-            #         #   but then the msg during __exit__
-            #         #   calling we may not activate the
-            #         #   ctxc-handler block below? should we
-            #         #   be?
-            #         # - if there's a remote error that arrives
-            #         #   after the child has exited, we won't
-            #         #   handle until the `finally:` block
-            #         #   where `.result()` is always called,
-            #         #   again in which case we handle it
-            #         #   differently then in the handler block
-            #         #   that would normally engage from THIS
-            #         #   block?
-            #         raise_ctxc_from_self_call=True,
-            #     )
-            #     assert maybe_ctxc
-
             if ctx.chan.connected():
                 log.runtime(
                     'Waiting on final context result for\n'
@@ -794,16 +820,18 @@ class Portal:
                     scope_err = berr
                     raise
 
+                # yes! this worx Bp
+                # from .devx import _debug
+                # await _debug.pause()
+
                 # an exception type boxed in a `RemoteActorError`
-                # is returned (meaning it was obvi not raised).
+                # is returned (meaning it was obvi not raised)
+                # that we want to log-report on.
                 msgdata: str|None = getattr(
                     result_or_err,
                     'msgdata',
                     None
                 )
-                # yes! this worx Bp
-                # from .devx import _debug
-                # await _debug.pause()
                 match (msgdata, result_or_err):
                     case (
                         {'tb_str': tbstr},
@@ -846,9 +874,8 @@ class Portal:
             # operating *in* this scope to have survived
             # we tear down the runtime feeder chan last
             # to avoid premature stream clobbers.
-            rxchan: trio.ReceiveChannel = ctx._recv_chan
             if (
-                 rxchan
+                (rxchan := ctx._recv_chan)
 
                 # maybe TODO: yes i know the below check is
                 # touching `trio` memchan internals..BUT, there are
@@ -904,22 +931,27 @@ class Portal:
                 etype: Type[BaseException] = type(scope_err)
 
                 # CASE 2
-                if ctx._cancel_called:
+                if (
+                    ctx._cancel_called
+                    and ctx.cancel_acked
+                ):
                     log.cancel(
                         'Context cancelled by caller task\n'
                         f'|_{ctx._task}\n\n'
 
-                        f'{etype}'
+                        f'{repr(scope_err)}\n'
                     )
 
+                # TODO: should we add a `._cancel_req_received`
+                # flag to determine if the callee manually called
+                # `ctx.cancel()`?
+                # -[ ] going to need a cid check no?
+
                 # CASE 1
                 else:
                     log.cancel(
-                        f'Context cancelled by remote callee task\n'
-                        f'peer: {uid}\n'
-                        f'|_ {nsf}()\n\n'
-
-                        f'{etype}\n'
+                        f'Context terminated due to local scope error:\n'
+                        f'{etype.__name__}\n'
                     )
 
             # FINALLY, remove the context from runtime tracking and
@@ -967,7 +999,7 @@ class LocalPortal:
 async def open_portal(
 
     channel: Channel,
-    nursery: Optional[trio.Nursery] = None,
+    nursery: trio.Nursery|None = None,
     start_msg_loop: bool = True,
     shield: bool = False,
 
@@ -992,7 +1024,7 @@ async def open_portal(
         if channel.uid is None:
             await actor._do_handshake(channel)
 
-        msg_loop_cs: Optional[trio.CancelScope] = None
+        msg_loop_cs: trio.CancelScope|None = None
         if start_msg_loop:
             from ._runtime import process_messages
             msg_loop_cs = await nursery.start(
-- 
2.34.1


From 9a18b57d384c3dc16125c8b1154c0f8db4323924 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sat, 2 Mar 2024 19:26:40 -0500
Subject: [PATCH 138/378] Mega-refactor on `._invoke()` targeting `@context`s

Since eventually we want to implement all other RPC "func types" as
contexts underneath this starts the rework to move all the other cases
into a separate func not only to simplify the main `._invoke()` body but
also as a reminder of the intention to do it XD

Details of re-factor:
- add a new `._invoke_non_context()` which just moves all the old blocks
  for non-context handling to a single def.
- factor what was basically just the `finally:` block handler (doing all
  the task bookkeeping) into a new `@acm`: `_errors_relayed_via_ipc()`
  with that content packed into the post-`yield` (also with a `hide_tb:
  bool` flag added of course).
  * include a `debug_kbis: bool` for when needed.
- since the `@context` block is the only type left in the main
  `_invoke()` body, de-dent it so it's more grok-able B)

Obviously this patch also includes a few improvements regarding
context-cancellation-semantics (for the `context` RPC case) on the
callee side in order to match previous changes to the `Context` api:
- always setting any ctxc as the `Context._local_error`.
- using the new convenience `.maybe_raise()` topically (for now).
- avoiding any previous reliance on `Context.cancelled_caught` for
  anything public of meaning.

Further included is more logging content updates:
- being pedantic in `.cancel()` msgs about whether termination is caused
  by error or ctxc.
- optional `._invoke()` traceback hiding via a `hide_tb: bool`.
- simpler log headers throughout instead leveraging new `.__repr__()` on
  primitives.
- buncha `<= <actor-uid>` sent some message emissions.
- simplified handshake statuses reporting.

Other subsys api changes we need to match:
- change to `Channel.transport`.
- avoiding any `local_nursery: ActorNursery` waiting when the
  `._implicit_runtime_started` is set.

And yes, lotsa more comments for #TODOs dawg.. since there's always
somethin!
---
 tractor/_runtime.py | 937 +++++++++++++++++++++++++++-----------------
 1 file changed, 576 insertions(+), 361 deletions(-)

diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index b3b87e26..4c1181de 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -22,6 +22,10 @@ low-level transport msg handling.
 
 """
 from __future__ import annotations
+from contextlib import (
+    ExitStack,
+    asynccontextmanager as acm,
+)
 from collections import defaultdict
 from functools import partial
 from itertools import chain
@@ -34,12 +38,12 @@ import sys
 from typing import (
     Any,
     Callable,
+    Coroutine,
     TYPE_CHECKING,
 )
 import uuid
 from types import ModuleType
 import os
-from contextlib import ExitStack
 import warnings
 
 from async_generator import aclosing
@@ -99,13 +103,290 @@ async def maybe_import_gb():
         await greenback.ensure_portal()
 
     except ModuleNotFoundError:
-        log.warning(
+        log.debug(
             '`greenback` is not installed.\n'
-            'No sync debug support!'
+            'No sync debug support!\n'
         )
         _gb_mod = False
 
 
+async def _invoke_non_context(
+    actor: Actor,
+    cancel_scope: CancelScope,
+    ctx: Context,
+    cid: str,
+    chan: Channel,
+    func: Callable,
+    coro: Coroutine,
+    kwargs: dict[str, Any],
+
+    treat_as_gen: bool,
+    is_rpc: bool,
+
+    task_status: TaskStatus[
+        Context | BaseException
+    ] = trio.TASK_STATUS_IGNORED,
+):
+
+    # TODO: can we unify this with the `context=True` impl below?
+    if inspect.isasyncgen(coro):
+        await chan.send({'functype': 'asyncgen', 'cid': cid})
+        # XXX: massive gotcha! If the containing scope
+        # is cancelled and we execute the below line,
+        # any ``ActorNursery.__aexit__()`` WON'T be
+        # triggered in the underlying async gen! So we
+        # have to properly handle the closing (aclosing)
+        # of the async gen in order to be sure the cancel
+        # is propagated!
+        with cancel_scope as cs:
+            ctx._scope = cs
+            task_status.started(ctx)
+            async with aclosing(coro) as agen:
+                async for item in agen:
+                    # TODO: can we send values back in here?
+                    # it's gonna require a `while True:` and
+                    # some non-blocking way to retrieve new `asend()`
+                    # values from the channel:
+                    # to_send = await chan.recv_nowait()
+                    # if to_send is not None:
+                    #     to_yield = await coro.asend(to_send)
+                    await chan.send({'yield': item, 'cid': cid})
+
+        log.runtime(f"Finished iterating {coro}")
+        # TODO: we should really support a proper
+        # `StopAsyncIteration` system here for returning a final
+        # value if desired
+        await chan.send({'stop': True, 'cid': cid})
+
+    # one way @stream func that gets treated like an async gen
+    # TODO: can we unify this with the `context=True` impl below?
+    elif treat_as_gen:
+        await chan.send({'functype': 'asyncgen', 'cid': cid})
+        # XXX: the async-func may spawn further tasks which push
+        # back values like an async-generator would but must
+        # manualy construct the response dict-packet-responses as
+        # above
+        with cancel_scope as cs:
+            ctx._scope = cs
+            task_status.started(ctx)
+            await coro
+
+        if not cs.cancelled_caught:
+            # task was not cancelled so we can instruct the
+            # far end async gen to tear down
+            await chan.send({'stop': True, 'cid': cid})
+    else:
+        # regular async function/method
+        # XXX: possibly just a scheduled `Actor._cancel_task()`
+        # from a remote request to cancel some `Context`.
+        # ------ - ------
+        # TODO: ideally we unify this with the above `context=True`
+        # block such that for any remote invocation ftype, we
+        # always invoke the far end RPC task scheduling the same
+        # way: using the linked IPC context machinery.
+        failed_resp: bool = False
+        try:
+            await chan.send({
+                'functype': 'asyncfunc',
+                'cid': cid
+            })
+        except (
+            trio.ClosedResourceError,
+            trio.BrokenResourceError,
+            BrokenPipeError,
+        ) as ipc_err:
+            failed_resp = True
+            if is_rpc:
+                raise
+            else:
+                # TODO: should this be an `.exception()` call?
+                log.warning(
+                    f'Failed to respond to non-rpc request: {func}\n'
+                    f'{ipc_err}'
+                )
+
+        with cancel_scope as cs:
+            ctx._scope: CancelScope = cs
+            task_status.started(ctx)
+            result = await coro
+            fname: str = func.__name__
+            log.runtime(
+                'RPC complete:\n'
+                f'task: {ctx._task}\n'
+                f'|_cid={ctx.cid}\n'
+                f'|_{fname}() -> {pformat(result)}\n'
+            )
+
+            # NOTE: only send result if we know IPC isn't down
+            if (
+                not failed_resp
+                and chan.connected()
+            ):
+                try:
+                    await chan.send(
+                        {'return': result,
+                         'cid': cid}
+                    )
+                except (
+                    BrokenPipeError,
+                    trio.BrokenResourceError,
+                ):
+                    log.warning(
+                        'Failed to return result:\n'
+                        f'{func}@{actor.uid}\n'
+                        f'remote chan: {chan.uid}'
+                    )
+
+@acm
+async def _errors_relayed_via_ipc(
+    actor: Actor,
+    chan: Channel,
+    ctx: Context,
+    is_rpc: bool,
+
+    hide_tb: bool = False,
+    debug_kbis: bool = False,
+    task_status: TaskStatus[
+        Context | BaseException
+    ] = trio.TASK_STATUS_IGNORED,
+
+) -> None:
+    __tracebackhide__: bool = hide_tb  # TODO: use hide_tb here?
+    try:
+        yield  # run RPC invoke body
+
+    # box and ship RPC errors for wire-transit via
+    # the task's requesting parent IPC-channel.
+    except (
+        Exception,
+        BaseExceptionGroup,
+        KeyboardInterrupt,
+    ) as err:
+
+        # always hide this frame from debug REPL if the crash
+        # originated from an rpc task and we DID NOT fail due to
+        # an IPC transport error!
+        if (
+            is_rpc
+            and chan.connected()
+        ):
+            __tracebackhide__: bool = hide_tb
+
+        if not is_multi_cancelled(err):
+
+            # TODO: maybe we'll want different "levels" of debugging
+            # eventualy such as ('app', 'supervisory', 'runtime') ?
+
+            # if not isinstance(err, trio.ClosedResourceError) and (
+            # if not is_multi_cancelled(err) and (
+
+            entered_debug: bool = False
+            if (
+                (
+                    not isinstance(err, ContextCancelled)
+                    or (
+                        isinstance(err, ContextCancelled)
+                        and ctx._cancel_called
+
+                        # if the root blocks the debugger lock request from a child
+                        # we will get a remote-cancelled condition.
+                        and ctx._enter_debugger_on_cancel
+                    )
+                )
+                and
+                (
+                    not isinstance(err, KeyboardInterrupt)
+                    or (
+                        isinstance(err, KeyboardInterrupt)
+                        and debug_kbis
+                    )
+                )
+            ):
+                # await pause()
+                # XXX QUESTION XXX: is there any case where we'll
+                # want to debug IPC disconnects as a default?
+                # => I can't think of a reason that inspecting this
+                # type of failure will be useful for respawns or
+                # recovery logic - the only case is some kind of
+                # strange bug in our transport layer itself? Going
+                # to keep this open ended for now.
+                entered_debug = await _debug._maybe_enter_pm(err)
+
+                if not entered_debug:
+                    log.exception('Actor crashed:\n')
+
+        # always ship errors back to caller
+        err_msg: dict[str, dict] = pack_error(
+            err,
+            # tb=tb, # TODO: special tb fmting?
+            cid=ctx.cid,
+        )
+
+        if is_rpc:
+            try:
+                await chan.send(err_msg)
+
+            # TODO: tests for this scenario:
+            # - RPC caller closes connection before getting a response
+            # should **not** crash this actor..
+            except (
+                trio.ClosedResourceError,
+                trio.BrokenResourceError,
+                BrokenPipeError,
+            ) as ipc_err:
+
+                # if we can't propagate the error that's a big boo boo
+                log.exception(
+                    f"Failed to ship error to caller @ {chan.uid} !?\n"
+                    f'{ipc_err}'
+
+                )
+
+        # error is probably from above coro running code *not from
+        # the target rpc invocation since a scope was never
+        # allocated around the coroutine await.
+        if ctx._scope is None:
+            # we don't ever raise directly here to allow the
+            # msg-loop-scheduler to continue running for this
+            # channel.
+            task_status.started(err)
+
+        # always reraise KBIs so they propagate at the sys-process
+        # level.
+        if isinstance(err, KeyboardInterrupt):
+            raise
+
+
+    # RPC task bookeeping
+    finally:
+        try:
+            ctx, func, is_complete = actor._rpc_tasks.pop(
+                (chan, ctx.cid)
+            )
+            is_complete.set()
+
+        except KeyError:
+            if is_rpc:
+                # If we're cancelled before the task returns then the
+                # cancel scope will not have been inserted yet
+                log.warning(
+                    'RPC task likely errored or cancelled before start?'
+                    f'|_{ctx._task}\n'
+                    f'  >> {ctx.repr_rpc}\n'
+                )
+            else:
+                log.cancel(
+                    'Failed to de-alloc internal runtime cancel task?\n'
+                    f'|_{ctx._task}\n'
+                    f'  >> {ctx.repr_rpc}\n'
+                )
+
+        finally:
+            if not actor._rpc_tasks:
+                log.runtime("All RPC tasks have completed")
+                actor._ongoing_rpc_tasks.set()
+
+
 async def _invoke(
 
     actor: Actor,
@@ -115,6 +396,8 @@ async def _invoke(
     kwargs: dict[str, Any],
 
     is_rpc: bool = True,
+    hide_tb: bool = True,
+
     task_status: TaskStatus[
         Context | BaseException
     ] = trio.TASK_STATUS_IGNORED,
@@ -127,8 +410,8 @@ async def _invoke(
     remotely invoked function, normally in `Actor._service_n: Nursery`.
 
     '''
+    __tracebackhide__: bool = hide_tb
     treat_as_gen: bool = False
-    failed_resp: bool = False
 
     if _state.debug_mode():
         await maybe_import_gb()
@@ -139,7 +422,7 @@ async def _invoke(
 
     cancel_scope = CancelScope()
     # activated cancel scope ref
-    cs: CancelScope | None = None
+    cs: CancelScope|None = None
 
     ctx = actor.get_context(
         chan=chan,
@@ -160,6 +443,7 @@ async def _invoke(
 
         # compat with old api
         kwargs['ctx'] = ctx
+        treat_as_gen = True
 
         if 'ctx' in params:
             warnings.warn(
@@ -174,7 +458,6 @@ async def _invoke(
             assert 'stream' in params
             kwargs['stream'] = ctx
 
-        treat_as_gen = True
 
     elif getattr(func, '_tractor_context_function', False):
         # handle decorated ``@tractor.context`` async function
@@ -182,65 +465,45 @@ async def _invoke(
         context = True
 
     # errors raised inside this block are propgated back to caller
-    try:
+    async with _errors_relayed_via_ipc(
+        actor,
+        chan,
+        ctx,
+        is_rpc,
+        hide_tb=hide_tb,
+        task_status=task_status,
+    ):
         if not (
             inspect.isasyncgenfunction(func) or
             inspect.iscoroutinefunction(func)
         ):
             raise TypeError(f'{func} must be an async function!')
 
+        # init coroutine with `kwargs` to immediately catch any
+        # type-sig errors.
         try:
             coro = func(**kwargs)
         except TypeError:
             raise
 
-        # TODO: can we unify this with the `context=True` impl below?
-        if inspect.isasyncgen(coro):
-            await chan.send({'functype': 'asyncgen', 'cid': cid})
-            # XXX: massive gotcha! If the containing scope
-            # is cancelled and we execute the below line,
-            # any ``ActorNursery.__aexit__()`` WON'T be
-            # triggered in the underlying async gen! So we
-            # have to properly handle the closing (aclosing)
-            # of the async gen in order to be sure the cancel
-            # is propagated!
-            with cancel_scope as cs:
-                ctx._scope = cs
-                task_status.started(ctx)
-                async with aclosing(coro) as agen:
-                    async for item in agen:
-                        # TODO: can we send values back in here?
-                        # it's gonna require a `while True:` and
-                        # some non-blocking way to retrieve new `asend()`
-                        # values from the channel:
-                        # to_send = await chan.recv_nowait()
-                        # if to_send is not None:
-                        #     to_yield = await coro.asend(to_send)
-                        await chan.send({'yield': item, 'cid': cid})
-
-            log.runtime(f"Finished iterating {coro}")
-            # TODO: we should really support a proper
-            # `StopAsyncIteration` system here for returning a final
-            # value if desired
-            await chan.send({'stop': True, 'cid': cid})
-
-        # one way @stream func that gets treated like an async gen
-        # TODO: can we unify this with the `context=True` impl below?
-        elif treat_as_gen:
-            await chan.send({'functype': 'asyncgen', 'cid': cid})
-            # XXX: the async-func may spawn further tasks which push
-            # back values like an async-generator would but must
-            # manualy construct the response dict-packet-responses as
-            # above
-            with cancel_scope as cs:
-                ctx._scope = cs
-                task_status.started(ctx)
-                await coro
-
-            if not cs.cancelled_caught:
-                # task was not cancelled so we can instruct the
-                # far end async gen to tear down
-                await chan.send({'stop': True, 'cid': cid})
+        # TODO: implement all these cases in terms of the
+        # `Context` one!
+        if not context:
+            await _invoke_non_context(
+                actor,
+                cancel_scope,
+                ctx,
+                cid,
+                chan,
+                func,
+                coro,
+                kwargs,
+                treat_as_gen,
+                is_rpc,
+                task_status,
+            )
+            # below is only for `@context` funcs
+            return
 
         # our most general case: a remote SC-transitive,
         # IPC-linked, cross-actor-task "context"
@@ -256,77 +519,53 @@ async def _invoke(
         #     here and awaited directly, possibly just with a small
         #     wrapper that calls `Context.started()` and then does
         #     the `await coro()`?
-        elif context:
 
-            # a "context" endpoint type is the most general and
-            # "least sugary" type of RPC ep with support for
-            # bi-dir streaming B)
-            await chan.send({
-                'functype': 'context',
-                'cid': cid
-            })
+        # a "context" endpoint type is the most general and
+        # "least sugary" type of RPC ep with support for
+        # bi-dir streaming B)
+        await chan.send({
+            'functype': 'context',
+            'cid': cid
+        })
 
-            try:
-                async with trio.open_nursery() as nurse:
-                    ctx._scope_nursery = nurse
-                    ctx._scope = nurse.cancel_scope
-                    task_status.started(ctx)
+        # TODO: should we also use an `.open_context()` equiv
+        # for this callee side by factoring the impl from
+        # `Portal.open_context()` into a common helper?
+        #
+        # NOTE: there are many different ctx state details
+        # in a callee side instance according to current impl:
+        # - `.cancelled_caught` can never be `True`.
+        #  -> the below scope is never exposed to the
+        #     `@context` marked RPC function.
+        # - `._portal` is never set.
+        try:
+            async with trio.open_nursery() as tn:
+                ctx._scope_nursery = tn
+                ctx._scope = tn.cancel_scope
+                task_status.started(ctx)
 
-                    # TODO: should would be nice to have our
-                    # `TaskMngr` nursery here!
-                    res: Any = await coro
-                    ctx._result = res
+                # TODO: should would be nice to have our
+                # `TaskMngr` nursery here!
+                res: Any = await coro
+                ctx._result = res
 
-                    # deliver final result to caller side.
-                    await chan.send({
-                        'return': res,
-                        'cid': cid
-                    })
+                # deliver final result to caller side.
+                await chan.send({
+                    'return': res,
+                    'cid': cid
+                })
 
-            # XXX: do we ever trigger this block any more?
-            except (
-                BaseExceptionGroup,
-                trio.Cancelled,
-            ) as scope_error:
-
-                # always set this (callee) side's exception as the
-                # local error on the context
-                ctx._local_error: BaseException = scope_error
-
-                # if a remote error was set then likely the
-                # exception group was raised due to that, so
-                # and we instead raise that error immediately!
-                if re := ctx._remote_error:
-                    ctx._maybe_raise_remote_err(re)
-
-                # maybe TODO: pack in
-                # ``trio.Cancelled.__traceback__`` here so they can
-                # be unwrapped and displayed on the caller side?
-                raise
-
-            finally:
-                # XXX: only pop the context tracking if
-                # a ``@tractor.context`` entrypoint was called
-                assert chan.uid
-
-                # don't pop the local context until we know the
-                # associated child isn't in debug any more
-                await maybe_wait_for_debugger()
-                ctx: Context = actor._contexts.pop(
-                    (chan.uid, cid)
-                )
-
-                res_str: str = (
-                    'error: {ctx._local_error}'
-                    if ctx._local_error
-                    else f'result: {ctx._result}'
-                )
-                log.cancel(
-                    f'IPC context terminated with final {res_str}\n\n'
-                    f'|_{pformat(ctx)}\n'
-                )
-
-            if ctx.cancelled_caught:
+            # NOTE: this happens IFF `ctx._scope.cancel()` is
+            # called by any of,
+            # - *this* callee task manually calling `ctx.cancel()`.
+            # - the runtime calling `ctx._deliver_msg()` which
+            #   itself calls `ctx._maybe_cancel_and_set_remote_error()`
+            #   which cancels the scope presuming the input error
+            #   is not a `.cancel_acked` pleaser.
+            # - currently a never-should-happen-fallthrough case
+            #   inside ._context._drain_to_final_msg()`..
+            #   # TODO: remove this ^ right?
+            if ctx._scope.cancelled_caught:
 
                 # first check for and raise any remote error
                 # before raising any context cancelled case
@@ -335,7 +574,6 @@ async def _invoke(
                 if re := ctx._remote_error:
                     ctx._maybe_raise_remote_err(re)
 
-                # fname: str = func.__name__
                 cs: CancelScope = ctx._scope
                 if cs.cancel_called:
                     our_uid: tuple = actor.uid
@@ -382,7 +620,16 @@ async def _invoke(
                         div_str +
                         f'<= canceller: {canceller}\n'
                         f'=> uid: {our_uid}\n'
-                        f'  |_{ctx._task}()\n'
+                        f'  |_{ctx._task}()'
+
+                        # TODO: instead just show the
+                        # ctx.__str__() here?
+                        # -[ ] textwrap.indent() it correctly!
+                        # -[ ] BUT we need to wait until
+                        #   the state is filled out before emitting
+                        #   this msg right ow its kinda empty? bleh..
+                        #
+                        # f'  |_{ctx}'
                     )
 
                     # TODO: does this ever get set any more or can
@@ -391,7 +638,7 @@ async def _invoke(
                         msg += (
                             # '------ - ------\n'
                             # 'IPC msg:\n'
-                            f'\n{ctx._cancel_msg}'
+                            f'\n\n{ctx._cancel_msg}'
                         )
 
                     # task-contex was either cancelled by request using
@@ -399,180 +646,68 @@ async def _invoke(
                     # on the far end, or it was cancelled by the local
                     # (callee) task, so relay this cancel signal to the
                     # other side.
-                    raise ContextCancelled(
+                    ctxc = ContextCancelled(
                         msg,
                         suberror_type=trio.Cancelled,
                         canceller=canceller,
                     )
+                    # assign local error so that the `.outcome`
+                    # resolves to an error for both reporting and
+                    # state checks.
+                    ctx._local_error = ctxc
+                    raise ctxc
 
-        # regular async function/method
-        # XXX: possibly just a scheduled `Actor._cancel_task()`
-        # from a remote request to cancel some `Context`.
-        # ------ - ------
-        # TODO: ideally we unify this with the above `context=True`
-        # block such that for any remote invocation ftype, we
-        # always invoke the far end RPC task scheduling the same
-        # way: using the linked IPC context machinery.
-        else:
-            try:
-                await chan.send({
-                    'functype': 'asyncfunc',
-                    'cid': cid
-                })
-            except (
-                trio.ClosedResourceError,
-                trio.BrokenResourceError,
-                BrokenPipeError,
-            ) as ipc_err:
-                failed_resp = True
-                if is_rpc:
-                    raise
-                else:
-                    # TODO: should this be an `.exception()` call?
-                    log.warning(
-                        f'Failed to respond to non-rpc request: {func}\n'
-                        f'{ipc_err}'
-                    )
+        # XXX: do we ever trigger this block any more?
+        except (
+            BaseExceptionGroup,
+            trio.Cancelled,
+            BaseException,
 
-            with cancel_scope as cs:
-                ctx._scope: CancelScope = cs
-                task_status.started(ctx)
-                result = await coro
-                fname: str = func.__name__
-                log.runtime(
-                    'RPC complete:\n'
-                    f'task: {ctx._task}\n'
-                    f'|_cid={ctx.cid}\n'
-                    f'|_{fname}() -> {pformat(result)}\n'
-                )
+        ) as scope_error:
 
-                # NOTE: only send result if we know IPC isn't down
-                if (
-                    not failed_resp
-                    and chan.connected()
-                ):
-                    try:
-                        await chan.send(
-                            {'return': result,
-                             'cid': cid}
-                        )
-                    except (
-                        BrokenPipeError,
-                        trio.BrokenResourceError,
-                    ):
-                        log.warning(
-                            'Failed to return result:\n'
-                            f'{func}@{actor.uid}\n'
-                            f'remote chan: {chan.uid}'
-                        )
+            # always set this (callee) side's exception as the
+            # local error on the context
+            ctx._local_error: BaseException = scope_error
 
-    except (
-        Exception,
-        BaseExceptionGroup,
-    ) as err:
+            # if a remote error was set then likely the
+            # exception group was raised due to that, so
+            # and we instead raise that error immediately!
+            ctx.maybe_raise()
 
-        # always hide this frame from debug REPL if the crash
-        # originated from an rpc task and we DID NOT fail
-        # due to an IPC transport error!
-        if (
-            is_rpc
-            and chan.connected()
-        ):
-            __tracebackhide__: bool = True
-
-        if not is_multi_cancelled(err):
-
-            # TODO: maybe we'll want different "levels" of debugging
-            # eventualy such as ('app', 'supervisory', 'runtime') ?
-
-            # if not isinstance(err, trio.ClosedResourceError) and (
-            # if not is_multi_cancelled(err) and (
-
-            entered_debug: bool = False
-            if (
-                not isinstance(err, ContextCancelled)
-                or (
-                    isinstance(err, ContextCancelled)
-                    and ctx._cancel_called
-
-                    # if the root blocks the debugger lock request from a child
-                    # we will get a remote-cancelled condition.
-                    and ctx._enter_debugger_on_cancel
-                )
-            ):
-                # XXX QUESTION XXX: is there any case where we'll
-                # want to debug IPC disconnects as a default?
-                # => I can't think of a reason that inspecting this
-                # type of failure will be useful for respawns or
-                # recovery logic - the only case is some kind of
-                # strange bug in our transport layer itself? Going
-                # to keep this open ended for now.
-                entered_debug = await _debug._maybe_enter_pm(err)
-
-                if not entered_debug:
-                    log.exception("Actor crashed:")
-
-        # always ship errors back to caller
-        err_msg: dict[str, dict] = pack_error(
-            err,
-            # tb=tb, # TODO: special tb fmting?
-            cid=cid,
-        )
-
-        if is_rpc:
-            try:
-                await chan.send(err_msg)
-
-            # TODO: tests for this scenario:
-            # - RPC caller closes connection before getting a response
-            # should **not** crash this actor..
-            except (
-                trio.ClosedResourceError,
-                trio.BrokenResourceError,
-                BrokenPipeError,
-            ) as ipc_err:
-
-                # if we can't propagate the error that's a big boo boo
-                log.exception(
-                    f"Failed to ship error to caller @ {chan.uid} !?\n"
-                    f'{ipc_err}'
-
-                )
-
-        # error is probably from above coro running code *not from the
-        # underlyingn rpc invocation* since a scope was never allocated
-        # around actual coroutine await.
-        if ctx._scope is None:
-            # we don't ever raise directly here to allow the
-            # msg-loop-scheduler to continue running for this
-            # channel.
-            task_status.started(err)
-
-    finally:
-        # RPC task bookeeping
-        try:
-            ctx, func, is_complete = actor._rpc_tasks.pop(
-                (chan, cid)
-            )
-            is_complete.set()
-
-        except KeyError:
-            if is_rpc:
-                # If we're cancelled before the task returns then the
-                # cancel scope will not have been inserted yet
-                log.warning(
-                    f"Task {func} likely errored or cancelled before start")
-            else:
-                log.cancel(
-                    'Failed to de-alloc internal task!?\n'
-                    f'cid: {cid}\n'
-                    f'{func.__name__}({kwargs})'
-                )
+            # maybe TODO: pack in come kinda
+            # `trio.Cancelled.__traceback__` here so they can be
+            # unwrapped and displayed on the caller side? no se..
+            raise
 
+        # `@context` entrypoint task bookeeping.
+        # i.e. only pop the context tracking if used ;)
         finally:
-            if not actor._rpc_tasks:
-                log.runtime("All RPC tasks have completed")
-                actor._ongoing_rpc_tasks.set()
+            assert chan.uid
+
+            # don't pop the local context until we know the
+            # associated child isn't in debug any more
+            await maybe_wait_for_debugger()
+            ctx: Context = actor._contexts.pop(
+                (chan.uid, cid)
+            )
+
+            merr: Exception|None = ctx.maybe_error
+
+            (
+                res_type_str,
+                res_str,
+            ) = (
+                ('error', f'{type(merr)}',)
+                if merr
+                else (
+                    'result',
+                    f'`{repr(ctx.outcome)}`',
+                )
+            )
+            log.cancel(
+                f'IPC context terminated with a final {res_type_str}\n\n'
+                f'{ctx}\n'
+            )
 
 
 def _get_mod_abspath(module: ModuleType) -> str:
@@ -878,20 +1013,29 @@ class Actor:
         Entry point for new inbound connections to the channel server.
 
         '''
-        self._no_more_peers = trio.Event()  # unset
-
+        self._no_more_peers = trio.Event()  # unset by making new
         chan = Channel.from_stream(stream)
-        their_uid: tuple[str, str] | None = chan.uid
+        their_uid: tuple[str, str]|None = chan.uid
+
+        con_msg: str = ''
         if their_uid:
-            log.warning(
-                f'Re-connection from already known {their_uid}'
+            # NOTE: `.uid` is only set after first contact
+            con_msg = (
+                'IPC Re-connection from already known peer? '
             )
         else:
-           log.runtime(f'New connection to us @{chan.raddr}')
+            con_msg = (
+                'New IPC connection to us '
+            )
 
+        con_msg += (
+            f'<= @{chan.raddr}\n'
+            f'|_{chan}\n'
+            # f' |_@{chan.raddr}\n\n'
+        )
         # send/receive initial handshake response
         try:
-            uid = await self._do_handshake(chan)
+            uid: tuple|None = await self._do_handshake(chan)
         except (
             # we need this for ``msgspec`` for some reason?
             # for now, it's been put in the stream backend.
@@ -906,44 +1050,66 @@ class Actor:
             # inside ``open_root_actor()`` where there is a check for
             # a bound listener on the "arbiter" addr.  the reset will be
             # because the handshake was never meant took place.
-            log.warning(f"Channel {chan} failed to handshake")
+            log.warning(
+                con_msg
+                +
+                ' -> But failed to handshake? Ignoring..\n'
+            )
             return
 
-        # channel tracking
+        con_msg += (
+            f' -> Handshake with actor `{uid[0]}[{uid[1][-6:]}]` complete\n'
+        )
+        # IPC connection tracking for both peers and new children:
+        # - if this is a new channel to a locally spawned
+        #   sub-actor there will be a spawn wait even registered
+        #   by a call to `.wait_for_peer()`.
+        # - if a peer is connecting no such event will exit.
         event: trio.Event|None = self._peer_connected.pop(
             uid,
             None,
         )
         if event:
-            # Instructing connection: this is likely a new channel to
-            # a recently spawned actor which we'd like to control via
-            # async-rpc calls.
-            log.runtime(f"Waking channel waiters {event.statistics()}")
-            # Alert any task waiting on this connection to come up
+            con_msg += (
+                ' -> Waking subactor spawn waiters: '
+                f'{event.statistics().tasks_waiting}\n'
+                f' -> Registered IPC chan for child actor {uid}@{chan.raddr}\n'
+                # f'    {event}\n'
+                # f'    |{event.statistics()}\n'
+            )
+            # wake tasks waiting on this IPC-transport "connect-back"
             event.set()
 
+        else:
+            con_msg += (
+                f' -> Registered IPC chan for peer actor {uid}@{chan.raddr}\n'
+            )  # type: ignore
+
         chans: list[Channel] = self._peers[uid]
-        if chans:
-            # TODO: re-use channels for new connections instead
-            # of always new ones?
-            # => will require changing all the discovery funcs..
-            log.runtime(
-                f"already have channel(s) for {uid}:{chans}?"
-            )
+        # if chans:
+        #     # TODO: re-use channels for new connections instead
+        #     # of always new ones?
+        #     # => will require changing all the discovery funcs..
 
         # append new channel
-        log.runtime(f"Registered {chan} for {uid}")  # type: ignore
         # TODO: can we just use list-ref directly?
-        # chans.append(chan)
-        self._peers[uid].append(chan)
+        chans.append(chan)
+
+        log.runtime(con_msg)
 
         # Begin channel management - respond to remote requests and
         # process received reponses.
         disconnected: bool = False
         try:
-            disconnected: bool = await process_messages(self, chan)
+            disconnected: bool = await process_messages(
+                self,
+                chan,
+            )
         except trio.Cancelled:
-            log.cancel(f'Msg loop was cancelled for {chan}')
+            log.cancel(
+                'IPC transport msg loop was cancelled for \n'
+                f'|_{chan}\n'
+            )
             raise
 
         finally:
@@ -957,7 +1123,10 @@ class Actor:
             # moving on with closing our own side.
             if local_nursery:
                 if chan._cancel_called:
-                    log.cancel(f'Waiting on cancel request to peer {chan.uid}')
+                    log.cancel(
+                        'Waiting on cancel request to peer\n'
+                        f'`Portal.cancel_actor()` => {chan.uid}\n'
+                    )
 
                 # XXX: this is a soft wait on the channel (and its
                 # underlying transport protocol) to close from the
@@ -970,10 +1139,13 @@ class Actor:
                 # loop processing.
                 with trio.move_on_after(0.5) as cs:
                     cs.shield = True
-                    # Attempt to wait for the far end to close the channel
-                    # and bail after timeout (2-generals on closure).
-                    assert chan.msgstream
-                    async for msg in chan.msgstream.drain():
+
+                    # attempt to wait for the far end to close the
+                    # channel and bail after timeout (a 2-generals
+                    # problem on closure).
+                    assert chan.transport
+                    async for msg in chan.transport.drain():
+
                         # try to deliver any lingering msgs
                         # before we destroy the channel.
                         # This accomplishes deterministic
@@ -985,7 +1157,7 @@ class Actor:
                             'Draining msg from disconnected peer\n'
                             f'{chan.uid}\n'
                             f'|_{chan}\n'
-                            f'  |_{chan.msgstream}\n\n'
+                            f'  |_{chan.transport}\n\n'
 
                             f'{pformat(msg)}\n'
                         )
@@ -998,11 +1170,30 @@ class Actor:
                                 msg,
                             )
 
-                    log.runtime(
-                        'Waiting on local actor nursery to exit..\n'
+                    # NOTE: when no call to `open_root_actor()` was
+                    # made, we implicitly make that call inside
+                    # the first `.open_nursery()`, in this case we
+                    # can assume that we are the root actor and do
+                    # not have to wait for the nursery-enterer to
+                    # exit before shutting down the actor runtime.
+                    #
+                    # see matching  note inside `._supervise.open_nursery()`
+                    if not local_nursery._implicit_runtime_started:
+                        log.runtime(
+                            'Waiting on local actor nursery to exit..\n'
+                            f'|_{local_nursery}\n'
+                        )
+                        await local_nursery.exited.wait()
+
+                if (
+                    cs.cancelled_caught
+                    and not local_nursery._implicit_runtime_started
+                ):
+                    log.warning(
+                        'Failed to exit local actor nursery?\n'
                         f'|_{local_nursery}\n'
                     )
-                    await local_nursery.exited.wait()
+                    # await _debug.pause()
 
                 if disconnected:
                     # if the transport died and this actor is still
@@ -1022,7 +1213,7 @@ class Actor:
                             log.cancel(
                                 f'Peer IPC broke but subproc is alive?\n\n'
 
-                                f'<=x @{chan.raddr}\n'
+                                f'<=x {chan.uid}@{chan.raddr}\n'
                                 f'   |_{proc}\n'
                             )
 
@@ -1033,9 +1224,9 @@ class Actor:
                 f'uid: {chan.uid}\n'
                 f'|_{pformat(chan)}\n'
             )
-            chans = self._peers.get(chan.uid)
             chans.remove(chan)
 
+            # TODO: do we need to be this pedantic?
             if not chans:
                 log.runtime(
                     f'No more channels with {chan.uid}'
@@ -1045,7 +1236,7 @@ class Actor:
             peers_str: str = ''
             for uid, chans in self._peers.items():
                 peers_str += (
-                    f'- uid: {uid}\n'
+                    f'|_ uid: {uid}\n'
                 )
                 for i, chan in enumerate(chans):
                     peers_str += (
@@ -1487,22 +1678,27 @@ class Actor:
             requesting_uid,
             requester_type,
             req_chan,
+            log_meth,
 
         ) = (
             req_chan.uid,
             'peer',
             req_chan,
+            log.cancel,
 
         ) if req_chan else (
 
             # a self cancel of ALL rpc tasks
             self.uid,
             'self',
-            self
+            self,
+            log.runtime,
         )
+        # TODO: just use the new `Context.repr_rpc: str` (and
+        # other) repr fields instead of doing this all manual..
         msg: str = (
-            f'`Actor.cancel()` request from {requester_type}:\n'
-            f'<= {requesting_uid}\n'
+            f'Runtime cancel request from {requester_type}:\n\n'
+            f'<= .cancel(): {requesting_uid}\n'
         )
 
         # TODO: what happens here when we self-cancel tho?
@@ -1541,7 +1737,7 @@ class Actor:
             if self._service_n:
                 self._service_n.cancel_scope.cancel()
 
-        log.cancel(msg)
+        log_meth(msg)
         self._cancel_complete.set()
         return True
 
@@ -1604,20 +1800,23 @@ class Actor:
             return True
 
         log.cancel(
-            'Cancel request for RPC task\n'
-            f'<= canceller: {requesting_uid}\n\n'
+            'Cancel request for RPC task\n\n'
+            f'<= ._cancel_task(): {requesting_uid}\n'
+            f'  |_ @{ctx.dmaddr}\n\n'
 
             # TODO: better ascii repr for "supervisor" like
             # a nursery or context scope?
-            f'=> ipc-parent: {parent_chan}\n'
+            # f'=> {parent_chan}\n'
+            f'=> {ctx._task}\n'
             # TODO: simplified `Context.__repr__()` fields output
             # shows only application state-related stuff like,
             # - ._stream
             # - .closed
             # - .started_called
             # - .. etc.
-            f'  |_ctx: {cid}\n'
-            f'    >> {ctx._nsf}()\n'
+            f'  >> {ctx.repr_rpc}\n'
+            # f'  |_ctx: {cid}\n'
+            # f'    >> {ctx._nsf}()\n'
         )
         if (
             ctx._canceller is None
@@ -1670,7 +1869,7 @@ class Actor:
         '''
         tasks: dict = self._rpc_tasks
         if not tasks:
-            log.warning(
+            log.runtime(
                 'Actor has no cancellable RPC tasks?\n'
                 f'<= canceller: {req_uid}\n'
             )
@@ -1700,11 +1899,17 @@ class Actor:
                 f'   |>> {ctx._nsf}() -> dict:\n'
             )
 
+        descr: str = (
+            'all' if not parent_chan
+            else
+            "IPC channel's "
+        )
+
         log.cancel(
-            f'Cancelling all {len(tasks)} rpc tasks:\n\n'
-            f'<= .cancel() from {req_uid}\n'
-            f'{self}\n'
-            f'{tasks_str}'
+            f'Cancelling {descr} {len(tasks)} rpc tasks\n\n'
+            f'<= .cancel_rpc_tasks(): {req_uid}\n'
+            # f'{self}\n'
+            # f'{tasks_str}'
         )
         for (
             (task_caller_chan, cid),
@@ -1733,10 +1938,11 @@ class Actor:
                 requesting_uid=req_uid,
             )
 
-        log.cancel(
-            'Waiting for remaining rpc tasks to complete\n'
-            f'|_{tasks}'
-        )
+        if tasks:
+            log.cancel(
+                'Waiting for remaining rpc tasks to complete\n'
+                f'|_{tasks}'
+            )
         await self._ongoing_rpc_tasks.wait()
 
     def cancel_server(self) -> None:
@@ -1793,21 +1999,21 @@ class Actor:
 
     ) -> tuple[str, str]:
         '''
-        Exchange (name, UUIDs) identifiers as the first communication step.
+        Exchange `(name, UUIDs)` identifiers as the first
+        communication step.
 
-        These are essentially the "mailbox addresses" found in actor model
-        parlance.
+        These are essentially the "mailbox addresses" found in
+        actor model parlance.
 
         '''
         await chan.send(self.uid)
-        value = await chan.recv()
+        value: tuple = await chan.recv()
         uid: tuple[str, str] = (str(value[0]), str(value[1]))
 
         if not isinstance(uid, tuple):
             raise ValueError(f"{uid} is not a valid uid?!")
 
         chan.uid = str(uid[0]), str(uid[1])
-        log.runtime(f"Handshake with actor {uid}@{chan.raddr} complete")
         return uid
 
     def is_infected_aio(self) -> bool:
@@ -1970,7 +2176,10 @@ async def async_main(
                             shield=True,
                         )
                     )
-                log.runtime("Waiting on service nursery to complete")
+                log.runtime(
+                    'Actor runtime is up!'
+                    # 'Blocking on service nursery to exit..\n'
+                )
             log.runtime(
                 "Service nursery complete\n"
                 "Waiting on root nursery to complete"
@@ -2016,11 +2225,13 @@ async def async_main(
         raise
 
     finally:
-        log.info("Runtime nursery complete")
-
+        log.runtime(
+            'Runtime nursery complete'
+            '-> Closing all actor lifetime contexts..'
+        )
         # tear down all lifetime contexts if not in guest mode
         # XXX: should this just be in the entrypoint?
-        log.info("Closing all actor lifetime contexts")
+        actor.lifetime_stack.close()
 
         # TODO: we can't actually do this bc the debugger
         # uses the _service_n to spawn the lock task, BUT,
@@ -2031,9 +2242,7 @@ async def async_main(
         #     with CancelScope(shield=True):
         #         await _debug.breakpoint()
 
-        actor.lifetime_stack.close()
-
-        # Unregister actor from the registry
+        # Unregister actor from the registry-sys / registrar.
         if (
             is_registered
             and not actor.is_registrar
@@ -2241,13 +2450,14 @@ async def process_messages(
                             'parent_chan': chan,
                             'requesting_uid': chan.uid,
                         }
-                        log.cancel(
-                            f'Rx task cancel request\n'
-                            f'<= canceller: {chan.uid}\n'
-                            f'  |_{chan}\n\n'
-                            f'=> {actor}\n'
-                            f'  |_cid: {target_cid}\n'
-                        )
+                        # TODO: remove? already have emit in meth.
+                        # log.runtime(
+                        #     f'Rx RPC task cancel request\n'
+                        #     f'<= canceller: {chan.uid}\n'
+                        #     f'  |_{chan}\n\n'
+                        #     f'=> {actor}\n'
+                        #     f'  |_cid: {target_cid}\n'
+                        # )
                         try:
                             await _invoke(
                                 actor,
@@ -2527,6 +2737,11 @@ class Arbiter(Actor):
         sockaddr: tuple[str, int]
 
         for (aname, _), sockaddr in self._registry.items():
+            log.info(
+                f'Actor mailbox info:\n'
+                f'aname: {aname}\n'
+                f'sockaddr: {sockaddr}\n'
+            )
             if name == aname:
                 sockaddrs.append(sockaddr)
 
-- 
2.34.1


From a5bdc6db668e3344d876574968b3818937ec114c Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 5 Mar 2024 10:34:32 -0500
Subject: [PATCH 139/378] Flip rpc tests over to use `ExceptionGroup` on new
 `trio`

---
 tests/test_advanced_streaming.py            | 2 +-
 tests/test_child_manages_service_nursery.py | 1 -
 tests/test_rpc.py                           | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/test_advanced_streaming.py b/tests/test_advanced_streaming.py
index 8061c3b9..e8696346 100644
--- a/tests/test_advanced_streaming.py
+++ b/tests/test_advanced_streaming.py
@@ -329,7 +329,7 @@ async def inf_streamer(
 
             # close out the stream gracefully
             except trio.ClosedResourceError:
-                print('msgstream closed on streamer side!')
+                print('transport closed on streamer side!')
                 assert stream.closed
                 break
         else:
diff --git a/tests/test_child_manages_service_nursery.py b/tests/test_child_manages_service_nursery.py
index fd1ceb80..228d6ade 100644
--- a/tests/test_child_manages_service_nursery.py
+++ b/tests/test_child_manages_service_nursery.py
@@ -10,7 +10,6 @@ from contextlib import asynccontextmanager as acm
 
 import pytest
 import trio
-from trio_typing import TaskStatus
 import tractor
 from tractor import RemoteActorError
 from async_generator import aclosing
diff --git a/tests/test_rpc.py b/tests/test_rpc.py
index 3404c602..1a46666c 100644
--- a/tests/test_rpc.py
+++ b/tests/test_rpc.py
@@ -134,7 +134,7 @@ def test_rpc_errors(
         value = err.value
 
         # might get multiple `trio.Cancelled`s as well inside an inception
-        if isinstance(value, trio.MultiError):
+        if isinstance(value, ExceptionGroup):
             value = next(itertools.dropwhile(
                 lambda exc: not isinstance(exc, tractor.RemoteActorError),
                 value.exceptions
-- 
2.34.1


From 1f7f84fdfa2da53ad1acf959b0e57216086d822a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 5 Mar 2024 11:43:23 -0500
Subject: [PATCH 140/378] Mk debugger tests work for arbitrary pre-REPL format

Since this was changed as part of overall project wide logging format
updates, and i ended up changing the both the crash and pause `.pdb()`
msgs to include some multi-line-ascii-"stuff", might as well make the
pre-prompt checks in the test suite more flexible to match.

As such, this exposes 2 new constants inside the `.devx._debug` mod:
- `._pause_msg: str` for the pre `tractor.pause()` header emitted via
  `log.pdb()` and,
- `._crash_msg: str` for the pre `._post_mortem()` equiv when handling
  errors in debug mode.

Adjust the test suite to use these values and thus make us more capable
to absorb changes in the future as well:
- add a new `in_prompt_msg()` predicate, very similar to `assert_before()`
  but minus `assert`s which takes in a `parts: list[str]` to match
  in the pre-prompt stdout.
- delegate to `in_prompt_msg()` in `assert_before()` since it was mostly
  duplicate minus `assert`.
- adjust all previous `<patt> in before` asserts to instead use
  `in_prompt_msg()` with separated pre-prompt-header vs. actor-name
  `parts`.
- use new `._pause/crash_msg` values in all such calls including any
  `assert_before()` cases.
---
 tests/test_debugger.py   | 186 +++++++++++++++++++++++++++++----------
 tractor/devx/__init__.py |  33 +++----
 tractor/devx/_debug.py   |  80 +++++++++++------
 tractor/devx/cli.py      |   7 --
 4 files changed, 202 insertions(+), 104 deletions(-)

diff --git a/tests/test_debugger.py b/tests/test_debugger.py
index 3bd26b61..c314ba62 100644
--- a/tests/test_debugger.py
+++ b/tests/test_debugger.py
@@ -10,12 +10,13 @@ TODO:
     - wonder if any of it'll work on OS X?
 
 """
+from functools import partial
 import itertools
-from os import path
+# from os import path
 from typing import Optional
 import platform
 import pathlib
-import sys
+# import sys
 import time
 
 import pytest
@@ -25,6 +26,10 @@ from pexpect.exceptions import (
     EOF,
 )
 
+from tractor.devx._debug import (
+    _pause_msg,
+    _crash_msg,
+)
 from conftest import (
     examples_dir,
     _ci_env,
@@ -123,20 +128,52 @@ def expect(
         raise
 
 
+def in_prompt_msg(
+    prompt: str,
+    parts: list[str],
+
+    pause_on_false: bool = False,
+    print_prompt_on_false: bool = True,
+
+) -> bool:
+    '''
+    Predicate check if (the prompt's) std-streams output has all
+    `str`-parts in it.
+
+    Can be used in test asserts for bulk matching expected
+    log/REPL output for a given `pdb` interact point.
+
+    '''
+    for part in parts:
+        if part not in prompt:
+
+            if pause_on_false:
+                import pdbp
+                pdbp.set_trace()
+
+            if print_prompt_on_false:
+                print(prompt)
+
+            return False
+
+    return True
+
 def assert_before(
     child,
     patts: list[str],
 
+    **kwargs,
+
 ) -> None:
 
-    before = str(child.before.decode())
+    # as in before the prompt end
+    before: str = str(child.before.decode())
+    assert in_prompt_msg(
+        prompt=before,
+        parts=patts,
 
-    for patt in patts:
-        try:
-            assert patt in before
-        except AssertionError:
-            print(before)
-            raise
+        **kwargs
+    )
 
 
 @pytest.fixture(
@@ -195,7 +232,10 @@ def test_root_actor_error(spawn, user_in_out):
     before = str(child.before.decode())
 
     # make sure expected logging and error arrives
-    assert "Attaching to pdb in crashed actor: ('root'" in before
+    assert in_prompt_msg(
+        before,
+        [_crash_msg, "('root'"]
+    )
     assert 'AssertionError' in before
 
     # send user command
@@ -332,7 +372,10 @@ def test_subactor_error(
     child.expect(PROMPT)
 
     before = str(child.before.decode())
-    assert "Attaching to pdb in crashed actor: ('name_error'" in before
+    assert in_prompt_msg(
+        before,
+        [_crash_msg, "('name_error'"]
+    )
 
     if do_next:
         child.sendline('n')
@@ -353,9 +396,15 @@ def test_subactor_error(
     before = str(child.before.decode())
 
     # root actor gets debugger engaged
-    assert "Attaching to pdb in crashed actor: ('root'" in before
+    assert in_prompt_msg(
+        before,
+        [_crash_msg, "('root'"]
+    )
     # error is a remote error propagated from the subactor
-    assert "RemoteActorError: ('name_error'" in before
+    assert in_prompt_msg(
+        before,
+        [_crash_msg, "('name_error'"]
+    )
 
     # another round
     if ctlc:
@@ -380,7 +429,10 @@ def test_subactor_breakpoint(
     child.expect(PROMPT)
 
     before = str(child.before.decode())
-    assert "Attaching pdb to actor: ('breakpoint_forever'" in before
+    assert in_prompt_msg(
+        before,
+        [_pause_msg, "('breakpoint_forever'"]
+    )
 
     # do some "next" commands to demonstrate recurrent breakpoint
     # entries
@@ -396,7 +448,10 @@ def test_subactor_breakpoint(
         child.sendline('continue')
         child.expect(PROMPT)
         before = str(child.before.decode())
-        assert "Attaching pdb to actor: ('breakpoint_forever'" in before
+        assert in_prompt_msg(
+            before,
+            [_pause_msg, "('breakpoint_forever'"]
+        )
 
         if ctlc:
             do_ctlc(child)
@@ -441,7 +496,10 @@ def test_multi_subactors(
     child.expect(PROMPT)
 
     before = str(child.before.decode())
-    assert "Attaching pdb to actor: ('breakpoint_forever'" in before
+    assert in_prompt_msg(
+        before,
+        [_pause_msg, "('breakpoint_forever'"]
+    )
 
     if ctlc:
         do_ctlc(child)
@@ -461,7 +519,10 @@ def test_multi_subactors(
     # first name_error failure
     child.expect(PROMPT)
     before = str(child.before.decode())
-    assert "Attaching to pdb in crashed actor: ('name_error'" in before
+    assert in_prompt_msg(
+        before,
+        [_crash_msg, "('name_error'"]
+    )
     assert "NameError" in before
 
     if ctlc:
@@ -487,7 +548,10 @@ def test_multi_subactors(
     child.sendline('c')
     child.expect(PROMPT)
     before = str(child.before.decode())
-    assert "Attaching pdb to actor: ('breakpoint_forever'" in before
+    assert in_prompt_msg(
+        before,
+        [_pause_msg, "('breakpoint_forever'"]
+    )
 
     if ctlc:
         do_ctlc(child)
@@ -527,17 +591,21 @@ def test_multi_subactors(
     child.expect(PROMPT)
     before = str(child.before.decode())
 
-    assert_before(child, [
-        # debugger attaches to root
-        "Attaching to pdb in crashed actor: ('root'",
+    assert_before(
+        child, [
+            # debugger attaches to root
+            # "Attaching to pdb in crashed actor: ('root'",
+            _crash_msg,
+            "('root'",
 
-        # expect a multierror with exceptions for each sub-actor
-        "RemoteActorError: ('breakpoint_forever'",
-        "RemoteActorError: ('name_error'",
-        "RemoteActorError: ('spawn_error'",
-        "RemoteActorError: ('name_error_1'",
-        'bdb.BdbQuit',
-    ])
+            # expect a multierror with exceptions for each sub-actor
+            "RemoteActorError: ('breakpoint_forever'",
+            "RemoteActorError: ('name_error'",
+            "RemoteActorError: ('spawn_error'",
+            "RemoteActorError: ('name_error_1'",
+            'bdb.BdbQuit',
+        ]
+    )
 
     if ctlc:
         do_ctlc(child)
@@ -574,15 +642,22 @@ def test_multi_daemon_subactors(
     # the root's tty lock first so anticipate either crash
     # message on the first entry.
 
-    bp_forever_msg = "Attaching pdb to actor: ('bp_forever'"
+    bp_forev_parts = [_pause_msg, "('bp_forever'"]
+    bp_forev_in_msg = partial(
+        in_prompt_msg,
+        parts=bp_forev_parts,
+    )
+
     name_error_msg = "NameError: name 'doggypants' is not defined"
+    name_error_parts = [name_error_msg]
 
     before = str(child.before.decode())
-    if bp_forever_msg in before:
-        next_msg = name_error_msg
+
+    if bp_forev_in_msg(prompt=before):
+        next_parts = name_error_parts
 
     elif name_error_msg in before:
-        next_msg = bp_forever_msg
+        next_parts = bp_forev_parts
 
     else:
         raise ValueError("Neither log msg was found !?")
@@ -599,7 +674,10 @@ def test_multi_daemon_subactors(
 
     child.sendline('c')
     child.expect(PROMPT)
-    assert_before(child, [next_msg])
+    assert_before(
+        child,
+        next_parts,
+    )
 
     # XXX: hooray the root clobbering the child here was fixed!
     # IMO, this demonstrates the true power of SC system design.
@@ -623,9 +701,15 @@ def test_multi_daemon_subactors(
     child.expect(PROMPT)
 
     try:
-        assert_before(child, [bp_forever_msg])
+        assert_before(
+            child,
+            bp_forev_parts,
+        )
     except AssertionError:
-        assert_before(child, [name_error_msg])
+        assert_before(
+            child,
+            name_error_parts,
+        )
 
     else:
         if ctlc:
@@ -637,7 +721,10 @@ def test_multi_daemon_subactors(
 
         child.sendline('c')
         child.expect(PROMPT)
-        assert_before(child, [name_error_msg])
+        assert_before(
+            child,
+            name_error_parts,
+        )
 
     # wait for final error in root
     # where it crashs with boxed error
@@ -647,7 +734,7 @@ def test_multi_daemon_subactors(
             child.expect(PROMPT)
             assert_before(
                 child,
-                [bp_forever_msg]
+                bp_forev_parts
             )
         except AssertionError:
             break
@@ -656,7 +743,9 @@ def test_multi_daemon_subactors(
         child,
         [
             # boxed error raised in root task
-            "Attaching to pdb in crashed actor: ('root'",
+            # "Attaching to pdb in crashed actor: ('root'",
+            _crash_msg,
+            "('root'",
             "_exceptions.RemoteActorError: ('name_error'",
         ]
     )
@@ -770,7 +859,7 @@ def test_multi_nested_subactors_error_through_nurseries(
 
     child = spawn('multi_nested_subactors_error_up_through_nurseries')
 
-    timed_out_early: bool = False
+    # timed_out_early: bool = False
 
     for send_char in itertools.cycle(['c', 'q']):
         try:
@@ -871,11 +960,14 @@ def test_root_nursery_cancels_before_child_releases_tty_lock(
 
     if not timed_out_early:
         before = str(child.before.decode())
-        assert_before(child, [
-            "tractor._exceptions.RemoteActorError: ('spawner0'",
-            "tractor._exceptions.RemoteActorError: ('name_error'",
-            "NameError: name 'doggypants' is not defined",
-        ])
+        assert_before(
+            child,
+            [
+                "tractor._exceptions.RemoteActorError: ('spawner0'",
+                "tractor._exceptions.RemoteActorError: ('name_error'",
+                "NameError: name 'doggypants' is not defined",
+            ],
+        )
 
 
 def test_root_cancels_child_context_during_startup(
@@ -909,8 +1001,10 @@ def test_different_debug_mode_per_actor(
 
     # only one actor should enter the debugger
     before = str(child.before.decode())
-    assert "Attaching to pdb in crashed actor: ('debugged_boi'" in before
-    assert "RuntimeError" in before
+    assert in_prompt_msg(
+        before,
+        [_crash_msg, "('debugged_boi'", "RuntimeError"],
+    )
 
     if ctlc:
         do_ctlc(child)
diff --git a/tractor/devx/__init__.py b/tractor/devx/__init__.py
index 5f832615..c4676e3f 100644
--- a/tractor/devx/__init__.py
+++ b/tractor/devx/__init__.py
@@ -21,30 +21,17 @@ and working with/on the actor runtime.
 
 """
 from ._debug import (
-    maybe_wait_for_debugger,
-    acquire_debug_lock,
-    breakpoint,
-    pause,
-    pause_from_sync,
-    shield_sigint_handler,
-    MultiActorPdb,
-    open_crash_handler,
-    maybe_open_crash_handler,
-    post_mortem,
+    maybe_wait_for_debugger as maybe_wait_for_debugger,
+    acquire_debug_lock as acquire_debug_lock,
+    breakpoint as breakpoint,
+    pause as pause,
+    pause_from_sync as pause_from_sync,
+    shield_sigint_handler as shield_sigint_handler,
+    MultiActorPdb as MultiActorPdb,
+    open_crash_handler as open_crash_handler,
+    maybe_open_crash_handler as maybe_open_crash_handler,
+    post_mortem as post_mortem,
 )
 from ._stackscope import (
     enable_stack_on_sig as enable_stack_on_sig,
 )
-
-__all__ = [
-    'maybe_wait_for_debugger',
-    'acquire_debug_lock',
-    'breakpoint',
-    'pause',
-    'pause_from_sync',
-    'shield_sigint_handler',
-    'MultiActorPdb',
-    'open_crash_handler',
-    'maybe_open_crash_handler',
-    'post_mortem',
-]
diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index d3bf4fe0..e174b848 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -21,18 +21,19 @@ Multi-core debugging for da peeps!
 """
 from __future__ import annotations
 import bdb
-import os
-import sys
-import signal
-from functools import (
-    partial,
-    cached_property,
-)
 from contextlib import (
     asynccontextmanager as acm,
     contextmanager as cm,
     nullcontext,
 )
+from functools import (
+    partial,
+    cached_property,
+)
+import os
+import signal
+import sys
+import traceback
 from typing import (
     Any,
     Callable,
@@ -611,6 +612,9 @@ def shield_sigint_handler(
         # https://github.com/prompt-toolkit/python-prompt-toolkit/blob/c2c6af8a0308f9e5d7c0e28cb8a02963fe0ce07a/prompt_toolkit/patch_stdout.py
 
 
+_pause_msg: str = 'Attaching to pdb REPL in actor'
+
+
 def _set_trace(
     actor: tractor.Actor | None = None,
     pdb: MultiActorPdb | None = None,
@@ -632,7 +636,13 @@ def _set_trace(
         ) or shield
     ):
         # pdbp.set_trace()
-        log.pdb(f"\nAttaching pdb to actor: {actor.uid}\n")
+        # TODO: maybe print the actor supervion tree up to the
+        # root here? Bo
+        log.pdb(
+            f'{_pause_msg}\n'
+            '|\n'
+            f'|_ {actor.uid}\n'
+        )
         # no f!#$&* idea, but when we're in async land
         # we need 2x frames up?
         frame = frame.f_back
@@ -911,6 +921,11 @@ async def breakpoint(**kwargs):
     await pause(**kwargs)
 
 
+_crash_msg: str = (
+    'Attaching to pdb REPL in crashed actor'
+)
+
+
 def _post_mortem(
     actor: tractor.Actor,
     pdb: MultiActorPdb,
@@ -921,15 +936,23 @@ def _post_mortem(
     debugger instance.
 
     '''
-    log.pdb(f"\nAttaching to pdb in crashed actor: {actor.uid}\n")
+    # TODO: print the actor supervion tree up to the root
+    # here! Bo
+    log.pdb(
+        f'{_crash_msg}\n'
+        '|\n'
+        f'|_ {actor.uid}\n'
+    )
 
-    # TODO: you need ``pdbpp`` master (at least this commit
-    # https://github.com/pdbpp/pdbpp/commit/b757794857f98d53e3ebbe70879663d7d843a6c2)
-    # to fix this and avoid the hang it causes. See issue:
-    # https://github.com/pdbpp/pdbpp/issues/480
-    # TODO: help with a 3.10+ major release if/when it arrives.
-
-    pdbp.xpm(Pdb=lambda: pdb)
+    # TODO: only replacing this to add the
+    # `end=''` to the print XD
+    # pdbp.xpm(Pdb=lambda: pdb)
+    info = sys.exc_info()
+    print(traceback.format_exc(), end='')
+    pdbp.post_mortem(
+        t=info[2],
+        Pdb=lambda: pdb,
+    )
 
 
 post_mortem = partial(
@@ -1001,13 +1024,13 @@ async def maybe_wait_for_debugger(
 
     header_msg: str = '',
 
-) -> None:
+) -> bool:  # was locked and we polled?
 
     if (
         not debug_mode()
         and not child_in_debug
     ):
-        return
+        return False
 
 
     msg: str = header_msg
@@ -1025,8 +1048,7 @@ async def maybe_wait_for_debugger(
 
         if sub_in_debug := Lock.global_actor_in_debug:
             msg += (
-                'Debug `Lock` in use by subactor\n'
-                f'|_{sub_in_debug}\n'
+                f'Debug `Lock` in use by subactor: {sub_in_debug}\n'
             )
             # TODO: could this make things more deterministic?
             # wait to see if a sub-actor task will be
@@ -1035,12 +1057,12 @@ async def maybe_wait_for_debugger(
             # XXX => but it doesn't seem to work..
             # await trio.testing.wait_all_tasks_blocked(cushion=0)
         else:
-            log.pdb(
+            log.debug(
                 msg
                 +
                 'Root immediately acquired debug TTY LOCK'
             )
-            return
+            return False
 
         for istep in range(poll_steps):
 
@@ -1090,12 +1112,13 @@ async def maybe_wait_for_debugger(
                     continue
 
         # fallthrough on failure to acquire..
-        else:
-            raise RuntimeError(
-                msg
-                +
-                'Root actor failed to acquire debug lock?'
-            )
+        # else:
+        #     raise RuntimeError(
+        #         msg
+        #         +
+        #         'Root actor failed to acquire debug lock?'
+        #     )
+        return True
 
     # else:
     #     # TODO: non-root call for #320?
@@ -1104,6 +1127,7 @@ async def maybe_wait_for_debugger(
     #         subactor_uid=this_uid,
     #     ):
     #         pass
+    return False
 
 # TODO: better naming and what additionals?
 # - [ ] optional runtime plugging?
diff --git a/tractor/devx/cli.py b/tractor/devx/cli.py
index 76890669..c44f9686 100644
--- a/tractor/devx/cli.py
+++ b/tractor/devx/cli.py
@@ -23,10 +23,6 @@ Currently popular frameworks supported are:
 
 """
 from __future__ import annotations
-from contextlib import (
-    # asynccontextmanager as acm,
-    contextmanager as cm,
-)
 from typing import (
     Any,
     Callable,
@@ -36,9 +32,6 @@ from typing_extensions import Annotated
 import typer
 
 
-from ._debug import open_crash_handler
-
-
 _runtime_vars: dict[str, Any] = {}
 
 
-- 
2.34.1


From c6b4da5788b4d7604d8bcd5921d5a516eaa40098 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 5 Mar 2024 12:26:33 -0500
Subject: [PATCH 141/378] Tweak `._portal` log content to use
 `Context.repr_outcome()`

---
 tractor/_portal.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tractor/_portal.py b/tractor/_portal.py
index 04f613e8..8148a5d9 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -257,7 +257,7 @@ class Portal:
             return False
 
         reminfo: str = (
-            f'{self.channel.uid}\n'
+            f'`Portal.cancel_actor()` => {self.channel.uid}\n'
             f' |_{chan}\n'
         )
         log.cancel(
@@ -949,9 +949,13 @@ class Portal:
 
                 # CASE 1
                 else:
+                    outcome_str: str = ctx.repr_outcome(
+                        show_error_fields=True,
+                        # type_only=True,
+                    )
                     log.cancel(
-                        f'Context terminated due to local scope error:\n'
-                        f'{etype.__name__}\n'
+                        f'Context terminated due to local scope error:\n\n'
+                        f'{ctx.chan.uid} => {outcome_str}\n'
                     )
 
             # FINALLY, remove the context from runtime tracking and
-- 
2.34.1


From e536057feaefb6c12f348096bacaf150a80067c3 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 5 Mar 2024 12:30:09 -0500
Subject: [PATCH 142/378] `._entry`: use same msg info in start/terminate log

---
 tractor/_entry.py | 18 ++++++++++++++----
 tractor/_root.py  |  4 +++-
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/tractor/_entry.py b/tractor/_entry.py
index 4a1499a0..0ac0dc47 100644
--- a/tractor/_entry.py
+++ b/tractor/_entry.py
@@ -116,14 +116,18 @@ def _trio_main(
     if actor.loglevel is not None:
         get_console_log(actor.loglevel)
         import os
-        log.info(
-            'Started new trio process:\n'
+        actor_info: str = (
             f'|_{actor}\n'
             f'  uid: {actor.uid}\n'
             f'  pid: {os.getpid()}\n'
             f'  parent_addr: {parent_addr}\n'
             f'  loglevel: {actor.loglevel}\n'
         )
+        log.info(
+            'Started new trio process:\n'
+            +
+            actor_info
+        )
 
     try:
         if infect_asyncio:
@@ -133,8 +137,14 @@ def _trio_main(
             trio.run(trio_main)
     except KeyboardInterrupt:
         log.cancel(
-            f'@{actor.uid} received KBI'
+            'Actor received KBI\n'
+            +
+            actor_info
         )
 
     finally:
-        log.info(f"Actor {actor.uid} terminated")
+        log.info(
+            'Actor terminated\n'
+            +
+            actor_info
+        )
diff --git a/tractor/_root.py b/tractor/_root.py
index c3deac9e..1d3d4f17 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -342,7 +342,9 @@ async def open_root_actor(
                 #     for an in nurseries:
                 #         tempn.start_soon(an.exited.wait)
 
-                logger.cancel("Shutting down root actor")
+                logger.info(
+                    'Closing down root actor'
+                )
                 await actor.cancel(None)  # self cancel
     finally:
         _state._current_actor = None
-- 
2.34.1


From 04c99c274979b072342b4bdacf3425a0d7db606f Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 6 Mar 2024 09:48:46 -0500
Subject: [PATCH 143/378] Woops, add `.msg` sub-pkg to install set

---
 setup.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index c226661e..e6a080bb 100755
--- a/setup.py
+++ b/setup.py
@@ -36,8 +36,9 @@ setup(
     platforms=['linux', 'windows'],
     packages=[
         'tractor',
-        'tractor.experimental',
-        'tractor.trionics',
+        'tractor.experimental',  # wacky ideas
+        'tractor.trionics',  # trio extensions
+        'tractor.msg',  # lowlevel data types
     ],
     install_requires=[
 
-- 
2.34.1


From 7c22f76274bac631aa810d42f384e77c24a5aacd Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 6 Mar 2024 09:55:05 -0500
Subject: [PATCH 144/378] Yahh, add `.devx` package to installed subpkgs..

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index e6a080bb..958c8f39 100755
--- a/setup.py
+++ b/setup.py
@@ -39,6 +39,7 @@ setup(
         'tractor.experimental',  # wacky ideas
         'tractor.trionics',  # trio extensions
         'tractor.msg',  # lowlevel data types
+        'tractor.devx',  # "dev-experience"
     ],
     install_requires=[
 
-- 
2.34.1


From 9e3f41a5b154b23a8f4acc0b2d07c2635dc61a45 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 6 Mar 2024 10:13:41 -0500
Subject: [PATCH 145/378] Tweak inter-peer tests for new/refined semantics

Buncha subtle details changed mostly to do with when `Context.cancel()`
gets called on "real" remote errors vs. (peer requested) cancellation
and then local side handling of `ContextCancelled`.

Specific changes to make tests pass:
- due to raciness with `sleeper_ctx.result()` raising the ctxc locally
  vs. the child-peers receiving similar ctxcs themselves (and then
  erroring and propagating back to the root parent), we might not see
  `._remote_error` set during the sub-ctx loops (except for the sleeper
  itself obvi).
- do not expect `.cancel_called`/`.cancel_caught` to be set on any
  sub-ctx since currently `Context.cancel()` is only called non-shielded
  and thus is not in invoked when `._scope.cancel()` is called as part
  of each root-side ctx ref/block handling the inter-peer ctxc.
- do not expect `Context._scope.cancelled_caught` to be set in most cases
  (even the sleeper)

TODO Outstanding adjustments not fixed yet:
-[ ] `_scope.cancelled_caught` checks outside the `.open_context()`
  blocks.
---
 tests/test_inter_peer_cancellation.py | 97 +++++++++++++++++----------
 1 file changed, 62 insertions(+), 35 deletions(-)

diff --git a/tests/test_inter_peer_cancellation.py b/tests/test_inter_peer_cancellation.py
index 082c5e65..81e8afa6 100644
--- a/tests/test_inter_peer_cancellation.py
+++ b/tests/test_inter_peer_cancellation.py
@@ -220,11 +220,12 @@ async def stream_from_peer(
             # - what about IPC-transport specific errors, should
             #   they bubble from the async for and trigger
             #   other special cases?
+            #
             # NOTE: current ctl flow:
             # - stream raises `trio.EndOfChannel` and
             #   exits the loop
-            # - `.open_context()` will raise the ctxcanc
-            #   received from the sleeper.
+            # - `.open_context()` will raise the ctxc received
+            #   from the sleeper.
             async for msg in stream:
                 assert msg is not None
                 print(msg)
@@ -383,11 +384,11 @@ def test_peer_canceller(
                     ) as (canceller_ctx, sent),
 
                 ):
-                    ctxs: list[Context] = [
-                        sleeper_ctx,
-                        caller_ctx,
-                        canceller_ctx,
-                    ]
+                    ctxs: dict[str, Context] = {
+                        'sleeper': sleeper_ctx,
+                        'caller': caller_ctx,
+                        'canceller': canceller_ctx,
+                    }
 
                     try:
                         print('PRE CONTEXT RESULT')
@@ -505,14 +506,17 @@ def test_peer_canceller(
                     # NOTE: this root actor task should have
                     # called `Context.cancel()` on the
                     # `.__aexit__()` to every opened ctx.
-                    for ctx in ctxs:
-                        assert ctx.cancel_called
+                    for name, ctx in ctxs.items():
 
                         # this root actor task should have
                         # cancelled all opened contexts except the
                         # sleeper which is obvi by the "canceller"
                         # peer.
                         re = ctx._remote_error
+                        le = ctx._local_error
+
+                        assert ctx.cancel_called
+
                         if (
                             ctx is sleeper_ctx
                             or ctx is caller_ctx
@@ -566,32 +570,43 @@ def test_peer_canceller(
 
                     # the sleeper's remote error is the error bubbled
                     # out of the context-stack above!
-                    re = sleeper_ctx.outcome
+                    final_err = sleeper_ctx.outcome
                     assert (
-                        re is loc_err
+                        final_err is loc_err
                         is sleeper_ctx.maybe_error
                         is sleeper_ctx._remote_error
                     )
 
-                    for ctx in ctxs:
+                    for name, ctx in ctxs.items():
+
                         re: BaseException|None = ctx._remote_error
-                        re: BaseException|None = ctx.outcome
-                        assert (
-                            re and
-                            (
-                                re is ctx.maybe_error
-                                is ctx._remote_error
-                            )
-                        )
-                        le: trio.MultiError = ctx._local_error
+                        le: BaseException|None = ctx._local_error
+                        err = ctx.maybe_error
+                        out = ctx.outcome
+
+                        # every ctx should error!
+                        assert out is err
+
+                        # the recorded local erro should always be
+                        # the same as the one raised by the
+                        # `sleeper_ctx.result()` call
                         assert (
                             le
-                            and ctx._local_error
+                            and
+                            le is loc_err
                         )
 
                         # root doesn't cancel sleeper since it's
                         # cancelled by its peer.
                         if ctx is sleeper_ctx:
+                            assert re
+                            assert (
+                                ctx._remote_error
+                                is ctx.maybe_error
+                                is ctx.outcome
+                                is ctx._local_error
+                            )
+
                             assert not ctx.cancel_called
                             assert not ctx.cancel_acked
 
@@ -601,21 +616,38 @@ def test_peer_canceller(
                             # `ContextCancelled` for it and thus
                             # the logic inside `.cancelled_caught`
                             # should trigger!
-                            assert ctx._scope.cancelled_caught
+                            assert not ctx._scope.cancelled_caught
 
-                        elif ctx is caller_ctx:
+                        elif ctx in (
+                            caller_ctx,
+                            canceller_ctx,
+                        ):
+
+                            assert not ctx._remote_error
+
+                            # the `canceller_ctx` shouldn't
+                            # have called `ctx.cancel()` either!
+                            #
                             # since its context was remotely
-                            # cancelled, we never needed to
-                            # call `Context.cancel()` bc it was
-                            # done by the peer and also we never 
-                            assert ctx.cancel_called
+                            # cancelled, we never needed to call
+                            # `Context.cancel()` bc the far end
+                            # task already done by the peer and
+                            # also we never 
+                            assert not ctx.cancel_called
 
                             # TODO: figure out the details of this..?
                             # if you look the `._local_error` here
                             # is a multi of ctxc + 2 Cancelleds?
                             # assert not ctx.cancelled_caught
 
-                        elif ctx is canceller_ctx:
+                            assert (
+                                not ctx.cancel_called
+                                and not ctx.cancel_acked
+                            )
+                            assert not ctx._scope.cancelled_caught
+
+                        # elif ctx is canceller_ctx:
+                        #     assert not ctx._remote_error
 
                             # XXX NOTE XXX: ONLY the canceller
                             # will get a self-cancelled outcome
@@ -626,11 +658,6 @@ def test_peer_canceller(
                             # .cancel() whenever an interpeer
                             # cancel takes place since each
                             # reception of a ctxc
-                            assert (
-                                ctx.cancel_called
-                                and ctx.cancel_acked
-                            )
-                            assert not ctx._scope.cancelled_caught
 
                         else:
                             pytest.fail(
@@ -663,7 +690,7 @@ def test_peer_canceller(
                 # `.open_context()` block has exited and should be
                 # set in both outcomes including the case where
                 # ctx-cancel handling itself errors.
-                assert sleeper_ctx._scope.cancelled_caught
+                assert not sleeper_ctx._scope.cancelled_caught
                 assert _loc_err is sleeper_ctx._local_error
                 assert (
                     sleeper_ctx.outcome
-- 
2.34.1


From 6156ff95f84190d81fca4625c53f27647d164ea7 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 6 Mar 2024 14:37:54 -0500
Subject: [PATCH 146/378] Add `shield: bool` support to `.pause()`

It's been on the todo for a while and I've given up trying to properly
hide the `trio.CancelScope.__exit__()` frame for now instead opting to
just `log.pdb()` a big apology XD

Users can obvi still just not use the flag and wrap `tractor.pause()` in
their own cs block if they want to avoid having to hit `'up'` in the pdb
REPL if needed in a cancelled task-scope.

Impl deatz:
- factor orig `.pause()` impl into new `._pause()` so that we can more tersely
  wrap the original content depending on `shield: bool` input; only open
  the cancel-scope when shield is set to avoid aforemented extra strack
  frame annoyance.
- pass through `shield` to underlying `_pause` and `debug_func()` so we
  can actually know when so log our apology.
- add a buncha notes to new `.pause()` wrapper regarding the inability
  to hide the cancel-scope `.__exit__()`, inluding that overriding the
  code in `trio._core._run.CancelScope` doesn't seem to solve the issue
  either..

Unrelated `maybe_wait_for_debugger()` tweaks:
- don't read `Lock.global_actor_in_debug` more then needed, rename local
  read var to `in_debug` (since it can also hold the root actor uid, not
  just sub-actors).
- shield the `await debug_complete.wait()` since ideally we avoid the
  root cancellation child-actors in debug even when the root calls this
  func in a cancelled scope.
---
 tractor/devx/_debug.py | 206 +++++++++++++++++++++++++++++------------
 1 file changed, 145 insertions(+), 61 deletions(-)

diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index e174b848..2839e597 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -619,13 +619,15 @@ def _set_trace(
     actor: tractor.Actor | None = None,
     pdb: MultiActorPdb | None = None,
     shield: bool = False,
+
+    extra_frames_up_when_async: int = 1,
 ):
     __tracebackhide__: bool = True
     actor: tractor.Actor = actor or current_actor()
 
-    # start 2 levels up in user code
-    frame: FrameType | None = sys._getframe()
-    if frame:
+    # always start 1 level up from THIS in user code.
+    frame: FrameType|None
+    if frame := sys._getframe():
         frame: FrameType = frame.f_back  # type: ignore
 
     if (
@@ -633,23 +635,39 @@ def _set_trace(
         and (
             pdb
             and actor is not None
-        ) or shield
+        )
+        # or shield
     ):
+        msg: str = _pause_msg
+        if shield:
+            # log.warning(
+            msg = (
+                '\n\n'
+                '            ------ - ------\n'
+                'Debugger invoked with `shield=True` so an extra\n'
+                '`trio.CancelScope.__exit__()` frame is shown..\n'
+                '\n'
+                'Try going up one frame to see your pause point!\n'
+                '\n'
+                '          SORRY we need to fix this!\n'
+                '            ------ - ------\n\n'
+            ) + msg
+
         # pdbp.set_trace()
         # TODO: maybe print the actor supervion tree up to the
         # root here? Bo
         log.pdb(
-            f'{_pause_msg}\n'
+            f'{msg}\n'
             '|\n'
             f'|_ {actor.uid}\n'
         )
         # no f!#$&* idea, but when we're in async land
         # we need 2x frames up?
-        frame = frame.f_back
-        # frame = frame.f_back
-
-        # if shield:
-        #     frame = frame.f_back
+        for i in range(extra_frames_up_when_async):
+            frame: FrameType = frame.f_back
+            log.debug(
+                f'Going up frame {i} -> {frame}\n'
+            )
 
     else:
         pdb, undo_sigint = mk_mpdb()
@@ -659,10 +677,9 @@ def _set_trace(
         Lock.local_task_in_debug = 'sync'
 
     pdb.set_trace(frame=frame)
-    # undo_
 
 
-async def pause(
+async def _pause(
 
     debug_func: Callable = _set_trace,
     release_lock_signal: trio.Event | None = None,
@@ -676,27 +693,19 @@ async def pause(
     # be no way to override it?..
     # shield: bool = False,
 
-    # TODO:
-    # shield: bool = False
+    shield: bool = False,
     task_status: TaskStatus[trio.Event] = trio.TASK_STATUS_IGNORED
 
 ) -> None:
     '''
-    A pause point (more commonly known as a "breakpoint") interrupt
-    instruction for engaging a blocking debugger instance to
-    conduct manual console-based-REPL-interaction from within
-    `tractor`'s async runtime, normally from some single-threaded
-    and currently executing actor-hosted-`trio`-task in some
-    (remote) process.
+    Inner impl for `pause()` to avoid the `trio.CancelScope.__exit__()`
+    stack frame when not shielded (since apparently i can't figure out
+    how to hide it using the normal mechanisms..)
 
-    NOTE: we use the semantics "pause" since it better encompasses
-    the entirety of the necessary global-runtime-state-mutation any
-    actor-task must access and lock in order to get full isolated
-    control over the process tree's root TTY:
-    https://en.wikipedia.org/wiki/Breakpoint
+    Hopefully we won't need this in the long run.
 
     '''
-    # __tracebackhide__ = True
+    __tracebackhide__: bool = True
     actor = current_actor()
     pdb, undo_sigint = mk_mpdb()
     task_name: str = trio.lowlevel.current_task().name
@@ -707,24 +716,11 @@ async def pause(
     ):
         Lock.local_pdb_complete = trio.Event()
 
-    # if shield:
     debug_func = partial(
         debug_func,
-        # shield=shield,
     )
 
-    # def _exit(self, *args, **kwargs):
-    #     __tracebackhide__: bool = True
-    #     super().__exit__(*args, **kwargs)
-
-    # trio.CancelScope.__exit__.__tracebackhide__ = True
-
-    # import types
-    # with trio.CancelScope(shield=shield) as cs:
-        # cs.__exit__ = types.MethodType(_exit, cs)
-        # cs.__exit__.__tracebackhide__ = True
-
-        # TODO: need a more robust check for the "root" actor
+    # TODO: need a more robust check for the "root" actor
     if (
         not is_root_process()
         and actor._parent_chan  # a connected child
@@ -818,7 +814,7 @@ async def pause(
             #     'trace func provided!'
             # )
             print(f"{actor.uid} ENTERING WAIT")
-            task_status.started()
+            task_status.started(cs)
 
             # with trio.CancelScope(shield=True):
             #     await release_lock_signal.wait()
@@ -827,22 +823,103 @@ async def pause(
             # block here one (at the appropriate frame *up*) where
             # ``breakpoint()`` was awaited and begin handling stdio.
             log.debug("Entering the synchronous world of pdb")
-            debug_func(actor, pdb)
+            debug_func(
+                actor,
+                pdb,
+                extra_frames_up_when_async=2,
+                shield=shield,
+            )
+            assert cs
 
     except bdb.BdbQuit:
         Lock.release()
         raise
 
-    # XXX: apparently we can't do this without showing this frame
-    # in the backtrace on first entry to the REPL? Seems like an odd
-    # behaviour that should have been fixed by now. This is also why
-    # we scrapped all the @cm approaches that were tried previously.
-    # finally:
-    #     __tracebackhide__ = True
-    #     # frame = sys._getframe()
-    #     # last_f = frame.f_back
-    #     # last_f.f_globals['__tracebackhide__'] = True
-    #     # signal.signal = pdbp.hideframe(signal.signal)
+# XXX: apparently we can't do this without showing this frame
+# in the backtrace on first entry to the REPL? Seems like an odd
+# behaviour that should have been fixed by now. This is also why
+# we scrapped all the @cm approaches that were tried previously.
+# finally:
+#     __tracebackhide__ = True
+#     # frame = sys._getframe()
+#     # last_f = frame.f_back
+#     # last_f.f_globals['__tracebackhide__'] = True
+#     # signal.signal = pdbp.hideframe(signal.signal)
+
+
+async def pause(
+
+    debug_func: Callable = _set_trace,
+    release_lock_signal: trio.Event | None = None,
+
+    # TODO: allow caller to pause despite task cancellation,
+    # exactly the same as wrapping with:
+    # with CancelScope(shield=True):
+    #     await pause()
+    # => the REMAINING ISSUE is that the scope's .__exit__() frame
+    # is always show in the debugger on entry.. and there seems to
+    # be no way to override it?..
+    # shield: bool = False,
+
+    shield: bool = False,
+    task_status: TaskStatus[trio.Event] = trio.TASK_STATUS_IGNORED
+
+) -> None:
+    '''
+    A pause point (more commonly known as a "breakpoint") interrupt
+    instruction for engaging a blocking debugger instance to
+    conduct manual console-based-REPL-interaction from within
+    `tractor`'s async runtime, normally from some single-threaded
+    and currently executing actor-hosted-`trio`-task in some
+    (remote) process.
+
+    NOTE: we use the semantics "pause" since it better encompasses
+    the entirety of the necessary global-runtime-state-mutation any
+    actor-task must access and lock in order to get full isolated
+    control over the process tree's root TTY:
+    https://en.wikipedia.org/wiki/Breakpoint
+
+    '''
+    __tracebackhide__: bool = True
+
+    if shield:
+        # NOTE XXX: even hard coding this inside the `class CancelScope:`
+        # doesn't seem to work for me!?
+        # ^ XXX ^
+
+        # def _exit(self, *args, **kwargs):
+        #     __tracebackhide__: bool = True
+        #     super().__exit__(*args, **kwargs)
+
+        trio.CancelScope.__enter__.__tracebackhide__ = True
+        trio.CancelScope.__exit__.__tracebackhide__ = True
+
+        # import types
+        # with trio.CancelScope(shield=shield) as cs:
+            # cs.__exit__ = types.MethodType(_exit, cs)
+            # cs.__exit__.__tracebackhide__ = True
+
+        with trio.CancelScope(shield=shield) as cs:
+            # setattr(cs.__exit__.__func__, '__tracebackhide__', True)
+            # setattr(cs.__enter__.__func__, '__tracebackhide__', True)
+
+            # NOTE: so the caller can always cancel even if shielded
+            task_status.started(cs)
+            await _pause(
+                debug_func=debug_func,
+                release_lock_signal=release_lock_signal,
+                shield=True,
+                task_status=task_status,
+            )
+    else:
+        await _pause(
+            debug_func=debug_func,
+            release_lock_signal=release_lock_signal,
+            shield=False,
+            task_status=task_status,
+        )
+
+
 
 
 # TODO: allow pausing from sync code.
@@ -1043,12 +1120,20 @@ async def maybe_wait_for_debugger(
         # will make the pdb repl unusable.
         # Instead try to wait for pdb to be released before
         # tearing down.
-        sub_in_debug: tuple[str, str]|None = Lock.global_actor_in_debug
+        in_debug: tuple[str, str]|None = Lock.global_actor_in_debug
         debug_complete: trio.Event|None = Lock.no_remote_has_tty
 
-        if sub_in_debug := Lock.global_actor_in_debug:
+        if in_debug == current_actor().uid:
+            log.debug(
+                msg
+                +
+                'Root already owns the TTY LOCK'
+            )
+            return True
+
+        elif in_debug:
             msg += (
-                f'Debug `Lock` in use by subactor: {sub_in_debug}\n'
+                f'Debug `Lock` in use by subactor: {in_debug}\n'
             )
             # TODO: could this make things more deterministic?
             # wait to see if a sub-actor task will be
@@ -1065,27 +1150,26 @@ async def maybe_wait_for_debugger(
             return False
 
         for istep in range(poll_steps):
-
-
             if (
                 debug_complete
                 and not debug_complete.is_set()
-                and sub_in_debug is not None
+                and in_debug is not None
             ):
                 log.pdb(
                     msg
                     +
                     'Root is waiting on tty lock to release..\n'
                 )
-                await debug_complete.wait()
+                with trio.CancelScope(shield=True):
+                    await debug_complete.wait()
                 log.pdb(
                     f'Child subactor released debug lock:'
-                    f'|_{sub_in_debug}\n'
+                    f'|_{in_debug}\n'
                 )
 
             # is no subactor locking debugger currently?
             if (
-                 sub_in_debug is None
+                in_debug is None
                 and (
                     debug_complete is None
                     or debug_complete.is_set()
-- 
2.34.1


From 7ae9b5319b5b784d4d5d75b88df0816e349add2a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 6 Mar 2024 16:07:30 -0500
Subject: [PATCH 147/378] Tweak inter-peer `._scope` state asserts

We don't expect `._scope.cancelled_caught` to be set really ever on
inter-peer cancellation since no ctx is ever cancelling itself, a peer
cancels some other and then bubbles back to all other peers.

Also add `ids: lambda` for `error_during_ctxerr_handling` param to
`test_peer_canceller()`
---
 tests/test_inter_peer_cancellation.py | 44 ++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 8 deletions(-)

diff --git a/tests/test_inter_peer_cancellation.py b/tests/test_inter_peer_cancellation.py
index 81e8afa6..d878b06d 100644
--- a/tests/test_inter_peer_cancellation.py
+++ b/tests/test_inter_peer_cancellation.py
@@ -292,6 +292,7 @@ async def stream_from_peer(
 @pytest.mark.parametrize(
     'error_during_ctxerr_handling',
     [False, True],
+    ids=lambda item: f'rte_during_ctxerr={item}',
 )
 def test_peer_canceller(
     error_during_ctxerr_handling: bool,
@@ -492,6 +493,15 @@ def test_peer_canceller(
                 #   should be cancelled by US.
                 #
                 if error_during_ctxerr_handling:
+                    # since we do a rte reraise above, the
+                    # `.open_context()` error handling should have
+                    # raised a local rte, thus the internal
+                    # `.open_context()` enterer task's
+                    # cancel-scope should have raised the RTE, NOT
+                    # a `trio.Cancelled` due to a local
+                    # `._scope.cancel()` call.
+                    assert not sleeper_ctx._scope.cancelled_caught
+
                     assert isinstance(loc_err, RuntimeError)
                     print(f'_loc_err: {_loc_err}\n')
                     # assert sleeper_ctx._local_error is _loc_err
@@ -558,6 +568,13 @@ def test_peer_canceller(
                 #   propagated
                 #
                 else:
+                    # since sleeper_ctx.result() IS called above
+                    # we should have (silently) absorbed the
+                    # corresponding `ContextCancelled` for it and
+                    # `._scope.cancel()` should never have been
+                    # called.
+                    assert not sleeper_ctx._scope.cancelled_caught
+
                     assert isinstance(loc_err, ContextCancelled)
                     assert loc_err.canceller == sleeper_ctx.canceller
                     assert (
@@ -625,20 +642,31 @@ def test_peer_canceller(
 
                             assert not ctx._remote_error
 
-                            # the `canceller_ctx` shouldn't
-                            # have called `ctx.cancel()` either!
+                            # neither of the `caller/canceller_ctx` should
+                            # have called `ctx.cancel()` bc the
+                            # canceller's task internally issues
+                            # a `Portal.cancel_actor()` to the
+                            # sleeper and thus never should call
+                            # `ctx.cancel()` per say UNLESS the
+                            # sleeper's `.result()` call above
+                            # ctxc exception results in the
+                            # canceller's
+                            # `.open_context().__aexit__()` error
+                            # handling to kick in BEFORE a remote
+                            # error is delivered - which since
+                            # we're asserting what we are above,
+                            # that should normally be the case
+                            # right?
                             #
-                            # since its context was remotely
-                            # cancelled, we never needed to call
-                            # `Context.cancel()` bc the far end
-                            # task already done by the peer and
-                            # also we never 
                             assert not ctx.cancel_called
+                            #
+                            # assert ctx.cancel_called
+                            # orig ^
 
                             # TODO: figure out the details of this..?
                             # if you look the `._local_error` here
                             # is a multi of ctxc + 2 Cancelleds?
-                            # assert not ctx.cancelled_caught
+                            # assert not ctx._scope.cancelled_caught
 
                             assert (
                                 not ctx.cancel_called
-- 
2.34.1


From 364ea919835a4f23e27928bedcc1aaa6cf2e7385 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 7 Mar 2024 18:24:00 -0500
Subject: [PATCH 148/378] Set `._cancel_msg` to RPC `{cmd: 'self._cancel_task',
 ..}` msg

Like how we set `Context._cancel_msg` in `._deliver_msg()` (in
which case normally it's an `{'error': ..}` msg), do the same when any
RPC task is remotely cancelled via `Actor._cancel_task` where that task
doesn't yet have a cancel msg set yet.

This makes is much easier to distinguish between ctx cancellations due
to some remote error vs. Explicit remote requests via any of
`Actor.cancel()`, `Portal.cancel_actor()` or `Context.cancel()`.
---
 tractor/_runtime.py | 74 +++++++++++++++++++++++++++------------------
 1 file changed, 45 insertions(+), 29 deletions(-)

diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 4c1181de..64549bac 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -302,7 +302,7 @@ async def _errors_relayed_via_ipc(
                     )
                 )
             ):
-                # await pause()
+                # await _debug.pause()
                 # XXX QUESTION XXX: is there any case where we'll
                 # want to debug IPC disconnects as a default?
                 # => I can't think of a reason that inspecting this
@@ -322,6 +322,12 @@ async def _errors_relayed_via_ipc(
             cid=ctx.cid,
         )
 
+        # NOTE: the src actor should always be packed into the
+        # error.. but how should we verify this?
+        # assert err_msg['src_actor_uid']
+        # if not err_msg['error'].get('src_actor_uid'):
+        #     import pdbp; pdbp.set_trace()
+
         if is_rpc:
             try:
                 await chan.send(err_msg)
@@ -566,6 +572,7 @@ async def _invoke(
             #   inside ._context._drain_to_final_msg()`..
             #   # TODO: remove this ^ right?
             if ctx._scope.cancelled_caught:
+                our_uid: tuple = actor.uid
 
                 # first check for and raise any remote error
                 # before raising any context cancelled case
@@ -575,8 +582,9 @@ async def _invoke(
                     ctx._maybe_raise_remote_err(re)
 
                 cs: CancelScope = ctx._scope
+
                 if cs.cancel_called:
-                    our_uid: tuple = actor.uid
+
                     canceller: tuple = ctx.canceller
                     msg: str = (
                         'actor was cancelled by '
@@ -632,15 +640,6 @@ async def _invoke(
                         # f'  |_{ctx}'
                     )
 
-                    # TODO: does this ever get set any more or can
-                    # we remove it?
-                    if ctx._cancel_msg:
-                        msg += (
-                            # '------ - ------\n'
-                            # 'IPC msg:\n'
-                            f'\n\n{ctx._cancel_msg}'
-                        )
-
                     # task-contex was either cancelled by request using
                     # ``Portal.cancel_actor()`` or ``Context.cancel()``
                     # on the far end, or it was cancelled by the local
@@ -1753,7 +1752,9 @@ class Actor:
         self,
         cid: str,
         parent_chan: Channel,
-        requesting_uid: tuple[str, str] | None = None,
+
+        requesting_uid: tuple[str, str]|None = None,
+        ipc_msg: dict|None|bool = False,
 
     ) -> bool:
         '''
@@ -1764,16 +1765,13 @@ class Actor:
         in the signature (for now).
 
         '''
-        # this ctx based lookup ensures the requested task to
-        # be cancelled was indeed spawned by a request from
-        # this channel
+
+        # this ctx based lookup ensures the requested task to be
+        # cancelled was indeed spawned by a request from its
+        # parent (or some grandparent's) channel
         ctx: Context
         func: Callable
         is_complete: trio.Event
-
-        # NOTE: right now this is only implicitly called by
-        # streaming IPC but it should be called
-        # to cancel any remotely spawned task
         try:
             (
                 ctx,
@@ -1801,20 +1799,23 @@ class Actor:
 
         log.cancel(
             'Cancel request for RPC task\n\n'
-            f'<= ._cancel_task(): {requesting_uid}\n'
-            f'  |_ @{ctx.dmaddr}\n\n'
+            f'<= Actor.cancel_task(): {requesting_uid}\n\n'
+            f'=> {ctx._task}\n'
+            f'  |_ >> {ctx.repr_rpc}\n'
+            # f'  >> Actor._cancel_task() => {ctx._task}\n'
+            # f'  |_ {ctx._task}\n\n'
 
             # TODO: better ascii repr for "supervisor" like
             # a nursery or context scope?
             # f'=> {parent_chan}\n'
-            f'=> {ctx._task}\n'
+            # f'   |_{ctx._task}\n'
             # TODO: simplified `Context.__repr__()` fields output
             # shows only application state-related stuff like,
             # - ._stream
             # - .closed
             # - .started_called
             # - .. etc.
-            f'  >> {ctx.repr_rpc}\n'
+            # f'     >> {ctx.repr_rpc}\n'
             # f'  |_ctx: {cid}\n'
             # f'    >> {ctx._nsf}()\n'
         )
@@ -1824,6 +1825,16 @@ class Actor:
         ):
             ctx._canceller: tuple = requesting_uid
 
+        # TODO: pack the RPC `{'cmd': <blah>}` msg into a ctxc and
+        # then raise and pack it here?
+        if (
+            ipc_msg
+            and ctx._cancel_msg is None
+        ):
+            # assign RPC msg directly from the loop which usually
+            # the case with `ctx.cancel()` on the other side.
+            ctx._cancel_msg = ipc_msg
+
         # don't allow cancelling this function mid-execution
         # (is this necessary?)
         if func is self._cancel_task:
@@ -1904,10 +1915,15 @@ class Actor:
             else
             "IPC channel's "
         )
-
+        rent_chan_repr: str = (
+            f'|_{parent_chan}'
+            if parent_chan
+            else ''
+        )
         log.cancel(
             f'Cancelling {descr} {len(tasks)} rpc tasks\n\n'
-            f'<= .cancel_rpc_tasks(): {req_uid}\n'
+            f'<= `Actor.cancel_rpc_tasks()`: {req_uid}\n'
+            f'    {rent_chan_repr}\n'
             # f'{self}\n'
             # f'{tasks_str}'
         )
@@ -1927,9 +1943,6 @@ class Actor:
             ):
                 continue
 
-            # if func == self._cancel_task:
-            #     continue
-
             # TODO: this maybe block on the task cancellation
             # and so should really done in a nursery batch?
             await self._cancel_task(
@@ -2339,6 +2352,8 @@ async def process_messages(
                             await actor._cancel_task(
                                 cid,
                                 channel,
+
+                                ipc_msg=msg,
                             )
                     break
 
@@ -2449,6 +2464,7 @@ async def process_messages(
                             # cancel it!
                             'parent_chan': chan,
                             'requesting_uid': chan.uid,
+                            'ipc_msg': msg,
                         }
                         # TODO: remove? already have emit in meth.
                         # log.runtime(
@@ -2737,7 +2753,7 @@ class Arbiter(Actor):
         sockaddr: tuple[str, int]
 
         for (aname, _), sockaddr in self._registry.items():
-            log.info(
+            log.runtime(
                 f'Actor mailbox info:\n'
                 f'aname: {aname}\n'
                 f'sockaddr: {sockaddr}\n'
-- 
2.34.1


From fa7e37d6edaec9afe3bd49cc2014479149a37ebb Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 7 Mar 2024 20:35:43 -0500
Subject: [PATCH 149/378] (Event) more pedantic `.cancel_acked: bool` def

Changes the condition logic to be more strict and moves it to a private
`._is_self_cancelled() -> bool` predicate which can be used elsewhere
(instead of having almost similar duplicate checks all over the
place..) and allows taking in a specific `remote_error` just for
verification purposes (like for tests).

Main strictness distinctions are now:
- obvi that `.cancel_called` is set (this filters any
  `Portal.cancel_actor()` or other out-of-band RPC),
- the received `ContextCancelled` **must** have its `.canceller` set to
  this side's `Actor.uid` (indicating we are the requester).
- `.src_actor_uid` **must** be the same as the `.chan.uid` (so the error
  must have originated from the opposite side's task.
- `ContextCancelled.canceller` should be already set to the `.chan.uid`
  indicating we received the msg via the runtime calling
  `._deliver_msg()` -> `_maybe_cancel_and_set_remote_error()` which
  ensures the error is specifically destined for this ctx-task exactly
  the same as how `Actor._cancel_task()` sets it from an input
  `requesting_uid` arg.

In support of the above adjust some impl deats:
- add `Context._actor: Actor` which is set once in `mk_context()` to
  avoid issues (particularly in testing) where `current_actor()` raises
  after the root actor / runtime is already exited. Use `._actor.uid` in
  both `.cancel_acked` (obvi) and '_maybe_cancel_and_set_remote_error()`
  when deciding whether to call `._scope.cancel()`.
- always cast `.canceller` to `tuple` if not null.
- delegate `.cancel_acked` directly to new private predicate (obvi).
- always set `._canceller` from any `RemoteActorError.src_actor_uid` or
  failing over to the `.chan.uid` when a non-remote error (tho that
  shouldn't ever happen right?).
- more extensive doc-string for `.cancel()` detailing the new strictness
  rules about whether an eventual `.cancel_acked` might be set.

Also tossed in even more logging format tweaks by adding a
`type_only: bool` to `.repr_outcome()` as desired for simpler output in
the `state: <outcome-repr-here>` and `.repr_rpc()` sections of the
`.__str__()`.
---
 tractor/_context.py | 285 ++++++++++++++++++++++++++++----------------
 1 file changed, 184 insertions(+), 101 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index f8aaf1c9..9179456b 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -364,6 +364,9 @@ class Context:
     '''
     chan: Channel
     cid: str  # "context id", more or less a unique linked-task-pair id
+
+    _actor: Actor
+
     # the "feeder" channels for delivering message values to the
     # local task from the runtime's msg processing loop.
     _recv_chan: trio.MemoryReceiveChannel
@@ -429,7 +432,7 @@ class Context:
     # there's always going to be an "underlying reason" that any
     # context was closed due to either a remote side error or
     # a call to `.cancel()` which triggers `ContextCancelled`.
-    _cancel_msg: str | dict | None = None
+    _cancel_msg: str|dict|None = None
 
     # NOTE: this state var used by the runtime to determine if the
     # `pdbp` REPL is allowed to engage on contexts terminated via
@@ -486,6 +489,13 @@ class Context:
                 f'   {stream}\n'
             )
 
+        outcome_str: str = self.repr_outcome(
+            show_error_fields=True
+        )
+        outcome_typ_str: str = self.repr_outcome(
+            type_only=True
+        )
+
         return (
             f'<Context(\n'
             # f'\n'
@@ -505,8 +515,16 @@ class Context:
             # f'   ---\n'
             f'\n'
             # f'   -----\n'
-            f' |_state: {self.repr_outcome()}\n'
-            f'   outcome{ds}{self.repr_outcome(show_error_fields=True)}\n'
+            #
+            # TODO: better state `str`ids?
+            # -[ ] maybe map err-types to strs like 'cancelled',
+            #     'errored', 'streaming', 'started', .. etc.
+            # -[ ] as well as a final result wrapper like
+            #     `outcome.Value`?
+            #
+            f' |_state: {outcome_typ_str}\n'
+
+            f'   outcome{ds}{outcome_str}\n'
             f'   result{ds}{self._result}\n'
             f'   cancel_called{ds}{self.cancel_called}\n'
             f'   cancel_acked{ds}{self.cancel_acked}\n'
@@ -545,14 +563,46 @@ class Context:
         return self._cancel_called
 
     @property
-    def canceller(self) -> tuple[str, str] | None:
+    def canceller(self) -> tuple[str, str]|None:
         '''
         ``Actor.uid: tuple[str, str]`` of the (remote)
         actor-process who's task was cancelled thus causing this
         (side of the) context to also be cancelled.
 
         '''
-        return self._canceller
+        if canc := self._canceller:
+            return tuple(canc)
+
+        return None
+
+    def _is_self_cancelled(
+        self,
+        remote_error: Exception|None = None,
+
+    ) -> bool:
+
+        if not self._cancel_called:
+            return False
+
+        re: BaseException|None = (
+            remote_error
+            or self._remote_error
+        )
+        if not re:
+            return False
+
+        if from_uid := re.src_actor_uid:
+            from_uid: tuple = tuple(from_uid)
+
+        our_uid: tuple = self._actor.uid
+        our_canceller = self.canceller
+
+        return bool(
+            isinstance(re, ContextCancelled)
+            and from_uid == self.chan.uid
+            and re.canceller == our_uid
+            and our_canceller == from_uid
+        )
 
     @property
     def cancel_acked(self) -> bool:
@@ -568,22 +618,7 @@ class Context:
         equal to the uid of the calling task's actor.
 
         '''
-        portal: Portal|None = self._portal
-        if portal:
-            our_uid: tuple = portal.actor.uid
-
-        return bool(
-            self._cancel_called
-            and (re := self._remote_error)
-            and isinstance(re, ContextCancelled)
-            and (
-                re.canceller
-                ==
-                self.canceller
-                ==
-                our_uid
-           )
-        )
+        return self._is_self_cancelled()
 
     @property
     def cancelled_caught(self) -> bool:
@@ -762,30 +797,15 @@ class Context:
         # self-cancel (ack) or,
         # peer propagated remote cancellation.
         if isinstance(error, ContextCancelled):
-            ctxc_src: tuple = error.canceller
 
             whom: str = (
-                'us' if ctxc_src == current_actor().uid
+                'us' if error.canceller == self._actor.uid
                 else 'peer'
             )
             log.cancel(
                 f'IPC context cancelled by {whom}!\n\n'
                 f'{error}'
             )
-            # always record the cancelling actor's uid since its
-            # cancellation state is linked and we want to know
-            # which process was the cause / requester of the
-            # cancellation.
-            self._canceller = ctxc_src
-
-
-            if self._cancel_called:
-                # this is an expected cancel request response
-                # message and we **don't need to raise it** in the
-                # local cancel `._scope` since it will potentially
-                # override a real error. After this returns
-                # `.cancel_acked == True`.
-                return
 
         else:
             log.error(
@@ -794,7 +814,23 @@ class Context:
                 f'{error}\n'
                 f'{pformat(self)}\n'
             )
-            self._canceller = self.chan.uid
+
+        # always record the cancelling actor's uid since its
+        # cancellation state is linked and we want to know
+        # which process was the cause / requester of the
+        # cancellation.
+        maybe_error_src: tuple = getattr(
+            error,
+            'src_actor_uid',
+            None,
+        )
+        self._canceller = (
+            maybe_error_src
+            or
+            # XXX: in the case we get a non-boxed error?
+            # -> wait but this should never happen right?
+            self.chan.uid
+        )
 
         # Cancel the local `._scope`, catch that
         # `._scope.cancelled_caught` and re-raise any remote error
@@ -803,6 +839,15 @@ class Context:
         cs: trio.CancelScope = self._scope
         if (
             cs
+
+            # XXX this is an expected cancel request response
+            # message and we **don't need to raise it** in the
+            # local cancel `._scope` since it will potentially
+            # override a real error. After this method returns
+            # if `._cancel_called` then `.cancel_acked and .cancel_called`
+            # always should be set.
+            and not self._is_self_cancelled()
+
             and not cs.cancel_called
             and not cs.cancelled_caught
         ):
@@ -840,9 +885,13 @@ class Context:
     ) -> str:
         # TODO: how to show the transport interchange fmt?
         # codec: str = self.chan.transport.codec_key
+        outcome_str: str = self.repr_outcome(
+            show_error_fields=True,
+            type_only=True,
+        )
         return (
             # f'{self._nsf}() -{{{codec}}}-> {repr(self.outcome)}:'
-            f'{self._nsf}() -> {self.repr_outcome()}:'
+            f'{self._nsf}() -> {outcome_str}:'
         )
 
     async def cancel(
@@ -851,10 +900,32 @@ class Context:
 
     ) -> None:
         '''
-        Cancel this inter-actor-task context.
+        Cancel this inter-actor IPC context by requestng the
+        remote side's cancel-scope-linked `trio.Task` by calling
+        `._scope.cancel()` and delivering an `ContextCancelled`
+        ack msg in reponse.
 
-        Request that the far side cancel it's current linked context,
-        Timeout quickly in an attempt to sidestep 2-generals...
+        Behaviour:
+        ---------
+        - after the far end cancels, the `.cancel()` calling side
+          should receive a `ContextCancelled` with the
+          `.canceller: tuple` uid set to the current `Actor.uid`.
+
+        - timeout (quickly) on failure to rx this ACK error-msg in
+          an attempt to sidestep 2-generals when the transport
+          layer fails.
+
+        Note, that calling this method DOES NOT also necessarily
+        result in `Context._scope.cancel()` being called
+        **locally**!
+
+        => That is, an IPC `Context` (this) **does not**
+           have the same semantics as a `trio.CancelScope`.
+
+        If the caller (who entered the `Portal.open_context()`)
+        desires that the internal block's cancel-scope  be
+        cancelled it should open its own `trio.CancelScope` and
+        manage it as needed.
 
         '''
         side: str = self.side
@@ -976,7 +1047,7 @@ class Context:
             ``trio``'s cancellation system.
 
         '''
-        actor: Actor = current_actor()
+        actor: Actor = self._actor
 
         # If the surrounding context has been cancelled by some
         # task with a handle to THIS, we error here immediately
@@ -1149,62 +1220,58 @@ class Context:
         a  cancellation (if any).
 
         '''
-        if ((
-            # NOTE: whenever the context's "opener" side (task) **is**
-            # the side which requested the cancellation (likekly via
-            # ``Context.cancel()``), we don't want to re-raise that
-            # cancellation signal locally (would be akin to
-            # a ``trio.Nursery`` nursery raising ``trio.Cancelled``
-            # whenever  ``CancelScope.cancel()`` was called) and
-            # instead silently reap the expected cancellation
-            # "error"-msg-as-ack. In this case the `err:
-            # ContextCancelled` must have a `.canceller` set to the 
-            # uid of the requesting task's actor and we only do NOT
-            # raise that error locally if WE ARE THAT ACTOR which
-            # requested the cancellation.
-                not raise_ctxc_from_self_call
-                and isinstance(remote_error, ContextCancelled)
-                and (
-                    self._cancel_called
+        our_uid: tuple = self.chan.uid
 
-                    # or self.chan._cancel_called
-                    # TODO: ^ should we have a special separate case
-                    # for this ^ ?
-                )
-                and ( # one of,
+        # XXX NOTE XXX: `ContextCancelled`/`StreamOverrun` absorption
+        # for "graceful cancellation" case:
+        #
+        # Whenever a "side" of a context (a `trio.Task` running in
+        # an actor) **is** the side which requested ctx
+        # cancellation (likekly via ``Context.cancel()``), we
+        # **don't** want to re-raise any eventually received
+        # `ContextCancelled` response locally (would be akin to
+        # a `trio.Nursery` nursery raising `trio.Cancelled`
+        # whenever `CancelScope.cancel()` was called).
+        #
+        # Instead, silently reap the remote delivered ctxc
+        # (`ContextCancelled`) as an expected
+        # error-msg-is-cancellation-ack IFF said
+        # `remote_error: ContextCancelled` has `.canceller`
+        # set to the `Actor.uid` of THIS task (i.e. the
+        # cancellation requesting task's actor is the actor
+        # checking whether it should absorb the ctxc).
+        if (
+            not raise_ctxc_from_self_call
+            and self._is_self_cancelled(remote_error)
 
-                    (portal := self._portal)
-                    and (our_uid := portal.actor.uid)
-                    # TODO: ?potentially it is useful to emit certain
-                    # warning/cancel logs for the cases where the
-                    # cancellation is due to a lower level cancel
-                    # request, such as `Portal.cancel_actor()`, since in
-                    # that case it's not actually this specific ctx that
-                    # made a `.cancel()` call, but it is the same
-                    # actor-process?
-                    and tuple(remote_error.canceller) == our_uid
-                    or self.chan._cancel_called
-                    or self.canceller == our_uid
-                 )
-            ) or (
+            # TODO: ?potentially it is useful to emit certain
+            # warning/cancel logs for the cases where the
+            # cancellation is due to a lower level cancel
+            # request, such as `Portal.cancel_actor()`, since in
+            # that case it's not actually this specific ctx that
+            # made a `.cancel()` call, but it is the same
+            # actor-process?
+            # or self.chan._cancel_called
+            # XXX: ^ should we have a special separate case
+            # for this ^, NO right?
 
-                # NOTE: whenever this context is the cause of an
-                # overrun on the remote side (aka we sent msgs too
-                # fast that the remote task was overrun according
-                # to `MsgStream` buffer settings) AND the caller
-                # has requested to not raise overruns this side
-                # caused, we also silently absorb any remotely
-                # boxed `StreamOverrun`. This is mostly useful for
-                # supressing such faults during
-                # cancellation/error/final-result handling inside
-                # `_drain_to_final_msg()` such that we do not
-                # raise such errors particularly in the case where
-                # `._cancel_called == True`.
-                not raise_overrun_from_self
-                and isinstance(remote_error, RemoteActorError)
-                and remote_error.msgdata['type_str'] == 'StreamOverrun'
-                and tuple(remote_error.msgdata['sender']) == our_uid
-            )
+        ) or (
+            # NOTE: whenever this context is the cause of an
+            # overrun on the remote side (aka we sent msgs too
+            # fast that the remote task was overrun according
+            # to `MsgStream` buffer settings) AND the caller
+            # has requested to not raise overruns this side
+            # caused, we also silently absorb any remotely
+            # boxed `StreamOverrun`. This is mostly useful for
+            # supressing such faults during
+            # cancellation/error/final-result handling inside
+            # `_drain_to_final_msg()` such that we do not
+            # raise such errors particularly in the case where
+            # `._cancel_called == True`.
+            not raise_overrun_from_self
+            and isinstance(remote_error, RemoteActorError)
+            and remote_error.msgdata['type_str'] == 'StreamOverrun'
+            and tuple(remote_error.msgdata['sender']) == our_uid
         ):
             # NOTE: we set the local scope error to any "self
             # cancellation" error-response thus "absorbing"
@@ -1236,7 +1303,7 @@ class Context:
     # TODO: change  to `.wait_for_result()`?
     async def result(
         self,
-        hide_tb: bool = True,
+        hide_tb: bool = False,
 
     ) -> Any|Exception:
         '''
@@ -1378,7 +1445,20 @@ class Context:
         if error:
             return error
 
-        assert not self._cancel_msg
+        if cancmsg := self._cancel_msg:
+            # NOTE: means we're prolly in the process of
+            # processing the cancellation caused by
+            # this msg (eg. logging from `Actor._cancel_task()`
+            # method after receiving a `Context.cancel()` RPC)
+            # though there shouldn't ever be a `._cancel_msg`
+            # without it eventually resulting in this property
+            # delivering a value!
+            log.debug(
+                '`Context._cancel_msg` is set but has not yet resolved to `.maybe_error`?\n\n'
+                f'{cancmsg}\n'
+            )
+
+        # assert not self._cancel_msg
         return None
 
     def _final_result_is_set(self) -> bool:
@@ -1411,6 +1491,7 @@ class Context:
     def repr_outcome(
         self,
         show_error_fields: bool = False,
+        type_only: bool = False,
 
     ) -> str:
         '''
@@ -1420,6 +1501,9 @@ class Context:
         '''
         merr: Exception|None = self.maybe_error
         if merr:
+            if type_only:
+                return type(merr).__name__
+
             # if the error-type is one of ours and has the custom
             # defined "repr-(in)-one-line" method call it, ow
             # just deliver the type name.
@@ -1616,8 +1700,6 @@ class Context:
 
                 f'{pformat(msg)}\n'
             )
-            # from .devx._debug import pause
-            # await pause()
 
             # NOTE: if an error is deteced we should always still
             # send it through the feeder-mem-chan and expect
@@ -1666,7 +1748,7 @@ class Context:
             # overrun state and that msg isn't stuck in an
             # overflow queue what happens?!?
 
-            local_uid = current_actor().uid
+            local_uid = self._actor.uid
             txt: str = (
                 'on IPC context:\n'
 
@@ -1765,6 +1847,7 @@ def mk_context(
     ctx = Context(
         chan=chan,
         cid=cid,
+        _actor=current_actor(),
         _send_chan=send_chan,
         _recv_chan=recv_chan,
         _nsf=nsf,
-- 
2.34.1


From c36deb1f4d51c34503e0ac9dffe5ba2bb2047267 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 7 Mar 2024 21:14:40 -0500
Subject: [PATCH 150/378] Woops, fix `_post_mortem()` type sig..

We're passing a `extra_frames_up_when_async=2` now (from prior attempt
to hide `CancelScope.__exit__()` when `shield=True`) and thus both
`debug_func`s must accept it :facepalm:

On the brighter side found out that the `TypeError` from the call-sig
mismatch was actually being swallowed entirely so add some
`.exception()` msgs for such cases to at least alert the dev they broke
stuff XD
---
 tractor/devx/_debug.py | 43 +++++++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 2839e597..3203af1b 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -95,12 +95,12 @@ class Lock:
     # and must be cancelled if this actor is cancelled via IPC
     # request-message otherwise deadlocks with the parent actor may
     # ensure
-    _debugger_request_cs: trio.CancelScope | None = None
+    _debugger_request_cs: trio.CancelScope|None = None
 
     # NOTE: set only in the root actor for the **local** root spawned task
     # which has acquired the lock (i.e. this is on the callee side of
     # the `lock_tty_for_child()` context entry).
-    _root_local_task_cs_in_debug: trio.CancelScope | None = None
+    _root_local_task_cs_in_debug: trio.CancelScope|None = None
 
     # actor tree-wide actor uid that supposedly has the tty lock
     global_actor_in_debug: tuple[str, str] = None
@@ -808,33 +808,46 @@ async def _pause(
         Lock.repl = pdb
 
     try:
-        if debug_func is None:
+        # TODO: do we want to support using this **just** for the
+        # locking / common code (prolly to help address #320)?
+        #
+        # if debug_func is None:
             # assert release_lock_signal, (
             #     'Must pass `release_lock_signal: trio.Event` if no '
             #     'trace func provided!'
             # )
-            print(f"{actor.uid} ENTERING WAIT")
-            task_status.started(cs)
-
+            # print(f"{actor.uid} ENTERING WAIT")
             # with trio.CancelScope(shield=True):
             #     await release_lock_signal.wait()
 
-        else:
+        # else:
             # block here one (at the appropriate frame *up*) where
             # ``breakpoint()`` was awaited and begin handling stdio.
-            log.debug("Entering the synchronous world of pdb")
+        log.debug('Entering sync world of the `pdb` REPL..')
+        try:
             debug_func(
                 actor,
                 pdb,
                 extra_frames_up_when_async=2,
                 shield=shield,
             )
-            assert cs
+        except BaseException:
+            log.exception(
+                'Failed to invoke internal `debug_func = '
+                f'{debug_func.func.__name__}`\n'
+            )
+            raise
 
     except bdb.BdbQuit:
         Lock.release()
         raise
 
+    except BaseException:
+        log.exception(
+            'Failed to engage debugger via `_pause()` ??\n'
+        )
+        raise
+
 # XXX: apparently we can't do this without showing this frame
 # in the backtrace on first entry to the REPL? Seems like an odd
 # behaviour that should have been fixed by now. This is also why
@@ -905,14 +918,14 @@ async def pause(
 
             # NOTE: so the caller can always cancel even if shielded
             task_status.started(cs)
-            await _pause(
+            return await _pause(
                 debug_func=debug_func,
                 release_lock_signal=release_lock_signal,
                 shield=True,
                 task_status=task_status,
             )
     else:
-        await _pause(
+        return await _pause(
             debug_func=debug_func,
             release_lock_signal=release_lock_signal,
             shield=False,
@@ -1006,6 +1019,10 @@ _crash_msg: str = (
 def _post_mortem(
     actor: tractor.Actor,
     pdb: MultiActorPdb,
+    shield: bool = False,
+
+    # only for compat with `._set_trace()`..
+    extra_frames_up_when_async=0,
 
 ) -> None:
     '''
@@ -1034,7 +1051,7 @@ def _post_mortem(
 
 post_mortem = partial(
     pause,
-    _post_mortem,
+    debug_func=_post_mortem,
 )
 
 
@@ -1163,7 +1180,7 @@ async def maybe_wait_for_debugger(
                 with trio.CancelScope(shield=True):
                     await debug_complete.wait()
                 log.pdb(
-                    f'Child subactor released debug lock:'
+                    f'Child subactor released debug lock\n'
                     f'|_{in_debug}\n'
                 )
 
-- 
2.34.1


From 2e797ef7ee49e72f52ce0417c43cf75ae5731658 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 7 Mar 2024 21:26:57 -0500
Subject: [PATCH 151/378] Update ctx test suites to stricter semantics

Including mostly tweaking asserts on relayed `ContextCancelled`s and
the new pub ctx properties: `.outcome`, `.maybe_error`, etc. as it
pertains to graceful (absorbed) remote cancellation vs. loud ctxc cases
expected to be raised by any `Portal.cancel_actor()` style teardown.

Start checking a variety internals like `._remote/local_error`,
`._is_self_cancelled()`, `._is_final_result_set()`, `._cancel_msg`
where applicable.

Also factor out the new `expect_ctxc()` checker to our `conftest.py` for
use in other suites.
---
 tests/conftest.py                      |  24 ++++++
 tests/test_context_stream_semantics.py | 107 +++++++++++++++++--------
 tests/test_inter_peer_cancellation.py  |  51 ++++++------
 3 files changed, 126 insertions(+), 56 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index c9159f0d..fb82a554 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,6 +1,7 @@
 """
 ``tractor`` testing!!
 """
+from contextlib import asynccontextmanager as acm
 import sys
 import subprocess
 import os
@@ -292,3 +293,26 @@ def daemon(
     time.sleep(_PROC_SPAWN_WAIT)
     yield proc
     sig_prog(proc, _INT_SIGNAL)
+
+
+@acm
+async def expect_ctxc(
+    yay: bool,
+    reraise: bool = False,
+) -> None:
+    '''
+    Small acm to catch `ContextCancelled` errors when expected
+    below it in a `async with ()` block.
+
+    '''
+    if yay:
+        try:
+            yield
+            raise RuntimeError('Never raised ctxc?')
+        except tractor.ContextCancelled:
+            if reraise:
+                raise
+            else:
+                return
+    else:
+        yield
diff --git a/tests/test_context_stream_semantics.py b/tests/test_context_stream_semantics.py
index 19a87453..d8e946bf 100644
--- a/tests/test_context_stream_semantics.py
+++ b/tests/test_context_stream_semantics.py
@@ -5,7 +5,6 @@ Verify the we raise errors when streams are opened prior to
 sync-opening a ``tractor.Context`` beforehand.
 
 '''
-from contextlib import asynccontextmanager as acm
 from itertools import count
 import platform
 from pprint import pformat
@@ -26,7 +25,10 @@ from tractor._exceptions import (
     ContextCancelled,
 )
 
-from conftest import tractor_test
+from conftest import (
+    tractor_test,
+    expect_ctxc,
+)
 
 # ``Context`` semantics are as follows,
 #  ------------------------------------
@@ -194,12 +196,13 @@ def test_simple_context(
                 )
 
                 try:
-                    async with portal.open_context(
-                        simple_setup_teardown,
-                        data=10,
-                        block_forever=callee_blocks_forever,
-                    ) as (ctx, sent):
-
+                    async with (
+                        portal.open_context(
+                            simple_setup_teardown,
+                            data=10,
+                            block_forever=callee_blocks_forever,
+                        ) as (ctx, sent),
+                    ):
                         assert sent == 11
 
                         if callee_blocks_forever:
@@ -250,17 +253,6 @@ def test_simple_context(
         trio.run(main)
 
 
-@acm
-async def expect_ctxc(yay: bool) -> None:
-    if yay:
-        try:
-            yield
-        except ContextCancelled:
-            return
-    else:
-        yield
-
-
 @pytest.mark.parametrize(
     'callee_returns_early',
     [True, False],
@@ -293,6 +285,7 @@ def test_caller_cancels(
     ) -> None:
         actor: Actor = current_actor()
         uid: tuple = actor.uid
+        _ctxc: ContextCancelled|None = None
 
         if (
             cancel_method == 'portal'
@@ -303,6 +296,9 @@ def test_caller_cancels(
                 assert 0, 'Portal cancel should raise!'
 
             except ContextCancelled as ctxc:
+                # with trio.CancelScope(shield=True):
+                #     await tractor.pause()
+                _ctxc = ctxc
                 assert ctx.chan._cancel_called
                 assert ctxc.canceller == uid
                 assert ctxc is ctx.maybe_error
@@ -311,7 +307,10 @@ def test_caller_cancels(
         # case since self-cancellation should swallow the ctxc
         # silently!
         else:
-            res = await ctx.result()
+            try:
+                res = await ctx.result()
+            except ContextCancelled as ctxc:
+                pytest.fail(f'should not have raised ctxc\n{ctxc}')
 
         # we actually get a result
         if callee_returns_early:
@@ -342,6 +341,10 @@ def test_caller_cancels(
                 # await tractor.pause()
                 # assert ctx._local_error is None
 
+        # TODO: don't need this right?
+        # if _ctxc:
+        #     raise _ctxc
+
 
     async def main():
 
@@ -352,11 +355,19 @@ def test_caller_cancels(
                 'simple_context',
                 enable_modules=[__name__],
             )
-            timeout = 0.5 if not callee_returns_early else 2
+            timeout: float = (
+                0.5
+                if not callee_returns_early
+                else 2
+            )
             with trio.fail_after(timeout):
                 async with (
-
-                    expect_ctxc(yay=cancel_method == 'portal'),
+                    expect_ctxc(
+                        yay=(
+                            not callee_returns_early
+                            and cancel_method == 'portal'
+                        )
+                    ),
 
                     portal.open_context(
                         simple_setup_teardown,
@@ -372,10 +383,18 @@ def test_caller_cancels(
                         await trio.sleep(0.5)
 
                     if cancel_method == 'ctx':
+                        print('cancelling with `Context.cancel()`')
                         await ctx.cancel()
-                    else:
+
+                    elif cancel_method == 'portal':
+                        print('cancelling with `Portal.cancel_actor()`')
                         await portal.cancel_actor()
 
+                    else:
+                        pytest.fail(
+                            f'Unknown `cancel_method={cancel_method} ?'
+                        )
+
                     if chk_ctx_result_before_exit:
                         await check_canceller(ctx)
 
@@ -385,15 +404,22 @@ def test_caller_cancels(
             if cancel_method != 'portal':
                 await portal.cancel_actor()
 
-            # since the `.cancel_actor()` call just above
-            # will cause the `.open_context().__aexit__()` raise
-            # a ctxc which should in turn cause `ctx._scope` to
+            # XXX NOTE XXX: non-normal yet purposeful
+            # test-specific ctxc suppression is implemented!
+            #
+            # WHY: the `.cancel_actor()` case (cancel_method='portal')
+            # will cause both:
+            #  * the `ctx.result()` inside `.open_context().__aexit__()`
+            #  * AND the `ctx.result()` inside `check_canceller()`
+            # to raise ctxc.
+            #
+            #   which should in turn cause `ctx._scope` to
             # catch any cancellation?
             if (
                 not callee_returns_early
-                and cancel_method == 'portal'
+                and cancel_method != 'portal'
             ):
-                assert ctx._scope.cancelled_caught
+                assert not ctx._scope.cancelled_caught
 
     trio.run(main)
 
@@ -511,6 +537,23 @@ async def expect_cancelled(
                 await stream.send(msg)  # echo server
 
     except trio.Cancelled:
+
+        # on ctx.cancel() the internal RPC scope is cancelled but
+        # never caught until the func exits.
+        assert ctx._scope.cancel_called
+        assert not ctx._scope.cancelled_caught
+
+        # should be the RPC cmd request for `._cancel_task()`
+        assert ctx._cancel_msg
+        # which, has not yet resolved to an error outcome
+        # since this rpc func has not yet exited.
+        assert not ctx.maybe_error
+        assert not ctx._final_result_is_set()
+
+        # debug REPL if needed
+        # with trio.CancelScope(shield=True):
+        #     await tractor.pause()
+
         # expected case
         _state = False
         raise
@@ -594,16 +637,16 @@ async def test_caller_closes_ctx_after_callee_opens_stream(
                     with trio.fail_after(0.2):
                         await ctx.result()
                         assert 0, "Callee should have blocked!?"
-
                 except trio.TooSlowError:
                     # NO-OP -> since already called above
                     await ctx.cancel()
 
         # NOTE: local scope should have absorbed the cancellation since
         # in this case we call `ctx.cancel()` and the local
-        # `._scope` gets `.cancel_called` on the ctxc ack.
+        # `._scope` does not get `.cancel_called` and thus
+        # `.cancelled_caught` neither will ever bet set.
         if use_ctx_cancel_method:
-            assert ctx._scope.cancelled_caught
+            assert not ctx._scope.cancelled_caught
 
         # rxed ctxc response from far end
         assert ctx.cancel_acked
diff --git a/tests/test_inter_peer_cancellation.py b/tests/test_inter_peer_cancellation.py
index d878b06d..c3d9e4fd 100644
--- a/tests/test_inter_peer_cancellation.py
+++ b/tests/test_inter_peer_cancellation.py
@@ -238,7 +238,12 @@ async def stream_from_peer(
 
         assert peer_ctx._remote_error is ctxerr
         assert peer_ctx._remote_error.msgdata == ctxerr.msgdata
-        assert peer_ctx.canceller == ctxerr.canceller
+
+        # the peer ctx is the canceller even though it's canceller
+        # is the "canceller" XD
+        assert peer_name in peer_ctx.canceller
+
+        assert "canceller" in ctxerr.canceller
 
         # caller peer should not be the cancel requester
         assert not ctx.cancel_called
@@ -272,7 +277,6 @@ async def stream_from_peer(
 
         # root/parent actor task should NEVER HAVE cancelled us!
         assert not ctx.canceller
-        assert 'canceller' in peer_ctx.canceller
 
         raise
         # TODO: IN THEORY we could have other cases depending on
@@ -527,27 +531,24 @@ def test_peer_canceller(
 
                         assert ctx.cancel_called
 
-                        if (
-                            ctx is sleeper_ctx
-                            or ctx is caller_ctx
-                        ):
-                            assert (
-                                re.canceller
-                                ==
-                                ctx.canceller
-                                ==
-                                canceller.channel.uid
-                            )
+                        if ctx is sleeper_ctx:
+                            assert 'canceller' in re.canceller
+                            assert 'sleeper' in ctx.canceller
 
-                        else:
+                        if ctx is canceller_ctx:
                             assert (
                                 re.canceller
                                 ==
-                                ctx.canceller
-                                ==
                                 root.uid
                             )
 
+                        else:  # the other 2 ctxs
+                            assert (
+                                re.canceller
+                                ==
+                                canceller.channel.uid
+                            )
+
                     # since the sleeper errors while handling a
                     # peer-cancelled (by ctxc) scenario, we expect
                     # that the `.open_context()` block DOES call
@@ -576,14 +577,16 @@ def test_peer_canceller(
                     assert not sleeper_ctx._scope.cancelled_caught
 
                     assert isinstance(loc_err, ContextCancelled)
-                    assert loc_err.canceller == sleeper_ctx.canceller
-                    assert (
-                        loc_err.canceller[0]
-                        ==
-                        sleeper_ctx.canceller[0]
-                        ==
-                        'canceller'
-                    )
+
+                    # the received remote error's `.canceller`
+                    # will of course be the "canceller" actor BUT
+                    # the canceller set on the local handle to
+                    # `sleeper_ctx` will be the "sleeper" uid
+                    # since it's the actor that relayed us the
+                    # error which was **caused** by the
+                    # "canceller".
+                    assert 'sleeper' in sleeper_ctx.canceller
+                    assert 'canceller' == loc_err.canceller[0]
 
                     # the sleeper's remote error is the error bubbled
                     # out of the context-stack above!
-- 
2.34.1


From c025761f15bb09c308ffc44f4776e71a4d2a60f4 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 7 Mar 2024 21:33:07 -0500
Subject: [PATCH 152/378] Adjust `asyncio` test for stricter ctx-self-cancels

Use `expect_ctx()` around the portal cancellation case, toss in
a `'context'` parametrization and return just the `Context.outcome` from
`main()` B)
---
 tests/test_infected_asyncio.py | 35 +++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/tests/test_infected_asyncio.py b/tests/test_infected_asyncio.py
index 56b5fde5..24bd371f 100644
--- a/tests/test_infected_asyncio.py
+++ b/tests/test_infected_asyncio.py
@@ -19,6 +19,8 @@ from tractor import (
 )
 from tractor.trionics import BroadcastReceiver
 
+from conftest import expect_ctxc
+
 
 async def sleep_and_err(
     sleep_for: float = 0.1,
@@ -190,7 +192,8 @@ async def trio_ctx(
 
 
 @pytest.mark.parametrize(
-    'parent_cancels', [False, True],
+    'parent_cancels',
+    ['context', 'actor', False],
     ids='parent_actor_cancels_child={}'.format
 )
 def test_context_spawns_aio_task_that_errors(
@@ -214,18 +217,36 @@ def test_context_spawns_aio_task_that_errors(
                     # debug_mode=True,
                     loglevel='cancel',
                 )
-                async with p.open_context(
-                    trio_ctx,
-                ) as (ctx, first):
+                async with (
+                    expect_ctxc(
+                        yay=parent_cancels == 'actor',
+                    ),
+                    p.open_context(
+                        trio_ctx,
+                    ) as (ctx, first),
+                ):
 
                     assert first == 'start'
 
-                    if parent_cancels:
+                    if parent_cancels == 'actor':
                         await p.cancel_actor()
 
-                    await trio.sleep_forever()
+                    elif parent_cancels == 'context':
+                        await ctx.cancel()
 
-                return await ctx.result()
+                    else:
+                        await trio.sleep_forever()
+
+                async with expect_ctxc(
+                    yay=parent_cancels == 'actor',
+                ):
+                    await ctx.result()
+
+                if parent_cancels == 'context':
+                    # to tear down sub-acor
+                    await p.cancel_actor()
+
+        return ctx.outcome
 
     if parent_cancels:
         # bc the parent made the cancel request,
-- 
2.34.1


From 1617e0ff2c52371b24ae64c38bd9074c98fdcea3 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 8 Mar 2024 13:48:35 -0500
Subject: [PATCH 153/378] Woops, fix one last `ctx._cancelled_caught` in drain
 loop

---
 tractor/_context.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index 9179456b..ee058196 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -196,8 +196,8 @@ async def _drain_to_final_msg(
                     (ctx._stream.closed
                      and (reason := 'stream was already closed')
                     )
-                    or (ctx._cancelled_caught
-                        and (reason := 'ctx caught a cancel')
+                    or (ctx.cancel_acked
+                        and (reason := 'ctx cancelled other side')
                     )
                     or (ctx._cancel_called
                         and (reason := 'ctx called `.cancel()`')
-- 
2.34.1


From b29d33d6031f69ba0b75373862434ad4cd00e326 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 8 Mar 2024 14:03:18 -0500
Subject: [PATCH 154/378] Make `Actor._cancel_task(requesting_uid: tuple)`
 required arg

---
 tractor/_runtime.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 64549bac..09778c76 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -1752,8 +1752,8 @@ class Actor:
         self,
         cid: str,
         parent_chan: Channel,
+        requesting_uid: tuple[str, str]|None,
 
-        requesting_uid: tuple[str, str]|None = None,
         ipc_msg: dict|None|bool = False,
 
     ) -> bool:
@@ -2352,6 +2352,7 @@ async def process_messages(
                             await actor._cancel_task(
                                 cid,
                                 channel,
+                                requesting_uid=channel.uid,
 
                                 ipc_msg=msg,
                             )
-- 
2.34.1


From 4c3c3e4b565637cd4ee9213501c0bf6a55a3b2dd Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 8 Mar 2024 14:11:17 -0500
Subject: [PATCH 155/378] Support a `._state.last_actor()` getter

Not sure if it's really that useful other then for reporting errors from
`current_actor()` but at least it alerts `tractor` devs and/or users
when the runtime has already terminated vs. hasn't been started
yet/correctly.

Set the `._last_actor_terminated: tuple` in the root's final block which
allows testing for an already terminated tree which is the case where
`._state._current_actor == None` and the last is set.
---
 tractor/_root.py      |  1 +
 tractor/_state.py     | 53 ++++++++++++++++++++++++++++++++++++++-----
 tractor/_supervise.py | 11 +++++----
 3 files changed, 55 insertions(+), 10 deletions(-)

diff --git a/tractor/_root.py b/tractor/_root.py
index 1d3d4f17..f948913d 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -348,6 +348,7 @@ async def open_root_actor(
                 await actor.cancel(None)  # self cancel
     finally:
         _state._current_actor = None
+        _state._last_actor_terminated = actor
 
         # restore built-in `breakpoint()` hook state
         sys.breakpointhook = builtin_bp_handler
diff --git a/tractor/_state.py b/tractor/_state.py
index f94c3ebb..f3917436 100644
--- a/tractor/_state.py
+++ b/tractor/_state.py
@@ -18,12 +18,18 @@
 Per process state
 
 """
+from __future__ import annotations
 from typing import (
-    Optional,
     Any,
+    TYPE_CHECKING,
 )
 
-_current_actor: Optional['Actor'] = None  # type: ignore # noqa
+if TYPE_CHECKING:
+    from ._runtime import Actor
+
+
+_current_actor: Actor|None = None  # type: ignore # noqa
+_last_actor_terminated: Actor|None = None
 _runtime_vars: dict[str, Any] = {
     '_debug_mode': False,
     '_is_root': False,
@@ -31,14 +37,49 @@ _runtime_vars: dict[str, Any] = {
 }
 
 
-def current_actor(err_on_no_runtime: bool = True) -> 'Actor':  # type: ignore # noqa
+def last_actor() -> Actor|None:
+    '''
+    Try to return last active `Actor` singleton
+    for this process.
+
+    For case where runtime already exited but someone is asking
+    about the "last" actor probably to get its `.uid: tuple`.
+
+    '''
+    return _last_actor_terminated
+
+
+def current_actor(
+    err_on_no_runtime: bool = True,
+) -> Actor:
     '''
     Get the process-local actor instance.
 
     '''
-    from ._exceptions import NoRuntime
-    if _current_actor is None and err_on_no_runtime:
-        raise NoRuntime("No local actor has been initialized yet")
+    if (
+        err_on_no_runtime
+        and _current_actor is None
+    ):
+        msg: str = 'No local actor has been initialized yet'
+        from ._exceptions import NoRuntime
+
+        if last := last_actor():
+            msg += (
+                f'Apparently the lact active actor was\n'
+                f'|_{last}\n'
+                f'|_{last.uid}\n'
+            )
+        # no actor runtime has (as of yet) ever been started for
+        # this process.
+        else:
+            msg += (
+                'No last actor found?\n'
+                'Did you forget to open one of:\n\n'
+                '- `tractor.open_root_actor()`\n'
+                '- `tractor.open_nursery()`\n'
+            )
+
+        raise NoRuntime(msg)
 
     return _current_actor
 
diff --git a/tractor/_supervise.py b/tractor/_supervise.py
index c27e0e43..ad007ae6 100644
--- a/tractor/_supervise.py
+++ b/tractor/_supervise.py
@@ -533,12 +533,15 @@ async def open_nursery(
 
     '''
     implicit_runtime: bool = False
-
-    actor = current_actor(err_on_no_runtime=False)
+    actor: Actor = current_actor(
+        err_on_no_runtime=False
+    )
 
     try:
-        if actor is None and is_main_process():
-
+        if (
+            actor is None
+            and is_main_process()
+        ):
             # if we are the parent process start the
             # actor runtime implicitly
             log.info("Starting actor runtime!")
-- 
2.34.1


From 7458f99733c844309f40baa5d2e437de7c33f7ce Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 8 Mar 2024 15:34:20 -0500
Subject: [PATCH 156/378] Add a `._state._runtime_vars['_registry_addrs']`

Such that it's set to whatever `Actor.reg_addrs: list[tuple]` is during
the actor's init-after-spawn guaranteeing each actor has at least the
registry infos from its parent. Ensure we read this if defined over
`_root._default_lo_addrs` in `._discovery` routines, namely
`.find_actor()` since it's the one API normally used without expecting
the runtime's `current_actor()` to be up.

Update the latest inter-peer cancellation test to use the `reg_addr`
fixture (and thus test this new runtime-vars value via `find_actor()`
usage) since it was failing if run *after* the infected `asyncio` suite
due to registry contact failure.
---
 tests/test_inter_peer_cancellation.py |  2 ++
 tractor/_discovery.py                 | 11 +++++++++--
 tractor/_runtime.py                   |  9 +++++----
 tractor/_state.py                     |  3 ++-
 4 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/tests/test_inter_peer_cancellation.py b/tests/test_inter_peer_cancellation.py
index c3d9e4fd..e3c8a7dd 100644
--- a/tests/test_inter_peer_cancellation.py
+++ b/tests/test_inter_peer_cancellation.py
@@ -939,6 +939,7 @@ async def tell_little_bro(
 def test_peer_spawns_and_cancels_service_subactor(
     debug_mode: bool,
     raise_client_error: str,
+    reg_addr: tuple[str, int],
 ):
     # NOTE: this tests for the modden `mod wks open piker` bug
     # discovered as part of implementing workspace ctx
@@ -956,6 +957,7 @@ def test_peer_spawns_and_cancels_service_subactor(
         async with tractor.open_nursery(
             # NOTE: to halt the peer tasks on ctxc, uncomment this.
             debug_mode=debug_mode,
+            registry_addrs=[reg_addr],
         ) as an:
             server: Portal = await an.start_actor(
                 (server_name := 'spawn_server'),
diff --git a/tractor/_discovery.py b/tractor/_discovery.py
index 8cccc505..de79edc0 100644
--- a/tractor/_discovery.py
+++ b/tractor/_discovery.py
@@ -35,7 +35,10 @@ from ._portal import (
     open_portal,
     LocalPortal,
 )
-from ._state import current_actor, _runtime_vars
+from ._state import (
+    current_actor,
+    _runtime_vars,
+)
 
 
 if TYPE_CHECKING:
@@ -205,7 +208,11 @@ async def find_actor(
         # every call since something may change it globally (eg.
         # like in our discovery test suite)!
         from . import _root
-        registry_addrs = _root._default_lo_addrs
+        registry_addrs = (
+            _runtime_vars['_registry_addrs']
+            or
+            _root._default_lo_addrs
+        )
 
     maybe_portals: list[
         AsyncContextManager[tuple[str, int]]
diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 09778c76..607f98ce 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -811,10 +811,10 @@ class Actor:
         name: str,
         *,
         enable_modules: list[str] = [],
-        uid: str | None = None,
-        loglevel: str | None = None,
-        registry_addrs: list[tuple[str, int]] | None = None,
-        spawn_method: str | None = None,
+        uid: str|None = None,
+        loglevel: str|None = None,
+        registry_addrs: list[tuple[str, int]]|None = None,
+        spawn_method: str|None = None,
 
         # TODO: remove!
         arbiter_addr: tuple[str, int] | None = None,
@@ -896,6 +896,7 @@ class Actor:
         self._reg_addrs: list[tuple[str, int]] = []
         if registry_addrs:
             self.reg_addrs: list[tuple[str, int]] = registry_addrs
+            _state._runtime_vars['_registry_addrs'] = registry_addrs
 
     @property
     def reg_addrs(self) -> list[tuple[str, int]]:
diff --git a/tractor/_state.py b/tractor/_state.py
index f3917436..9e4e9473 100644
--- a/tractor/_state.py
+++ b/tractor/_state.py
@@ -33,7 +33,8 @@ _last_actor_terminated: Actor|None = None
 _runtime_vars: dict[str, Any] = {
     '_debug_mode': False,
     '_is_root': False,
-    '_root_mailbox': (None, None)
+    '_root_mailbox': (None, None),
+    '_registry_addrs': [],
 }
 
 
-- 
2.34.1


From 7cafb59ab7f35b4d5fbdfa43ac65849a2c96ffdd Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 8 Mar 2024 15:46:42 -0500
Subject: [PATCH 157/378] Tweak `Context.repr_outcome()` for KBIs

Since apparently `str(KeyboardInterrupt()) == ''`? So instead add little
`<str> or repr(merr)` expressions throughout to avoid blank strings
rendering if various `repr()`/`.__str__()` outputs..
---
 tractor/_context.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index ee058196..a7ce5832 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -1485,7 +1485,11 @@ class Context:
         TODO: implement this using `outcome.Outcome` types?
 
         '''
-        return self.maybe_error or self._result
+        return (
+            self.maybe_error
+            or
+            self._result
+        )
 
     # @property
     def repr_outcome(
@@ -1520,16 +1524,28 @@ class Context:
                 #
                 # just the type name for now to avoid long lines
                 # when tons of cancels..
-                return type(merr).__name__
+                return (
+                    str(type(merr).__name__)
+                    or
+                    repr(merr)
+                )
 
             # just the type name
             # else:  # but wen?
             #     return type(merr).__name__
 
             # for all other errors show their regular output
-            return str(merr)
+            return (
+                str(merr)
+                or
+                repr(merr)
+            )
 
-        return str(self._result)
+        return (
+            str(self._result)
+            or
+            repr(self._result)
+        )
 
     async def started(
         self,
-- 
2.34.1


From c56d4b0a794ce8273d4e9c7058a7073a480d7eb7 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 8 Mar 2024 15:48:13 -0500
Subject: [PATCH 158/378] Assign `ctx._local_error` ASAP from `.open_context()`

Such that `.outcome` related fields render nicely asap for logging
withing `Portal.open_context()` itself.
---
 tractor/_portal.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/tractor/_portal.py b/tractor/_portal.py
index 8148a5d9..5e5fd813 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -30,7 +30,7 @@ from typing import (
     Any,
     Callable,
     AsyncGenerator,
-    Type,
+    # Type,
 )
 from functools import partial
 from dataclasses import dataclass
@@ -41,8 +41,7 @@ from async_generator import asynccontextmanager
 
 from .trionics import maybe_open_nursery
 from .devx import (
-    # acquire_debug_lock,
-    # pause,
+    # _debug,
     maybe_wait_for_debugger,
 )
 from ._state import (
@@ -673,6 +672,7 @@ class Portal:
         #   `Nursery.cancel_scope.cancel()`)
         except ContextCancelled as ctxc:
             scope_err = ctxc
+            ctx._local_error: BaseException = scope_err
             ctxc_from_callee = ctxc
 
             # XXX TODO XXX: FIX THIS debug_mode BUGGGG!!!
@@ -684,7 +684,7 @@ class Portal:
             #   debugging the tractor-runtime itself using it's
             #   own `.devx.` tooling!
             # 
-            # await pause()
+            # await _debug.pause()
 
             # CASE 2: context was cancelled by local task calling
             # `.cancel()`, we don't raise and the exit block should
@@ -745,18 +745,20 @@ class Portal:
 
         ) as caller_err:
             scope_err = caller_err
+            ctx._local_error: BaseException = scope_err
 
             # XXX: ALWAYS request the context to CANCEL ON any ERROR.
             # NOTE: `Context.cancel()` is conversely NEVER CALLED in
             # the `ContextCancelled` "self cancellation absorbed" case
             # handled in the block above ^^^ !!
+            # await _debug.pause()
             log.cancel(
                 'Context terminated due to\n\n'
-                f'{caller_err}\n'
+                f'.outcome => {ctx.repr_outcome()}\n'
             )
 
             if debug_mode():
-                # async with acquire_debug_lock(self.actor.uid):
+                # async with _debug.acquire_debug_lock(self.actor.uid):
                 #     pass
                 # TODO: factor ^ into below for non-root cases?
                 was_acquired: bool = await maybe_wait_for_debugger(
@@ -818,6 +820,7 @@ class Portal:
                     # this task didn't know until final teardown
                     # / value collection.
                     scope_err = berr
+                    ctx._local_error: BaseException = scope_err
                     raise
 
                 # yes! this worx Bp
@@ -927,8 +930,10 @@ class Portal:
             # should be stored as the `Context._local_error` and
             # used in determining `Context.cancelled_caught: bool`.
             if scope_err is not None:
-                ctx._local_error: BaseException = scope_err
-                etype: Type[BaseException] = type(scope_err)
+                # sanity, tho can remove?
+                assert ctx._local_error is scope_err
+                # ctx._local_error: BaseException = scope_err
+                # etype: Type[BaseException] = type(scope_err)
 
                 # CASE 2
                 if (
-- 
2.34.1


From f067cf48a73bbf03072921664f6604017ad81e40 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 8 Mar 2024 16:07:17 -0500
Subject: [PATCH 159/378] Unify some log msgs in `.to_asyncio`

Much like similar recent changes throughout the core, build out `msg:
str` depending on error cases and emit with `.cancel()` level as
appropes. Also mute (via level) some duplication in the cancel case
inside `_run_asyncio_task()` for console noise reduction.
---
 tractor/to_asyncio.py | 49 ++++++++++++++++++++++++++++++-------------
 1 file changed, 35 insertions(+), 14 deletions(-)

diff --git a/tractor/to_asyncio.py b/tractor/to_asyncio.py
index 788181e6..7c88edd2 100644
--- a/tractor/to_asyncio.py
+++ b/tractor/to_asyncio.py
@@ -216,7 +216,14 @@ def _run_asyncio_task(
         try:
             result = await coro
         except BaseException as aio_err:
-            log.exception('asyncio task errored')
+            if isinstance(aio_err, CancelledError):
+                log.runtime(
+                    '`asyncio` task was cancelled..\n'
+                )
+            else:
+                log.exception(
+                    '`asyncio` task errored\n'
+                )
             chan._aio_err = aio_err
             raise
 
@@ -271,12 +278,22 @@ def _run_asyncio_task(
         except BaseException as terr:
             task_err = terr
 
+            msg: str = (
+                'Infected `asyncio` task {etype_str}\n'
+                f'|_{task}\n'
+            )
             if isinstance(terr, CancelledError):
-                log.cancel(f'`asyncio` task cancelled: {task.get_name()}')
+                log.cancel(
+                    msg.format(etype_str='cancelled')
+                )
             else:
-                log.exception(f'`asyncio` task: {task.get_name()} errored')
+                log.exception(
+                    msg.format(etype_str='cancelled')
+                )
 
-            assert type(terr) is type(aio_err), 'Asyncio task error mismatch?'
+            assert type(terr) is type(aio_err), (
+                '`asyncio` task error mismatch?!?'
+            )
 
         if aio_err is not None:
             # XXX: uhh is this true?
@@ -289,18 +306,22 @@ def _run_asyncio_task(
             # We might want to change this in the future though.
             from_aio.close()
 
-            if type(aio_err) is CancelledError:
-                log.cancel("infected task was cancelled")
-
-                # TODO: show that the cancellation originated
-                # from the ``trio`` side? right?
-                # if cancel_scope.cancelled:
-                #     raise aio_err from err
-
-            elif task_err is None:
+            if task_err is None:
                 assert aio_err
                 aio_err.with_traceback(aio_err.__traceback__)
-                log.error('infected task errorred')
+                # log.error(
+                #     'infected task errorred'
+                # )
+
+            # TODO: show that the cancellation originated
+            # from the ``trio`` side? right?
+            # elif type(aio_err) is CancelledError:
+            #     log.cancel(
+            #         'infected task was cancelled'
+            #     )
+
+                # if cancel_scope.cancelled:
+                #     raise aio_err from err
 
             # XXX: alway cancel the scope on error
             # in case the trio task is blocking
-- 
2.34.1


From 37ee477aeef83fa7ac5e9b1478a838603abd70ce Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 11 Mar 2024 10:20:55 -0400
Subject: [PATCH 160/378] Let `MsgStream.receive_nowait()` take in msg key list

Call it `allow_msg_keys: list[str] = ['yield']` and set it to accept
`['yield', 'return']` from the drain loop in `.aclose()`. Only pass the
last key error to `_raise_from_no_key_in_msg()` in the fall-through
case.

Somehow this seems to prevent all the intermittent test failures i was
seeing in local runs including when running the entire suite all in
sequence; i ain't complaining B)
---
 tractor/_streaming.py | 40 ++++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/tractor/_streaming.py b/tractor/_streaming.py
index b2cfe485..50a32ae9 100644
--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@@ -90,19 +90,29 @@ class MsgStream(trio.abc.Channel):
         self._closed: bool|trio.ClosedResourceError = False
 
     # delegate directly to underlying mem channel
-    def receive_nowait(self):
-        msg = self._rx_chan.receive_nowait()
-        try:
-            return msg['yield']
-        except KeyError as kerr:
-            _raise_from_no_key_in_msg(
-                ctx=self._ctx,
-                msg=msg,
-                src_err=kerr,
-                log=log,
-                expect_key='yield',
-                stream=self,
-            )
+    def receive_nowait(
+        self,
+        allow_msg_keys: list[str] = ['yield'],
+    ):
+        msg: dict = self._rx_chan.receive_nowait()
+        for (
+            i,
+            key,
+        ) in enumerate(allow_msg_keys):
+            try:
+                return msg[key]
+            except KeyError as kerr:
+                if i < (len(allow_msg_keys) - 1):
+                    continue
+
+                _raise_from_no_key_in_msg(
+                    ctx=self._ctx,
+                    msg=msg,
+                    src_err=kerr,
+                    log=log,
+                    expect_key=key,
+                    stream=self,
+                )
 
     async def receive(self):
         '''
@@ -263,7 +273,9 @@ class MsgStream(trio.abc.Channel):
         drained: list[Exception|dict] = []
         while not drained:
             try:
-                maybe_final_msg = self.receive_nowait()
+                maybe_final_msg = self.receive_nowait(
+                    allow_msg_keys=['yield', 'return'],
+                )
                 if maybe_final_msg:
                     log.debug(
                         'Drained un-processed stream msg:\n'
-- 
2.34.1


From dd168184c31e238d916d31fe9850f564c47446f4 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 11 Mar 2024 10:24:44 -0400
Subject: [PATCH 161/378] Add a open-ctx-with-self test

Found exactly why trying this won't work when playing around with
opening workspaces in `modden` using a `Portal.open_context()` back to
the 'bigd' root actor: the RPC machinery only registers one entry in
`Actor._contexts` which will get overwritten by each task's side and
then experience race-based IPC msging errors (eg. rxing `{'started': _}`
on the callee side..). Instead make opening a ctx back to the self-actor
a runtime error describing it as an invalid op.

To match:
- add a new test `test_ctx_with_self_actor()` to the context semantics
  suite.
- tried out adding a new `side: str` to the `Actor.get_context()` (and
  callers) but ran into not being able to determine the value from in
  `._push_result()` where it's needed to figure out which side to push
  to.. So, just leaving the commented arg (passing) in the runtime core
  for now in case we can come back to trying to make it work, tho i'm
  thinking it's not the right hack anyway XD
---
 tests/test_context_stream_semantics.py | 51 ++++++++++++++++++++++++++
 tractor/_context.py                    |  4 +-
 tractor/_portal.py                     | 10 +++++
 tractor/_runtime.py                    | 50 +++++++++++++++++++++----
 4 files changed, 107 insertions(+), 8 deletions(-)

diff --git a/tests/test_context_stream_semantics.py b/tests/test_context_stream_semantics.py
index d8e946bf..d5767eec 100644
--- a/tests/test_context_stream_semantics.py
+++ b/tests/test_context_stream_semantics.py
@@ -1024,6 +1024,8 @@ def test_maybe_allow_overruns_stream(
     cancel_ctx: bool,
     slow_side: str,
     allow_overruns_side: str,
+
+    # conftest wide
     loglevel: str,
     debug_mode: bool,
 ):
@@ -1147,3 +1149,52 @@ def test_maybe_allow_overruns_stream(
         # if this hits the logic blocks from above are not
         # exhaustive..
         pytest.fail('PARAMETRIZED CASE GEN PROBLEM YO')
+
+
+def test_ctx_with_self_actor(
+    loglevel: str,
+    debug_mode: bool,
+):
+    '''
+    NOTE: for now this is an INVALID OP!
+
+    BUT, eventually presuming we add a "side" key to `Actor.get_context()`,
+    we might be able to get this working symmetrically, but should we??
+
+    Open a context back to the same actor and ensure all cancellation
+    and error semantics hold the same.
+
+    '''
+    async def main():
+        async with tractor.open_nursery(
+            debug_mode=debug_mode,
+            enable_modules=[__name__],
+        ) as an:
+            assert an
+            async with (
+                tractor.find_actor('root') as portal,
+                portal.open_context(
+                    expect_cancelled,
+                    # echo_back_sequence,
+                    # seq=seq,
+                    # wait_for_cancel=cancel_ctx,
+                    # be_slow=(slow_side == 'child'),
+                    # allow_overruns_side=allow_overruns_side,
+
+                ) as (ctx, sent),
+                ctx.open_stream() as ipc,
+            ):
+                assert sent is None
+
+                seq = list(range(10))
+                for i in seq:
+                    await ipc.send(i)
+                    rx: int = await ipc.receive()
+                    assert rx == i
+
+                await ctx.cancel()
+
+    with pytest.raises(RuntimeError) as excinfo:
+        trio.run(main)
+
+    assert 'Invalid Operation' in repr(excinfo.value)
diff --git a/tractor/_context.py b/tractor/_context.py
index a7ce5832..a31c3b1b 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -1101,6 +1101,8 @@ class Context:
             chan=self.chan,
             cid=self.cid,
             nsf=self._nsf,
+            # side=self.side,
+
             msg_buffer_size=msg_buffer_size,
             allow_overruns=allow_overruns,
         )
@@ -1298,7 +1300,7 @@ class Context:
         # https://docs.python.org/3/reference/simple_stmts.html#the-raise-statement
         # https://stackoverflow.com/a/24752607
         __tracebackhide__: bool = True
-        raise remote_error from None
+        raise remote_error # from None
 
     # TODO: change  to `.wait_for_result()`?
     async def result(
diff --git a/tractor/_portal.py b/tractor/_portal.py
index 5e5fd813..7ac5711a 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -514,6 +514,16 @@ class Portal:
         # a new `_context.py` mod.
         nsf = NamespacePath.from_ref(func)
 
+        # XXX NOTE XXX: currenly we do NOT allow opening a contex
+        # with "self" since the local feeder mem-chan processing
+        # is not built for it.
+        if self.channel.uid == self.actor.uid:
+            raise RuntimeError(
+                '** !! Invalid Operation !! **\n'
+                'Can not open an IPC ctx with the local actor!\n'
+                f'|_{self.actor}\n'
+            )
+
         ctx: Context = await self.actor.start_remote_task(
             self.channel,
             nsf=nsf,
diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 607f98ce..307dacdf 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -434,6 +434,10 @@ async def _invoke(
         chan=chan,
         cid=cid,
         nsf=NamespacePath.from_ref(func),
+
+        # TODO: if we wanted to get cray and support it?
+        # side='callee',
+
         # We shouldn't ever need to pass this through right?
         # it's up to the soon-to-be called rpc task to
         # open the stream with this option.
@@ -686,9 +690,11 @@ async def _invoke(
             # don't pop the local context until we know the
             # associated child isn't in debug any more
             await maybe_wait_for_debugger()
-            ctx: Context = actor._contexts.pop(
-                (chan.uid, cid)
-            )
+            ctx: Context = actor._contexts.pop((
+                chan.uid,
+                cid,
+                # ctx.side,
+            ))
 
             merr: Exception|None = ctx.maybe_error
 
@@ -879,7 +885,11 @@ class Actor:
 
         # map {actor uids -> Context}
         self._contexts: dict[
-            tuple[tuple[str, str], str],
+            tuple[
+                tuple[str, str],  # .uid
+                str,  # .cid
+                str,  # .side
+            ],
             Context
         ] = {}
 
@@ -1363,7 +1373,13 @@ class Actor:
         uid: tuple[str, str] = chan.uid
         assert uid, f"`chan.uid` can't be {uid}"
         try:
-            ctx: Context = self._contexts[(uid, cid)]
+            ctx: Context = self._contexts[(
+                uid,
+                cid,
+
+                # TODO: how to determine this tho?
+                # side,
+            )]
         except KeyError:
             log.warning(
                 'Ignoring invalid IPC ctx msg!\n\n'
@@ -1382,6 +1398,16 @@ class Actor:
         cid: str,
         nsf: NamespacePath,
 
+        # TODO: support lookup by `Context.side: str` ?
+        # -> would allow making a self-context which might have
+        # certain special use cases where RPC isolation is wanted
+        # between 2 tasks running in the same process?
+        # => prolly needs some deeper though on the real use cases
+        # and whether or not such things should be better
+        # implemented using a `TaskManager` style nursery..
+        #
+        # side: str|None = None,
+
         msg_buffer_size: int | None = None,
         allow_overruns: bool = False,
 
@@ -1397,7 +1423,11 @@ class Actor:
         actor_uid = chan.uid
         assert actor_uid
         try:
-            ctx = self._contexts[(actor_uid, cid)]
+            ctx = self._contexts[(
+                actor_uid,
+                cid,
+                # side,
+            )]
             log.runtime(
                 f'Retreived cached IPC ctx for\n'
                 f'peer: {chan.uid}\n'
@@ -1423,7 +1453,11 @@ class Actor:
                 msg_buffer_size=msg_buffer_size or self.msg_buffer_size,
                 _allow_overruns=allow_overruns,
             )
-            self._contexts[(actor_uid, cid)] = ctx
+            self._contexts[(
+                actor_uid,
+                cid,
+                # side,
+            )] = ctx
 
         return ctx
 
@@ -1454,6 +1488,8 @@ class Actor:
             chan=chan,
             cid=cid,
             nsf=nsf,
+
+            # side='caller',
             msg_buffer_size=msg_buffer_size,
             allow_overruns=allow_overruns,
         )
-- 
2.34.1


From ededa2e88fdb3064e60210cc8564932d373c3f62 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 11 Mar 2024 10:33:06 -0400
Subject: [PATCH 162/378] More spaceless union type annots

---
 tractor/_discovery.py |  4 ++--
 tractor/_root.py      | 20 ++++++++++++--------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/tractor/_discovery.py b/tractor/_discovery.py
index de79edc0..99a4dd68 100644
--- a/tractor/_discovery.py
+++ b/tractor/_discovery.py
@@ -162,8 +162,8 @@ async def query_actor(
 @acm
 async def find_actor(
     name: str,
-    arbiter_sockaddr: tuple[str, int] | None = None,
-    registry_addrs: list[tuple[str, int]] | None = None,
+    arbiter_sockaddr: tuple[str, int]|None = None,
+    registry_addrs: list[tuple[str, int]]|None = None,
 
     only_first: bool = True,
     raise_on_none: bool = False,
diff --git a/tractor/_root.py b/tractor/_root.py
index f948913d..6ee78b99 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -63,26 +63,26 @@ async def open_root_actor(
 
     *,
     # defaults are above
-    registry_addrs: list[tuple[str, int]] | None = None,
+    registry_addrs: list[tuple[str, int]]|None = None,
 
     # defaults are above
-    arbiter_addr: tuple[str, int] | None = None,
+    arbiter_addr: tuple[str, int]|None = None,
 
-    name: str | None = 'root',
+    name: str|None = 'root',
 
     # either the `multiprocessing` start method:
     # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
     # OR `trio` (the new default).
-    start_method: _spawn.SpawnMethodKey | None = None,
+    start_method: _spawn.SpawnMethodKey|None = None,
 
     # enables the multi-process debugger support
     debug_mode: bool = False,
 
     # internal logging
-    loglevel: str | None = None,
+    loglevel: str|None = None,
 
-    enable_modules: list | None = None,
-    rpc_module_paths: list | None = None,
+    enable_modules: list|None = None,
+    rpc_module_paths: list|None = None,
 
     # NOTE: allow caller to ensure that only one registry exists
     # and that this call creates it.
@@ -108,7 +108,11 @@ async def open_root_actor(
     _state._runtime_vars['_is_root'] = True
 
     # caps based rpc list
-    enable_modules = enable_modules or []
+    enable_modules = (
+        enable_modules
+        or
+        []
+    )
 
     if rpc_module_paths:
         warnings.warn(
-- 
2.34.1


From 8c39b8b124de203f5ff244d20016b0416136fc57 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 11 Mar 2024 10:37:34 -0400
Subject: [PATCH 163/378] Tweak some tests for spurious failues

With the seeming cause that some cases occasionally raise
`ExceptionGroup` instead of a (collapsed out) single error which, in
those cases at least try to check that `.exceptions` has the original
error.
---
 pytest.ini                       |  2 +-
 tests/test_advanced_faults.py    |  5 ++++-
 tests/test_advanced_streaming.py | 13 +++++++++++--
 tests/test_infected_asyncio.py   | 30 ++++++++++++++++++++++++++----
 tests/test_rpc.py                | 21 +++++++++++++++------
 5 files changed, 57 insertions(+), 14 deletions(-)

diff --git a/pytest.ini b/pytest.ini
index 6a7e51fb..b2527228 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,4 +1,4 @@
-# vim: ft=ini
+# vim: ft=conf
 # pytest.ini for tractor
 
 [pytest]
diff --git a/tests/test_advanced_faults.py b/tests/test_advanced_faults.py
index a48866ea..f34738bd 100644
--- a/tests/test_advanced_faults.py
+++ b/tests/test_advanced_faults.py
@@ -18,7 +18,10 @@ from conftest import (
 @pytest.mark.parametrize(
     'debug_mode',
     [False, True],
-    ids=['no_debug_mode', 'debug_mode'],
+    ids=[
+        'no_debug_mode',
+        'debug_mode',
+    ],
 )
 @pytest.mark.parametrize(
     'ipc_break',
diff --git a/tests/test_advanced_streaming.py b/tests/test_advanced_streaming.py
index e8696346..3134b9c2 100644
--- a/tests/test_advanced_streaming.py
+++ b/tests/test_advanced_streaming.py
@@ -6,6 +6,7 @@ from collections import Counter
 import itertools
 import platform
 
+import pytest
 import trio
 import tractor
 
@@ -143,8 +144,16 @@ def test_dynamic_pub_sub():
 
     try:
         trio.run(main)
-    except trio.TooSlowError:
-        pass
+    except (
+        trio.TooSlowError,
+        ExceptionGroup,
+    ) as err:
+        if isinstance(err, ExceptionGroup):
+            for suberr in err.exceptions:
+                if isinstance(suberr, trio.TooSlowError):
+                    break
+            else:
+                pytest.fail('Never got a `TooSlowError` ?')
 
 
 @tractor.context
diff --git a/tests/test_infected_asyncio.py b/tests/test_infected_asyncio.py
index 24bd371f..1ac1fba4 100644
--- a/tests/test_infected_asyncio.py
+++ b/tests/test_infected_asyncio.py
@@ -70,7 +70,7 @@ def test_trio_cancels_aio_on_actor_side(reg_addr):
 async def asyncio_actor(
 
     target: str,
-    expect_err: Optional[Exception] = None
+    expect_err: Exception|None = None
 
 ) -> None:
 
@@ -114,10 +114,21 @@ def test_aio_simple_error(reg_addr):
                 infect_asyncio=True,
             )
 
-    with pytest.raises(RemoteActorError) as excinfo:
+    with pytest.raises(
+        expected_exception=(RemoteActorError, ExceptionGroup),
+    ) as excinfo:
         trio.run(main)
 
     err = excinfo.value
+
+    # might get multiple `trio.Cancelled`s as well inside an inception
+    if isinstance(err, ExceptionGroup):
+        err = next(itertools.dropwhile(
+            lambda exc: not isinstance(exc, tractor.RemoteActorError),
+            err.exceptions
+        ))
+        assert err
+
     assert isinstance(err, RemoteActorError)
     assert err.type == AssertionError
 
@@ -290,11 +301,22 @@ def test_aio_cancelled_from_aio_causes_trio_cancelled(reg_addr):
                 infect_asyncio=True,
             )
 
-    with pytest.raises(RemoteActorError) as excinfo:
+    with pytest.raises(
+        expected_exception=(RemoteActorError, ExceptionGroup),
+    ) as excinfo:
         trio.run(main)
 
+    # might get multiple `trio.Cancelled`s as well inside an inception
+    err = excinfo.value
+    if isinstance(err, ExceptionGroup):
+        err = next(itertools.dropwhile(
+            lambda exc: not isinstance(exc, tractor.RemoteActorError),
+            err.exceptions
+        ))
+        assert err
+
     # ensure boxed error is correct
-    assert excinfo.value.type == to_asyncio.AsyncioCancelled
+    assert err.type == to_asyncio.AsyncioCancelled
 
 
 # TODO: verify open_channel_from will fail on this..
diff --git a/tests/test_rpc.py b/tests/test_rpc.py
index 1a46666c..a18bcb02 100644
--- a/tests/test_rpc.py
+++ b/tests/test_rpc.py
@@ -1,6 +1,8 @@
-"""
-RPC related
-"""
+'''
+RPC (or maybe better labelled as "RTS: remote task scheduling"?)
+related API and error checks.
+
+'''
 import itertools
 
 import pytest
@@ -52,8 +54,13 @@ async def short_sleep():
         (['tmp_mod'], 'import doggy', ModuleNotFoundError),
         (['tmp_mod'], '4doggy', SyntaxError),
     ],
-    ids=['no_mods', 'this_mod', 'this_mod_bad_func', 'fail_to_import',
-         'fail_on_syntax'],
+    ids=[
+        'no_mods',
+        'this_mod',
+        'this_mod_bad_func',
+        'fail_to_import',
+        'fail_on_syntax',
+    ],
 )
 def test_rpc_errors(
     reg_addr,
@@ -127,7 +134,9 @@ def test_rpc_errors(
         run()
     else:
         # underlying errors aren't propagated upwards (yet)
-        with pytest.raises(remote_err) as err:
+        with pytest.raises(
+            expected_exception=(remote_err, ExceptionGroup),
+        ) as err:
             run()
 
         # get raw instance from pytest wrapper
-- 
2.34.1


From 6533285d7d157f6d51f62f57db9ba8954a1f541b Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 12 Mar 2024 08:56:17 -0400
Subject: [PATCH 164/378] Add `an: ActorNursery` var placeholder for final log
 msg

---
 tractor/_supervise.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tractor/_supervise.py b/tractor/_supervise.py
index ad007ae6..50f0d5e6 100644
--- a/tractor/_supervise.py
+++ b/tractor/_supervise.py
@@ -533,10 +533,8 @@ async def open_nursery(
 
     '''
     implicit_runtime: bool = False
-    actor: Actor = current_actor(
-        err_on_no_runtime=False
-    )
-
+    actor: Actor = current_actor(err_on_no_runtime=False)
+    an: ActorNursery|None = None
     try:
         if (
             actor is None
-- 
2.34.1


From 96992bcbb91ddc1c30a464010a0203a8c68049f8 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 12 Mar 2024 15:48:20 -0400
Subject: [PATCH 165/378] Add (back) a `tractor._testing` sub-pkg

Since importing from our top level `conftest.py` is not scaleable
or as "future forward thinking" in terms of:
- LoC-wise (it's only one file),
- prevents "external" (aka non-test) example scripts from importing
  content easily,
- seemingly(?) can't be used via abs-import if using
  a `[tool.pytest.ini_options]` in a `pyproject.toml` vs.
  a `pytest.ini`, see:
  https://docs.pytest.org/en/8.0.x/reference/customize.html#pyproject-toml)

=> Go back to having an internal "testing" pkg like `trio` (kinda) does.

Deats:
- move generic top level helpers into pkg-mod including the new
  `expect_ctxc()` (which i needed in the advanced faults testing script.
- move `@tractor_test` into `._testing.pytest` sub-mod.
- adjust all the helper imports to be a `from tractor._testing import <..>`

Rework `test_ipc_channel_break_during_stream()` and backing script:
- make test(s) pull `debug_mode` from new fixture (which is now
  controlled manually from `--tpdb` flag) and drop the previous
  parametrized input.
- update logic in ^ test for "which-side-fails" cases to better match
  recently updated/stricter cancel/failure semantics in terms of
  `ClosedResouruceError` vs. `EndOfChannel` expectations.
- handle `ExceptionGroup`s with expected embedded errors in test.
- better pendantics around whether to expect a user simulated KBI.
- for `examples/advanced_faults/ipc_failure_during_stream.py` script:
  - generalize ipc breakage in new `break_ipc()` with support for diff
    internal `trio` methods and a #TODO for future disti frameworks
  - only make one sub-actor task break and the other just stream.
  - use new `._testing.expect_ctxc()` around ctx block.
  - add a bit of exception handling with `print()`s around ctxc (unused
    except if 'msg' break method is set) and eoc cases.
  - don't break parent side ipc in loop any more then once
    after first break, checked via flag var.
  - add a `pre_close: bool` flag to control whether
    `MsgStreama.aclose()` is called *before* any ipc breakage method.

Still TODO:
- drop `pytest.ini` and add the alt section to `pyproject.py`.
 -> currently can't get `--rootdir=` opt to work.. not showing in
   console header.
 -> ^ also breaks on 'tests' `enable_modules` imports in subactors
   during discovery tests?
---
 .../ipc_failure_during_stream.py              | 216 ++++++++++++++----
 pyproject.toml                                |  20 ++
 pytest.ini                                    |   8 -
 tests/conftest.py                             | 144 ++----------
 tests/test_advanced_faults.py                 | 178 ++++++++++-----
 tests/test_cancellation.py                    |   6 +-
 tests/test_clustering.py                      |   4 +-
 tests/test_context_stream_semantics.py        |   2 +-
 tests/test_debugger.py                        |   4 +-
 tests/test_discovery.py                       |   3 +-
 tests/test_docs_examples.py                   |   3 +-
 tests/test_infected_asyncio.py                |   3 +-
 tests/test_legacy_one_way_streaming.py        |   2 +-
 tests/test_local.py                           |   2 +-
 tests/test_multi_program.py                   |   4 +-
 tests/test_pubsub.py                          |   3 +-
 tests/test_runtime.py                         |   2 +-
 tests/test_spawning.py                        |   2 +-
 tractor/_testing/__init__.py                  |  74 ++++++
 tractor/_testing/pytest.py                    | 113 +++++++++
 20 files changed, 535 insertions(+), 258 deletions(-)
 delete mode 100644 pytest.ini
 create mode 100644 tractor/_testing/__init__.py
 create mode 100644 tractor/_testing/pytest.py

diff --git a/examples/advanced_faults/ipc_failure_during_stream.py b/examples/advanced_faults/ipc_failure_during_stream.py
index 6728b8d2..c7322a7c 100644
--- a/examples/advanced_faults/ipc_failure_during_stream.py
+++ b/examples/advanced_faults/ipc_failure_during_stream.py
@@ -6,47 +6,120 @@ been an outage) and we want to ensure that despite being in debug mode
 actor tree will eventually be cancelled without leaving any zombies.
 
 '''
-import trio
+from functools import partial
+
 from tractor import (
     open_nursery,
     context,
     Context,
+    ContextCancelled,
     MsgStream,
+    _testing,
 )
+import trio
 
 
-async def break_channel_silently_then_error(
+async def break_ipc(
     stream: MsgStream,
+    method: str|None = None,
+    pre_close: bool = False,
+
+    def_method: str = 'eof',
+
+) -> None:
+    '''
+    XXX: close the channel right after an error is raised
+    purposely breaking the IPC transport to make sure the parent
+    doesn't get stuck in debug or hang on the connection join.
+    this more or less simulates an infinite msg-receive hang on
+    the other end.
+
+    '''
+    # close channel via IPC prot msging before
+    # any transport breakage
+    if pre_close:
+        await stream.aclose()
+
+    method: str = method or def_method
+
+    match method:
+        case 'trans_aclose':
+            await stream._ctx.chan.transport.stream.aclose()
+
+        case 'eof':
+            await stream._ctx.chan.transport.stream.send_eof()
+
+        case 'msg':
+            await stream._ctx.chan.send(None)
+
+        # TODO: the actual real-world simulated cases like
+        # transport layer hangs and/or lower layer 2-gens type
+        # scenarios..
+        #
+        # -[ ] already have some issues for this general testing
+        # area:
+        #  - https://github.com/goodboy/tractor/issues/97
+        #  - https://github.com/goodboy/tractor/issues/124
+        #   - PR from @guille:
+        #     https://github.com/goodboy/tractor/pull/149
+        # case 'hang':
+        # TODO: framework research:
+        #
+        # - https://github.com/GuoTengda1993/pynetem
+        # - https://github.com/shopify/toxiproxy
+        # - https://manpages.ubuntu.com/manpages/trusty/man1/wirefilter.1.html
+
+        case _:
+            raise RuntimeError(
+                f'IPC break method unsupported: {method}'
+            )
+
+
+async def break_ipc_then_error(
+    stream: MsgStream,
+    break_ipc_with: str|None = None,
+    pre_close: bool = False,
 ):
     async for msg in stream:
         await stream.send(msg)
-
-        # XXX: close the channel right after an error is raised
-        # purposely breaking the IPC transport to make sure the parent
-        # doesn't get stuck in debug or hang on the connection join.
-        # this more or less simulates an infinite msg-receive hang on
-        # the other end.
-        await stream._ctx.chan.send(None)
+        await break_ipc(
+            stream=stream,
+            method=break_ipc_with,
+            pre_close=pre_close,
+        )
         assert 0
 
 
-async def close_stream_and_error(
+# async def close_stream_and_error(
+async def iter_ipc_stream(
     stream: MsgStream,
+    break_ipc_with: str|None = None,
+    pre_close: bool = False,
 ):
     async for msg in stream:
         await stream.send(msg)
 
         # wipe out channel right before raising
-        await stream._ctx.chan.send(None)
-        await stream.aclose()
-        assert 0
+        # await break_ipc(
+        #     stream=stream,
+        #     method=break_ipc_with,
+        #     pre_close=pre_close,
+        # )
+
+        # send channel close msg at SC-prot level
+        #
+        # TODO: what should get raised here if anything?
+        # await stream.aclose()
+
+    # assert 0
 
 
 @context
 async def recv_and_spawn_net_killers(
 
     ctx: Context,
-    break_ipc_after: bool | int = False,
+    break_ipc_after: bool|int = False,
+    pre_close: bool = False,
 
 ) -> None:
     '''
@@ -63,27 +136,42 @@ async def recv_and_spawn_net_killers(
             await stream.send(i)
             if (
                 break_ipc_after
-                and i > break_ipc_after
+                and
+                i > break_ipc_after
             ):
                 '#################################\n'
-                'Simulating child-side IPC BREAK!\n'
-                '#################################'
-                n.start_soon(break_channel_silently_then_error, stream)
-                n.start_soon(close_stream_and_error, stream)
+                'Simulating CHILD-side IPC BREAK!\n'
+                '#################################\n'
+                n.start_soon(
+                    partial(
+                        break_ipc_then_error,
+                        stream=stream,
+                        pre_close=pre_close,
+                    )
+                )
+                n.start_soon(
+                    iter_ipc_stream,
+                    stream,
+                )
 
 
 async def main(
     debug_mode: bool = False,
     start_method: str = 'trio',
+    loglevel: str = 'cancel',
 
     # by default we break the parent IPC first (if configured to break
     # at all), but this can be changed so the child does first (even if
     # both are set to break).
-    break_parent_ipc_after: int | bool = False,
-    break_child_ipc_after: int | bool = False,
+    break_parent_ipc_after: int|bool = False,
+    break_child_ipc_after: int|bool = False,
+    pre_close: bool = False,
 
 ) -> None:
 
+    # from tractor._state import _runtime_vars as rtv
+    # rtv['_debug_mode'] = debug_mode
+
     async with (
         open_nursery(
             start_method=start_method,
@@ -91,57 +179,107 @@ async def main(
             # NOTE: even debugger is used we shouldn't get
             # a hang since it never engages due to broken IPC
             debug_mode=debug_mode,
-            loglevel='warning',
+            loglevel=loglevel,
 
         ) as an,
     ):
+        sub_name: str = 'chitty_hijo'
         portal = await an.start_actor(
-            'chitty_hijo',
+            sub_name,
             enable_modules=[__name__],
         )
 
-        async with portal.open_context(
-            recv_and_spawn_net_killers,
-            break_ipc_after=break_child_ipc_after,
-
-        ) as (ctx, sent):
+        async with (
+            _testing.expect_ctxc(
+                yay=(
+                    break_parent_ipc_after
+                    or break_child_ipc_after,
+                ),
+                # TODO: we CAN'T remove this right?
+                # since we need the ctxc to bubble up from either
+                # the stream API after the `None` msg is sent
+                # (which actually implicitly cancels all remote
+                # tasks in the hijo) or from simluated
+                # KBI-mash-from-user
+                # or should we expect that a KBI triggers the ctxc
+                # and KBI in an eg?
+                reraise=True,
+            ),
+            portal.open_context(
+                recv_and_spawn_net_killers,
+                break_ipc_after=break_child_ipc_after,
+                pre_close=pre_close,
+            ) as (ctx, sent),
+        ):
+            ipc_break_sent: bool = False
             async with ctx.open_stream() as stream:
                 for i in range(1000):
 
                     if (
                         break_parent_ipc_after
-                        and i > break_parent_ipc_after
+                        and
+                        i > break_parent_ipc_after
+                        and
+                        not ipc_break_sent
                     ):
                         print(
                             '#################################\n'
-                            'Simulating parent-side IPC BREAK!\n'
-                            '#################################'
+                            'Simulating PARENT-side IPC BREAK!\n'
+                            '#################################\n'
                         )
-                        await stream._ctx.chan.send(None)
+
+                        # await stream._ctx.chan.send(None)
+                        # await stream._ctx.chan.transport.stream.send_eof()
+                        await stream._ctx.chan.transport.stream.aclose()
+
+                        ipc_break_sent = True
 
                     # it actually breaks right here in the
                     # mp_spawn/forkserver backends and thus the zombie
                     # reaper never even kicks in?
                     print(f'parent sending {i}')
-                    await stream.send(i)
+                    try:
+                        await stream.send(i)
+                    except ContextCancelled as ctxc:
+                        print(
+                            'parent received ctxc on `stream.send()`\n'
+                            f'{ctxc}\n'
+                        )
+                        assert 'root' in ctxc.canceller
+                        assert sub_name in ctx.canceller
 
-                    with trio.move_on_after(2) as cs:
+                        # TODO: is this needed or no?
+                        raise
+
+                    timeout: int = 1
+                    print(f'Entering `stream.receive()` with timeout={timeout}\n')
+                    with trio.move_on_after(timeout) as cs:
 
                         # NOTE: in the parent side IPC failure case this
                         # will raise an ``EndOfChannel`` after the child
                         # is killed and sends a stop msg back to it's
                         # caller/this-parent.
-                        rx = await stream.receive()
-
-                        print(f"I'm a happy user and echoed to me is {rx}")
+                        try:
+                            rx = await stream.receive()
+                            print(
+                                "I'm a happy PARENT user and echoed to me is\n"
+                                f'{rx}\n'
+                            )
+                        except trio.EndOfChannel:
+                            print('MsgStream got EoC for PARENT')
+                            raise
 
                     if cs.cancelled_caught:
                         # pretend to be a user seeing no streaming action
                         # thinking it's a hang, and then hitting ctl-c..
-                        print("YOO i'm a user anddd thingz hangin..")
+                        print(
+                            f"YOO i'm a PARENT user anddd thingz hangin..\n"
+                            f'after timeout={timeout}\n'
+                        )
 
                 print(
-                    "YOO i'm mad send side dun but thingz hangin..\n"
+                    "YOO i'm mad!\n"
+                    'The send side is dun but thingz hangin..\n'
                     'MASHING CTlR-C Ctl-c..'
                 )
                 raise KeyboardInterrupt
diff --git a/pyproject.toml b/pyproject.toml
index e52aa476..84633806 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,3 +26,23 @@ all_bullets = true
   directory = "trivial"
   name = "Trivial/Internal Changes"
   showcontent = true
+
+
+[tool.pytest.ini_options]
+minversion = '6.0'
+testpaths = [
+  'tests'
+]
+addopts = [
+  # TODO: figure out why this isn't working..
+  '--rootdir=./tests',
+
+  '--import-mode=importlib',
+  # don't show frickin captured logs AGAIN in the report..
+  '--show-capture=no',
+]
+log_cli = false
+
+# TODO: maybe some of these layout choices?
+# https://docs.pytest.org/en/8.0.x/explanation/goodpractices.html#choosing-a-test-layout-import-rules
+# pythonpath = "src"
diff --git a/pytest.ini b/pytest.ini
deleted file mode 100644
index b2527228..00000000
--- a/pytest.ini
+++ /dev/null
@@ -1,8 +0,0 @@
-# vim: ft=conf
-# pytest.ini for tractor
-
-[pytest]
-# don't show frickin captured logs AGAIN in the report..
-addopts = --show-capture='no'
-log_cli = false
-; minversion = 6.0
diff --git a/tests/conftest.py b/tests/conftest.py
index fb82a554..5ce84425 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,105 +1,25 @@
 """
 ``tractor`` testing!!
 """
-from contextlib import asynccontextmanager as acm
 import sys
 import subprocess
 import os
 import random
 import signal
 import platform
-import pathlib
 import time
-import inspect
-from functools import partial, wraps
 
 import pytest
-import trio
 import tractor
+from tractor._testing import (
+    examples_dir as examples_dir,
+    tractor_test as tractor_test,
+    expect_ctxc as expect_ctxc,
+)
 
+# TODO: include wtv plugin(s) we build in `._testing.pytest`?
 pytest_plugins = ['pytester']
 
-
-def tractor_test(fn):
-    """
-    Use:
-
-    @tractor_test
-    async def test_whatever():
-        await ...
-
-    If fixtures:
-
-        - ``reg_addr`` (a socket addr tuple where arbiter is listening)
-        - ``loglevel`` (logging level passed to tractor internals)
-        - ``start_method`` (subprocess spawning backend)
-
-    are defined in the `pytest` fixture space they will be automatically
-    injected to tests declaring these funcargs.
-    """
-    @wraps(fn)
-    def wrapper(
-        *args,
-        loglevel=None,
-        reg_addr=None,
-        start_method: str|None = None,
-        debug_mode: bool = False,
-        **kwargs
-    ):
-        # __tracebackhide__ = True
-
-        # NOTE: inject ant test func declared fixture
-        # names by manually checking!
-        if 'reg_addr' in inspect.signature(fn).parameters:
-            # injects test suite fixture value to test as well
-            # as `run()`
-            kwargs['reg_addr'] = reg_addr
-
-        if 'loglevel' in inspect.signature(fn).parameters:
-            # allows test suites to define a 'loglevel' fixture
-            # that activates the internal logging
-            kwargs['loglevel'] = loglevel
-
-        if start_method is None:
-            if platform.system() == "Windows":
-                start_method = 'trio'
-
-        if 'start_method' in inspect.signature(fn).parameters:
-            # set of subprocess spawning backends
-            kwargs['start_method'] = start_method
-
-        if 'debug_mode' in inspect.signature(fn).parameters:
-            # set of subprocess spawning backends
-            kwargs['debug_mode'] = debug_mode
-
-
-        if kwargs:
-
-            # use explicit root actor start
-            async def _main():
-                async with tractor.open_root_actor(
-                    # **kwargs,
-                    registry_addrs=[reg_addr] if reg_addr else None,
-                    loglevel=loglevel,
-                    start_method=start_method,
-
-                    # TODO: only enable when pytest is passed --pdb
-                    debug_mode=debug_mode,
-
-                ):
-                    await fn(*args, **kwargs)
-
-            main = _main
-
-        else:
-            # use implicit root actor start
-            main = partial(fn, *args, **kwargs)
-
-        return trio.run(main)
-
-    return wrapper
-
-
 # Sending signal.SIGINT on subprocess fails on windows. Use CTRL_* alternatives
 if platform.system() == 'Windows':
     _KILL_SIGNAL = signal.CTRL_BREAK_EVENT
@@ -119,23 +39,6 @@ no_windows = pytest.mark.skipif(
 )
 
 
-def repodir() -> pathlib.Path:
-    '''
-    Return the abspath to the repo directory.
-
-    '''
-    # 2 parents up to step up through tests/<repo_dir>
-    return pathlib.Path(__file__).parent.parent.absolute()
-
-
-def examples_dir() -> pathlib.Path:
-    '''
-    Return the abspath to the examples directory as `pathlib.Path`.
-
-    '''
-    return repodir() / 'examples'
-
-
 def pytest_addoption(parser):
     parser.addoption(
         "--ll",
@@ -194,11 +97,18 @@ _ci_env: bool = os.environ.get('CI', False)
 
 @pytest.fixture(scope='session')
 def ci_env() -> bool:
-    """Detect CI envoirment.
-    """
+    '''
+    Detect CI envoirment.
+
+    '''
     return _ci_env
 
 
+# TODO: also move this to `._testing` for now?
+# -[ ] possibly generalize and re-use for multi-tree spawning
+#    along with the new stuff for multi-addrs in distribute_dis
+#    branch?
+#
 # choose randomly at import time
 _reg_addr: tuple[str, int] = (
     '127.0.0.1',
@@ -252,6 +162,7 @@ def sig_prog(proc, sig):
     assert ret
 
 
+# TODO: factor into @cm and move to `._testing`?
 @pytest.fixture
 def daemon(
     loglevel: str,
@@ -293,26 +204,3 @@ def daemon(
     time.sleep(_PROC_SPAWN_WAIT)
     yield proc
     sig_prog(proc, _INT_SIGNAL)
-
-
-@acm
-async def expect_ctxc(
-    yay: bool,
-    reraise: bool = False,
-) -> None:
-    '''
-    Small acm to catch `ContextCancelled` errors when expected
-    below it in a `async with ()` block.
-
-    '''
-    if yay:
-        try:
-            yield
-            raise RuntimeError('Never raised ctxc?')
-        except tractor.ContextCancelled:
-            if reraise:
-                raise
-            else:
-                return
-    else:
-        yield
diff --git a/tests/test_advanced_faults.py b/tests/test_advanced_faults.py
index f34738bd..8b73b4c2 100644
--- a/tests/test_advanced_faults.py
+++ b/tests/test_advanced_faults.py
@@ -3,24 +3,28 @@ Sketchy network blackoutz, ugly byzantine gens, puedes eschuchar la
 cancelacion?..
 
 '''
+import itertools
 from functools import partial
+from types import ModuleType
 
 import pytest
 from _pytest.pathlib import import_path
 import trio
 import tractor
-
-from conftest import (
+from tractor._testing import (
     examples_dir,
 )
 
 
 @pytest.mark.parametrize(
-    'debug_mode',
-    [False, True],
+    'pre_aclose_msgstream',
+    [
+        False,
+        True,
+    ],
     ids=[
-        'no_debug_mode',
-        'debug_mode',
+        'no_msgstream_aclose',
+        'pre_aclose_msgstream',
     ],
 )
 @pytest.mark.parametrize(
@@ -66,8 +70,10 @@ from conftest import (
 )
 def test_ipc_channel_break_during_stream(
     debug_mode: bool,
+    loglevel: str,
     spawn_backend: str,
-    ipc_break: dict | None,
+    ipc_break: dict|None,
+    pre_aclose_msgstream: bool,
 ):
     '''
     Ensure we can have an IPC channel break its connection during
@@ -79,77 +85,123 @@ def test_ipc_channel_break_during_stream(
 
     '''
     if spawn_backend != 'trio':
-        if debug_mode:
-            pytest.skip('`debug_mode` only supported on `trio` spawner')
+    #     if debug_mode:
+    #         pytest.skip('`debug_mode` only supported on `trio` spawner')
 
         # non-`trio` spawners should never hit the hang condition that
         # requires the user to do ctl-c to cancel the actor tree.
         expect_final_exc = trio.ClosedResourceError
 
-    mod = import_path(
+    mod: ModuleType = import_path(
         examples_dir() / 'advanced_faults' / 'ipc_failure_during_stream.py',
         root=examples_dir(),
     )
 
-    expect_final_exc = KeyboardInterrupt
-
-    # when ONLY the child breaks we expect the parent to get a closed
-    # resource error on the next `MsgStream.receive()` and then fail out
-    # and cancel the child from there.
+    # by def we expect KBI from user after a simulated "hang
+    # period" wherein the user eventually hits ctl-c to kill the
+    # root-actor tree.
+    expect_final_exc: BaseException = KeyboardInterrupt
     if (
-
-        # only child breaks
-        (
-            ipc_break['break_child_ipc_after']
-            and ipc_break['break_parent_ipc_after'] is False
-        )
-
-        # both break but, parent breaks first
-        or (
-            ipc_break['break_child_ipc_after'] is not False
-            and (
-                ipc_break['break_parent_ipc_after']
-                > ipc_break['break_child_ipc_after']
-            )
-        )
-
-    ):
-        expect_final_exc = trio.ClosedResourceError
-
-    # when the parent IPC side dies (even if the child's does as well
-    # but the child fails BEFORE the parent) we expect the channel to be
-    # sent a stop msg from the child at some point which will signal the
-    # parent that the stream has been terminated.
-    # NOTE: when the parent breaks "after" the child you get this same
-    # case as well, the child breaks the IPC channel with a stop msg
-    # before any closure takes place.
-    elif (
-        # only parent breaks
-        (
-            ipc_break['break_parent_ipc_after']
-            and ipc_break['break_child_ipc_after'] is False
-        )
-
-        # both break but, child breaks first
-        or (
-            ipc_break['break_parent_ipc_after'] is not False
-            and (
-                ipc_break['break_child_ipc_after']
-                > ipc_break['break_parent_ipc_after']
-            )
-        )
+        # only expect EoC if trans is broken on the child side,
+        ipc_break['break_child_ipc_after'] is not False
+        # AND we tell the child to call `MsgStream.aclose()`.
+        and pre_aclose_msgstream
     ):
         expect_final_exc = trio.EndOfChannel
 
-    with pytest.raises(expect_final_exc):
-        trio.run(
-            partial(
-                mod.main,
-                debug_mode=debug_mode,
-                start_method=spawn_backend,
-                **ipc_break,
+    # NOTE when ONLY the child breaks or it breaks BEFORE the
+    # parent we expect the parent to get a closed resource error
+    # on the next `MsgStream.receive()` and then fail out and
+    # cancel the child from there.
+    #
+    # ONLY CHILD breaks
+    if (
+        ipc_break['break_child_ipc_after']
+        and
+        ipc_break['break_parent_ipc_after'] is False
+    ):
+        expect_final_exc = trio.ClosedResourceError
+
+        # if child calls `MsgStream.aclose()` then expect EoC.
+        if pre_aclose_msgstream:
+            expect_final_exc = trio.EndOfChannel
+
+    # BOTH but, CHILD breaks FIRST
+    elif (
+        ipc_break['break_child_ipc_after'] is not False
+        and (
+            ipc_break['break_parent_ipc_after']
+            > ipc_break['break_child_ipc_after']
+        )
+    ):
+        expect_final_exc = trio.ClosedResourceError
+
+        # child will send a 'stop' msg before it breaks
+        # the transport channel.
+        if pre_aclose_msgstream:
+            expect_final_exc = trio.EndOfChannel
+
+    # NOTE when the parent IPC side dies (even if the child's does as well
+    # but the child fails BEFORE the parent) we always expect the
+    # IPC layer to raise a closed-resource, NEVER do we expect
+    # a stop msg since the parent-side ctx apis will error out
+    # IMMEDIATELY before the child ever sends any 'stop' msg.
+    #
+    # ONLY PARENT breaks
+    elif (
+        ipc_break['break_parent_ipc_after']
+        and
+        ipc_break['break_child_ipc_after'] is False
+    ):
+        expect_final_exc = trio.ClosedResourceError
+
+    # BOTH but, PARENT breaks FIRST
+    elif (
+        ipc_break['break_parent_ipc_after'] is not False
+        and (
+            ipc_break['break_child_ipc_after']
+            > ipc_break['break_parent_ipc_after']
+        )
+    ):
+        expect_final_exc = trio.ClosedResourceError
+
+    with pytest.raises(
+        expected_exception=(
+            expect_final_exc,
+            ExceptionGroup,
+        ),
+    ) as excinfo:
+        try:
+            trio.run(
+                partial(
+                    mod.main,
+                    debug_mode=debug_mode,
+                    start_method=spawn_backend,
+                    loglevel=loglevel,
+                    pre_close=pre_aclose_msgstream,
+                    **ipc_break,
+                )
+            )
+        except KeyboardInterrupt as kbi:
+            _err = kbi
+            if expect_final_exc is not KeyboardInterrupt:
+                pytest.fail(
+                    'Rxed unexpected KBI !?\n'
+                    f'{repr(kbi)}'
+                )
+
+            raise
+
+    # get raw instance from pytest wrapper
+    value = excinfo.value
+    if isinstance(value, ExceptionGroup):
+        value = next(
+            itertools.dropwhile(
+                lambda exc: not isinstance(exc, expect_final_exc),
+                value.exceptions,
             )
         )
+        assert value
 
 
 @tractor.context
diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py
index 9a729f3d..5b589f6a 100644
--- a/tests/test_cancellation.py
+++ b/tests/test_cancellation.py
@@ -15,8 +15,10 @@ from exceptiongroup import (
 import pytest
 import trio
 import tractor
-
-from conftest import tractor_test, no_windows
+from tractor._testing import (
+    tractor_test,
+)
+from conftest import no_windows
 
 
 def is_win():
diff --git a/tests/test_clustering.py b/tests/test_clustering.py
index 02b1f8fa..92362b58 100644
--- a/tests/test_clustering.py
+++ b/tests/test_clustering.py
@@ -5,9 +5,7 @@ import trio
 import tractor
 from tractor import open_actor_cluster
 from tractor.trionics import gather_contexts
-
-from conftest import tractor_test
-
+from tractor._testing import tractor_test
 
 MESSAGE = 'tractoring at full speed'
 
diff --git a/tests/test_context_stream_semantics.py b/tests/test_context_stream_semantics.py
index d5767eec..42b1f7d0 100644
--- a/tests/test_context_stream_semantics.py
+++ b/tests/test_context_stream_semantics.py
@@ -25,7 +25,7 @@ from tractor._exceptions import (
     ContextCancelled,
 )
 
-from conftest import (
+from tractor._testing import (
     tractor_test,
     expect_ctxc,
 )
diff --git a/tests/test_debugger.py b/tests/test_debugger.py
index c314ba62..20e67aba 100644
--- a/tests/test_debugger.py
+++ b/tests/test_debugger.py
@@ -30,8 +30,10 @@ from tractor.devx._debug import (
     _pause_msg,
     _crash_msg,
 )
-from conftest import (
+from tractor._testing import (
     examples_dir,
+)
+from conftest import (
     _ci_env,
 )
 
diff --git a/tests/test_discovery.py b/tests/test_discovery.py
index 8b47700c..cd9dc022 100644
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@@ -9,10 +9,9 @@ import itertools
 
 import pytest
 import tractor
+from tractor._testing import tractor_test
 import trio
 
-from conftest import tractor_test
-
 
 @tractor_test
 async def test_reg_then_unreg(reg_addr):
diff --git a/tests/test_docs_examples.py b/tests/test_docs_examples.py
index 1eefdb40..79a22009 100644
--- a/tests/test_docs_examples.py
+++ b/tests/test_docs_examples.py
@@ -11,8 +11,7 @@ import platform
 import shutil
 
 import pytest
-
-from conftest import (
+from tractor._testing import (
     examples_dir,
 )
 
diff --git a/tests/test_infected_asyncio.py b/tests/test_infected_asyncio.py
index 1ac1fba4..a3f96ee8 100644
--- a/tests/test_infected_asyncio.py
+++ b/tests/test_infected_asyncio.py
@@ -18,8 +18,7 @@ from tractor import (
     ContextCancelled,
 )
 from tractor.trionics import BroadcastReceiver
-
-from conftest import expect_ctxc
+from tractor._testing import expect_ctxc
 
 
 async def sleep_and_err(
diff --git a/tests/test_legacy_one_way_streaming.py b/tests/test_legacy_one_way_streaming.py
index 0cbda4d8..1e7ec987 100644
--- a/tests/test_legacy_one_way_streaming.py
+++ b/tests/test_legacy_one_way_streaming.py
@@ -9,7 +9,7 @@ import trio
 import tractor
 import pytest
 
-from conftest import tractor_test
+from tractor._testing import tractor_test
 
 
 def test_must_define_ctx():
diff --git a/tests/test_local.py b/tests/test_local.py
index 009d0d71..a019d771 100644
--- a/tests/test_local.py
+++ b/tests/test_local.py
@@ -7,7 +7,7 @@ import pytest
 import trio
 import tractor
 
-from conftest import tractor_test
+from tractor._testing import tractor_test
 
 
 @pytest.mark.trio
diff --git a/tests/test_multi_program.py b/tests/test_multi_program.py
index d1ee0f5e..0b6b5baf 100644
--- a/tests/test_multi_program.py
+++ b/tests/test_multi_program.py
@@ -7,8 +7,10 @@ import time
 import pytest
 import trio
 import tractor
-from conftest import (
+from tractor._testing import (
     tractor_test,
+)
+from conftest import (
     sig_prog,
     _INT_SIGNAL,
     _INT_RETURN_CODE,
diff --git a/tests/test_pubsub.py b/tests/test_pubsub.py
index 20554fa5..6d416f89 100644
--- a/tests/test_pubsub.py
+++ b/tests/test_pubsub.py
@@ -5,8 +5,7 @@ import pytest
 import trio
 import tractor
 from tractor.experimental import msgpub
-
-from conftest import tractor_test
+from tractor._testing import tractor_test
 
 
 def test_type_checks():
diff --git a/tests/test_runtime.py b/tests/test_runtime.py
index 127138c2..3755af1b 100644
--- a/tests/test_runtime.py
+++ b/tests/test_runtime.py
@@ -8,7 +8,7 @@ import pytest
 import trio
 import tractor
 
-from conftest import tractor_test
+from tractor._testing import tractor_test
 
 
 _file_path: str = ''
diff --git a/tests/test_spawning.py b/tests/test_spawning.py
index 1a07610a..5995ed2d 100644
--- a/tests/test_spawning.py
+++ b/tests/test_spawning.py
@@ -8,7 +8,7 @@ import pytest
 import trio
 import tractor
 
-from conftest import tractor_test
+from tractor._testing import tractor_test
 
 data_to_pass_down = {'doggy': 10, 'kitty': 4}
 
diff --git a/tractor/_testing/__init__.py b/tractor/_testing/__init__.py
new file mode 100644
index 00000000..876c87e8
--- /dev/null
+++ b/tractor/_testing/__init__.py
@@ -0,0 +1,74 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+Various helpers/utils for auditing your `tractor` app and/or the
+core runtime.
+
+'''
+from contextlib import asynccontextmanager as acm
+import pathlib
+
+import tractor
+from .pytest import (
+    tractor_test as tractor_test
+)
+
+
+def repodir() -> pathlib.Path:
+    '''
+    Return the abspath to the repo directory.
+
+    '''
+    # 2 parents up to step up through tests/<repo_dir>
+    return pathlib.Path(
+        __file__
+
+    # 3 .parents bc:
+    # <._testing-pkg>.<tractor-pkg>.<git-repo-dir>
+    # /$HOME/../<tractor-repo-dir>/tractor/_testing/__init__.py
+    ).parent.parent.parent.absolute()
+
+
+def examples_dir() -> pathlib.Path:
+    '''
+    Return the abspath to the examples directory as `pathlib.Path`.
+
+    '''
+    return repodir() / 'examples'
+
+
+@acm
+async def expect_ctxc(
+    yay: bool,
+    reraise: bool = False,
+) -> None:
+    '''
+    Small acm to catch `ContextCancelled` errors when expected
+    below it in a `async with ()` block.
+
+    '''
+    if yay:
+        try:
+            yield
+            raise RuntimeError('Never raised ctxc?')
+        except tractor.ContextCancelled:
+            if reraise:
+                raise
+            else:
+                return
+    else:
+        yield
diff --git a/tractor/_testing/pytest.py b/tractor/_testing/pytest.py
new file mode 100644
index 00000000..93eeaf72
--- /dev/null
+++ b/tractor/_testing/pytest.py
@@ -0,0 +1,113 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+`pytest` utils helpers and plugins for testing `tractor`'s runtime
+and applications.
+
+'''
+from functools import (
+    partial,
+    wraps,
+)
+import inspect
+import platform
+
+import tractor
+import trio
+
+
+def tractor_test(fn):
+    '''
+    Decorator for async test funcs to present them as "native"
+    looking sync funcs runnable by `pytest` using `trio.run()`.
+
+    Use:
+
+    @tractor_test
+    async def test_whatever():
+        await ...
+
+    If fixtures:
+
+        - ``reg_addr`` (a socket addr tuple where arbiter is listening)
+        - ``loglevel`` (logging level passed to tractor internals)
+        - ``start_method`` (subprocess spawning backend)
+
+    are defined in the `pytest` fixture space they will be automatically
+    injected to tests declaring these funcargs.
+    '''
+    @wraps(fn)
+    def wrapper(
+        *args,
+        loglevel=None,
+        reg_addr=None,
+        start_method: str|None = None,
+        debug_mode: bool = False,
+        **kwargs
+    ):
+        # __tracebackhide__ = True
+
+        # NOTE: inject ant test func declared fixture
+        # names by manually checking!
+        if 'reg_addr' in inspect.signature(fn).parameters:
+            # injects test suite fixture value to test as well
+            # as `run()`
+            kwargs['reg_addr'] = reg_addr
+
+        if 'loglevel' in inspect.signature(fn).parameters:
+            # allows test suites to define a 'loglevel' fixture
+            # that activates the internal logging
+            kwargs['loglevel'] = loglevel
+
+        if start_method is None:
+            if platform.system() == "Windows":
+                start_method = 'trio'
+
+        if 'start_method' in inspect.signature(fn).parameters:
+            # set of subprocess spawning backends
+            kwargs['start_method'] = start_method
+
+        if 'debug_mode' in inspect.signature(fn).parameters:
+            # set of subprocess spawning backends
+            kwargs['debug_mode'] = debug_mode
+
+
+        if kwargs:
+
+            # use explicit root actor start
+            async def _main():
+                async with tractor.open_root_actor(
+                    # **kwargs,
+                    registry_addrs=[reg_addr] if reg_addr else None,
+                    loglevel=loglevel,
+                    start_method=start_method,
+
+                    # TODO: only enable when pytest is passed --pdb
+                    debug_mode=debug_mode,
+
+                ):
+                    await fn(*args, **kwargs)
+
+            main = _main
+
+        else:
+            # use implicit root actor start
+            main = partial(fn, *args, **kwargs)
+
+        return trio.run(main)
+
+    return wrapper
-- 
2.34.1


From da913ef2bb3682dab8e669cfac40a91f2a6a00f3 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 13 Mar 2024 09:55:47 -0400
Subject: [PATCH 166/378] Attempt at better internal traceback hiding

Previously i was trying to approach this using lots of
`__tracebackhide__`'s in various internal funcs but since it's not
exactly straight forward to do this inside core deps like `trio` and the
stdlib, it makes a bit more sense to optionally catch and re-raise
certain classes of errors from their originals using `raise from` syntax
as per:
https://docs.python.org/3/library/exceptions.html#exception-context

Deats:
- litter `._context` methods with `__tracebackhide__`/`hide_tb` which
  were previously being shown but that don't need to be to application
  code now that cancel semantics testing is finished up.
- i originally did the same but later commented it all out in `._ipc`
  since error catch and re-raise instead in higher level layers
  (above the transport) seems to be a much saner approach.
- add catch-n-reraise-from in `MsgStream.send()`/.`receive()` to avoid
  seeing the depths of `trio` and/or our `._ipc` layers on comms errors.

Further this patch adds some refactoring to use the
same remote-error shipper routine from both the actor-core in the RPC
invoker:
- rename it as `try_ship_error_to_remote()` and call it from
  `._invoke()` as well as it's prior usage.
- make it optionally accept `cid: str` a `remote_descr: str` and of
  course a `hide_tb: bool`.

Other misc tweaks:
- add some todo notes around `Actor.load_modules()` debug hooking.
- tweak the zombie reaper log msg and timeout value ;)
---
 tractor/_context.py   |   9 +++-
 tractor/_ipc.py       |  50 +++++++++++++-----
 tractor/_portal.py    |   2 +-
 tractor/_runtime.py   | 116 +++++++++++++++++++++++++-----------------
 tractor/_spawn.py     |   7 ++-
 tractor/_streaming.py |  51 ++++++++++++++-----
 6 files changed, 157 insertions(+), 78 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index a31c3b1b..7a562155 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -1198,8 +1198,12 @@ class Context:
     # TODO: replace all the instances of this!! XD
     def maybe_raise(
         self,
+
+        hide_tb: bool = True,
         **kwargs,
+
     ) -> Exception|None:
+        __tracebackhide__: bool = hide_tb
         if re := self._remote_error:
             return self._maybe_raise_remote_err(
                 re,
@@ -1209,8 +1213,10 @@ class Context:
     def _maybe_raise_remote_err(
         self,
         remote_error: Exception,
+
         raise_ctxc_from_self_call: bool = False,
         raise_overrun_from_self: bool = True,
+        hide_tb: bool = True,
 
     ) -> (
         ContextCancelled  # `.cancel()` request to far side
@@ -1222,6 +1228,7 @@ class Context:
         a  cancellation (if any).
 
         '''
+        __tracebackhide__: bool = hide_tb
         our_uid: tuple = self.chan.uid
 
         # XXX NOTE XXX: `ContextCancelled`/`StreamOverrun` absorption
@@ -1305,7 +1312,7 @@ class Context:
     # TODO: change  to `.wait_for_result()`?
     async def result(
         self,
-        hide_tb: bool = False,
+        hide_tb: bool = True,
 
     ) -> Any|Exception:
         '''
diff --git a/tractor/_ipc.py b/tractor/_ipc.py
index b108c90e..f57d3bd8 100644
--- a/tractor/_ipc.py
+++ b/tractor/_ipc.py
@@ -19,13 +19,14 @@ Inter-process comms abstractions
 
 """
 from __future__ import annotations
-import struct
-import platform
-from pprint import pformat
 from collections.abc import (
     AsyncGenerator,
     AsyncIterator,
 )
+from contextlib import asynccontextmanager as acm
+import platform
+from pprint import pformat
+import struct
 import typing
 from typing import (
     Any,
@@ -35,18 +36,16 @@ from typing import (
     TypeVar,
 )
 
-from tricycle import BufferedReceiveStream
 import msgspec
+from tricycle import BufferedReceiveStream
 import trio
-from async_generator import asynccontextmanager
 
-from .log import get_logger
-from ._exceptions import TransportClosed
+from tractor.log import get_logger
+from tractor._exceptions import TransportClosed
+
 log = get_logger(__name__)
 
-
 _is_windows = platform.system() == 'Windows'
-log = get_logger(__name__)
 
 
 def get_stream_addrs(stream: trio.SocketStream) -> tuple:
@@ -206,7 +205,17 @@ class MsgpackTCPStream(MsgTransport):
                 else:
                     raise
 
-    async def send(self, msg: Any) -> None:
+    async def send(
+        self,
+        msg: Any,
+
+        # hide_tb: bool = False,
+    ) -> None:
+        '''
+        Send a msgpack coded blob-as-msg over TCP.
+
+        '''
+        # __tracebackhide__: bool = hide_tb
         async with self._send_lock:
 
             bytes_data: bytes = self.encode(msg)
@@ -388,15 +397,28 @@ class Channel:
         )
         return transport
 
-    async def send(self, item: Any) -> None:
+    async def send(
+        self,
+        payload: Any,
 
+        # hide_tb: bool = False,
+
+    ) -> None:
+        '''
+        Send a coded msg-blob over the transport.
+
+        '''
+        # __tracebackhide__: bool = hide_tb
         log.transport(
             '=> send IPC msg:\n\n'
-            f'{pformat(item)}\n'
+            f'{pformat(payload)}\n'
         )  # type: ignore
         assert self._transport
 
-        await self._transport.send(item)
+        await self._transport.send(
+            payload,
+            # hide_tb=hide_tb,
+        )
 
     async def recv(self) -> Any:
         assert self._transport
@@ -493,7 +515,7 @@ class Channel:
         return self._transport.connected() if self._transport else False
 
 
-@asynccontextmanager
+@acm
 async def _connect_chan(
     host: str,
     port: int
diff --git a/tractor/_portal.py b/tractor/_portal.py
index 7ac5711a..5e649439 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -465,7 +465,7 @@ class Portal:
         # TODO: if we set this the wrapping `@acm` body will
         # still be shown (awkwardly) on pdb REPL entry. Ideally
         # we can similarly annotate that frame to NOT show?
-        hide_tb: bool = False,
+        hide_tb: bool = True,
 
         # proxied to RPC
         **kwargs,
diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 307dacdf..587d636c 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -315,38 +315,19 @@ async def _errors_relayed_via_ipc(
                 if not entered_debug:
                     log.exception('Actor crashed:\n')
 
-        # always ship errors back to caller
-        err_msg: dict[str, dict] = pack_error(
-            err,
-            # tb=tb, # TODO: special tb fmting?
-            cid=ctx.cid,
-        )
-
-        # NOTE: the src actor should always be packed into the
-        # error.. but how should we verify this?
-        # assert err_msg['src_actor_uid']
-        # if not err_msg['error'].get('src_actor_uid'):
-        #     import pdbp; pdbp.set_trace()
-
+        # always (try to) ship RPC errors back to caller
         if is_rpc:
-            try:
-                await chan.send(err_msg)
-
+            #
             # TODO: tests for this scenario:
             # - RPC caller closes connection before getting a response
             # should **not** crash this actor..
-            except (
-                trio.ClosedResourceError,
-                trio.BrokenResourceError,
-                BrokenPipeError,
-            ) as ipc_err:
-
-                # if we can't propagate the error that's a big boo boo
-                log.exception(
-                    f"Failed to ship error to caller @ {chan.uid} !?\n"
-                    f'{ipc_err}'
-
-                )
+            await try_ship_error_to_remote(
+                chan,
+                err,
+                cid=ctx.cid,
+                remote_descr='caller',
+                hide_tb=hide_tb,
+            )
 
         # error is probably from above coro running code *not from
         # the target rpc invocation since a scope was never
@@ -719,9 +700,13 @@ def _get_mod_abspath(module: ModuleType) -> str:
     return os.path.abspath(module.__file__)
 
 
-async def try_ship_error_to_parent(
+async def try_ship_error_to_remote(
     channel: Channel,
-    err: Exception | BaseExceptionGroup,
+    err: Exception|BaseExceptionGroup,
+
+    cid: str|None = None,
+    remote_descr: str = 'parent',
+    hide_tb: bool = True,
 
 ) -> None:
     '''
@@ -730,22 +715,39 @@ async def try_ship_error_to_parent(
     local cancellation ignored but logged as critical(ly bad).
 
     '''
+    __tracebackhide__: bool = hide_tb
     with CancelScope(shield=True):
         try:
-            await channel.send(
-                # NOTE: normally only used for internal runtime errors
-                # so ship to peer actor without a cid.
-                pack_error(err)
+            # NOTE: normally only used for internal runtime errors
+            # so ship to peer actor without a cid.
+            msg: dict = pack_error(
+                err,
+                cid=cid,
+
+                # TODO: special tb fmting for ctxc cases?
+                # tb=tb,
             )
+            # NOTE: the src actor should always be packed into the
+            # error.. but how should we verify this?
+            # actor: Actor = _state.current_actor()
+            # assert err_msg['src_actor_uid']
+            # if not err_msg['error'].get('src_actor_uid'):
+            #     import pdbp; pdbp.set_trace()
+            await channel.send(msg)
+
+        # XXX NOTE XXX in SC terms this is one of the worst things
+        # that can happen and provides for a 2-general's dilemma..
         except (
             trio.ClosedResourceError,
             trio.BrokenResourceError,
+            BrokenPipeError,
         ):
-            # in SC terms this is one of the worst things that can
-            # happen and provides for a 2-general's dilemma..
+            err_msg: dict = msg['error']['tb_str']
             log.critical(
-                f'Failed to ship error to parent '
-                f'{channel.uid}, IPC transport failure!'
+                'IPC transport failure -> '
+                f'failed to ship error to {remote_descr}!\n\n'
+                f'X=> {channel.uid}\n\n'
+                f'{err_msg}\n'
             )
 
 
@@ -954,7 +956,10 @@ class Actor:
         log.runtime(f"{uid} successfully connected back to us")
         return event, self._peers[uid][-1]
 
-    def load_modules(self) -> None:
+    def load_modules(
+        self,
+        debug_mode: bool = False,
+    ) -> None:
         '''
         Load allowed RPC modules locally (after fork).
 
@@ -986,7 +991,9 @@ class Actor:
         except ModuleNotFoundError:
             # it is expected the corresponding `ModuleNotExposed` error
             # will be raised later
-            log.error(f"Failed to import {modpath} in {self.name}")
+            log.error(
+                f"Failed to import {modpath} in {self.name}"
+            )
             raise
 
     def _get_rpc_func(self, ns, funcname):
@@ -1836,7 +1843,7 @@ class Actor:
 
         log.cancel(
             'Cancel request for RPC task\n\n'
-            f'<= Actor.cancel_task(): {requesting_uid}\n\n'
+            f'<= Actor._cancel_task(): {requesting_uid}\n\n'
             f'=> {ctx._task}\n'
             f'  |_ >> {ctx.repr_rpc}\n'
             # f'  >> Actor._cancel_task() => {ctx._task}\n'
@@ -2117,11 +2124,6 @@ async def async_main(
             ):
                 accept_addrs = set_accept_addr_says_rent
 
-        # load exposed/allowed RPC modules
-        # XXX: do this **after** establishing a channel to the parent
-        # but **before** starting the message loop for that channel
-        # such that import errors are properly propagated upwards
-        actor.load_modules()
 
         # The "root" nursery ensures the channel with the immediate
         # parent is kept alive as a resilient service until
@@ -2139,6 +2141,24 @@ async def async_main(
                 actor._service_n = service_nursery
                 assert actor._service_n
 
+                # load exposed/allowed RPC modules
+                # XXX: do this **after** establishing a channel to the parent
+                # but **before** starting the message loop for that channel
+                # such that import errors are properly propagated upwards
+                actor.load_modules()
+
+                # XXX TODO XXX: figuring out debugging of this
+                # would somemwhat guarantee "self-hosted" runtime
+                # debugging (since it hits all the ede cases?)
+                #
+                # `tractor.pause()` right?
+                # try:
+                #     actor.load_modules()
+                # except ModuleNotFoundError as err:
+                #     _debug.pause_from_sync()
+                #     import pdbp; pdbp.set_trace()
+                #     raise
+
                 # Startup up the transport(-channel) server with,
                 # - subactor: the bind address is sent by our parent
                 #   over our established channel
@@ -2258,7 +2278,7 @@ async def async_main(
             )
 
         if actor._parent_chan:
-            await try_ship_error_to_parent(
+            await try_ship_error_to_remote(
                 actor._parent_chan,
                 err,
             )
@@ -2674,7 +2694,7 @@ async def process_messages(
                     log.exception("Actor errored:")
 
             if actor._parent_chan:
-                await try_ship_error_to_parent(
+                await try_ship_error_to_remote(
                     actor._parent_chan,
                     err,
                 )
diff --git a/tractor/_spawn.py b/tractor/_spawn.py
index 5268b250..e23d70f1 100644
--- a/tractor/_spawn.py
+++ b/tractor/_spawn.py
@@ -215,7 +215,7 @@ async def cancel_on_completion(
 
 async def hard_kill(
     proc: trio.Process,
-    terminate_after: int = 3,
+    terminate_after: int = 1.6,
 
     # NOTE: for mucking with `.pause()`-ing inside the runtime
     # whilst also hacking on it XD
@@ -281,8 +281,11 @@ async def hard_kill(
     # zombies (as a feature) we ask the OS to do send in the
     # removal swad as the last resort.
     if cs.cancelled_caught:
+        # TODO: toss in the skynet-logo face as ascii art?
         log.critical(
-            'Well, the #ZOMBIE_LORD_IS_HERE# to collect\n'
+            # 'Well, the #ZOMBIE_LORD_IS_HERE# to collect\n'
+            '#T-800 deployed to collect zombie B0\n'
+            f'|\n'
             f'|_{proc}\n'
         )
         proc.kill()
diff --git a/tractor/_streaming.py b/tractor/_streaming.py
index 50a32ae9..149bb350 100644
--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@@ -114,13 +114,19 @@ class MsgStream(trio.abc.Channel):
                     stream=self,
                 )
 
-    async def receive(self):
+    async def receive(
+        self,
+
+        hide_tb: bool = True,
+    ):
         '''
         Receive a single msg from the IPC transport, the next in
         sequence sent by the far end task (possibly in order as
         determined by the underlying protocol).
 
         '''
+        __tracebackhide__: bool = hide_tb
+
         # NOTE: `trio.ReceiveChannel` implements
         # EOC handling as follows (aka uses it
         # to gracefully exit async for loops):
@@ -139,7 +145,7 @@ class MsgStream(trio.abc.Channel):
         if self._closed:
             raise self._closed
 
-        src_err: Exception|None = None
+        src_err: Exception|None = None  # orig tb
         try:
             try:
                 msg = await self._rx_chan.receive()
@@ -186,7 +192,7 @@ class MsgStream(trio.abc.Channel):
 
             # TODO: Locally, we want to close this stream gracefully, by
             # terminating any local consumers tasks deterministically.
-            # One we have broadcast support, we **don't** want to be
+            # Once we have broadcast support, we **don't** want to be
             # closing this stream and not flushing a final value to
             # remaining (clone) consumers who may not have been
             # scheduled to receive it yet.
@@ -237,7 +243,12 @@ class MsgStream(trio.abc.Channel):
                 raise_ctxc_from_self_call=True,
             )
 
-        raise src_err  # propagate
+        # propagate any error but hide low-level frames from
+        # caller by default.
+        if hide_tb:
+            raise type(src_err)(*src_err.args) from src_err
+        else:
+            raise src_err
 
     async def aclose(self) -> list[Exception|dict]:
         '''
@@ -475,23 +486,39 @@ class MsgStream(trio.abc.Channel):
 
     async def send(
         self,
-        data: Any
+        data: Any,
+
+        hide_tb: bool = True,
     ) -> None:
         '''
         Send a message over this stream to the far end.
 
         '''
-        if self._ctx._remote_error:
-            raise self._ctx._remote_error  # from None
+        __tracebackhide__: bool = hide_tb
 
+        self._ctx.maybe_raise()
         if self._closed:
             raise self._closed
-            # raise trio.ClosedResourceError('This stream was already closed')
 
-        await self._ctx.chan.send({
-            'yield': data,
-            'cid': self._ctx.cid,
-        })
+        try:
+            await self._ctx.chan.send(
+                payload={
+                    'yield': data,
+                    'cid': self._ctx.cid,
+                },
+                # hide_tb=hide_tb,
+            )
+        except (
+            trio.ClosedResourceError,
+            trio.BrokenResourceError,
+            BrokenPipeError,
+        ) as trans_err:
+            if hide_tb:
+                raise type(trans_err)(
+                    *trans_err.args
+                ) from trans_err
+            else:
+                raise
 
 
 def stream(func: Callable) -> Callable:
-- 
2.34.1


From 58cc57a422459970b2ad78e9dc0e86337a8b6d6d Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 13 Mar 2024 11:59:39 -0400
Subject: [PATCH 167/378] Move `Portal.open_context()` impl to `._context`

Finally, since normally you need the content from `._context.Context`
and surroundings in order to effectively grok `Portal.open_context()`
anyways, might as well move the impl to the ctx module as
`open_context_from_portal()` and just bind it on the `Portal` class def.

Associated/required tweaks:
- avoid circ import on `.devx` by only import
  `.maybe_wait_for_debugger()` when debug mode is set.
- drop `async_generator` usage, not sure why this hadn't already been
  changed to `contextlib`?
- use `@acm` alias throughout `._portal`
---
 tractor/_context.py | 545 ++++++++++++++++++++++++++++++++++++++++++-
 tractor/_portal.py  | 557 +-------------------------------------------
 2 files changed, 553 insertions(+), 549 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index 7a562155..55902281 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -43,7 +43,6 @@ import warnings
 
 import trio
 
-from .msg import NamespacePath
 from ._exceptions import (
     ContextCancelled,
     InternalError,
@@ -51,11 +50,16 @@ from ._exceptions import (
     StreamOverrun,
     pack_error,
     unpack_error,
+    _raise_from_no_key_in_msg,
 )
 from .log import get_logger
+from .msg import NamespacePath
 from ._ipc import Channel
 from ._streaming import MsgStream
-from ._state import current_actor
+from ._state import (
+    current_actor,
+    debug_mode,
+)
 
 if TYPE_CHECKING:
     from ._portal import Portal
@@ -1021,6 +1025,8 @@ class Context:
             assert self._scope
             self._scope.cancel()
 
+    # TODO? should we move this to `._streaming` much like we
+    # moved `Portal.open_context()`'s def to this mod?
     @acm
     async def open_stream(
         self,
@@ -1848,6 +1854,541 @@ class Context:
             return False
 
 
+@acm
+async def open_context_from_portal(
+    portal: Portal,
+    func: Callable,
+
+    allow_overruns: bool = False,
+
+    # TODO: if we set this the wrapping `@acm` body will
+    # still be shown (awkwardly) on pdb REPL entry. Ideally
+    # we can similarly annotate that frame to NOT show?
+    hide_tb: bool = True,
+
+    # proxied to RPC
+    **kwargs,
+
+) -> AsyncGenerator[tuple[Context, Any], None]:
+    '''
+    Open an inter-actor "task context"; a remote task is
+    scheduled and cancel-scope-state-linked to a `trio.run()` across
+    memory boundaries in another actor's runtime.
+
+    This is an `@acm` API bound as `Portal.open_context()` which
+    allows for deterministic setup and teardown of a remotely
+    scheduled task in another remote actor. Once opened, the 2 now
+    "linked" tasks run completely in parallel in each actor's
+    runtime with their enclosing `trio.CancelScope`s kept in
+    a synced state wherein if either side errors or cancels an
+    equivalent error is relayed to the other side via an SC-compat
+    IPC protocol.
+
+    The yielded `tuple` is a pair delivering a `tractor.Context`
+    and any first value "sent" by the "callee" task via a call
+    to `Context.started(<value: Any>)`; this side of the
+    context does not unblock until the "callee" task calls
+    `.started()` in similar style to `trio.Nursery.start()`.
+    When the "callee" (side that is "called"/started by a call
+    to *this* method) returns, the caller side (this) unblocks
+    and any final value delivered from the other end can be
+    retrieved using the `Contex.result()` api.
+
+    The yielded ``Context`` instance further allows for opening
+    bidirectional streams, explicit cancellation and
+    structurred-concurrency-synchronized final result-msg
+    collection. See ``tractor.Context`` for more details.
+
+    '''
+    __tracebackhide__: bool = hide_tb
+
+    # conduct target func method structural checks
+    if not inspect.iscoroutinefunction(func) and (
+        getattr(func, '_tractor_contex_function', False)
+    ):
+        raise TypeError(
+            f'{func} must be an async generator function!')
+
+    # TODO: i think from here onward should probably
+    # just be factored into an `@acm` inside a new
+    # a new `_context.py` mod.
+    nsf = NamespacePath.from_ref(func)
+
+    # XXX NOTE XXX: currenly we do NOT allow opening a contex
+    # with "self" since the local feeder mem-chan processing
+    # is not built for it.
+    if portal.channel.uid == portal.actor.uid:
+        raise RuntimeError(
+            '** !! Invalid Operation !! **\n'
+            'Can not open an IPC ctx with the local actor!\n'
+            f'|_{portal.actor}\n'
+        )
+
+    ctx: Context = await portal.actor.start_remote_task(
+        portal.channel,
+        nsf=nsf,
+        kwargs=kwargs,
+
+        # NOTE: it's imporant to expose this since you might
+        # get the case where the parent who opened the context does
+        # not open a stream until after some slow startup/init
+        # period, in which case when the first msg is read from
+        # the feeder mem chan, say when first calling
+        # `Context.open_stream(allow_overruns=True)`, the overrun condition will be
+        # raised before any ignoring of overflow msgs can take
+        # place..
+        allow_overruns=allow_overruns,
+    )
+
+    assert ctx._remote_func_type == 'context'
+    msg: dict = await ctx._recv_chan.receive()
+
+    try:
+        # the "first" value here is delivered by the callee's
+        # ``Context.started()`` call.
+        first: Any = msg['started']
+        ctx._started_called: bool = True
+
+    except KeyError as src_error:
+        _raise_from_no_key_in_msg(
+            ctx=ctx,
+            msg=msg,
+            src_err=src_error,
+            log=log,
+            expect_key='started',
+        )
+
+    ctx._portal: Portal = portal
+    uid: tuple = portal.channel.uid
+    cid: str = ctx.cid
+
+    # placeholder for any exception raised in the runtime
+    # or by user tasks which cause this context's closure.
+    scope_err: BaseException|None = None
+    ctxc_from_callee: ContextCancelled|None = None
+    try:
+        async with trio.open_nursery() as nurse:
+
+            # NOTE: used to start overrun queuing tasks
+            ctx._scope_nursery: trio.Nursery = nurse
+            ctx._scope: trio.CancelScope = nurse.cancel_scope
+
+            # deliver context instance and .started() msg value
+            # in enter tuple.
+            yield ctx, first
+
+            # ??TODO??: do we still want to consider this or is
+            # the `else:` block handling via a `.result()`
+            # call below enough??
+            # -[ ] pretty sure `.result()` internals do the
+            # same as our ctxc handler below so it ended up
+            # being same (repeated?) behaviour, but ideally we
+            # wouldn't have that duplication either by somehow
+            # factoring the `.result()` handler impl in a way
+            # that we can re-use it around the `yield` ^ here
+            # or vice versa?
+            #
+            # NOTE: between the caller exiting and arriving
+            # here the far end may have sent a ctxc-msg or
+            # other error, so check for it here immediately
+            # and maybe raise so as to engage the ctxc
+            # handling block below!
+            #
+            # if re := ctx._remote_error:
+            #     maybe_ctxc: ContextCancelled|None = ctx._maybe_raise_remote_err(
+            #         re,
+            #         # TODO: do we want this to always raise?
+            #         # - means that on self-ctxc, if/when the
+            #         #   block is exited before the msg arrives
+            #         #   but then the msg during __exit__
+            #         #   calling we may not activate the
+            #         #   ctxc-handler block below? should we
+            #         #   be?
+            #         # - if there's a remote error that arrives
+            #         #   after the child has exited, we won't
+            #         #   handle until the `finally:` block
+            #         #   where `.result()` is always called,
+            #         #   again in which case we handle it
+            #         #   differently then in the handler block
+            #         #   that would normally engage from THIS
+            #         #   block?
+            #         raise_ctxc_from_self_call=True,
+            #     )
+            #     ctxc_from_callee = maybe_ctxc
+
+            # when in allow_overruns mode there may be
+            # lingering overflow sender tasks remaining?
+            if nurse.child_tasks:
+                # XXX: ensure we are in overrun state
+                # with ``._allow_overruns=True`` bc otherwise
+                # there should be no tasks in this nursery!
+                if (
+                    not ctx._allow_overruns
+                    or len(nurse.child_tasks) > 1
+                ):
+                    raise InternalError(
+                        'Context has sub-tasks but is '
+                        'not in `allow_overruns=True` mode!?'
+                    )
+
+                # ensure we cancel all overflow sender
+                # tasks started in the nursery when
+                # `._allow_overruns == True`.
+                #
+                # NOTE: this means `._scope.cancelled_caught`
+                # will prolly be set! not sure if that's
+                # non-ideal or not ???
+                ctx._scope.cancel()
+
+    # XXX NOTE XXX: maybe shield against
+    # self-context-cancellation (which raises a local
+    # `ContextCancelled`) when requested (via
+    # `Context.cancel()`) by the same task (tree) which entered
+    # THIS `.open_context()`.
+    #
+    # NOTE: There are 2 operating cases for a "graceful cancel"
+    # of a `Context`. In both cases any `ContextCancelled`
+    # raised in this scope-block came from a transport msg
+    # relayed from some remote-actor-task which our runtime set
+    # as to `Context._remote_error`
+    #
+    # the CASES:
+    #
+    # - if that context IS THE SAME ONE that called
+    #   `Context.cancel()`, we want to absorb the error
+    #   silently and let this `.open_context()` block to exit
+    #   without raising, ideally eventually receiving the ctxc
+    #   ack msg thus resulting in `ctx.cancel_acked == True`.
+    #
+    # - if it is from some OTHER context (we did NOT call
+    #   `.cancel()`), we want to re-RAISE IT whilst also
+    #   setting our own ctx's "reason for cancel" to be that
+    #   other context's cancellation condition; we set our
+    #   `.canceller: tuple[str, str]` to be same value as
+    #   caught here in a `ContextCancelled.canceller`.
+    #
+    # AGAIN to restate the above, there are 2 cases:
+    #
+    # 1-some other context opened in this `.open_context()`
+    #   block cancelled due to a self or peer cancellation
+    #   request in which case we DO let the error bubble to the
+    #   opener.
+    #
+    # 2-THIS "caller" task somewhere invoked `Context.cancel()`
+    #   and received a `ContextCanclled` from the "callee"
+    #   task, in which case we mask the `ContextCancelled` from
+    #   bubbling to this "caller" (much like how `trio.Nursery`
+    #   swallows any `trio.Cancelled` bubbled by a call to
+    #   `Nursery.cancel_scope.cancel()`)
+    except ContextCancelled as ctxc:
+        scope_err = ctxc
+        ctx._local_error: BaseException = scope_err
+        ctxc_from_callee = ctxc
+
+        # XXX TODO XXX: FIX THIS debug_mode BUGGGG!!!
+        # using this code and then resuming the REPL will
+        # cause a SIGINT-ignoring HANG!
+        # -> prolly due to a stale debug lock entry..
+        # -[ ] USE `.stackscope` to demonstrate that (possibly
+        #   documenting it as a definittive example of
+        #   debugging the tractor-runtime itself using it's
+        #   own `.devx.` tooling!
+        # 
+        # await _debug.pause()
+
+        # CASE 2: context was cancelled by local task calling
+        # `.cancel()`, we don't raise and the exit block should
+        # exit silently.
+        if (
+            ctx._cancel_called
+            and
+            ctxc is ctx._remote_error
+            and
+            ctxc.canceller == portal.actor.uid
+        ):
+            log.cancel(
+                f'Context (cid=[{ctx.cid[-6:]}..] cancelled gracefully with:\n'
+                f'{ctxc}'
+            )
+        # CASE 1: this context was never cancelled via a local
+        # task (tree) having called `Context.cancel()`, raise
+        # the error since it was caused by someone else
+        # -> probably a remote peer!
+        else:
+            raise
+
+    # the above `._scope` can be cancelled due to:
+    # 1. an explicit self cancel via `Context.cancel()` or
+    #    `Actor.cancel()`,
+    # 2. any "callee"-side remote error, possibly also a cancellation
+    #    request by some peer,
+    # 3. any "caller" (aka THIS scope's) local error raised in the above `yield`
+    except (
+        # CASE 3: standard local error in this caller/yieldee
+        Exception,
+
+        # CASES 1 & 2: can manifest as a `ctx._scope_nursery`
+        # exception-group of,
+        #
+        # 1.-`trio.Cancelled`s, since
+        #   `._scope.cancel()` will have been called
+        #   (transitively by the runtime calling
+        #   `._deliver_msg()`) and any `ContextCancelled`
+        #   eventually absorbed and thus absorbed/supressed in
+        #   any `Context._maybe_raise_remote_err()` call.
+        #
+        # 2.-`BaseExceptionGroup[ContextCancelled | RemoteActorError]`
+        #    from any error delivered from the "callee" side
+        #    AND a group-exc is only raised if there was > 1
+        #    tasks started *here* in the "caller" / opener
+        #    block. If any one of those tasks calls
+        #    `.result()` or `MsgStream.receive()`
+        #    `._maybe_raise_remote_err()` will be transitively
+        #    called and the remote error raised causing all
+        #    tasks to be cancelled.
+        #    NOTE: ^ this case always can happen if any
+        #    overrun handler tasks were spawned!
+        BaseExceptionGroup,
+
+        trio.Cancelled,  # NOTE: NOT from inside the ctx._scope
+        KeyboardInterrupt,
+
+    ) as caller_err:
+        scope_err = caller_err
+        ctx._local_error: BaseException = scope_err
+
+        # XXX: ALWAYS request the context to CANCEL ON any ERROR.
+        # NOTE: `Context.cancel()` is conversely NEVER CALLED in
+        # the `ContextCancelled` "self cancellation absorbed" case
+        # handled in the block above ^^^ !!
+        # await _debug.pause()
+        log.cancel(
+            'Context terminated due to\n\n'
+            f'.outcome => {ctx.repr_outcome()}\n'
+        )
+
+        if debug_mode():
+            # async with _debug.acquire_debug_lock(portal.actor.uid):
+            #     pass
+            # TODO: factor ^ into below for non-root cases?
+            #
+            from .devx import maybe_wait_for_debugger
+            was_acquired: bool = await maybe_wait_for_debugger(
+                header_msg=(
+                    'Delaying `ctx.cancel()` until debug lock '
+                    'acquired..\n'
+                ),
+            )
+            if was_acquired:
+                log.pdb(
+                    'Acquired debug lock! '
+                    'Calling `ctx.cancel()`!\n'
+                )
+
+        # we don't need to cancel the callee if it already
+        # told us it's cancelled ;p
+        if ctxc_from_callee is None:
+            try:
+                await ctx.cancel()
+            except (
+                trio.BrokenResourceError,
+                trio.ClosedResourceError,
+            ):
+                log.warning(
+                    'IPC connection for context is broken?\n'
+                    f'task:{cid}\n'
+                    f'actor:{uid}'
+                )
+
+        raise  # duh
+
+    # no local scope error, the "clean exit with a result" case.
+    else:
+        if ctx.chan.connected():
+            log.runtime(
+                'Waiting on final context result for\n'
+                f'peer: {uid}\n'
+                f'|_{ctx._task}\n'
+            )
+            # XXX NOTE XXX: the below call to
+            # `Context.result()` will ALWAYS raise
+            # a `ContextCancelled` (via an embedded call to
+            # `Context._maybe_raise_remote_err()`) IFF
+            # a `Context._remote_error` was set by the runtime
+            # via a call to
+            # `Context._maybe_cancel_and_set_remote_error()`.
+            # As per `Context._deliver_msg()`, that error IS
+            # ALWAYS SET any time "callee" side fails and causes "caller
+            # side" cancellation via a `ContextCancelled` here.
+            try:
+                result_or_err: Exception|Any = await ctx.result()
+            except BaseException as berr:
+                # on normal teardown, if we get some error
+                # raised in `Context.result()` we still want to
+                # save that error on the ctx's state to
+                # determine things like `.cancelled_caught` for
+                # cases where there was remote cancellation but
+                # this task didn't know until final teardown
+                # / value collection.
+                scope_err = berr
+                ctx._local_error: BaseException = scope_err
+                raise
+
+            # yes! this worx Bp
+            # from .devx import _debug
+            # await _debug.pause()
+
+            # an exception type boxed in a `RemoteActorError`
+            # is returned (meaning it was obvi not raised)
+            # that we want to log-report on.
+            msgdata: str|None = getattr(
+                result_or_err,
+                'msgdata',
+                None
+            )
+            match (msgdata, result_or_err):
+                case (
+                    {'tb_str': tbstr},
+                    ContextCancelled(),
+                ):
+                    log.cancel(tbstr)
+
+                case (
+                    {'tb_str': tbstr},
+                    RemoteActorError(),
+                ):
+                    log.exception(
+                        'Context remotely errored!\n'
+                        f'<= peer: {uid}\n'
+                        f'  |_ {nsf}()\n\n'
+
+                        f'{tbstr}'
+                    )
+                case (None, _):
+                    log.runtime(
+                        'Context returned final result from callee task:\n'
+                        f'<= peer: {uid}\n'
+                        f'  |_ {nsf}()\n\n'
+
+                        f'`{result_or_err}`\n'
+                    )
+
+    finally:
+        # XXX: (MEGA IMPORTANT) if this is a root opened process we
+        # wait for any immediate child in debug before popping the
+        # context from the runtime msg loop otherwise inside
+        # ``Actor._push_result()`` the msg will be discarded and in
+        # the case where that msg is global debugger unlock (via
+        # a "stop" msg for a stream), this can result in a deadlock
+        # where the root is waiting on the lock to clear but the
+        # child has already cleared it and clobbered IPC.
+        if debug_mode():
+            from .devx import maybe_wait_for_debugger
+            await maybe_wait_for_debugger()
+
+        # though it should be impossible for any tasks
+        # operating *in* this scope to have survived
+        # we tear down the runtime feeder chan last
+        # to avoid premature stream clobbers.
+        if (
+            (rxchan := ctx._recv_chan)
+
+            # maybe TODO: yes i know the below check is
+            # touching `trio` memchan internals..BUT, there are
+            # only a couple ways to avoid a `trio.Cancelled`
+            # bubbling from the `.aclose()` call below:
+            #
+            # - catch and mask it via the cancel-scope-shielded call
+            #   as we are rn (manual and frowned upon) OR,
+            # - specially handle the case where `scope_err` is
+            #   one of {`BaseExceptionGroup`, `trio.Cancelled`}
+            #   and then presume that the `.aclose()` call will
+            #   raise a `trio.Cancelled` and just don't call it
+            #   in those cases..
+            #
+            # that latter approach is more logic, LOC, and more
+            # convoluted so for now stick with the first
+            # psuedo-hack-workaround where we just try to avoid
+            # the shielded call as much as we can detect from
+            # the memchan's `._closed` state..
+            #
+            # XXX MOTIVATION XXX-> we generally want to raise
+            # any underlying actor-runtime/internals error that
+            # surfaces from a bug in tractor itself so it can
+            # be easily detected/fixed AND, we also want to
+            # minimize noisy runtime tracebacks (normally due
+            # to the cross-actor linked task scope machinery
+            # teardown) displayed to user-code and instead only
+            # displaying `ContextCancelled` traces where the
+            # cause of crash/exit IS due to something in
+            # user/app code on either end of the context.
+            and not rxchan._closed
+        ):
+            # XXX NOTE XXX: and again as per above, we mask any
+            # `trio.Cancelled` raised here so as to NOT mask
+            # out any exception group or legit (remote) ctx
+            # error that sourced from the remote task or its
+            # runtime.
+            #
+            # NOTE: further, this should be the only place the
+            # underlying feeder channel is
+            # once-and-only-CLOSED!
+            with trio.CancelScope(shield=True):
+                await ctx._recv_chan.aclose()
+
+        # XXX: we always raise remote errors locally and
+        # generally speaking mask runtime-machinery related
+        # multi-`trio.Cancelled`s. As such, any `scope_error`
+        # which was the underlying cause of this context's exit
+        # should be stored as the `Context._local_error` and
+        # used in determining `Context.cancelled_caught: bool`.
+        if scope_err is not None:
+            # sanity, tho can remove?
+            assert ctx._local_error is scope_err
+            # ctx._local_error: BaseException = scope_err
+            # etype: Type[BaseException] = type(scope_err)
+
+            # CASE 2
+            if (
+                ctx._cancel_called
+                and ctx.cancel_acked
+            ):
+                log.cancel(
+                    'Context cancelled by caller task\n'
+                    f'|_{ctx._task}\n\n'
+
+                    f'{repr(scope_err)}\n'
+                )
+
+            # TODO: should we add a `._cancel_req_received`
+            # flag to determine if the callee manually called
+            # `ctx.cancel()`?
+            # -[ ] going to need a cid check no?
+
+            # CASE 1
+            else:
+                outcome_str: str = ctx.repr_outcome(
+                    show_error_fields=True,
+                    # type_only=True,
+                )
+                log.cancel(
+                    f'Context terminated due to local scope error:\n\n'
+                    f'{ctx.chan.uid} => {outcome_str}\n'
+                )
+
+        # FINALLY, remove the context from runtime tracking and
+        # exit!
+        log.runtime(
+            'Removing IPC ctx opened with peer\n'
+            f'{uid}\n'
+            f'|_{ctx}\n'
+        )
+        portal.actor._contexts.pop(
+            (uid, cid),
+            None,
+        )
+
+
 def mk_context(
     chan: Channel,
     cid: str,
diff --git a/tractor/_portal.py b/tractor/_portal.py
index 5e649439..ac602dd5 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -24,6 +24,7 @@ OS processes, possibly on different (hardware) hosts.
 
 '''
 from __future__ import annotations
+from contextlib import asynccontextmanager as acm
 import importlib
 import inspect
 from typing import (
@@ -37,30 +38,21 @@ from dataclasses import dataclass
 import warnings
 
 import trio
-from async_generator import asynccontextmanager
 
 from .trionics import maybe_open_nursery
-from .devx import (
-    # _debug,
-    maybe_wait_for_debugger,
-)
 from ._state import (
     current_actor,
-    debug_mode,
 )
 from ._ipc import Channel
 from .log import get_logger
 from .msg import NamespacePath
 from ._exceptions import (
-    InternalError,
-    _raise_from_no_key_in_msg,
     unpack_error,
     NoResult,
-    ContextCancelled,
-    RemoteActorError,
 )
 from ._context import (
     Context,
+    open_context_from_portal,
 )
 from ._streaming import (
     MsgStream,
@@ -392,7 +384,7 @@ class Portal:
             self.channel,
         )
 
-    @asynccontextmanager
+    @acm
     async def open_stream_from(
         self,
         async_gen_func: Callable,  # typing: ignore
@@ -449,541 +441,12 @@ class Portal:
             # await recv_chan.aclose()
             self._streams.remove(rchan)
 
-    # TODO: move this impl to `._context` mod and
-    # instead just bind it here as a method so that the logic
-    # for ctx stuff stays all in one place (instead of frickin
-    # having to open this file in tandem every gd time!!! XD)
-    #
-    @asynccontextmanager
-    async def open_context(
-
-        self,
-        func: Callable,
-
-        allow_overruns: bool = False,
-
-        # TODO: if we set this the wrapping `@acm` body will
-        # still be shown (awkwardly) on pdb REPL entry. Ideally
-        # we can similarly annotate that frame to NOT show?
-        hide_tb: bool = True,
-
-        # proxied to RPC
-        **kwargs,
-
-    ) -> AsyncGenerator[tuple[Context, Any], None]:
-        '''
-        Open an inter-actor "task context"; a remote task is
-        scheduled and cancel-scope-state-linked to a `trio.run()` across
-        memory boundaries in another actor's runtime.
-
-        This is an `@acm` API which allows for deterministic setup
-        and teardown of a remotely scheduled task in another remote
-        actor. Once opened, the 2 now "linked" tasks run completely
-        in parallel in each actor's runtime with their enclosing
-        `trio.CancelScope`s kept in a synced state wherein if
-        either side errors or cancels an equivalent error is
-        relayed to the other side via an SC-compat IPC protocol.
-
-        The yielded `tuple` is a pair delivering a `tractor.Context`
-        and any first value "sent" by the "callee" task via a call
-        to `Context.started(<value: Any>)`; this side of the
-        context does not unblock until the "callee" task calls
-        `.started()` in similar style to `trio.Nursery.start()`.
-        When the "callee" (side that is "called"/started by a call
-        to *this* method) returns, the caller side (this) unblocks
-        and any final value delivered from the other end can be
-        retrieved using the `Contex.result()` api.
-
-        The yielded ``Context`` instance further allows for opening
-        bidirectional streams, explicit cancellation and
-        structurred-concurrency-synchronized final result-msg
-        collection. See ``tractor.Context`` for more details.
-
-        '''
-        __tracebackhide__: bool = hide_tb
-
-        # conduct target func method structural checks
-        if not inspect.iscoroutinefunction(func) and (
-            getattr(func, '_tractor_contex_function', False)
-        ):
-            raise TypeError(
-                f'{func} must be an async generator function!')
-
-        # TODO: i think from here onward should probably
-        # just be factored into an `@acm` inside a new
-        # a new `_context.py` mod.
-        nsf = NamespacePath.from_ref(func)
-
-        # XXX NOTE XXX: currenly we do NOT allow opening a contex
-        # with "self" since the local feeder mem-chan processing
-        # is not built for it.
-        if self.channel.uid == self.actor.uid:
-            raise RuntimeError(
-                '** !! Invalid Operation !! **\n'
-                'Can not open an IPC ctx with the local actor!\n'
-                f'|_{self.actor}\n'
-            )
-
-        ctx: Context = await self.actor.start_remote_task(
-            self.channel,
-            nsf=nsf,
-            kwargs=kwargs,
-
-            # NOTE: it's imporant to expose this since you might
-            # get the case where the parent who opened the context does
-            # not open a stream until after some slow startup/init
-            # period, in which case when the first msg is read from
-            # the feeder mem chan, say when first calling
-            # `Context.open_stream(allow_overruns=True)`, the overrun condition will be
-            # raised before any ignoring of overflow msgs can take
-            # place..
-            allow_overruns=allow_overruns,
-        )
-
-        assert ctx._remote_func_type == 'context'
-        msg: dict = await ctx._recv_chan.receive()
-
-        try:
-            # the "first" value here is delivered by the callee's
-            # ``Context.started()`` call.
-            first: Any = msg['started']
-            ctx._started_called: bool = True
-
-        except KeyError as src_error:
-            _raise_from_no_key_in_msg(
-                ctx=ctx,
-                msg=msg,
-                src_err=src_error,
-                log=log,
-                expect_key='started',
-            )
-
-        ctx._portal: Portal = self
-        uid: tuple = self.channel.uid
-        cid: str = ctx.cid
-
-        # placeholder for any exception raised in the runtime
-        # or by user tasks which cause this context's closure.
-        scope_err: BaseException|None = None
-        ctxc_from_callee: ContextCancelled|None = None
-        try:
-            async with trio.open_nursery() as nurse:
-
-                # NOTE: used to start overrun queuing tasks
-                ctx._scope_nursery: trio.Nursery = nurse
-                ctx._scope: trio.CancelScope = nurse.cancel_scope
-
-                # deliver context instance and .started() msg value
-                # in enter tuple.
-                yield ctx, first
-
-                # ??TODO??: do we still want to consider this or is
-                # the `else:` block handling via a `.result()`
-                # call below enough??
-                # -[ ] pretty sure `.result()` internals do the
-                # same as our ctxc handler below so it ended up
-                # being same (repeated?) behaviour, but ideally we
-                # wouldn't have that duplication either by somehow
-                # factoring the `.result()` handler impl in a way
-                # that we can re-use it around the `yield` ^ here
-                # or vice versa?
-                #
-                # NOTE: between the caller exiting and arriving
-                # here the far end may have sent a ctxc-msg or
-                # other error, so check for it here immediately
-                # and maybe raise so as to engage the ctxc
-                # handling block below!
-                #
-                # if re := ctx._remote_error:
-                #     maybe_ctxc: ContextCancelled|None = ctx._maybe_raise_remote_err(
-                #         re,
-                #         # TODO: do we want this to always raise?
-                #         # - means that on self-ctxc, if/when the
-                #         #   block is exited before the msg arrives
-                #         #   but then the msg during __exit__
-                #         #   calling we may not activate the
-                #         #   ctxc-handler block below? should we
-                #         #   be?
-                #         # - if there's a remote error that arrives
-                #         #   after the child has exited, we won't
-                #         #   handle until the `finally:` block
-                #         #   where `.result()` is always called,
-                #         #   again in which case we handle it
-                #         #   differently then in the handler block
-                #         #   that would normally engage from THIS
-                #         #   block?
-                #         raise_ctxc_from_self_call=True,
-                #     )
-                #     ctxc_from_callee = maybe_ctxc
-
-                # when in allow_overruns mode there may be
-                # lingering overflow sender tasks remaining?
-                if nurse.child_tasks:
-                    # XXX: ensure we are in overrun state
-                    # with ``._allow_overruns=True`` bc otherwise
-                    # there should be no tasks in this nursery!
-                    if (
-                        not ctx._allow_overruns
-                        or len(nurse.child_tasks) > 1
-                    ):
-                        raise InternalError(
-                            'Context has sub-tasks but is '
-                            'not in `allow_overruns=True` mode!?'
-                        )
-
-                    # ensure we cancel all overflow sender
-                    # tasks started in the nursery when
-                    # `._allow_overruns == True`.
-                    #
-                    # NOTE: this means `._scope.cancelled_caught`
-                    # will prolly be set! not sure if that's
-                    # non-ideal or not ???
-                    ctx._scope.cancel()
-
-        # XXX NOTE XXX: maybe shield against
-        # self-context-cancellation (which raises a local
-        # `ContextCancelled`) when requested (via
-        # `Context.cancel()`) by the same task (tree) which entered
-        # THIS `.open_context()`.
-        #
-        # NOTE: There are 2 operating cases for a "graceful cancel"
-        # of a `Context`. In both cases any `ContextCancelled`
-        # raised in this scope-block came from a transport msg
-        # relayed from some remote-actor-task which our runtime set
-        # as to `Context._remote_error`
-        #
-        # the CASES:
-        #
-        # - if that context IS THE SAME ONE that called
-        #   `Context.cancel()`, we want to absorb the error
-        #   silently and let this `.open_context()` block to exit
-        #   without raising, ideally eventually receiving the ctxc
-        #   ack msg thus resulting in `ctx.cancel_acked == True`.
-        #
-        # - if it is from some OTHER context (we did NOT call
-        #   `.cancel()`), we want to re-RAISE IT whilst also
-        #   setting our own ctx's "reason for cancel" to be that
-        #   other context's cancellation condition; we set our
-        #   `.canceller: tuple[str, str]` to be same value as
-        #   caught here in a `ContextCancelled.canceller`.
-        #
-        # AGAIN to restate the above, there are 2 cases:
-        #
-        # 1-some other context opened in this `.open_context()`
-        #   block cancelled due to a self or peer cancellation
-        #   request in which case we DO let the error bubble to the
-        #   opener.
-        #
-        # 2-THIS "caller" task somewhere invoked `Context.cancel()`
-        #   and received a `ContextCanclled` from the "callee"
-        #   task, in which case we mask the `ContextCancelled` from
-        #   bubbling to this "caller" (much like how `trio.Nursery`
-        #   swallows any `trio.Cancelled` bubbled by a call to
-        #   `Nursery.cancel_scope.cancel()`)
-        except ContextCancelled as ctxc:
-            scope_err = ctxc
-            ctx._local_error: BaseException = scope_err
-            ctxc_from_callee = ctxc
-
-            # XXX TODO XXX: FIX THIS debug_mode BUGGGG!!!
-            # using this code and then resuming the REPL will
-            # cause a SIGINT-ignoring HANG!
-            # -> prolly due to a stale debug lock entry..
-            # -[ ] USE `.stackscope` to demonstrate that (possibly
-            #   documenting it as a definittive example of
-            #   debugging the tractor-runtime itself using it's
-            #   own `.devx.` tooling!
-            # 
-            # await _debug.pause()
-
-            # CASE 2: context was cancelled by local task calling
-            # `.cancel()`, we don't raise and the exit block should
-            # exit silently.
-            if (
-                ctx._cancel_called
-                and
-                ctxc is ctx._remote_error
-                and
-                ctxc.canceller == self.actor.uid
-            ):
-                log.cancel(
-                    f'Context (cid=[{ctx.cid[-6:]}..] cancelled gracefully with:\n'
-                    f'{ctxc}'
-                )
-            # CASE 1: this context was never cancelled via a local
-            # task (tree) having called `Context.cancel()`, raise
-            # the error since it was caused by someone else
-            # -> probably a remote peer!
-            else:
-                raise
-
-        # the above `._scope` can be cancelled due to:
-        # 1. an explicit self cancel via `Context.cancel()` or
-        #    `Actor.cancel()`,
-        # 2. any "callee"-side remote error, possibly also a cancellation
-        #    request by some peer,
-        # 3. any "caller" (aka THIS scope's) local error raised in the above `yield`
-        except (
-            # CASE 3: standard local error in this caller/yieldee
-            Exception,
-
-            # CASES 1 & 2: can manifest as a `ctx._scope_nursery`
-            # exception-group of,
-            #
-            # 1.-`trio.Cancelled`s, since
-            #   `._scope.cancel()` will have been called
-            #   (transitively by the runtime calling
-            #   `._deliver_msg()`) and any `ContextCancelled`
-            #   eventually absorbed and thus absorbed/supressed in
-            #   any `Context._maybe_raise_remote_err()` call.
-            #
-            # 2.-`BaseExceptionGroup[ContextCancelled | RemoteActorError]`
-            #    from any error delivered from the "callee" side
-            #    AND a group-exc is only raised if there was > 1
-            #    tasks started *here* in the "caller" / opener
-            #    block. If any one of those tasks calls
-            #    `.result()` or `MsgStream.receive()`
-            #    `._maybe_raise_remote_err()` will be transitively
-            #    called and the remote error raised causing all
-            #    tasks to be cancelled.
-            #    NOTE: ^ this case always can happen if any
-            #    overrun handler tasks were spawned!
-            BaseExceptionGroup,
-
-            trio.Cancelled,  # NOTE: NOT from inside the ctx._scope
-            KeyboardInterrupt,
-
-        ) as caller_err:
-            scope_err = caller_err
-            ctx._local_error: BaseException = scope_err
-
-            # XXX: ALWAYS request the context to CANCEL ON any ERROR.
-            # NOTE: `Context.cancel()` is conversely NEVER CALLED in
-            # the `ContextCancelled` "self cancellation absorbed" case
-            # handled in the block above ^^^ !!
-            # await _debug.pause()
-            log.cancel(
-                'Context terminated due to\n\n'
-                f'.outcome => {ctx.repr_outcome()}\n'
-            )
-
-            if debug_mode():
-                # async with _debug.acquire_debug_lock(self.actor.uid):
-                #     pass
-                # TODO: factor ^ into below for non-root cases?
-                was_acquired: bool = await maybe_wait_for_debugger(
-                    header_msg=(
-                        'Delaying `ctx.cancel()` until debug lock '
-                        'acquired..\n'
-                    ),
-                )
-                if was_acquired:
-                    log.pdb(
-                        'Acquired debug lock! '
-                        'Calling `ctx.cancel()`!\n'
-                    )
-
-
-            # we don't need to cancel the callee if it already
-            # told us it's cancelled ;p
-            if ctxc_from_callee is None:
-                try:
-                    await ctx.cancel()
-                except (
-                    trio.BrokenResourceError,
-                    trio.ClosedResourceError,
-                ):
-                    log.warning(
-                        'IPC connection for context is broken?\n'
-                        f'task:{cid}\n'
-                        f'actor:{uid}'
-                    )
-
-            raise  # duh
-
-        # no local scope error, the "clean exit with a result" case.
-        else:
-            if ctx.chan.connected():
-                log.runtime(
-                    'Waiting on final context result for\n'
-                    f'peer: {uid}\n'
-                    f'|_{ctx._task}\n'
-                )
-                # XXX NOTE XXX: the below call to
-                # `Context.result()` will ALWAYS raise
-                # a `ContextCancelled` (via an embedded call to
-                # `Context._maybe_raise_remote_err()`) IFF
-                # a `Context._remote_error` was set by the runtime
-                # via a call to
-                # `Context._maybe_cancel_and_set_remote_error()`.
-                # As per `Context._deliver_msg()`, that error IS
-                # ALWAYS SET any time "callee" side fails and causes "caller
-                # side" cancellation via a `ContextCancelled` here.
-                try:
-                    result_or_err: Exception|Any = await ctx.result()
-                except BaseException as berr:
-                    # on normal teardown, if we get some error
-                    # raised in `Context.result()` we still want to
-                    # save that error on the ctx's state to
-                    # determine things like `.cancelled_caught` for
-                    # cases where there was remote cancellation but
-                    # this task didn't know until final teardown
-                    # / value collection.
-                    scope_err = berr
-                    ctx._local_error: BaseException = scope_err
-                    raise
-
-                # yes! this worx Bp
-                # from .devx import _debug
-                # await _debug.pause()
-
-                # an exception type boxed in a `RemoteActorError`
-                # is returned (meaning it was obvi not raised)
-                # that we want to log-report on.
-                msgdata: str|None = getattr(
-                    result_or_err,
-                    'msgdata',
-                    None
-                )
-                match (msgdata, result_or_err):
-                    case (
-                        {'tb_str': tbstr},
-                        ContextCancelled(),
-                    ):
-                        log.cancel(tbstr)
-
-                    case (
-                        {'tb_str': tbstr},
-                        RemoteActorError(),
-                    ):
-                        log.exception(
-                            'Context remotely errored!\n'
-                            f'<= peer: {uid}\n'
-                            f'  |_ {nsf}()\n\n'
-
-                            f'{tbstr}'
-                        )
-                    case (None, _):
-                        log.runtime(
-                            'Context returned final result from callee task:\n'
-                            f'<= peer: {uid}\n'
-                            f'  |_ {nsf}()\n\n'
-
-                            f'`{result_or_err}`\n'
-                        )
-
-        finally:
-            # XXX: (MEGA IMPORTANT) if this is a root opened process we
-            # wait for any immediate child in debug before popping the
-            # context from the runtime msg loop otherwise inside
-            # ``Actor._push_result()`` the msg will be discarded and in
-            # the case where that msg is global debugger unlock (via
-            # a "stop" msg for a stream), this can result in a deadlock
-            # where the root is waiting on the lock to clear but the
-            # child has already cleared it and clobbered IPC.
-            await maybe_wait_for_debugger()
-
-            # though it should be impossible for any tasks
-            # operating *in* this scope to have survived
-            # we tear down the runtime feeder chan last
-            # to avoid premature stream clobbers.
-            if (
-                (rxchan := ctx._recv_chan)
-
-                # maybe TODO: yes i know the below check is
-                # touching `trio` memchan internals..BUT, there are
-                # only a couple ways to avoid a `trio.Cancelled`
-                # bubbling from the `.aclose()` call below:
-                #
-                # - catch and mask it via the cancel-scope-shielded call
-                #   as we are rn (manual and frowned upon) OR,
-                # - specially handle the case where `scope_err` is
-                #   one of {`BaseExceptionGroup`, `trio.Cancelled`}
-                #   and then presume that the `.aclose()` call will
-                #   raise a `trio.Cancelled` and just don't call it
-                #   in those cases..
-                #
-                # that latter approach is more logic, LOC, and more
-                # convoluted so for now stick with the first
-                # psuedo-hack-workaround where we just try to avoid
-                # the shielded call as much as we can detect from
-                # the memchan's `._closed` state..
-                #
-                # XXX MOTIVATION XXX-> we generally want to raise
-                # any underlying actor-runtime/internals error that
-                # surfaces from a bug in tractor itself so it can
-                # be easily detected/fixed AND, we also want to
-                # minimize noisy runtime tracebacks (normally due
-                # to the cross-actor linked task scope machinery
-                # teardown) displayed to user-code and instead only
-                # displaying `ContextCancelled` traces where the
-                # cause of crash/exit IS due to something in
-                # user/app code on either end of the context.
-                and not rxchan._closed
-            ):
-                # XXX NOTE XXX: and again as per above, we mask any
-                # `trio.Cancelled` raised here so as to NOT mask
-                # out any exception group or legit (remote) ctx
-                # error that sourced from the remote task or its
-                # runtime.
-                #
-                # NOTE: further, this should be the only place the
-                # underlying feeder channel is
-                # once-and-only-CLOSED!
-                with trio.CancelScope(shield=True):
-                    await ctx._recv_chan.aclose()
-
-            # XXX: we always raise remote errors locally and
-            # generally speaking mask runtime-machinery related
-            # multi-`trio.Cancelled`s. As such, any `scope_error`
-            # which was the underlying cause of this context's exit
-            # should be stored as the `Context._local_error` and
-            # used in determining `Context.cancelled_caught: bool`.
-            if scope_err is not None:
-                # sanity, tho can remove?
-                assert ctx._local_error is scope_err
-                # ctx._local_error: BaseException = scope_err
-                # etype: Type[BaseException] = type(scope_err)
-
-                # CASE 2
-                if (
-                    ctx._cancel_called
-                    and ctx.cancel_acked
-                ):
-                    log.cancel(
-                        'Context cancelled by caller task\n'
-                        f'|_{ctx._task}\n\n'
-
-                        f'{repr(scope_err)}\n'
-                    )
-
-                # TODO: should we add a `._cancel_req_received`
-                # flag to determine if the callee manually called
-                # `ctx.cancel()`?
-                # -[ ] going to need a cid check no?
-
-                # CASE 1
-                else:
-                    outcome_str: str = ctx.repr_outcome(
-                        show_error_fields=True,
-                        # type_only=True,
-                    )
-                    log.cancel(
-                        f'Context terminated due to local scope error:\n\n'
-                        f'{ctx.chan.uid} => {outcome_str}\n'
-                    )
-
-            # FINALLY, remove the context from runtime tracking and
-            # exit!
-            log.runtime(
-                'Removing IPC ctx opened with peer\n'
-                f'{uid}\n'
-                f'|_{ctx}\n'
-            )
-            self.actor._contexts.pop(
-                (uid, cid),
-                None,
-            )
+    # NOTE: impl is found in `._context`` mod to make
+    # reading/groking the details simpler code-org-wise. This
+    # method does not have to be used over that `@acm` module func
+    # directly, it is for conventience and from the original API
+    # design.
+    open_context = open_context_from_portal
 
 
 @dataclass
@@ -1014,7 +477,7 @@ class LocalPortal:
         return await func(**kwargs)
 
 
-@asynccontextmanager
+@acm
 async def open_portal(
 
     channel: Channel,
-- 
2.34.1


From d23d8c1779d3456cfdde24f77ed240ae71349273 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 13 Mar 2024 15:57:15 -0400
Subject: [PATCH 168/378] Start a `._rpc` module

Since `._runtime` was getting pretty long (> 2k LOC) and much of the RPC
low-level machinery is fairly isolated to a handful of task-funcs, it
makes sense to re-org the RPC task scheduling and driving msg loop to
its own code space.

The move includes:
- `process_messages()` which is the main IPC business logic.
- `try_ship_error_to_remote()` helper, to box local errors for the wire.
- `_invoke()`, the core task scheduler entrypoing used in the msg loop.
- `_invoke_non_context()`, holds impls for non-`@context` task starts.
- `_errors_relayed_via_ipc()` which does all error catch-n-boxing for
   wire-msg shipment using `try_ship_error_to_remote()` internally.

Also inside `._runtime` improve some `Actor` methods docs.
---
 tractor/_rpc.py     | 1118 ++++++++++++++++++++++++++++++++++++++++
 tractor/_runtime.py | 1184 ++++---------------------------------------
 2 files changed, 1224 insertions(+), 1078 deletions(-)
 create mode 100644 tractor/_rpc.py

diff --git a/tractor/_rpc.py b/tractor/_rpc.py
new file mode 100644
index 00000000..54a60be6
--- /dev/null
+++ b/tractor/_rpc.py
@@ -0,0 +1,1118 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+Remote (task) Procedure Call (scheduling) with SC transitive semantics.
+
+'''
+from __future__ import annotations
+from contextlib import (
+    asynccontextmanager as acm,
+)
+from functools import partial
+import inspect
+from pprint import pformat
+from types import ModuleType
+from typing import (
+    Any,
+    Callable,
+    Coroutine,
+    TYPE_CHECKING,
+)
+import warnings
+
+from async_generator import aclosing
+from exceptiongroup import BaseExceptionGroup
+import trio
+from trio import (
+    CancelScope,
+    Nursery,
+    TaskStatus,
+)
+# from trio_typing import (
+#     TaskStatus,
+# )
+
+from .msg import NamespacePath
+from ._ipc import Channel
+from ._context import (
+    Context,
+)
+from ._exceptions import (
+    ModuleNotExposed,
+    is_multi_cancelled,
+    ContextCancelled,
+    pack_error,
+    unpack_error,
+    TransportClosed,
+)
+from .devx import (
+    # pause,
+    maybe_wait_for_debugger,
+    _debug,
+)
+from . import _state
+from .log import get_logger
+
+
+if TYPE_CHECKING:
+    from ._runtime import Actor
+
+log = get_logger('tractor')
+
+
+async def _invoke_non_context(
+    actor: Actor,
+    cancel_scope: CancelScope,
+    ctx: Context,
+    cid: str,
+    chan: Channel,
+    func: Callable,
+    coro: Coroutine,
+    kwargs: dict[str, Any],
+
+    treat_as_gen: bool,
+    is_rpc: bool,
+
+    task_status: TaskStatus[
+        Context | BaseException
+    ] = trio.TASK_STATUS_IGNORED,
+):
+
+    # TODO: can we unify this with the `context=True` impl below?
+    if inspect.isasyncgen(coro):
+        await chan.send({'functype': 'asyncgen', 'cid': cid})
+        # XXX: massive gotcha! If the containing scope
+        # is cancelled and we execute the below line,
+        # any ``ActorNursery.__aexit__()`` WON'T be
+        # triggered in the underlying async gen! So we
+        # have to properly handle the closing (aclosing)
+        # of the async gen in order to be sure the cancel
+        # is propagated!
+        with cancel_scope as cs:
+            ctx._scope = cs
+            task_status.started(ctx)
+            async with aclosing(coro) as agen:
+                async for item in agen:
+                    # TODO: can we send values back in here?
+                    # it's gonna require a `while True:` and
+                    # some non-blocking way to retrieve new `asend()`
+                    # values from the channel:
+                    # to_send = await chan.recv_nowait()
+                    # if to_send is not None:
+                    #     to_yield = await coro.asend(to_send)
+                    await chan.send({'yield': item, 'cid': cid})
+
+        log.runtime(f"Finished iterating {coro}")
+        # TODO: we should really support a proper
+        # `StopAsyncIteration` system here for returning a final
+        # value if desired
+        await chan.send({'stop': True, 'cid': cid})
+
+    # one way @stream func that gets treated like an async gen
+    # TODO: can we unify this with the `context=True` impl below?
+    elif treat_as_gen:
+        await chan.send({'functype': 'asyncgen', 'cid': cid})
+        # XXX: the async-func may spawn further tasks which push
+        # back values like an async-generator would but must
+        # manualy construct the response dict-packet-responses as
+        # above
+        with cancel_scope as cs:
+            ctx._scope = cs
+            task_status.started(ctx)
+            await coro
+
+        if not cs.cancelled_caught:
+            # task was not cancelled so we can instruct the
+            # far end async gen to tear down
+            await chan.send({'stop': True, 'cid': cid})
+    else:
+        # regular async function/method
+        # XXX: possibly just a scheduled `Actor._cancel_task()`
+        # from a remote request to cancel some `Context`.
+        # ------ - ------
+        # TODO: ideally we unify this with the above `context=True`
+        # block such that for any remote invocation ftype, we
+        # always invoke the far end RPC task scheduling the same
+        # way: using the linked IPC context machinery.
+        failed_resp: bool = False
+        try:
+            await chan.send({
+                'functype': 'asyncfunc',
+                'cid': cid
+            })
+        except (
+            trio.ClosedResourceError,
+            trio.BrokenResourceError,
+            BrokenPipeError,
+        ) as ipc_err:
+            failed_resp = True
+            if is_rpc:
+                raise
+            else:
+                # TODO: should this be an `.exception()` call?
+                log.warning(
+                    f'Failed to respond to non-rpc request: {func}\n'
+                    f'{ipc_err}'
+                )
+
+        with cancel_scope as cs:
+            ctx._scope: CancelScope = cs
+            task_status.started(ctx)
+            result = await coro
+            fname: str = func.__name__
+            log.runtime(
+                'RPC complete:\n'
+                f'task: {ctx._task}\n'
+                f'|_cid={ctx.cid}\n'
+                f'|_{fname}() -> {pformat(result)}\n'
+            )
+
+            # NOTE: only send result if we know IPC isn't down
+            if (
+                not failed_resp
+                and chan.connected()
+            ):
+                try:
+                    await chan.send(
+                        {'return': result,
+                         'cid': cid}
+                    )
+                except (
+                    BrokenPipeError,
+                    trio.BrokenResourceError,
+                ):
+                    log.warning(
+                        'Failed to return result:\n'
+                        f'{func}@{actor.uid}\n'
+                        f'remote chan: {chan.uid}'
+                    )
+
+@acm
+async def _errors_relayed_via_ipc(
+    actor: Actor,
+    chan: Channel,
+    ctx: Context,
+    is_rpc: bool,
+
+    hide_tb: bool = False,
+    debug_kbis: bool = False,
+    task_status: TaskStatus[
+        Context | BaseException
+    ] = trio.TASK_STATUS_IGNORED,
+
+) -> None:
+    __tracebackhide__: bool = hide_tb  # TODO: use hide_tb here?
+    try:
+        yield  # run RPC invoke body
+
+    # box and ship RPC errors for wire-transit via
+    # the task's requesting parent IPC-channel.
+    except (
+        Exception,
+        BaseExceptionGroup,
+        KeyboardInterrupt,
+    ) as err:
+
+        # always hide this frame from debug REPL if the crash
+        # originated from an rpc task and we DID NOT fail due to
+        # an IPC transport error!
+        if (
+            is_rpc
+            and chan.connected()
+        ):
+            __tracebackhide__: bool = hide_tb
+
+        if not is_multi_cancelled(err):
+
+            # TODO: maybe we'll want different "levels" of debugging
+            # eventualy such as ('app', 'supervisory', 'runtime') ?
+
+            # if not isinstance(err, trio.ClosedResourceError) and (
+            # if not is_multi_cancelled(err) and (
+
+            entered_debug: bool = False
+            if (
+                (
+                    not isinstance(err, ContextCancelled)
+                    or (
+                        isinstance(err, ContextCancelled)
+                        and ctx._cancel_called
+
+                        # if the root blocks the debugger lock request from a child
+                        # we will get a remote-cancelled condition.
+                        and ctx._enter_debugger_on_cancel
+                    )
+                )
+                and
+                (
+                    not isinstance(err, KeyboardInterrupt)
+                    or (
+                        isinstance(err, KeyboardInterrupt)
+                        and debug_kbis
+                    )
+                )
+            ):
+                # await _debug.pause()
+                # XXX QUESTION XXX: is there any case where we'll
+                # want to debug IPC disconnects as a default?
+                # => I can't think of a reason that inspecting this
+                # type of failure will be useful for respawns or
+                # recovery logic - the only case is some kind of
+                # strange bug in our transport layer itself? Going
+                # to keep this open ended for now.
+                entered_debug = await _debug._maybe_enter_pm(err)
+
+                if not entered_debug:
+                    log.exception('Actor crashed:\n')
+
+        # always (try to) ship RPC errors back to caller
+        if is_rpc:
+            #
+            # TODO: tests for this scenario:
+            # - RPC caller closes connection before getting a response
+            # should **not** crash this actor..
+            await try_ship_error_to_remote(
+                chan,
+                err,
+                cid=ctx.cid,
+                remote_descr='caller',
+                hide_tb=hide_tb,
+            )
+
+        # error is probably from above coro running code *not from
+        # the target rpc invocation since a scope was never
+        # allocated around the coroutine await.
+        if ctx._scope is None:
+            # we don't ever raise directly here to allow the
+            # msg-loop-scheduler to continue running for this
+            # channel.
+            task_status.started(err)
+
+        # always reraise KBIs so they propagate at the sys-process
+        # level.
+        if isinstance(err, KeyboardInterrupt):
+            raise
+
+
+    # RPC task bookeeping
+    finally:
+        try:
+            ctx, func, is_complete = actor._rpc_tasks.pop(
+                (chan, ctx.cid)
+            )
+            is_complete.set()
+
+        except KeyError:
+            if is_rpc:
+                # If we're cancelled before the task returns then the
+                # cancel scope will not have been inserted yet
+                log.warning(
+                    'RPC task likely errored or cancelled before start?'
+                    f'|_{ctx._task}\n'
+                    f'  >> {ctx.repr_rpc}\n'
+                )
+            else:
+                log.cancel(
+                    'Failed to de-alloc internal runtime cancel task?\n'
+                    f'|_{ctx._task}\n'
+                    f'  >> {ctx.repr_rpc}\n'
+                )
+
+        finally:
+            if not actor._rpc_tasks:
+                log.runtime("All RPC tasks have completed")
+                actor._ongoing_rpc_tasks.set()
+
+
+_gb_mod: ModuleType|None|False = None
+
+
+async def maybe_import_gb():
+    global _gb_mod
+    if _gb_mod is False:
+        return
+
+    try:
+        import greenback
+        _gb_mod = greenback
+        await greenback.ensure_portal()
+
+    except ModuleNotFoundError:
+        log.debug(
+            '`greenback` is not installed.\n'
+            'No sync debug support!\n'
+        )
+        _gb_mod = False
+
+
+async def _invoke(
+
+    actor: Actor,
+    cid: str,
+    chan: Channel,
+    func: Callable,
+    kwargs: dict[str, Any],
+
+    is_rpc: bool = True,
+    hide_tb: bool = True,
+
+    task_status: TaskStatus[
+        Context | BaseException
+    ] = trio.TASK_STATUS_IGNORED,
+):
+    '''
+    Schedule a `trio` task-as-func and deliver result(s) over
+    connected IPC channel.
+
+    This is the core "RPC" `trio.Task` scheduling machinery used to start every
+    remotely invoked function, normally in `Actor._service_n: Nursery`.
+
+    '''
+    __tracebackhide__: bool = hide_tb
+    treat_as_gen: bool = False
+
+    if _state.debug_mode():
+        await maybe_import_gb()
+
+    # TODO: possibly a specially formatted traceback
+    # (not sure what typing is for this..)?
+    # tb = None
+
+    cancel_scope = CancelScope()
+    # activated cancel scope ref
+    cs: CancelScope|None = None
+
+    ctx = actor.get_context(
+        chan=chan,
+        cid=cid,
+        nsf=NamespacePath.from_ref(func),
+
+        # TODO: if we wanted to get cray and support it?
+        # side='callee',
+
+        # We shouldn't ever need to pass this through right?
+        # it's up to the soon-to-be called rpc task to
+        # open the stream with this option.
+        # allow_overruns=True,
+    )
+    context: bool = False
+
+    # TODO: deprecate this style..
+    if getattr(func, '_tractor_stream_function', False):
+        # handle decorated ``@tractor.stream`` async functions
+        sig = inspect.signature(func)
+        params = sig.parameters
+
+        # compat with old api
+        kwargs['ctx'] = ctx
+        treat_as_gen = True
+
+        if 'ctx' in params:
+            warnings.warn(
+                "`@tractor.stream decorated funcs should now declare "
+                "a `stream`  arg, `ctx` is now designated for use with "
+                "@tractor.context",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+
+        elif 'stream' in params:
+            assert 'stream' in params
+            kwargs['stream'] = ctx
+
+
+    elif getattr(func, '_tractor_context_function', False):
+        # handle decorated ``@tractor.context`` async function
+        kwargs['ctx'] = ctx
+        context = True
+
+    # errors raised inside this block are propgated back to caller
+    async with _errors_relayed_via_ipc(
+        actor,
+        chan,
+        ctx,
+        is_rpc,
+        hide_tb=hide_tb,
+        task_status=task_status,
+    ):
+        if not (
+            inspect.isasyncgenfunction(func) or
+            inspect.iscoroutinefunction(func)
+        ):
+            raise TypeError(f'{func} must be an async function!')
+
+        # init coroutine with `kwargs` to immediately catch any
+        # type-sig errors.
+        try:
+            coro = func(**kwargs)
+        except TypeError:
+            raise
+
+        # TODO: implement all these cases in terms of the
+        # `Context` one!
+        if not context:
+            await _invoke_non_context(
+                actor,
+                cancel_scope,
+                ctx,
+                cid,
+                chan,
+                func,
+                coro,
+                kwargs,
+                treat_as_gen,
+                is_rpc,
+                task_status,
+            )
+            # below is only for `@context` funcs
+            return
+
+        # our most general case: a remote SC-transitive,
+        # IPC-linked, cross-actor-task "context"
+        # ------ - ------
+        # TODO: every other "func type" should be implemented from
+        # a special case of this impl eventually!
+        # -[ ] streaming funcs should instead of being async-for
+        #     handled directly here wrapped in
+        #     a async-with-open_stream() closure that does the
+        #     normal thing you'd expect a far end streaming context
+        #     to (if written by the app-dev).
+        # -[ ] one off async funcs can literally just be called
+        #     here and awaited directly, possibly just with a small
+        #     wrapper that calls `Context.started()` and then does
+        #     the `await coro()`?
+
+        # a "context" endpoint type is the most general and
+        # "least sugary" type of RPC ep with support for
+        # bi-dir streaming B)
+        await chan.send({
+            'functype': 'context',
+            'cid': cid
+        })
+
+        # TODO: should we also use an `.open_context()` equiv
+        # for this callee side by factoring the impl from
+        # `Portal.open_context()` into a common helper?
+        #
+        # NOTE: there are many different ctx state details
+        # in a callee side instance according to current impl:
+        # - `.cancelled_caught` can never be `True`.
+        #  -> the below scope is never exposed to the
+        #     `@context` marked RPC function.
+        # - `._portal` is never set.
+        try:
+            async with trio.open_nursery() as tn:
+                ctx._scope_nursery = tn
+                ctx._scope = tn.cancel_scope
+                task_status.started(ctx)
+
+                # TODO: should would be nice to have our
+                # `TaskMngr` nursery here!
+                res: Any = await coro
+                ctx._result = res
+
+                # deliver final result to caller side.
+                await chan.send({
+                    'return': res,
+                    'cid': cid
+                })
+
+            # NOTE: this happens IFF `ctx._scope.cancel()` is
+            # called by any of,
+            # - *this* callee task manually calling `ctx.cancel()`.
+            # - the runtime calling `ctx._deliver_msg()` which
+            #   itself calls `ctx._maybe_cancel_and_set_remote_error()`
+            #   which cancels the scope presuming the input error
+            #   is not a `.cancel_acked` pleaser.
+            # - currently a never-should-happen-fallthrough case
+            #   inside ._context._drain_to_final_msg()`..
+            #   # TODO: remove this ^ right?
+            if ctx._scope.cancelled_caught:
+                our_uid: tuple = actor.uid
+
+                # first check for and raise any remote error
+                # before raising any context cancelled case
+                # so that real remote errors don't get masked as
+                # ``ContextCancelled``s.
+                if re := ctx._remote_error:
+                    ctx._maybe_raise_remote_err(re)
+
+                cs: CancelScope = ctx._scope
+
+                if cs.cancel_called:
+
+                    canceller: tuple = ctx.canceller
+                    msg: str = (
+                        'actor was cancelled by '
+                    )
+
+                    # NOTE / TODO: if we end up having
+                    # ``Actor._cancel_task()`` call
+                    # ``Context.cancel()`` directly, we're going to
+                    # need to change this logic branch since it
+                    # will always enter..
+                    if ctx._cancel_called:
+                        # TODO: test for this!!!!!
+                        canceller: tuple = our_uid
+                        msg += 'itself '
+
+                    # if the channel which spawned the ctx is the
+                    # one that cancelled it then we report that, vs.
+                    # it being some other random actor that for ex.
+                    # some actor who calls `Portal.cancel_actor()`
+                    # and by side-effect cancels this ctx.
+                    elif canceller == ctx.chan.uid:
+                        msg += 'its caller'
+
+                    else:
+                        msg += 'a remote peer'
+
+                    div_chars: str = '------ - ------'
+                    div_offset: int = (
+                        round(len(msg)/2)+1
+                        +
+                        round(len(div_chars)/2)+1
+                    )
+                    div_str: str = (
+                        '\n'
+                        +
+                        ' '*div_offset
+                        +
+                        f'{div_chars}\n'
+                    )
+                    msg += (
+                        div_str +
+                        f'<= canceller: {canceller}\n'
+                        f'=> uid: {our_uid}\n'
+                        f'  |_{ctx._task}()'
+
+                        # TODO: instead just show the
+                        # ctx.__str__() here?
+                        # -[ ] textwrap.indent() it correctly!
+                        # -[ ] BUT we need to wait until
+                        #   the state is filled out before emitting
+                        #   this msg right ow its kinda empty? bleh..
+                        #
+                        # f'  |_{ctx}'
+                    )
+
+                    # task-contex was either cancelled by request using
+                    # ``Portal.cancel_actor()`` or ``Context.cancel()``
+                    # on the far end, or it was cancelled by the local
+                    # (callee) task, so relay this cancel signal to the
+                    # other side.
+                    ctxc = ContextCancelled(
+                        msg,
+                        suberror_type=trio.Cancelled,
+                        canceller=canceller,
+                    )
+                    # assign local error so that the `.outcome`
+                    # resolves to an error for both reporting and
+                    # state checks.
+                    ctx._local_error = ctxc
+                    raise ctxc
+
+        # XXX: do we ever trigger this block any more?
+        except (
+            BaseExceptionGroup,
+            trio.Cancelled,
+            BaseException,
+
+        ) as scope_error:
+
+            # always set this (callee) side's exception as the
+            # local error on the context
+            ctx._local_error: BaseException = scope_error
+
+            # if a remote error was set then likely the
+            # exception group was raised due to that, so
+            # and we instead raise that error immediately!
+            ctx.maybe_raise()
+
+            # maybe TODO: pack in come kinda
+            # `trio.Cancelled.__traceback__` here so they can be
+            # unwrapped and displayed on the caller side? no se..
+            raise
+
+        # `@context` entrypoint task bookeeping.
+        # i.e. only pop the context tracking if used ;)
+        finally:
+            assert chan.uid
+
+            # don't pop the local context until we know the
+            # associated child isn't in debug any more
+            await maybe_wait_for_debugger()
+            ctx: Context = actor._contexts.pop((
+                chan.uid,
+                cid,
+                # ctx.side,
+            ))
+
+            merr: Exception|None = ctx.maybe_error
+
+            (
+                res_type_str,
+                res_str,
+            ) = (
+                ('error', f'{type(merr)}',)
+                if merr
+                else (
+                    'result',
+                    f'`{repr(ctx.outcome)}`',
+                )
+            )
+            log.cancel(
+                f'IPC context terminated with a final {res_type_str}\n\n'
+                f'{ctx}\n'
+            )
+
+
+async def try_ship_error_to_remote(
+    channel: Channel,
+    err: Exception|BaseExceptionGroup,
+
+    cid: str|None = None,
+    remote_descr: str = 'parent',
+    hide_tb: bool = True,
+
+) -> None:
+    '''
+    Box, pack and encode a local runtime(-internal) exception for
+    an IPC channel `.send()` with transport/network failures and
+    local cancellation ignored but logged as critical(ly bad).
+
+    '''
+    __tracebackhide__: bool = hide_tb
+    with CancelScope(shield=True):
+        try:
+            # NOTE: normally only used for internal runtime errors
+            # so ship to peer actor without a cid.
+            msg: dict = pack_error(
+                err,
+                cid=cid,
+
+                # TODO: special tb fmting for ctxc cases?
+                # tb=tb,
+            )
+            # NOTE: the src actor should always be packed into the
+            # error.. but how should we verify this?
+            # actor: Actor = _state.current_actor()
+            # assert err_msg['src_actor_uid']
+            # if not err_msg['error'].get('src_actor_uid'):
+            #     import pdbp; pdbp.set_trace()
+            await channel.send(msg)
+
+        # XXX NOTE XXX in SC terms this is one of the worst things
+        # that can happen and provides for a 2-general's dilemma..
+        except (
+            trio.ClosedResourceError,
+            trio.BrokenResourceError,
+            BrokenPipeError,
+        ):
+            err_msg: dict = msg['error']['tb_str']
+            log.critical(
+                'IPC transport failure -> '
+                f'failed to ship error to {remote_descr}!\n\n'
+                f'X=> {channel.uid}\n\n'
+                f'{err_msg}\n'
+            )
+
+
+async def process_messages(
+    actor: Actor,
+    chan: Channel,
+    shield: bool = False,
+    task_status: TaskStatus[CancelScope] = trio.TASK_STATUS_IGNORED,
+
+) -> bool:
+    '''
+    This is the low-level, per-IPC-channel, RPC task scheduler loop.
+
+    Receive (multiplexed) per-`Channel` RPC requests as msgs from
+    remote processes; schedule target async funcs as local
+    `trio.Task`s inside the `Actor._service_n: Nursery`.
+
+    Depending on msg type, non-`cmd` (task spawning/starting)
+    request payloads (eg. `started`, `yield`, `return`, `error`)
+    are delivered to locally running, linked-via-`Context`, tasks
+    with any (boxed) errors and/or final results shipped back to
+    the remote side.
+
+    All higher level inter-actor comms ops are delivered in some
+    form by the msg processing here, including:
+
+    - lookup and invocation of any (async) funcs-as-tasks requested
+      by remote actors presuming the local actor has enabled their
+      containing module.
+
+    - IPC-session oriented `Context` and `MsgStream` msg payload
+      delivery such as `started`, `yield` and `return` msgs.
+
+    - cancellation handling for both `Context.cancel()` (which
+      translate to `Actor._cancel_task()` RPCs server side)
+      and `Actor.cancel()` process-wide-runtime-shutdown requests
+      (as utilized inside `Portal.cancel_actor()` ).
+
+
+    '''
+    # TODO: once `trio` get's an "obvious way" for req/resp we
+    # should use it?
+    # https://github.com/python-trio/trio/issues/467
+    log.runtime(
+        'Entering IPC msg loop:\n'
+        f'peer: {chan.uid}\n'
+        f'|_{chan}\n'
+    )
+    nursery_cancelled_before_task: bool = False
+    msg: dict | None = None
+    try:
+        # NOTE: this internal scope allows for keeping this
+        # message loop running despite the current task having
+        # been cancelled (eg. `open_portal()` may call this method
+        # from a locally spawned task) and recieve this scope
+        # using ``scope = Nursery.start()``
+        with CancelScope(shield=shield) as loop_cs:
+            task_status.started(loop_cs)
+            async for msg in chan:
+
+                # dedicated loop terminate sentinel
+                if msg is None:
+
+                    tasks: dict[
+                        tuple[Channel, str],
+                        tuple[Context, Callable, trio.Event]
+                    ] = actor._rpc_tasks.copy()
+                    log.cancel(
+                        f'Peer IPC channel terminated via `None` setinel msg?\n'
+                        f'=> Cancelling all {len(tasks)} local RPC tasks..\n'
+                        f'peer: {chan.uid}\n'
+                        f'|_{chan}\n'
+                    )
+                    for (channel, cid) in tasks:
+                        if channel is chan:
+                            await actor._cancel_task(
+                                cid,
+                                channel,
+                                requesting_uid=channel.uid,
+
+                                ipc_msg=msg,
+                            )
+                    break
+
+                log.transport(   # type: ignore
+                    f'<= IPC msg from peer: {chan.uid}\n\n'
+
+                    # TODO: conditionally avoid fmting depending
+                    # on log level (for perf)?
+                    # => specifically `pformat()` sub-call..?
+                    f'{pformat(msg)}\n'
+                )
+
+                cid = msg.get('cid')
+                if cid:
+                    # deliver response to local caller/waiter
+                    # via its per-remote-context memory channel.
+                    await actor._push_result(
+                        chan,
+                        cid,
+                        msg,
+                    )
+
+                    log.runtime(
+                        'Waiting on next IPC msg from\n'
+                        f'peer: {chan.uid}:\n'
+                        f'|_{chan}\n'
+
+                        # f'last msg: {msg}\n'
+                    )
+                    continue
+
+                # process a 'cmd' request-msg upack
+                # TODO: impl with native `msgspec.Struct` support !!
+                # -[ ] implement with ``match:`` syntax?
+                # -[ ] discard un-authed msgs as per,
+                # <TODO put issue for typed msging structs>
+                try:
+                    (
+                        ns,
+                        funcname,
+                        kwargs,
+                        actorid,
+                        cid,
+                    ) = msg['cmd']
+
+                except KeyError:
+                    # This is the non-rpc error case, that is, an
+                    # error **not** raised inside a call to ``_invoke()``
+                    # (i.e. no cid was provided in the msg - see above).
+                    # Push this error to all local channel consumers
+                    # (normally portals) by marking the channel as errored
+                    assert chan.uid
+                    exc = unpack_error(msg, chan=chan)
+                    chan._exc = exc
+                    raise exc
+
+                log.runtime(
+                    'Handling RPC cmd from\n'
+                    f'peer: {actorid}\n'
+                    '\n'
+                    f'=> {ns}.{funcname}({kwargs})\n'
+                )
+                if ns == 'self':
+                    if funcname == 'cancel':
+                        func: Callable = actor.cancel
+                        kwargs |= {
+                            'req_chan': chan,
+                        }
+
+                        # don't start entire actor runtime cancellation
+                        # if this actor is currently in debug mode!
+                        pdb_complete: trio.Event|None = _debug.Lock.local_pdb_complete
+                        if pdb_complete:
+                            await pdb_complete.wait()
+
+                        # Either of  `Actor.cancel()`/`.cancel_soon()`
+                        # was called, so terminate this IPC msg
+                        # loop, exit back out into `async_main()`,
+                        # and immediately start the core runtime
+                        # machinery shutdown!
+                        with CancelScope(shield=True):
+                            await _invoke(
+                                actor,
+                                cid,
+                                chan,
+                                func,
+                                kwargs,
+                                is_rpc=False,
+                            )
+
+                        log.runtime(
+                            'Cancelling IPC transport msg-loop with peer:\n'
+                            f'|_{chan}\n'
+                        )
+                        loop_cs.cancel()
+                        break
+
+                    if funcname == '_cancel_task':
+                        func: Callable = actor._cancel_task
+
+                        # we immediately start the runtime machinery
+                        # shutdown
+                        # with CancelScope(shield=True):
+                        target_cid: str = kwargs['cid']
+                        kwargs |= {
+                            # NOTE: ONLY the rpc-task-owning
+                            # parent IPC channel should be able to
+                            # cancel it!
+                            'parent_chan': chan,
+                            'requesting_uid': chan.uid,
+                            'ipc_msg': msg,
+                        }
+                        # TODO: remove? already have emit in meth.
+                        # log.runtime(
+                        #     f'Rx RPC task cancel request\n'
+                        #     f'<= canceller: {chan.uid}\n'
+                        #     f'  |_{chan}\n\n'
+                        #     f'=> {actor}\n'
+                        #     f'  |_cid: {target_cid}\n'
+                        # )
+                        try:
+                            await _invoke(
+                                actor,
+                                cid,
+                                chan,
+                                func,
+                                kwargs,
+                                is_rpc=False,
+                            )
+                        except BaseException:
+                            log.exception(
+                                'Failed to cancel task?\n'
+                                f'<= canceller: {chan.uid}\n'
+                                f'  |_{chan}\n\n'
+                                f'=> {actor}\n'
+                                f'  |_cid: {target_cid}\n'
+                            )
+                        continue
+                    else:
+                        # normally registry methods, eg.
+                        # ``.register_actor()`` etc.
+                        func: Callable = getattr(actor, funcname)
+
+                else:
+                    # complain to client about restricted modules
+                    try:
+                        func = actor._get_rpc_func(ns, funcname)
+                    except (
+                        ModuleNotExposed,
+                        AttributeError,
+                    ) as err:
+                        err_msg: dict[str, dict] = pack_error(
+                            err,
+                            cid=cid,
+                        )
+                        await chan.send(err_msg)
+                        continue
+
+                # schedule a task for the requested RPC function
+                # in the actor's main "service nursery".
+                # TODO: possibly a service-tn per IPC channel for
+                # supervision isolation? would avoid having to
+                # manage RPC tasks individually in `._rpc_tasks`
+                # table?
+                log.runtime(
+                    f'Spawning task for RPC request\n'
+                    f'<= caller: {chan.uid}\n'
+                    f'  |_{chan}\n\n'
+                    # TODO: maddr style repr?
+                    # f'  |_@ /ipv4/{chan.raddr}/tcp/{chan.rport}/'
+                    # f'cid="{cid[-16:]} .."\n\n'
+
+                    f'=> {actor}\n'
+                    f'  |_cid: {cid}\n'
+                    f'   |>> {func}()\n'
+                )
+                assert actor._service_n  # wait why? do it at top?
+                try:
+                    ctx: Context = await actor._service_n.start(
+                        partial(
+                            _invoke,
+                            actor,
+                            cid,
+                            chan,
+                            func,
+                            kwargs,
+                        ),
+                        name=funcname,
+                    )
+
+                except (
+                    RuntimeError,
+                    BaseExceptionGroup,
+                ):
+                    # avoid reporting a benign race condition
+                    # during actor runtime teardown.
+                    nursery_cancelled_before_task: bool = True
+                    break
+
+                # in the lone case where a ``Context`` is not
+                # delivered, it's likely going to be a locally
+                # scoped exception from ``_invoke()`` itself.
+                if isinstance(err := ctx, Exception):
+                    log.warning(
+                        'Task for RPC failed?'
+                        f'|_ {func}()\n\n'
+
+                        f'{err}'
+                    )
+                    continue
+
+                else:
+                    # mark that we have ongoing rpc tasks
+                    actor._ongoing_rpc_tasks = trio.Event()
+
+                    # store cancel scope such that the rpc task can be
+                    # cancelled gracefully if requested
+                    actor._rpc_tasks[(chan, cid)] = (
+                        ctx,
+                        func,
+                        trio.Event(),
+                    )
+
+                log.runtime(
+                    'Waiting on next IPC msg from\n'
+                    f'peer: {chan.uid}\n'
+                    f'|_{chan}\n'
+                )
+
+            # end of async for, channel disconnect vis
+            # ``trio.EndOfChannel``
+            log.runtime(
+                f"{chan} for {chan.uid} disconnected, cancelling tasks"
+            )
+            await actor.cancel_rpc_tasks(
+                req_uid=actor.uid,
+                # a "self cancel" in terms of the lifetime of the
+                # IPC connection which is presumed to be the
+                # source of any requests for spawned tasks.
+                parent_chan=chan,
+            )
+
+    except (
+        TransportClosed,
+    ):
+        # channels "breaking" (for TCP streams by EOF or 104
+        # connection-reset) is ok since we don't have a teardown
+        # handshake for them (yet) and instead we simply bail out of
+        # the message loop and expect the teardown sequence to clean
+        # up.
+        # TODO: don't show this msg if it's an emphemeral
+        # discovery ep call?
+        log.runtime(
+            f'channel closed abruptly with\n'
+            f'peer: {chan.uid}\n' 
+            f'|_{chan.raddr}\n'
+        )
+
+        # transport **was** disconnected
+        return True
+
+    except (
+        Exception,
+        BaseExceptionGroup,
+    ) as err:
+
+        if nursery_cancelled_before_task:
+            sn: Nursery = actor._service_n
+            assert sn and sn.cancel_scope.cancel_called  # sanity
+            log.cancel(
+                f'Service nursery cancelled before it handled {funcname}'
+            )
+        else:
+            # ship any "internal" exception (i.e. one from internal
+            # machinery not from an rpc task) to parent
+            match err:
+                case ContextCancelled():
+                    log.cancel(
+                        f'Actor: {actor.uid} was context-cancelled with,\n'
+                        f'str(err)'
+                    )
+                case _:
+                    log.exception("Actor errored:")
+
+            if actor._parent_chan:
+                await try_ship_error_to_remote(
+                    actor._parent_chan,
+                    err,
+                )
+
+        # if this is the `MainProcess` we expect the error broadcasting
+        # above to trigger an error at consuming portal "checkpoints"
+        raise
+
+    finally:
+        # msg debugging for when he machinery is brokey
+        log.runtime(
+            'Exiting IPC msg loop with\n'
+            f'peer: {chan.uid}\n'
+            f'|_{chan}\n\n'
+            'final msg:\n'
+            f'{pformat(msg)}\n'
+        )
+
+    # transport **was not** disconnected
+    return False
diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 587d636c..838c648c 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -14,31 +14,43 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
-"""
-The fundamental core machinery implementing every "actor" including
-the process-local (python-interpreter global) `Actor` state-type
-primitive(s), RPC-in-task scheduling, and IPC connectivity and
-low-level transport msg handling.
+'''
+The fundamental core machinery implementing every "actor"
+including the process-local, or "python-interpreter (aka global)
+singleton) `Actor` primitive(s) and its internal `trio` machinery
+implementing the low level runtime system supporting the
+discovery, communication, spawning, supervision and cancellation
+of other actors in a hierarchincal process tree.
 
-"""
+The runtime's main entry point: `async_main()` opens the top level
+supervision and service `trio.Nursery`s which manage the tasks responsible
+for running all lower level spawning, supervision and msging layers:
+
+- lowlevel transport-protocol init and persistent connectivity on
+  top of `._ipc` primitives; the transport layer.
+- bootstrapping of connection/runtime config from the spawning
+  parent (actor).
+- starting and supervising IPC-channel msg processing loops around
+  tranport connections from parent/peer actors in order to deliver
+  SC-transitive RPC via scheduling of `trio` tasks.
+- registration of newly spawned actors with the discovery sys.
+
+'''
 from __future__ import annotations
 from contextlib import (
     ExitStack,
-    asynccontextmanager as acm,
 )
 from collections import defaultdict
 from functools import partial
 from itertools import chain
 import importlib
 import importlib.util
-import inspect
 from pprint import pformat
 import signal
 import sys
 from typing import (
     Any,
     Callable,
-    Coroutine,
     TYPE_CHECKING,
 )
 import uuid
@@ -46,8 +58,6 @@ from types import ModuleType
 import os
 import warnings
 
-from async_generator import aclosing
-from exceptiongroup import BaseExceptionGroup
 import trio
 from trio import (
     CancelScope,
@@ -65,10 +75,8 @@ from ._context import (
 )
 from .log import get_logger
 from ._exceptions import (
-    pack_error,
     unpack_error,
     ModuleNotExposed,
-    is_multi_cancelled,
     ContextCancelled,
     TransportClosed,
 )
@@ -81,6 +89,10 @@ from ._discovery import get_registry
 from ._portal import Portal
 from . import _state
 from . import _mp_fixup_main
+from ._rpc import (
+    process_messages,
+    try_ship_error_to_remote,
+)
 
 
 if TYPE_CHECKING:
@@ -89,668 +101,11 @@ if TYPE_CHECKING:
 
 log = get_logger('tractor')
 
-_gb_mod: ModuleType|None|False = None
-
-
-async def maybe_import_gb():
-    global _gb_mod
-    if _gb_mod is False:
-        return
-
-    try:
-        import greenback
-        _gb_mod = greenback
-        await greenback.ensure_portal()
-
-    except ModuleNotFoundError:
-        log.debug(
-            '`greenback` is not installed.\n'
-            'No sync debug support!\n'
-        )
-        _gb_mod = False
-
-
-async def _invoke_non_context(
-    actor: Actor,
-    cancel_scope: CancelScope,
-    ctx: Context,
-    cid: str,
-    chan: Channel,
-    func: Callable,
-    coro: Coroutine,
-    kwargs: dict[str, Any],
-
-    treat_as_gen: bool,
-    is_rpc: bool,
-
-    task_status: TaskStatus[
-        Context | BaseException
-    ] = trio.TASK_STATUS_IGNORED,
-):
-
-    # TODO: can we unify this with the `context=True` impl below?
-    if inspect.isasyncgen(coro):
-        await chan.send({'functype': 'asyncgen', 'cid': cid})
-        # XXX: massive gotcha! If the containing scope
-        # is cancelled and we execute the below line,
-        # any ``ActorNursery.__aexit__()`` WON'T be
-        # triggered in the underlying async gen! So we
-        # have to properly handle the closing (aclosing)
-        # of the async gen in order to be sure the cancel
-        # is propagated!
-        with cancel_scope as cs:
-            ctx._scope = cs
-            task_status.started(ctx)
-            async with aclosing(coro) as agen:
-                async for item in agen:
-                    # TODO: can we send values back in here?
-                    # it's gonna require a `while True:` and
-                    # some non-blocking way to retrieve new `asend()`
-                    # values from the channel:
-                    # to_send = await chan.recv_nowait()
-                    # if to_send is not None:
-                    #     to_yield = await coro.asend(to_send)
-                    await chan.send({'yield': item, 'cid': cid})
-
-        log.runtime(f"Finished iterating {coro}")
-        # TODO: we should really support a proper
-        # `StopAsyncIteration` system here for returning a final
-        # value if desired
-        await chan.send({'stop': True, 'cid': cid})
-
-    # one way @stream func that gets treated like an async gen
-    # TODO: can we unify this with the `context=True` impl below?
-    elif treat_as_gen:
-        await chan.send({'functype': 'asyncgen', 'cid': cid})
-        # XXX: the async-func may spawn further tasks which push
-        # back values like an async-generator would but must
-        # manualy construct the response dict-packet-responses as
-        # above
-        with cancel_scope as cs:
-            ctx._scope = cs
-            task_status.started(ctx)
-            await coro
-
-        if not cs.cancelled_caught:
-            # task was not cancelled so we can instruct the
-            # far end async gen to tear down
-            await chan.send({'stop': True, 'cid': cid})
-    else:
-        # regular async function/method
-        # XXX: possibly just a scheduled `Actor._cancel_task()`
-        # from a remote request to cancel some `Context`.
-        # ------ - ------
-        # TODO: ideally we unify this with the above `context=True`
-        # block such that for any remote invocation ftype, we
-        # always invoke the far end RPC task scheduling the same
-        # way: using the linked IPC context machinery.
-        failed_resp: bool = False
-        try:
-            await chan.send({
-                'functype': 'asyncfunc',
-                'cid': cid
-            })
-        except (
-            trio.ClosedResourceError,
-            trio.BrokenResourceError,
-            BrokenPipeError,
-        ) as ipc_err:
-            failed_resp = True
-            if is_rpc:
-                raise
-            else:
-                # TODO: should this be an `.exception()` call?
-                log.warning(
-                    f'Failed to respond to non-rpc request: {func}\n'
-                    f'{ipc_err}'
-                )
-
-        with cancel_scope as cs:
-            ctx._scope: CancelScope = cs
-            task_status.started(ctx)
-            result = await coro
-            fname: str = func.__name__
-            log.runtime(
-                'RPC complete:\n'
-                f'task: {ctx._task}\n'
-                f'|_cid={ctx.cid}\n'
-                f'|_{fname}() -> {pformat(result)}\n'
-            )
-
-            # NOTE: only send result if we know IPC isn't down
-            if (
-                not failed_resp
-                and chan.connected()
-            ):
-                try:
-                    await chan.send(
-                        {'return': result,
-                         'cid': cid}
-                    )
-                except (
-                    BrokenPipeError,
-                    trio.BrokenResourceError,
-                ):
-                    log.warning(
-                        'Failed to return result:\n'
-                        f'{func}@{actor.uid}\n'
-                        f'remote chan: {chan.uid}'
-                    )
-
-@acm
-async def _errors_relayed_via_ipc(
-    actor: Actor,
-    chan: Channel,
-    ctx: Context,
-    is_rpc: bool,
-
-    hide_tb: bool = False,
-    debug_kbis: bool = False,
-    task_status: TaskStatus[
-        Context | BaseException
-    ] = trio.TASK_STATUS_IGNORED,
-
-) -> None:
-    __tracebackhide__: bool = hide_tb  # TODO: use hide_tb here?
-    try:
-        yield  # run RPC invoke body
-
-    # box and ship RPC errors for wire-transit via
-    # the task's requesting parent IPC-channel.
-    except (
-        Exception,
-        BaseExceptionGroup,
-        KeyboardInterrupt,
-    ) as err:
-
-        # always hide this frame from debug REPL if the crash
-        # originated from an rpc task and we DID NOT fail due to
-        # an IPC transport error!
-        if (
-            is_rpc
-            and chan.connected()
-        ):
-            __tracebackhide__: bool = hide_tb
-
-        if not is_multi_cancelled(err):
-
-            # TODO: maybe we'll want different "levels" of debugging
-            # eventualy such as ('app', 'supervisory', 'runtime') ?
-
-            # if not isinstance(err, trio.ClosedResourceError) and (
-            # if not is_multi_cancelled(err) and (
-
-            entered_debug: bool = False
-            if (
-                (
-                    not isinstance(err, ContextCancelled)
-                    or (
-                        isinstance(err, ContextCancelled)
-                        and ctx._cancel_called
-
-                        # if the root blocks the debugger lock request from a child
-                        # we will get a remote-cancelled condition.
-                        and ctx._enter_debugger_on_cancel
-                    )
-                )
-                and
-                (
-                    not isinstance(err, KeyboardInterrupt)
-                    or (
-                        isinstance(err, KeyboardInterrupt)
-                        and debug_kbis
-                    )
-                )
-            ):
-                # await _debug.pause()
-                # XXX QUESTION XXX: is there any case where we'll
-                # want to debug IPC disconnects as a default?
-                # => I can't think of a reason that inspecting this
-                # type of failure will be useful for respawns or
-                # recovery logic - the only case is some kind of
-                # strange bug in our transport layer itself? Going
-                # to keep this open ended for now.
-                entered_debug = await _debug._maybe_enter_pm(err)
-
-                if not entered_debug:
-                    log.exception('Actor crashed:\n')
-
-        # always (try to) ship RPC errors back to caller
-        if is_rpc:
-            #
-            # TODO: tests for this scenario:
-            # - RPC caller closes connection before getting a response
-            # should **not** crash this actor..
-            await try_ship_error_to_remote(
-                chan,
-                err,
-                cid=ctx.cid,
-                remote_descr='caller',
-                hide_tb=hide_tb,
-            )
-
-        # error is probably from above coro running code *not from
-        # the target rpc invocation since a scope was never
-        # allocated around the coroutine await.
-        if ctx._scope is None:
-            # we don't ever raise directly here to allow the
-            # msg-loop-scheduler to continue running for this
-            # channel.
-            task_status.started(err)
-
-        # always reraise KBIs so they propagate at the sys-process
-        # level.
-        if isinstance(err, KeyboardInterrupt):
-            raise
-
-
-    # RPC task bookeeping
-    finally:
-        try:
-            ctx, func, is_complete = actor._rpc_tasks.pop(
-                (chan, ctx.cid)
-            )
-            is_complete.set()
-
-        except KeyError:
-            if is_rpc:
-                # If we're cancelled before the task returns then the
-                # cancel scope will not have been inserted yet
-                log.warning(
-                    'RPC task likely errored or cancelled before start?'
-                    f'|_{ctx._task}\n'
-                    f'  >> {ctx.repr_rpc}\n'
-                )
-            else:
-                log.cancel(
-                    'Failed to de-alloc internal runtime cancel task?\n'
-                    f'|_{ctx._task}\n'
-                    f'  >> {ctx.repr_rpc}\n'
-                )
-
-        finally:
-            if not actor._rpc_tasks:
-                log.runtime("All RPC tasks have completed")
-                actor._ongoing_rpc_tasks.set()
-
-
-async def _invoke(
-
-    actor: Actor,
-    cid: str,
-    chan: Channel,
-    func: Callable,
-    kwargs: dict[str, Any],
-
-    is_rpc: bool = True,
-    hide_tb: bool = True,
-
-    task_status: TaskStatus[
-        Context | BaseException
-    ] = trio.TASK_STATUS_IGNORED,
-):
-    '''
-    Schedule a `trio` task-as-func and deliver result(s) over
-    connected IPC channel.
-
-    This is the core "RPC" `trio.Task` scheduling machinery used to start every
-    remotely invoked function, normally in `Actor._service_n: Nursery`.
-
-    '''
-    __tracebackhide__: bool = hide_tb
-    treat_as_gen: bool = False
-
-    if _state.debug_mode():
-        await maybe_import_gb()
-
-    # TODO: possibly a specially formatted traceback
-    # (not sure what typing is for this..)?
-    # tb = None
-
-    cancel_scope = CancelScope()
-    # activated cancel scope ref
-    cs: CancelScope|None = None
-
-    ctx = actor.get_context(
-        chan=chan,
-        cid=cid,
-        nsf=NamespacePath.from_ref(func),
-
-        # TODO: if we wanted to get cray and support it?
-        # side='callee',
-
-        # We shouldn't ever need to pass this through right?
-        # it's up to the soon-to-be called rpc task to
-        # open the stream with this option.
-        # allow_overruns=True,
-    )
-    context: bool = False
-
-    # TODO: deprecate this style..
-    if getattr(func, '_tractor_stream_function', False):
-        # handle decorated ``@tractor.stream`` async functions
-        sig = inspect.signature(func)
-        params = sig.parameters
-
-        # compat with old api
-        kwargs['ctx'] = ctx
-        treat_as_gen = True
-
-        if 'ctx' in params:
-            warnings.warn(
-                "`@tractor.stream decorated funcs should now declare "
-                "a `stream`  arg, `ctx` is now designated for use with "
-                "@tractor.context",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-
-        elif 'stream' in params:
-            assert 'stream' in params
-            kwargs['stream'] = ctx
-
-
-    elif getattr(func, '_tractor_context_function', False):
-        # handle decorated ``@tractor.context`` async function
-        kwargs['ctx'] = ctx
-        context = True
-
-    # errors raised inside this block are propgated back to caller
-    async with _errors_relayed_via_ipc(
-        actor,
-        chan,
-        ctx,
-        is_rpc,
-        hide_tb=hide_tb,
-        task_status=task_status,
-    ):
-        if not (
-            inspect.isasyncgenfunction(func) or
-            inspect.iscoroutinefunction(func)
-        ):
-            raise TypeError(f'{func} must be an async function!')
-
-        # init coroutine with `kwargs` to immediately catch any
-        # type-sig errors.
-        try:
-            coro = func(**kwargs)
-        except TypeError:
-            raise
-
-        # TODO: implement all these cases in terms of the
-        # `Context` one!
-        if not context:
-            await _invoke_non_context(
-                actor,
-                cancel_scope,
-                ctx,
-                cid,
-                chan,
-                func,
-                coro,
-                kwargs,
-                treat_as_gen,
-                is_rpc,
-                task_status,
-            )
-            # below is only for `@context` funcs
-            return
-
-        # our most general case: a remote SC-transitive,
-        # IPC-linked, cross-actor-task "context"
-        # ------ - ------
-        # TODO: every other "func type" should be implemented from
-        # a special case of this impl eventually!
-        # -[ ] streaming funcs should instead of being async-for
-        #     handled directly here wrapped in
-        #     a async-with-open_stream() closure that does the
-        #     normal thing you'd expect a far end streaming context
-        #     to (if written by the app-dev).
-        # -[ ] one off async funcs can literally just be called
-        #     here and awaited directly, possibly just with a small
-        #     wrapper that calls `Context.started()` and then does
-        #     the `await coro()`?
-
-        # a "context" endpoint type is the most general and
-        # "least sugary" type of RPC ep with support for
-        # bi-dir streaming B)
-        await chan.send({
-            'functype': 'context',
-            'cid': cid
-        })
-
-        # TODO: should we also use an `.open_context()` equiv
-        # for this callee side by factoring the impl from
-        # `Portal.open_context()` into a common helper?
-        #
-        # NOTE: there are many different ctx state details
-        # in a callee side instance according to current impl:
-        # - `.cancelled_caught` can never be `True`.
-        #  -> the below scope is never exposed to the
-        #     `@context` marked RPC function.
-        # - `._portal` is never set.
-        try:
-            async with trio.open_nursery() as tn:
-                ctx._scope_nursery = tn
-                ctx._scope = tn.cancel_scope
-                task_status.started(ctx)
-
-                # TODO: should would be nice to have our
-                # `TaskMngr` nursery here!
-                res: Any = await coro
-                ctx._result = res
-
-                # deliver final result to caller side.
-                await chan.send({
-                    'return': res,
-                    'cid': cid
-                })
-
-            # NOTE: this happens IFF `ctx._scope.cancel()` is
-            # called by any of,
-            # - *this* callee task manually calling `ctx.cancel()`.
-            # - the runtime calling `ctx._deliver_msg()` which
-            #   itself calls `ctx._maybe_cancel_and_set_remote_error()`
-            #   which cancels the scope presuming the input error
-            #   is not a `.cancel_acked` pleaser.
-            # - currently a never-should-happen-fallthrough case
-            #   inside ._context._drain_to_final_msg()`..
-            #   # TODO: remove this ^ right?
-            if ctx._scope.cancelled_caught:
-                our_uid: tuple = actor.uid
-
-                # first check for and raise any remote error
-                # before raising any context cancelled case
-                # so that real remote errors don't get masked as
-                # ``ContextCancelled``s.
-                if re := ctx._remote_error:
-                    ctx._maybe_raise_remote_err(re)
-
-                cs: CancelScope = ctx._scope
-
-                if cs.cancel_called:
-
-                    canceller: tuple = ctx.canceller
-                    msg: str = (
-                        'actor was cancelled by '
-                    )
-
-                    # NOTE / TODO: if we end up having
-                    # ``Actor._cancel_task()`` call
-                    # ``Context.cancel()`` directly, we're going to
-                    # need to change this logic branch since it
-                    # will always enter..
-                    if ctx._cancel_called:
-                        # TODO: test for this!!!!!
-                        canceller: tuple = our_uid
-                        msg += 'itself '
-
-                    # if the channel which spawned the ctx is the
-                    # one that cancelled it then we report that, vs.
-                    # it being some other random actor that for ex.
-                    # some actor who calls `Portal.cancel_actor()`
-                    # and by side-effect cancels this ctx.
-                    elif canceller == ctx.chan.uid:
-                        msg += 'its caller'
-
-                    else:
-                        msg += 'a remote peer'
-
-                    div_chars: str = '------ - ------'
-                    div_offset: int = (
-                        round(len(msg)/2)+1
-                        +
-                        round(len(div_chars)/2)+1
-                    )
-                    div_str: str = (
-                        '\n'
-                        +
-                        ' '*div_offset
-                        +
-                        f'{div_chars}\n'
-                    )
-                    msg += (
-                        div_str +
-                        f'<= canceller: {canceller}\n'
-                        f'=> uid: {our_uid}\n'
-                        f'  |_{ctx._task}()'
-
-                        # TODO: instead just show the
-                        # ctx.__str__() here?
-                        # -[ ] textwrap.indent() it correctly!
-                        # -[ ] BUT we need to wait until
-                        #   the state is filled out before emitting
-                        #   this msg right ow its kinda empty? bleh..
-                        #
-                        # f'  |_{ctx}'
-                    )
-
-                    # task-contex was either cancelled by request using
-                    # ``Portal.cancel_actor()`` or ``Context.cancel()``
-                    # on the far end, or it was cancelled by the local
-                    # (callee) task, so relay this cancel signal to the
-                    # other side.
-                    ctxc = ContextCancelled(
-                        msg,
-                        suberror_type=trio.Cancelled,
-                        canceller=canceller,
-                    )
-                    # assign local error so that the `.outcome`
-                    # resolves to an error for both reporting and
-                    # state checks.
-                    ctx._local_error = ctxc
-                    raise ctxc
-
-        # XXX: do we ever trigger this block any more?
-        except (
-            BaseExceptionGroup,
-            trio.Cancelled,
-            BaseException,
-
-        ) as scope_error:
-
-            # always set this (callee) side's exception as the
-            # local error on the context
-            ctx._local_error: BaseException = scope_error
-
-            # if a remote error was set then likely the
-            # exception group was raised due to that, so
-            # and we instead raise that error immediately!
-            ctx.maybe_raise()
-
-            # maybe TODO: pack in come kinda
-            # `trio.Cancelled.__traceback__` here so they can be
-            # unwrapped and displayed on the caller side? no se..
-            raise
-
-        # `@context` entrypoint task bookeeping.
-        # i.e. only pop the context tracking if used ;)
-        finally:
-            assert chan.uid
-
-            # don't pop the local context until we know the
-            # associated child isn't in debug any more
-            await maybe_wait_for_debugger()
-            ctx: Context = actor._contexts.pop((
-                chan.uid,
-                cid,
-                # ctx.side,
-            ))
-
-            merr: Exception|None = ctx.maybe_error
-
-            (
-                res_type_str,
-                res_str,
-            ) = (
-                ('error', f'{type(merr)}',)
-                if merr
-                else (
-                    'result',
-                    f'`{repr(ctx.outcome)}`',
-                )
-            )
-            log.cancel(
-                f'IPC context terminated with a final {res_type_str}\n\n'
-                f'{ctx}\n'
-            )
-
 
 def _get_mod_abspath(module: ModuleType) -> str:
     return os.path.abspath(module.__file__)
 
 
-async def try_ship_error_to_remote(
-    channel: Channel,
-    err: Exception|BaseExceptionGroup,
-
-    cid: str|None = None,
-    remote_descr: str = 'parent',
-    hide_tb: bool = True,
-
-) -> None:
-    '''
-    Box, pack and encode a local runtime(-internal) exception for
-    an IPC channel `.send()` with transport/network failures and
-    local cancellation ignored but logged as critical(ly bad).
-
-    '''
-    __tracebackhide__: bool = hide_tb
-    with CancelScope(shield=True):
-        try:
-            # NOTE: normally only used for internal runtime errors
-            # so ship to peer actor without a cid.
-            msg: dict = pack_error(
-                err,
-                cid=cid,
-
-                # TODO: special tb fmting for ctxc cases?
-                # tb=tb,
-            )
-            # NOTE: the src actor should always be packed into the
-            # error.. but how should we verify this?
-            # actor: Actor = _state.current_actor()
-            # assert err_msg['src_actor_uid']
-            # if not err_msg['error'].get('src_actor_uid'):
-            #     import pdbp; pdbp.set_trace()
-            await channel.send(msg)
-
-        # XXX NOTE XXX in SC terms this is one of the worst things
-        # that can happen and provides for a 2-general's dilemma..
-        except (
-            trio.ClosedResourceError,
-            trio.BrokenResourceError,
-            BrokenPipeError,
-        ):
-            err_msg: dict = msg['error']['tb_str']
-            log.critical(
-                'IPC transport failure -> '
-                f'failed to ship error to {remote_descr}!\n\n'
-                f'X=> {channel.uid}\n\n'
-                f'{err_msg}\n'
-            )
-
-
 class Actor:
     '''
     The fundamental "runtime" concurrency primitive.
@@ -946,8 +301,8 @@ class Actor:
         self, uid: tuple[str, str]
     ) -> tuple[trio.Event, Channel]:
         '''
-        Wait for a connection back from a spawned actor with a given
-        ``uid``.
+        Wait for a connection back from a spawned actor with a `uid`
+        using a `trio.Event` for sync.
 
         '''
         log.runtime(f"Waiting for peer {uid} to connect")
@@ -961,11 +316,11 @@ class Actor:
         debug_mode: bool = False,
     ) -> None:
         '''
-        Load allowed RPC modules locally (after fork).
+        Load enabled RPC py-modules locally (after process fork/spawn).
 
         Since this actor may be spawned on a different machine from
         the original nursery we need to try and load the local module
-        code (if it exists).
+        code (presuming it exists).
 
         '''
         try:
@@ -997,6 +352,11 @@ class Actor:
             raise
 
     def _get_rpc_func(self, ns, funcname):
+        '''
+        Try to lookup and return a target RPC func from the
+        post-fork enabled module set.
+
+        '''
         try:
             return getattr(self._mods[ns], funcname)
         except KeyError as err:
@@ -1027,7 +387,8 @@ class Actor:
 
     ) -> None:
         '''
-        Entry point for new inbound connections to the channel server.
+        Entry point for new inbound IPC connections on a specific
+        transport server.
 
         '''
         self._no_more_peers = trio.Event()  # unset by making new
@@ -1366,6 +727,8 @@ class Actor:
                     except trio.BrokenResourceError:
                         log.runtime(f"Channel {chan.uid} was already closed")
 
+    # TODO: rename to `._deliver_payload()` since this handles
+    # more then just `result` msgs now obvi XD
     async def _push_result(
         self,
         chan: Channel,
@@ -1374,7 +737,8 @@ class Actor:
 
     ) -> None|bool:
         '''
-        Push an RPC result to the local consumer's queue.
+        Push an RPC msg-payload to the local consumer peer-task's
+        queue.
 
         '''
         uid: tuple[str, str] = chan.uid
@@ -1420,11 +784,16 @@ class Actor:
 
     ) -> Context:
         '''
-        Look up or create a new inter-actor-task-IPC-linked task
-        "context" which encapsulates the local task's scheduling
-        enviroment including a ``trio`` cancel scope, a pair of IPC
-        messaging "feeder" channels, and an RPC id unique to the
-        task-as-function invocation.
+        Look-up (existing) or create a new
+        inter-actor-SC-linked task "context" (a `Context`) which
+        encapsulates the local RPC task's execution enviroment
+        around `Channel` relayed msg handling including,
+
+        - a dedicated `trio` cancel scope (`Context._scope`),
+        - a pair of IPC-msg-relay "feeder" mem-channels
+          (`Context._recv/send_chan`),
+        - and a "context id" (cid) unique to the task-pair
+          msging session's lifetime.
 
         '''
         actor_uid = chan.uid
@@ -1481,15 +850,17 @@ class Actor:
 
     ) -> Context:
         '''
-        Send a ``'cmd'`` message to a remote actor, which starts
-        a remote task-as-function entrypoint.
+        Send a `'cmd'` msg to a remote actor, which requests the
+        start and schedule of a remote task-as-function's
+        entrypoint.
 
-        Synchronously validates the endpoint type  and return a caller
-        side task ``Context`` that can be used to wait for responses
-        delivered by the local runtime's message processing loop.
+        Synchronously validates the endpoint type and returns
+        a (caller side) `Context` that can be used to accept
+        delivery of msg payloads from the local runtime's
+        processing loop: `._rpc.process_messages()`.
 
         '''
-        cid = str(uuid.uuid4())
+        cid: str = str(uuid.uuid4())
         assert chan.uid
         ctx = self.get_context(
             chan=chan,
@@ -1553,6 +924,12 @@ class Actor:
         Channel,
         list[tuple[str, int]] | None,
     ]:
+        '''
+        Bootstrap this local actor's runtime config from its parent by
+        connecting back via the IPC transport, handshaking and then 
+        `Channel.recv()`-ing seeded data.
+
+        '''
         try:
             # Connect back to the parent actor and conduct initial
             # handshake. From this point on if we error, we
@@ -1635,10 +1012,11 @@ class Actor:
         task_status: TaskStatus[Nursery] = trio.TASK_STATUS_IGNORED,
     ) -> None:
         '''
-        Start the channel server, begin listening for new connections.
+        Start the IPC transport server, begin listening for new connections.
 
-        This will cause an actor to continue living (blocking) until
-        ``cancel_server()`` is called.
+        This will cause an actor to continue living (and thus
+        blocking at the process/OS-thread level) until
+        `.cancel_server()` is called.
 
         '''
         if listen_sockaddrs is None:
@@ -1683,8 +1061,8 @@ class Actor:
         '''
         Cancel this actor asap; can be called from a sync context.
 
-        Schedules `.cancel()` to be run immediately just like when
-        cancelled by the parent.
+        Schedules runtime cancellation via `Actor.cancel()` inside
+        the RPC service nursery.
 
         '''
         assert self._service_n
@@ -1706,15 +1084,15 @@ class Actor:
     ) -> bool:
         '''
         Cancel this actor's runtime, eventually resulting in
-        the exit its containing process.
+        termination of its containing OS process.
 
         The ideal "deterministic" teardown sequence in order is:
-         - cancel all ongoing rpc tasks by cancel scope
+         - cancel all ongoing rpc tasks by cancel scope.
          - cancel the channel server to prevent new inbound
-           connections
+           connections.
          - cancel the "service" nursery reponsible for
-           spawning new rpc tasks
-         - return control the parent channel message loop
+           spawning new rpc tasks.
+         - return control the parent channel message loop.
 
         '''
         (
@@ -1802,11 +1180,9 @@ class Actor:
 
     ) -> bool:
         '''
-        Cancel a local task by call-id / channel.
-
-        Note this method will be treated as a streaming function
-        by remote actor-callers due to the declaration of ``ctx``
-        in the signature (for now).
+        Cancel a local (RPC) task by context-id/channel by calling
+        `trio.CancelScope.cancel()` on it's surrounding cancel
+        scope.
 
         '''
 
@@ -1918,8 +1294,9 @@ class Actor:
 
     ) -> None:
         '''
-        Cancel all existing RPC responder tasks using the cancel scope
-        registered for each.
+        Cancel all ongoing RPC tasks owned/spawned for a given
+        `parent_chan: Channel` or simply all tasks (inside
+        `._service_n`) when `parent_chan=None`.
 
         '''
         tasks: dict = self._rpc_tasks
@@ -2004,8 +1381,8 @@ class Actor:
 
     def cancel_server(self) -> None:
         '''
-        Cancel the internal channel server nursery thereby
-        preventing any new inbound connections from being established.
+        Cancel the internal IPC transport server nursery thereby
+        preventing any new inbound IPC connections establishing.
 
         '''
         if self._server_n:
@@ -2015,8 +1392,8 @@ class Actor:
     @property
     def accept_addrs(self) -> list[tuple[str, int]]:
         '''
-        All addresses to which the transport-channel server binds
-        and listens for new connections.
+        All addresses to which the IPC-transport-channel server
+        binds and listens for new connections.
 
         '''
         # throws OSError on failure
@@ -2028,7 +1405,8 @@ class Actor:
     @property
     def accept_addr(self) -> tuple[str, int]:
         '''
-        Primary address to which the channel server is bound.
+        Primary address to which the IPC transport server is
+        bound.
 
         '''
         # throws OSError on failure
@@ -2036,7 +1414,7 @@ class Actor:
 
     def get_parent(self) -> Portal:
         '''
-        Return a portal to our parent actor.
+        Return a `Portal` to our parent.
 
         '''
         assert self._parent_chan, "No parent channel for this actor?"
@@ -2044,7 +1422,7 @@ class Actor:
 
     def get_chans(self, uid: tuple[str, str]) -> list[Channel]:
         '''
-        Return all channels to the actor with provided uid.
+        Return all IPC channels to the actor with provided `uid`.
 
         '''
         return self._peers[uid]
@@ -2057,10 +1435,10 @@ class Actor:
     ) -> tuple[str, str]:
         '''
         Exchange `(name, UUIDs)` identifiers as the first
-        communication step.
+        communication step with any (peer) remote `Actor`.
 
         These are essentially the "mailbox addresses" found in
-        actor model parlance.
+        "actor model" parlance.
 
         '''
         await chan.send(self.uid)
@@ -2074,6 +1452,13 @@ class Actor:
         return uid
 
     def is_infected_aio(self) -> bool:
+        '''
+        If `True`, this actor is running `trio` in guest mode on
+        the `asyncio` event loop and thus can use the APIs in
+        `.to_asyncio` to coordinate tasks running in each
+        framework but within the same actor runtime.
+
+        '''
         return self._infected_aio
 
 
@@ -2093,11 +1478,14 @@ async def async_main(
 
 ) -> None:
     '''
-    Actor runtime entrypoint; start the IPC channel server, maybe connect
-    back to the parent, and startup all core machinery tasks.
+    Main `Actor` runtime entrypoint; start the transport-specific
+    IPC channel server, (maybe) connect back to parent (to receive
+    additional config), startup all core `trio` machinery for
+    delivering RPCs, register with the discovery system.
 
-    A "root" (or "top-level") nursery for this actor is opened here and
-    when cancelled/terminated effectively closes the actor's "runtime".
+    The "root" (or "top-level") and "service" `trio.Nursery`s are
+    opened here and when cancelled/terminated effectively shutdown
+    the actor's "runtime" and all thus all ongoing RPC tasks.
 
     '''
     # attempt to retreive ``trio``'s sigint handler and stash it
@@ -2356,367 +1744,7 @@ async def async_main(
     log.runtime("Runtime completed")
 
 
-async def process_messages(
-    actor: Actor,
-    chan: Channel,
-    shield: bool = False,
-    task_status: TaskStatus[CancelScope] = trio.TASK_STATUS_IGNORED,
-
-) -> bool:
-    '''
-    This is the per-channel, low level RPC task scheduler loop.
-
-    Receive multiplexed RPC request messages from some remote process,
-    spawn handler tasks depending on request type and deliver responses
-    or boxed errors back to the remote caller (task).
-
-    '''
-    # TODO: once `trio` get's an "obvious way" for req/resp we
-    # should use it?
-    # https://github.com/python-trio/trio/issues/467
-    log.runtime(
-        'Entering IPC msg loop:\n'
-        f'peer: {chan.uid}\n'
-        f'|_{chan}\n'
-    )
-    nursery_cancelled_before_task: bool = False
-    msg: dict | None = None
-    try:
-        # NOTE: this internal scope allows for keeping this
-        # message loop running despite the current task having
-        # been cancelled (eg. `open_portal()` may call this method
-        # from a locally spawned task) and recieve this scope
-        # using ``scope = Nursery.start()``
-        with CancelScope(shield=shield) as loop_cs:
-            task_status.started(loop_cs)
-            async for msg in chan:
-
-                # dedicated loop terminate sentinel
-                if msg is None:
-
-                    tasks: dict[
-                        tuple[Channel, str],
-                        tuple[Context, Callable, trio.Event]
-                    ] = actor._rpc_tasks.copy()
-                    log.cancel(
-                        f'Peer IPC channel terminated via `None` setinel msg?\n'
-                        f'=> Cancelling all {len(tasks)} local RPC tasks..\n'
-                        f'peer: {chan.uid}\n'
-                        f'|_{chan}\n'
-                    )
-                    for (channel, cid) in tasks:
-                        if channel is chan:
-                            await actor._cancel_task(
-                                cid,
-                                channel,
-                                requesting_uid=channel.uid,
-
-                                ipc_msg=msg,
-                            )
-                    break
-
-                log.transport(   # type: ignore
-                    f'<= IPC msg from peer: {chan.uid}\n\n'
-
-                    # TODO: conditionally avoid fmting depending
-                    # on log level (for perf)?
-                    # => specifically `pformat()` sub-call..?
-                    f'{pformat(msg)}\n'
-                )
-
-                cid = msg.get('cid')
-                if cid:
-                    # deliver response to local caller/waiter
-                    # via its per-remote-context memory channel.
-                    await actor._push_result(
-                        chan,
-                        cid,
-                        msg,
-                    )
-
-                    log.runtime(
-                        'Waiting on next IPC msg from\n'
-                        f'peer: {chan.uid}:\n'
-                        f'|_{chan}\n'
-
-                        # f'last msg: {msg}\n'
-                    )
-                    continue
-
-                # process a 'cmd' request-msg upack
-                # TODO: impl with native `msgspec.Struct` support !!
-                # -[ ] implement with ``match:`` syntax?
-                # -[ ] discard un-authed msgs as per,
-                # <TODO put issue for typed msging structs>
-                try:
-                    (
-                        ns,
-                        funcname,
-                        kwargs,
-                        actorid,
-                        cid,
-                    ) = msg['cmd']
-
-                except KeyError:
-                    # This is the non-rpc error case, that is, an
-                    # error **not** raised inside a call to ``_invoke()``
-                    # (i.e. no cid was provided in the msg - see above).
-                    # Push this error to all local channel consumers
-                    # (normally portals) by marking the channel as errored
-                    assert chan.uid
-                    exc = unpack_error(msg, chan=chan)
-                    chan._exc = exc
-                    raise exc
-
-                log.runtime(
-                    'Handling RPC cmd from\n'
-                    f'peer: {actorid}\n'
-                    '\n'
-                    f'=> {ns}.{funcname}({kwargs})\n'
-                )
-                if ns == 'self':
-                    if funcname == 'cancel':
-                        func: Callable = actor.cancel
-                        kwargs |= {
-                            'req_chan': chan,
-                        }
-
-                        # don't start entire actor runtime cancellation
-                        # if this actor is currently in debug mode!
-                        pdb_complete: trio.Event|None = _debug.Lock.local_pdb_complete
-                        if pdb_complete:
-                            await pdb_complete.wait()
-
-                        # Either of  `Actor.cancel()`/`.cancel_soon()`
-                        # was called, so terminate this IPC msg
-                        # loop, exit back out into `async_main()`,
-                        # and immediately start the core runtime
-                        # machinery shutdown!
-                        with CancelScope(shield=True):
-                            await _invoke(
-                                actor,
-                                cid,
-                                chan,
-                                func,
-                                kwargs,
-                                is_rpc=False,
-                            )
-
-                        log.runtime(
-                            'Cancelling IPC transport msg-loop with peer:\n'
-                            f'|_{chan}\n'
-                        )
-                        loop_cs.cancel()
-                        break
-
-                    if funcname == '_cancel_task':
-                        func: Callable = actor._cancel_task
-
-                        # we immediately start the runtime machinery
-                        # shutdown
-                        # with CancelScope(shield=True):
-                        target_cid: str = kwargs['cid']
-                        kwargs |= {
-                            # NOTE: ONLY the rpc-task-owning
-                            # parent IPC channel should be able to
-                            # cancel it!
-                            'parent_chan': chan,
-                            'requesting_uid': chan.uid,
-                            'ipc_msg': msg,
-                        }
-                        # TODO: remove? already have emit in meth.
-                        # log.runtime(
-                        #     f'Rx RPC task cancel request\n'
-                        #     f'<= canceller: {chan.uid}\n'
-                        #     f'  |_{chan}\n\n'
-                        #     f'=> {actor}\n'
-                        #     f'  |_cid: {target_cid}\n'
-                        # )
-                        try:
-                            await _invoke(
-                                actor,
-                                cid,
-                                chan,
-                                func,
-                                kwargs,
-                                is_rpc=False,
-                            )
-                        except BaseException:
-                            log.exception(
-                                'Failed to cancel task?\n'
-                                f'<= canceller: {chan.uid}\n'
-                                f'  |_{chan}\n\n'
-                                f'=> {actor}\n'
-                                f'  |_cid: {target_cid}\n'
-                            )
-                        continue
-                    else:
-                        # normally registry methods, eg.
-                        # ``.register_actor()`` etc.
-                        func: Callable = getattr(actor, funcname)
-
-                else:
-                    # complain to client about restricted modules
-                    try:
-                        func = actor._get_rpc_func(ns, funcname)
-                    except (ModuleNotExposed, AttributeError) as err:
-                        err_msg: dict[str, dict] = pack_error(
-                            err,
-                            cid=cid,
-                        )
-                        await chan.send(err_msg)
-                        continue
-
-                # schedule a task for the requested RPC function
-                # in the actor's main "service nursery".
-                # TODO: possibly a service-tn per IPC channel for
-                # supervision isolation? would avoid having to
-                # manage RPC tasks individually in `._rpc_tasks`
-                # table?
-                log.runtime(
-                    f'Spawning task for RPC request\n'
-                    f'<= caller: {chan.uid}\n'
-                    f'  |_{chan}\n\n'
-                    # TODO: maddr style repr?
-                    # f'  |_@ /ipv4/{chan.raddr}/tcp/{chan.rport}/'
-                    # f'cid="{cid[-16:]} .."\n\n'
-
-                    f'=> {actor}\n'
-                    f'  |_cid: {cid}\n'
-                    f'   |>> {func}()\n'
-                )
-                assert actor._service_n  # wait why? do it at top?
-                try:
-                    ctx: Context = await actor._service_n.start(
-                        partial(
-                            _invoke,
-                            actor,
-                            cid,
-                            chan,
-                            func,
-                            kwargs,
-                        ),
-                        name=funcname,
-                    )
-
-                except (
-                    RuntimeError,
-                    BaseExceptionGroup,
-                ):
-                    # avoid reporting a benign race condition
-                    # during actor runtime teardown.
-                    nursery_cancelled_before_task: bool = True
-                    break
-
-                # in the lone case where a ``Context`` is not
-                # delivered, it's likely going to be a locally
-                # scoped exception from ``_invoke()`` itself.
-                if isinstance(err := ctx, Exception):
-                    log.warning(
-                        'Task for RPC failed?'
-                        f'|_ {func}()\n\n'
-
-                        f'{err}'
-                    )
-                    continue
-
-                else:
-                    # mark that we have ongoing rpc tasks
-                    actor._ongoing_rpc_tasks = trio.Event()
-
-                    # store cancel scope such that the rpc task can be
-                    # cancelled gracefully if requested
-                    actor._rpc_tasks[(chan, cid)] = (
-                        ctx,
-                        func,
-                        trio.Event(),
-                    )
-
-                log.runtime(
-                    'Waiting on next IPC msg from\n'
-                    f'peer: {chan.uid}\n'
-                    f'|_{chan}\n'
-                )
-
-            # end of async for, channel disconnect vis
-            # ``trio.EndOfChannel``
-            log.runtime(
-                f"{chan} for {chan.uid} disconnected, cancelling tasks"
-            )
-            await actor.cancel_rpc_tasks(
-                req_uid=actor.uid,
-                # a "self cancel" in terms of the lifetime of the
-                # IPC connection which is presumed to be the
-                # source of any requests for spawned tasks.
-                parent_chan=chan,
-            )
-
-    except (
-        TransportClosed,
-    ):
-        # channels "breaking" (for TCP streams by EOF or 104
-        # connection-reset) is ok since we don't have a teardown
-        # handshake for them (yet) and instead we simply bail out of
-        # the message loop and expect the teardown sequence to clean
-        # up.
-        # TODO: don't show this msg if it's an emphemeral
-        # discovery ep call?
-        log.runtime(
-            f'channel closed abruptly with\n'
-            f'peer: {chan.uid}\n' 
-            f'|_{chan.raddr}\n'
-        )
-
-        # transport **was** disconnected
-        return True
-
-    except (
-        Exception,
-        BaseExceptionGroup,
-    ) as err:
-
-        if nursery_cancelled_before_task:
-            sn: Nursery = actor._service_n
-            assert sn and sn.cancel_scope.cancel_called  # sanity
-            log.cancel(
-                f'Service nursery cancelled before it handled {funcname}'
-            )
-        else:
-            # ship any "internal" exception (i.e. one from internal
-            # machinery not from an rpc task) to parent
-            match err:
-                case ContextCancelled():
-                    log.cancel(
-                        f'Actor: {actor.uid} was context-cancelled with,\n'
-                        f'str(err)'
-                    )
-                case _:
-                    log.exception("Actor errored:")
-
-            if actor._parent_chan:
-                await try_ship_error_to_remote(
-                    actor._parent_chan,
-                    err,
-                )
-
-        # if this is the `MainProcess` we expect the error broadcasting
-        # above to trigger an error at consuming portal "checkpoints"
-        raise
-
-    finally:
-        # msg debugging for when he machinery is brokey
-        log.runtime(
-            'Exiting IPC msg loop with\n'
-            f'peer: {chan.uid}\n'
-            f'|_{chan}\n\n'
-            'final msg:\n'
-            f'{pformat(msg)}\n'
-        )
-
-    # transport **was not** disconnected
-    return False
-
-
+# TODO: rename to `Registry` and move to `._discovery`!
 class Arbiter(Actor):
     '''
     A special registrar actor who can contact all other actors
-- 
2.34.1


From d28c7e17c650a356868eaf7f738965dfbd5afbc1 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 13 Mar 2024 16:09:31 -0400
Subject: [PATCH 169/378] Add `.trionics._broadcast` todos for py 3.12

---
 tractor/trionics/_broadcast.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tractor/trionics/_broadcast.py b/tractor/trionics/_broadcast.py
index 244a42d4..a5d31871 100644
--- a/tractor/trionics/_broadcast.py
+++ b/tractor/trionics/_broadcast.py
@@ -26,7 +26,6 @@ from contextlib import asynccontextmanager
 from functools import partial
 from operator import ne
 from typing import (
-    Optional,
     Callable,
     Awaitable,
     Any,
@@ -45,6 +44,11 @@ from tractor.log import get_logger
 
 log = get_logger(__name__)
 
+# TODO: use new type-vars syntax from 3.12
+# https://realpython.com/python312-new-features/#dedicated-type-variable-syntax
+# https://docs.python.org/3/whatsnew/3.12.html#whatsnew312-pep695
+# https://docs.python.org/3/reference/simple_stmts.html#type
+#
 # A regular invariant generic type
 T = TypeVar("T")
 
@@ -110,7 +114,7 @@ class BroadcastState(Struct):
 
     # broadcast event to wake up all sleeping consumer tasks
     # on a newly produced value from the sender.
-    recv_ready: Optional[tuple[int, trio.Event]] = None
+    recv_ready: tuple[int, trio.Event]|None = None
 
     # if a ``trio.EndOfChannel`` is received on any
     # consumer all consumers should be placed in this state
@@ -164,7 +168,7 @@ class BroadcastReceiver(ReceiveChannel):
 
         rx_chan: AsyncReceiver,
         state: BroadcastState,
-        receive_afunc: Optional[Callable[[], Awaitable[Any]]] = None,
+        receive_afunc: Callable[[], Awaitable[Any]]|None = None,
         raise_on_lag: bool = True,
 
     ) -> None:
@@ -452,7 +456,7 @@ def broadcast_receiver(
 
     recv_chan: AsyncReceiver,
     max_buffer_size: int,
-    receive_afunc: Optional[Callable[[], Awaitable[Any]]] = None,
+    receive_afunc: Callable[[], Awaitable[Any]]|None = None,
     raise_on_lag: bool = True,
 
 ) -> BroadcastReceiver:
-- 
2.34.1


From e5cb39804c77b8bb6f5ea3d728e6a656bb1246a5 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 13 Mar 2024 16:21:30 -0400
Subject: [PATCH 170/378] Pin to `trio>=0.24` to avoid `trio_typing`

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 958c8f39..7f6d8f52 100755
--- a/setup.py
+++ b/setup.py
@@ -46,7 +46,7 @@ setup(
         # trio related
         # proper range spec:
         # https://packaging.python.org/en/latest/discussions/install-requires-vs-requirements/#id5
-        'trio >= 0.22',
+        'trio >= 0.24',
         'async_generator',
         'trio_typing',
         'exceptiongroup',
-- 
2.34.1


From 71de56b09a73dcaf14d5222f49bc68bedcfcb4c9 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 13 Mar 2024 18:41:24 -0400
Subject: [PATCH 171/378] Drop now-deprecated deps on modern `trio`/Python

- `trio_typing` is nearly obsolete since `trio >= 0.23`
- `exceptiongroup` is built-in to python 3.11
- `async_generator` primitives have lived in `contextlib` for quite
  a while!
---
 examples/debugging/debug_mode_hang.py           |  9 +++++++++
 .../parallelism/concurrent_actors_primes.py     |  8 +++++---
 examples/rpc_bidir_streaming.py                 |  2 +-
 setup.py                                        |  7 ++++---
 tests/test_cancellation.py                      |  4 ----
 tests/test_child_manages_service_nursery.py     |  6 ++++--
 tests/test_infected_asyncio.py                  |  1 -
 tests/test_runtime.py                           |  3 ++-
 tests/test_trioisms.py                          |  2 +-
 tractor/__init__.py                             |  2 --
 tractor/_exceptions.py                          | 11 +++++------
 tractor/_root.py                                |  3 ++-
 tractor/_rpc.py                                 |  6 +-----
 tractor/_runtime.py                             |  2 --
 tractor/_shm.py                                 |  2 +-
 tractor/_spawn.py                               | 17 ++++++++---------
 tractor/_supervise.py                           |  1 -
 tractor/experimental/_pubsub.py                 |  2 +-
 tractor/trionics/_mngrs.py                      |  7 +++----
 19 files changed, 47 insertions(+), 48 deletions(-)
 create mode 100644 examples/debugging/debug_mode_hang.py

diff --git a/examples/debugging/debug_mode_hang.py b/examples/debugging/debug_mode_hang.py
new file mode 100644
index 00000000..a81890ed
--- /dev/null
+++ b/examples/debugging/debug_mode_hang.py
@@ -0,0 +1,9 @@
+'''
+Reproduce a bug where enabling debug mode for a sub-actor actually causes
+a hang on teardown...
+
+'''
+import asyncio
+
+import trio
+import tractor
diff --git a/examples/parallelism/concurrent_actors_primes.py b/examples/parallelism/concurrent_actors_primes.py
index feaaca79..748861e6 100644
--- a/examples/parallelism/concurrent_actors_primes.py
+++ b/examples/parallelism/concurrent_actors_primes.py
@@ -8,7 +8,10 @@ This uses no extra threads, fancy semaphores or futures; all we need
 is ``tractor``'s channels.
 
 """
-from contextlib import asynccontextmanager
+from contextlib import (
+    asynccontextmanager as acm,
+    aclosing,
+)
 from typing import Callable
 import itertools
 import math
@@ -16,7 +19,6 @@ import time
 
 import tractor
 import trio
-from async_generator import aclosing
 
 
 PRIMES = [
@@ -44,7 +46,7 @@ async def is_prime(n):
     return True
 
 
-@asynccontextmanager
+@acm
 async def worker_pool(workers=4):
     """Though it's a trivial special case for ``tractor``, the well
     known "worker pool" seems to be the defacto "but, I want this
diff --git a/examples/rpc_bidir_streaming.py b/examples/rpc_bidir_streaming.py
index 73200814..c961bf20 100644
--- a/examples/rpc_bidir_streaming.py
+++ b/examples/rpc_bidir_streaming.py
@@ -13,7 +13,7 @@ async def simple_rpc(
 
     '''
     # signal to parent that we're up much like
-    # ``trio_typing.TaskStatus.started()``
+    # ``trio.TaskStatus.started()``
     await ctx.started(data + 1)
 
     async with ctx.open_stream() as stream:
diff --git a/setup.py b/setup.py
index 7f6d8f52..50ee92ec 100755
--- a/setup.py
+++ b/setup.py
@@ -47,9 +47,10 @@ setup(
         # proper range spec:
         # https://packaging.python.org/en/latest/discussions/install-requires-vs-requirements/#id5
         'trio >= 0.24',
-        'async_generator',
-        'trio_typing',
-        'exceptiongroup',
+
+        # 'async_generator',  # in stdlib mostly!
+        # 'trio_typing',  # trio==0.23.0 has type hints!
+        # 'exceptiongroup',  # in stdlib as of 3.11!
 
         # tooling
         'stackscope',
diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py
index 5b589f6a..b8c14af3 100644
--- a/tests/test_cancellation.py
+++ b/tests/test_cancellation.py
@@ -8,10 +8,6 @@ import platform
 import time
 from itertools import repeat
 
-from exceptiongroup import (
-    BaseExceptionGroup,
-    ExceptionGroup,
-)
 import pytest
 import trio
 import tractor
diff --git a/tests/test_child_manages_service_nursery.py b/tests/test_child_manages_service_nursery.py
index 228d6ade..350f939b 100644
--- a/tests/test_child_manages_service_nursery.py
+++ b/tests/test_child_manages_service_nursery.py
@@ -6,13 +6,15 @@ sub-sub-actor daemons.
 '''
 from typing import Optional
 import asyncio
-from contextlib import asynccontextmanager as acm
+from contextlib import (
+    asynccontextmanager as acm,
+    aclosing,
+)
 
 import pytest
 import trio
 import tractor
 from tractor import RemoteActorError
-from async_generator import aclosing
 
 
 async def aio_streamer(
diff --git a/tests/test_infected_asyncio.py b/tests/test_infected_asyncio.py
index a3f96ee8..568708a2 100644
--- a/tests/test_infected_asyncio.py
+++ b/tests/test_infected_asyncio.py
@@ -8,7 +8,6 @@ import builtins
 import itertools
 import importlib
 
-from exceptiongroup import BaseExceptionGroup
 import pytest
 import trio
 import tractor
diff --git a/tests/test_runtime.py b/tests/test_runtime.py
index 3755af1b..55553dd9 100644
--- a/tests/test_runtime.py
+++ b/tests/test_runtime.py
@@ -64,7 +64,8 @@ async def test_lifetime_stack_wipes_tmpfile(
 
     except (
         tractor.RemoteActorError,
-        tractor.BaseExceptionGroup,
+        # tractor.BaseExceptionGroup,
+        BaseExceptionGroup,
     ):
         pass
 
diff --git a/tests/test_trioisms.py b/tests/test_trioisms.py
index 5b19f50d..27dc6c34 100644
--- a/tests/test_trioisms.py
+++ b/tests/test_trioisms.py
@@ -5,7 +5,7 @@ want to see changed.
 '''
 import pytest
 import trio
-from trio_typing import TaskStatus
+from trio import TaskStatus
 
 
 @pytest.mark.parametrize(
diff --git a/tractor/__init__.py b/tractor/__init__.py
index 01d00ec9..c7d21c9d 100644
--- a/tractor/__init__.py
+++ b/tractor/__init__.py
@@ -18,8 +18,6 @@
 tractor: structured concurrent ``trio``-"actors".
 
 """
-from exceptiongroup import BaseExceptionGroup as BaseExceptionGroup
-
 from ._clustering import (
     open_actor_cluster as open_actor_cluster,
 )
diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 259a28a7..344f0c33 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -30,11 +30,10 @@ from typing import (
 import textwrap
 import traceback
 
-import exceptiongroup as eg
 import trio
 
-from ._state import current_actor
-from .log import get_logger
+from tractor._state import current_actor
+from tractor.log import get_logger
 
 if TYPE_CHECKING:
     from ._context import Context
@@ -373,7 +372,6 @@ def unpack_error(
         for ns in [
             builtins,
             _this_mod,
-            eg,
             trio,
         ]:
             if suberror_type := getattr(
@@ -396,12 +394,13 @@ def unpack_error(
 
 def is_multi_cancelled(exc: BaseException) -> bool:
     '''
-    Predicate to determine if a possible ``eg.BaseExceptionGroup`` contains
+    Predicate to determine if a possible ``BaseExceptionGroup`` contains
     only ``trio.Cancelled`` sub-exceptions (and is likely the result of
     cancelling a collection of subtasks.
 
     '''
-    if isinstance(exc, eg.BaseExceptionGroup):
+    # if isinstance(exc, eg.BaseExceptionGroup):
+    if isinstance(exc, BaseExceptionGroup):
         return exc.subgroup(
             lambda exc: isinstance(exc, trio.Cancelled)
         ) is not None
diff --git a/tractor/_root.py b/tractor/_root.py
index 6ee78b99..54451918 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -28,12 +28,13 @@ import os
 import warnings
 
 
-from exceptiongroup import BaseExceptionGroup
 import trio
 
 from ._runtime import (
     Actor,
     Arbiter,
+    # TODO: rename and make a non-actor subtype?
+    # Arbiter as Registry,
     async_main,
 )
 from .devx import _debug
diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index 54a60be6..6bdc0c6b 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -21,6 +21,7 @@ Remote (task) Procedure Call (scheduling) with SC transitive semantics.
 from __future__ import annotations
 from contextlib import (
     asynccontextmanager as acm,
+    aclosing,
 )
 from functools import partial
 import inspect
@@ -34,17 +35,12 @@ from typing import (
 )
 import warnings
 
-from async_generator import aclosing
-from exceptiongroup import BaseExceptionGroup
 import trio
 from trio import (
     CancelScope,
     Nursery,
     TaskStatus,
 )
-# from trio_typing import (
-#     TaskStatus,
-# )
 
 from .msg import NamespacePath
 from ._ipc import Channel
diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 838c648c..ff929c0b 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -61,8 +61,6 @@ import warnings
 import trio
 from trio import (
     CancelScope,
-)
-from trio_typing import (
     Nursery,
     TaskStatus,
 )
diff --git a/tractor/_shm.py b/tractor/_shm.py
index f8295105..da6d8ddb 100644
--- a/tractor/_shm.py
+++ b/tractor/_shm.py
@@ -46,7 +46,7 @@ if _USE_POSIX:
 try:
     import numpy as np
     from numpy.lib import recfunctions as rfn
-    import nptyping
+    # import nptyping
 except ImportError:
     pass
 
diff --git a/tractor/_spawn.py b/tractor/_spawn.py
index e23d70f1..741a2f87 100644
--- a/tractor/_spawn.py
+++ b/tractor/_spawn.py
@@ -31,25 +31,24 @@ from typing import (
     TYPE_CHECKING,
 )
 
-from exceptiongroup import BaseExceptionGroup
 import trio
-from trio_typing import TaskStatus
+from trio import TaskStatus
 
-from .devx import (
+from tractor.devx import (
     maybe_wait_for_debugger,
     acquire_debug_lock,
 )
-from ._state import (
+from tractor._state import (
     current_actor,
     is_main_process,
     is_root_process,
     debug_mode,
 )
-from .log import get_logger
-from ._portal import Portal
-from ._runtime import Actor
-from ._entry import _mp_main
-from ._exceptions import ActorFailure
+from tractor.log import get_logger
+from tractor._portal import Portal
+from tractor._runtime import Actor
+from tractor._entry import _mp_main
+from tractor._exceptions import ActorFailure
 
 
 if TYPE_CHECKING:
diff --git a/tractor/_supervise.py b/tractor/_supervise.py
index 50f0d5e6..733dd53c 100644
--- a/tractor/_supervise.py
+++ b/tractor/_supervise.py
@@ -26,7 +26,6 @@ from typing import TYPE_CHECKING
 import typing
 import warnings
 
-from exceptiongroup import BaseExceptionGroup
 import trio
 
 from .devx._debug import maybe_wait_for_debugger
diff --git a/tractor/experimental/_pubsub.py b/tractor/experimental/_pubsub.py
index 89f286d2..b894ed49 100644
--- a/tractor/experimental/_pubsub.py
+++ b/tractor/experimental/_pubsub.py
@@ -31,7 +31,7 @@ from typing import (
     Callable,
 )
 from functools import partial
-from async_generator import aclosing
+from contextlib import aclosing
 
 import trio
 import wrapt
diff --git a/tractor/trionics/_mngrs.py b/tractor/trionics/_mngrs.py
index 1c079cdb..f57be0a7 100644
--- a/tractor/trionics/_mngrs.py
+++ b/tractor/trionics/_mngrs.py
@@ -33,10 +33,9 @@ from typing import (
 )
 
 import trio
-from trio_typing import TaskStatus
 
-from .._state import current_actor
-from ..log import get_logger
+from tractor._state import current_actor
+from tractor.log import get_logger
 
 
 log = get_logger(__name__)
@@ -184,7 +183,7 @@ class _Cache:
         cls,
         mng,
         ctx_key: tuple,
-        task_status: TaskStatus[T] = trio.TASK_STATUS_IGNORED,
+        task_status: trio.TaskStatus[T] = trio.TASK_STATUS_IGNORED,
 
     ) -> None:
         async with mng as value:
-- 
2.34.1


From 5fb568226986d8f2cc3cbabbfd39bc1dfc2b6b3f Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 18 Mar 2024 10:21:37 -0400
Subject: [PATCH 172/378] First try "relayed boxed errors", or "inceptions"

Since adding more complex inter-peer (actor) testing scenarios, we
definitely have an immediate need for `trio`'s style of "inceptions" but
for nesting `RemoteActorError`s as they're relayed through multiple
actor-IPC hops. So for example, a remote error relayed "through" some
proxy actor to another ends up packing a `RemoteActorError` into another
one such that there are 2 layers of RAEs with the first
containing/boxing an original src actor error (type).

In support of this extension to `RemoteActorError` we add:

- `get_err_type()` error type resolver helper (factored fromthe
  body of `unpack_error()`) to be used whenever rendering
  `.src_type`/`.boxed_type`.

- `.src_type_str: str` which is pulled from `.msgdata` and holds the
  above (eventually when unpacked) type as `str`.
- `._src_type: BaseException|None` for the original
  "source" actor's error as unpacked in any remote (actor's) env and
  exposed as a readonly property `.src_type`.

- `.boxed_type_str: str` the same as above but for the "last" boxed
  error's type; when the RAE is unpacked at its first hop this will
  be **the same as** `.src_type_str`.
- `._boxed_type: BaseException` which now similarly should be "rendered"
  from the below type-`str` field instead of passed in as a error-type
  via `boxed_type` (though we still do for the ctxc case atm, see
  notes).
 |_ new sanity checks in `.__init__()` mostly as a reminder to handle
   that ^ ctxc case ^ more elegantly at some point..
 |_ obvi we discard the previous `suberror_type` input arg.

- fully remove the `.type`/`.type_str` properties instead expecting
  usage of `.boxed_/.src_` equivalents.
- start deprecation of `.src_actor_uid` and make it delegate to new
  `.src_uid`
- add `.relay_uid` propery for the last relay/hop's actor uid.
- add `.relay_path: list[str]` which holds the per-hop updated sequence
  of relay actor uid's which consecutively did boxing of an RAE.
- only include `.src_uid` and `.relay_path` in reprol() output.
- factor field-to-str rendering into a new `_mk_fields_str()`
  and use it in `.__repr__()`/`.reprol()`.
- add an `.unwrap()` to (attempt to) render the src error.

- rework `pack_error()` to handle inceptions including,
  - packing the correct field-values for the new `boxed_type_str`, `relay_uid`,
    `src_uid`, `src_type_str`.
  - always updating the `relay_path` sequence with the uid of the
    current actor.

- adjust `unpack_error()` to match all these changes,
  - pulling `boxed_type_str` and passing any resolved `boxed_type` to
    `RemoteActorError.__init__()`.
  - use the new `Context.maybe_raise()` convenience method.

Adjust `._rpc` packing to `ContextCancelled(boxed_type=trio.Cancelled)`
and tweak some more log msg formats.
---
 tractor/_exceptions.py | 367 ++++++++++++++++++++++++++++++++++-------
 tractor/_rpc.py        |  16 +-
 2 files changed, 311 insertions(+), 72 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 344f0c33..d6629ad4 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -58,16 +58,44 @@ class InternalError(RuntimeError):
     '''
 
 _body_fields: list[str] = [
-    'src_actor_uid',
+    'boxed_type',
+    'src_type',
+    # TODO: format this better if we're going to include it.
+    # 'relay_path',
+    'src_uid',
+
+    # only in sub-types
     'canceller',
     'sender',
 ]
 
 _msgdata_keys: list[str] = [
-    'type_str',
+    'boxed_type_str',
 ] + _body_fields
 
 
+def get_err_type(type_name: str) -> BaseException|None:
+    '''
+    Look up an exception type by name from the set of locally
+    known namespaces:
+
+    - `builtins`
+    - `tractor._exceptions`
+    - `trio`
+
+    '''
+    for ns in [
+        builtins,
+        _this_mod,
+        trio,
+    ]:
+        if type_ref := getattr(
+            ns,
+            type_name,
+            False,
+        ):
+            return type_ref
+
 
 # TODO: rename to just `RemoteError`?
 class RemoteActorError(Exception):
@@ -81,13 +109,16 @@ class RemoteActorError(Exception):
 
     '''
     reprol_fields: list[str] = [
-        'src_actor_uid',
+        # 'src_actor_uid',
+        'src_uid',
+        'relay_path',
+        # 'relay_uid',
     ]
 
     def __init__(
         self,
         message: str,
-        suberror_type: Type[BaseException] | None = None,
+        boxed_type: Type[BaseException]|None = None,
         **msgdata
 
     ) -> None:
@@ -101,20 +132,124 @@ class RemoteActorError(Exception):
         # - .remote_type
         # also pertains to our long long oustanding issue XD
         # https://github.com/goodboy/tractor/issues/5
-        self.boxed_type: str = suberror_type
+        #
+        # TODO: always set ._boxed_type` as `None` by default
+        # and instead render if from `.boxed_type_str`?
+        self._boxed_type: BaseException = boxed_type
+        self._src_type: BaseException|None = None
         self.msgdata: dict[str, Any] = msgdata
 
-    @property
-    def type(self) -> str:
-        return self.boxed_type
+        # TODO: mask out eventually or place in `pack_error()`
+        # pre-`return` lines?
+        # sanity on inceptions
+        if boxed_type is RemoteActorError:
+            if self.src_type_str == 'RemoteActorError':
+                import pdbp; pdbp.set_trace()
+
+            assert self.src_type_str != 'RemoteActorError'
+            assert self.src_uid not in self.relay_path
+
+        # ensure type-str matches and round-tripping from that
+        # str results in same error type.
+        #
+        # TODO NOTE: this is currently exclusively for the
+        # `ContextCancelled(boxed_type=trio.Cancelled)` case as is
+        # used inside `._rpc._invoke()` atm though probably we
+        # should better emphasize that special (one off?) case
+        # either by customizing `ContextCancelled.__init__()` or
+        # through a special factor func?
+        else:
+            if not self.msgdata.get('boxed_type_str'):
+                self.msgdata['boxed_type_str'] = str(
+                    type(boxed_type).__name__
+                )
+
+            assert self.boxed_type_str == self.msgdata['boxed_type_str']
+            assert self.boxed_type is boxed_type
 
     @property
-    def type_str(self) -> str:
-        return str(type(self.boxed_type).__name__)
+    def src_type_str(self) -> str:
+        '''
+        String-name of the source error's type.
 
+        This should be the same as `.boxed_type_str` when unpacked
+        at the first relay/hop's receiving actor.
+
+        '''
+        return self.msgdata['src_type_str']
+
+    @property
+    def src_type(self) -> str:
+        '''
+        Error type raised by original remote faulting actor.
+
+        '''
+        if self._src_type is None:
+            self._src_type = get_err_type(
+                self.msgdata['src_type_str']
+            )
+
+        return self._src_type
+
+    @property
+    def boxed_type_str(self) -> str:
+        '''
+        String-name of the (last hop's) boxed error type.
+
+        '''
+        return self.msgdata['boxed_type_str']
+
+    @property
+    def boxed_type(self) -> str:
+        '''
+        Error type boxed by last actor IPC hop.
+
+        '''
+        if self._boxed_type is None:
+            self._src_type = get_err_type(
+                self.msgdata['boxed_type_str']
+            )
+
+        return self._boxed_type
+
+    @property
+    def relay_path(self) -> list[tuple]:
+        '''
+        Return the list of actors which consecutively relayed
+        a boxed `RemoteActorError` the src error up until THIS
+        actor's hop.
+
+        NOTE: a `list` field with the same name is expected to be
+        passed/updated in `.msgdata`.
+
+        '''
+        return self.msgdata['relay_path']
+
+    @property
+    def relay_uid(self) -> tuple[str, str]|None:
+        return tuple(
+            self.msgdata['relay_path'][-1]
+        )
+
+    @property
+    def src_uid(self) -> tuple[str, str]|None:
+        if src_uid := (
+            self.msgdata.get('src_uid')
+            # TODO: remove!
+            or
+            self.msgdata.get('src_actor_uid')
+        ):
+            return tuple(src_uid)
+        # TODO: use path lookup instead?
+        # return tuple(
+        #     self.msgdata['relay_path'][0]
+        # )
+
+    # TODO: deprecate this for ^!
     @property
     def src_actor_uid(self) -> tuple[str, str]|None:
-        return self.msgdata.get('src_actor_uid')
+        log.warning('.src_actor_uid` is deprecated, use `.src_uid` instead!')
+        return self.src_uid
 
     @property
     def tb_str(
@@ -129,28 +264,56 @@ class RemoteActorError(Exception):
 
         return ''
 
+    def _mk_fields_str(
+        self,
+        fields: list[str],
+        end_char: str = '\n',
+    ) -> str:
+        _repr: str = ''
+        for key in fields:
+            val: Any|None = (
+                getattr(self, key, None)
+                or
+                self.msgdata.get(key)
+            )
+            # TODO: for `.relay_path` on multiline?
+            # if not isinstance(val, str):
+            #     val_str = pformat(val)
+            # else:
+            val_str: str = repr(val)
+
+            if val:
+                _repr += f'{key}={val_str}{end_char}'
+
+        return _repr
+
     def reprol(self) -> str:
         '''
         Represent this error for "one line" display, like in
         a field of our `Context.__repr__()` output.
 
         '''
-        _repr: str = f'{type(self).__name__}('
-        for key in self.reprol_fields:
-            val: Any|None = self.msgdata.get(key)
-            if val:
-                _repr += f'{key}={repr(val)} '
-
-        return _repr
+        # TODO: use this matryoshka emjoi XD
+        # => 🪆
+        reprol_str: str = f'{type(self).__name__}('
+        _repr: str = self._mk_fields_str(
+            self.reprol_fields,
+            end_char=' ',
+        )
+        return (
+            reprol_str
+            +
+            _repr
+        )
 
     def __repr__(self) -> str:
+        '''
+        Nicely formatted boxed error meta data + traceback.
 
-        fields: str = ''
-        for key in _body_fields:
-            val: str|None = self.msgdata.get(key)
-            if val:
-                fields += f'{key}={val}\n'
-
+        '''
+        fields: str = self._mk_fields_str(
+            _body_fields,
+        )
         fields: str = textwrap.indent(
             fields,
             # prefix=' '*2,
@@ -165,8 +328,6 @@ class RemoteActorError(Exception):
             f'   ------ - ------\n'
             f' _|\n'
         )
-            # f'|\n'
-            # f'         |\n'
         if indent:
             body: str = textwrap.indent(
                 body,
@@ -178,9 +339,47 @@ class RemoteActorError(Exception):
             ')>'
         )
 
-    # TODO: local recontruction of remote exception deats
+    def unwrap(
+        self,
+    ) -> BaseException:
+        '''
+        Unpack the inner-most source error from it's original IPC msg data.
+
+        We attempt to reconstruct (as best as we can) the original
+        `Exception` from as it would have been raised in the
+        failing actor's remote env.
+
+        '''
+        src_type_ref: Type[BaseException] = self.src_type
+        if not src_type_ref:
+            raise TypeError(
+                'Failed to lookup src error type:\n'
+                f'{self.src_type_str}'
+            )
+
+        # TODO: better tb insertion and all the fancier dunder
+        # metadata stuff as per `.__context__` etc. and friends:
+        # https://github.com/python-trio/trio/issues/611
+        return src_type_ref(self.tb_str)
+
+    # TODO: local recontruction of nested inception for a given
+    # "hop" / relay-node in this error's relay_path?
+    # => so would render a `RAE[RAE[RAE[Exception]]]` instance
+    #   with all inner errors unpacked?
+    # -[ ] if this is useful shouldn't be too hard to impl right?
     # def unbox(self) -> BaseException:
-    #     ...
+    #     '''
+    #     Unbox to the prior relays (aka last boxing actor's)
+    #     inner error.
+
+    #     '''
+    #     if not self.relay_path:
+    #         return self.unwrap()
+
+    #     # TODO..
+    #     # return self.boxed_type(
+    #     #     boxed_type=get_type_ref(..
+    #     raise NotImplementedError
 
 
 class InternalActorError(RemoteActorError):
@@ -232,7 +431,7 @@ class ContextCancelled(RemoteActorError):
             f'{self}'
         )
 
-    # to make `.__repr__()` work uniformly
+    # TODO: to make `.__repr__()` work uniformly?
     # src_actor_uid = canceller
 
 
@@ -283,7 +482,8 @@ class MessagingError(Exception):
 
 
 def pack_error(
-    exc: BaseException,
+    exc: BaseException|RemoteActorError,
+
     tb: str|None = None,
     cid: str|None = None,
 
@@ -300,27 +500,60 @@ def pack_error(
     else:
         tb_str = traceback.format_exc()
 
+    our_uid: tuple = current_actor().uid
     error_msg: dict[
         str,
         str | tuple[str, str]
     ] = {
         'tb_str': tb_str,
-        'type_str': type(exc).__name__,
-        'boxed_type': type(exc).__name__,
-        'src_actor_uid': current_actor().uid,
+        'relay_uid': our_uid,
     }
 
-    # TODO: ?just wholesale proxy `.msgdata: dict`?
-    # XXX WARNING, when i swapped these ctx-semantics
-    # tests started hanging..???!!!???
-    # if msgdata := exc.getattr('msgdata', {}):
-    #     error_msg.update(msgdata)
     if (
-        isinstance(exc, ContextCancelled)
-        or isinstance(exc, StreamOverrun)
+        isinstance(exc, RemoteActorError)
     ):
         error_msg.update(exc.msgdata)
 
+    # an onion/inception we need to pack
+    if (
+        type(exc) is RemoteActorError
+        and exc.boxed_type != RemoteActorError
+    ):
+        # sanity on source error (if needed when tweaking this)
+        assert (src_type := exc.src_type) != RemoteActorError
+        assert error_msg['src_type_str'] != 'RemoteActorError'
+        assert error_msg['src_type_str'] == src_type.__name__
+        assert error_msg['src_uid'] != our_uid
+
+        # set the boxed type to be another boxed type thus
+        # creating an "inception" when unpacked by
+        # `unpack_error()` in another actor who gets "relayed"
+        # this error Bo
+        #
+        # NOTE on WHY: since we are re-boxing and already
+        # boxed src error, we want to overwrite the original
+        # `boxed_type_str` and instead set it to the type of
+        # the input `exc` type.
+        error_msg['boxed_type_str'] = 'RemoteActorError'
+
+        # import pdbp; pdbp.set_trace()
+        # log.debug(
+        #     'INCEPTION packing!\n\n'
+        #     f'{pformat(exc.msgdata)}\n\n'
+        #     f'{exc}\n'
+        # )
+
+    else:
+        error_msg['src_uid'] = our_uid
+        error_msg['src_type_str'] =  type(exc).__name__
+        error_msg['boxed_type_str'] = type(exc).__name__
+
+    # XXX alawys append us the last relay in error propagation path
+    error_msg.setdefault(
+        'relay_path',
+        [],
+    ).append(our_uid)
+
     pkt: dict = {'error': error_msg}
     if cid:
         pkt['cid'] = cid
@@ -329,12 +562,10 @@ def pack_error(
 
 
 def unpack_error(
-
     msg: dict[str, Any],
 
     chan: Channel|None = None,
     box_type: RemoteActorError = RemoteActorError,
-
     hide_tb: bool = True,
 
 ) -> None|Exception:
@@ -357,33 +588,38 @@ def unpack_error(
 
     # retrieve the remote error's msg encoded details
     tb_str: str = error_dict.get('tb_str', '')
-    message: str = f'{chan.uid}\n' + tb_str
-    type_name: str = (
-        error_dict.get('type_str')
-        or error_dict['boxed_type']
+    message: str = (
+        f'{chan.uid}\n'
+        +
+        tb_str
     )
-    suberror_type: Type[BaseException] = Exception
+    boxed_type_str: str = (
+        # TODO: deprecate this!
+        error_dict.get('boxed_type_str')
+        # or error_dict['boxed_type']
+    )
+    boxed_type: Type[BaseException] = Exception
 
-    if type_name == 'ContextCancelled':
-        box_type = ContextCancelled
-        suberror_type = box_type
+    if boxed_type_str == 'ContextCancelled':
+        boxed_type = box_type = ContextCancelled
 
-    else:  # try to lookup a suitable local error type
-        for ns in [
-            builtins,
-            _this_mod,
-            trio,
-        ]:
-            if suberror_type := getattr(
-                ns,
-                type_name,
-                False,
-            ):
-                break
+    # TODO: already included by `_this_mod` in else loop right?
+    #
+    # we have an inception/onion-error so ensure
+    # we include the relay_path info and the
+    # original source error.
+    elif boxed_type_str == 'RemoteActorError':
+        boxed_type = RemoteActorError
+        assert len(error_dict['relay_path']) >= 1
+
+    # try to lookup a suitable error type
+    # from the local runtime env.
+    else:
+        boxed_type = get_err_type(boxed_type_str)
 
     exc = box_type(
         message,
-        suberror_type=suberror_type,
+        boxed_type=boxed_type,
 
         # unpack other fields into error type init
         **error_dict,
@@ -501,6 +737,11 @@ def _raise_from_no_key_in_msg(
         # destined for the `Context.result()` call during ctx-exit!
         stream._eoc: Exception = eoc
 
+        # in case there already is some underlying remote error
+        # that arrived which is probably the source of this stream
+        # closure
+        ctx.maybe_raise()
+
         raise eoc from src_err
 
     if (
diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index 6bdc0c6b..d369b41c 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -273,7 +273,10 @@ async def _errors_relayed_via_ipc(
                 entered_debug = await _debug._maybe_enter_pm(err)
 
                 if not entered_debug:
-                    log.exception('Actor crashed:\n')
+                    log.exception(
+                        'RPC task crashed\n'
+                        f'|_{ctx}'
+                    )
 
         # always (try to) ship RPC errors back to caller
         if is_rpc:
@@ -613,7 +616,8 @@ async def _invoke(
                     # other side.
                     ctxc = ContextCancelled(
                         msg,
-                        suberror_type=trio.Cancelled,
+                        boxed_type=trio.Cancelled,
+                        # boxed_type_str='Cancelled',
                         canceller=canceller,
                     )
                     # assign local error so that the `.outcome`
@@ -671,7 +675,7 @@ async def _invoke(
                     f'`{repr(ctx.outcome)}`',
                 )
             )
-            log.cancel(
+            log.runtime(
                 f'IPC context terminated with a final {res_type_str}\n\n'
                 f'{ctx}\n'
             )
@@ -704,12 +708,6 @@ async def try_ship_error_to_remote(
                 # TODO: special tb fmting for ctxc cases?
                 # tb=tb,
             )
-            # NOTE: the src actor should always be packed into the
-            # error.. but how should we verify this?
-            # actor: Actor = _state.current_actor()
-            # assert err_msg['src_actor_uid']
-            # if not err_msg['error'].get('src_actor_uid'):
-            #     import pdbp; pdbp.set_trace()
             await channel.send(msg)
 
         # XXX NOTE XXX in SC terms this is one of the worst things
-- 
2.34.1


From 78434f631729dfb20ba6895d83d0270e6a479abd Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 19 Mar 2024 14:20:59 -0400
Subject: [PATCH 173/378] Fix `.boxed_type` facepalm, drop `.src_actor_uid`

The misname of `._boxed_type` as `._src_type` was only manifesting as
a reallly strange boxing error with a packed exception-group, not sure
how or why only that but it's fixed now XD

Start refining/cleaning out stuff for sure we don't need (based on
multiple local test runs):

- discard `.src_actor_uid` fully since test set has been moved over to
  `.src_uid`; this means also removing the `.msgdata` insertion from
  `pack_error()`; a patch to all internals is coming next obvi!

- don't pass `boxed_type` to `RemoteActorError.__init__()` from
  `unpack_error()` since it's now set directly via the
  `.msgdata["boxed_type_str"]`/`error_msg: dict` input , but in the case
  where **it is passed as an arg** (only for ctxc in `._rpc._invoke()`
  rn) make sure we only do the `.__init__()` insert when `boxed_type is
  not None`.
---
 tractor/_exceptions.py | 52 +++++++++++-------------------------------
 1 file changed, 13 insertions(+), 39 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index d6629ad4..b28a4a75 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -109,7 +109,6 @@ class RemoteActorError(Exception):
 
     '''
     reprol_fields: list[str] = [
-        # 'src_actor_uid',
         'src_uid',
         'relay_path',
         # 'relay_uid',
@@ -143,9 +142,6 @@ class RemoteActorError(Exception):
         # pre-`return` lines?
         # sanity on inceptions
         if boxed_type is RemoteActorError:
-            if self.src_type_str == 'RemoteActorError':
-                import pdbp; pdbp.set_trace()
-
             assert self.src_type_str != 'RemoteActorError'
             assert self.src_uid not in self.relay_path
 
@@ -158,7 +154,7 @@ class RemoteActorError(Exception):
         # should better emphasize that special (one off?) case
         # either by customizing `ContextCancelled.__init__()` or
         # through a special factor func?
-        else:
+        elif boxed_type:
             if not self.msgdata.get('boxed_type_str'):
                 self.msgdata['boxed_type_str'] = str(
                     type(boxed_type).__name__
@@ -206,7 +202,7 @@ class RemoteActorError(Exception):
 
         '''
         if self._boxed_type is None:
-            self._src_type = get_err_type(
+            self._boxed_type = get_err_type(
                 self.msgdata['boxed_type_str']
             )
 
@@ -235,9 +231,6 @@ class RemoteActorError(Exception):
     def src_uid(self) -> tuple[str, str]|None:
         if src_uid := (
             self.msgdata.get('src_uid')
-            # TODO: remove!
-            or
-            self.msgdata.get('src_actor_uid')
         ):
             return tuple(src_uid)
         # TODO: use path lookup instead?
@@ -245,12 +238,6 @@ class RemoteActorError(Exception):
         #     self.msgdata['relay_path'][0]
         # )
 
-    # TODO: deprecate this for ^!
-    @property
-    def src_actor_uid(self) -> tuple[str, str]|None:
-        log.warning('.src_actor_uid` is deprecated, use `.src_uid` instead!')
-        return self.src_uid
-
     @property
     def tb_str(
         self,
@@ -517,7 +504,8 @@ def pack_error(
     # an onion/inception we need to pack
     if (
         type(exc) is RemoteActorError
-        and exc.boxed_type != RemoteActorError
+        and (boxed := exc.boxed_type)
+        and boxed != RemoteActorError
     ):
         # sanity on source error (if needed when tweaking this)
         assert (src_type := exc.src_type) != RemoteActorError
@@ -536,13 +524,6 @@ def pack_error(
         # the input `exc` type.
         error_msg['boxed_type_str'] = 'RemoteActorError'
 
-        # import pdbp; pdbp.set_trace()
-        # log.debug(
-        #     'INCEPTION packing!\n\n'
-        #     f'{pformat(exc.msgdata)}\n\n'
-        #     f'{exc}\n'
-        # )
-
     else:
         error_msg['src_uid'] = our_uid
         error_msg['src_type_str'] =  type(exc).__name__
@@ -566,6 +547,7 @@ def unpack_error(
 
     chan: Channel|None = None,
     box_type: RemoteActorError = RemoteActorError,
+
     hide_tb: bool = True,
 
 ) -> None|Exception:
@@ -593,15 +575,15 @@ def unpack_error(
         +
         tb_str
     )
-    boxed_type_str: str = (
-        # TODO: deprecate this!
-        error_dict.get('boxed_type_str')
-        # or error_dict['boxed_type']
-    )
-    boxed_type: Type[BaseException] = Exception
+
+    # try to lookup a suitable error type from the local runtime
+    # env then use it to construct a local instance.
+    boxed_type_str: str = error_dict['boxed_type_str']
+    boxed_type: Type[BaseException] = get_err_type(boxed_type_str)
 
     if boxed_type_str == 'ContextCancelled':
-        boxed_type = box_type = ContextCancelled
+        box_type = ContextCancelled
+        assert boxed_type is box_type
 
     # TODO: already included by `_this_mod` in else loop right?
     #
@@ -609,19 +591,11 @@ def unpack_error(
     # we include the relay_path info and the
     # original source error.
     elif boxed_type_str == 'RemoteActorError':
-        boxed_type = RemoteActorError
+        assert boxed_type is RemoteActorError
         assert len(error_dict['relay_path']) >= 1
 
-    # try to lookup a suitable error type
-    # from the local runtime env.
-    else:
-        boxed_type = get_err_type(boxed_type_str)
-
     exc = box_type(
         message,
-        boxed_type=boxed_type,
-
-        # unpack other fields into error type init
         **error_dict,
     )
 
-- 
2.34.1


From 9221c572346ec783cb16c9860c14351861afdd51 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 19 Mar 2024 18:08:54 -0400
Subject: [PATCH 174/378] Adjust all `RemoteActorError.type` using tests

To instead use the new `.boxed_type` B)
---
 examples/debugging/multi_daemon_subactors.py |  2 +-
 tests/test_cancellation.py                   | 26 ++++++++++----------
 tests/test_child_manages_service_nursery.py  |  2 +-
 tests/test_context_stream_semantics.py       | 12 ++++-----
 tests/test_infected_asyncio.py               | 10 ++++----
 tests/test_rpc.py                            |  4 +--
 6 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/examples/debugging/multi_daemon_subactors.py b/examples/debugging/multi_daemon_subactors.py
index 6c2d5750..ea5fe005 100644
--- a/examples/debugging/multi_daemon_subactors.py
+++ b/examples/debugging/multi_daemon_subactors.py
@@ -32,7 +32,7 @@ async def main():
             try:
                 await p1.run(name_error)
             except tractor.RemoteActorError as rae:
-                assert rae.type is NameError
+                assert rae.boxed_type is NameError
 
             async for i in stream:
 
diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py
index b8c14af3..5fd58fbc 100644
--- a/tests/test_cancellation.py
+++ b/tests/test_cancellation.py
@@ -77,7 +77,7 @@ def test_remote_error(reg_addr, args_err):
                 # of this actor nursery.
                 await portal.result()
             except tractor.RemoteActorError as err:
-                assert err.type == errtype
+                assert err.boxed_type == errtype
                 print("Look Maa that actor failed hard, hehh")
                 raise
 
@@ -86,7 +86,7 @@ def test_remote_error(reg_addr, args_err):
         with pytest.raises(tractor.RemoteActorError) as excinfo:
             trio.run(main)
 
-        assert excinfo.value.type == errtype
+        assert excinfo.value.boxed_type == errtype
 
     else:
         # the root task will also error on the `.result()` call
@@ -96,7 +96,7 @@ def test_remote_error(reg_addr, args_err):
 
         # ensure boxed errors
         for exc in excinfo.value.exceptions:
-            assert exc.type == errtype
+            assert exc.boxed_type == errtype
 
 
 def test_multierror(reg_addr):
@@ -117,7 +117,7 @@ def test_multierror(reg_addr):
             try:
                 await portal2.result()
             except tractor.RemoteActorError as err:
-                assert err.type == AssertionError
+                assert err.boxed_type == AssertionError
                 print("Look Maa that first actor failed hard, hehh")
                 raise
 
@@ -169,7 +169,7 @@ def test_multierror_fast_nursery(reg_addr, start_method, num_subactors, delay):
 
     for exc in exceptions:
         assert isinstance(exc, tractor.RemoteActorError)
-        assert exc.type == AssertionError
+        assert exc.boxed_type == AssertionError
 
 
 async def do_nothing():
@@ -310,7 +310,7 @@ async def test_some_cancels_all(num_actors_and_errs, start_method, loglevel):
                         await portal.run(func, **kwargs)
 
                     except tractor.RemoteActorError as err:
-                        assert err.type == err_type
+                        assert err.boxed_type == err_type
                         # we only expect this first error to propogate
                         # (all other daemons are cancelled before they
                         # can be scheduled)
@@ -329,11 +329,11 @@ async def test_some_cancels_all(num_actors_and_errs, start_method, loglevel):
             assert len(err.exceptions) == num_actors
             for exc in err.exceptions:
                 if isinstance(exc, tractor.RemoteActorError):
-                    assert exc.type == err_type
+                    assert exc.boxed_type == err_type
                 else:
                     assert isinstance(exc, trio.Cancelled)
         elif isinstance(err, tractor.RemoteActorError):
-            assert err.type == err_type
+            assert err.boxed_type == err_type
 
         assert n.cancelled is True
         assert not n._children
@@ -412,7 +412,7 @@ async def test_nested_multierrors(loglevel, start_method):
                     elif isinstance(subexc, tractor.RemoteActorError):
                         # on windows it seems we can't exactly be sure wtf
                         # will happen..
-                        assert subexc.type in (
+                        assert subexc.boxed_type in (
                             tractor.RemoteActorError,
                             trio.Cancelled,
                             BaseExceptionGroup,
@@ -422,7 +422,7 @@ async def test_nested_multierrors(loglevel, start_method):
                         for subsub in subexc.exceptions:
 
                             if subsub in (tractor.RemoteActorError,):
-                                subsub = subsub.type
+                                subsub = subsub.boxed_type
 
                             assert type(subsub) in (
                                 trio.Cancelled,
@@ -437,16 +437,16 @@ async def test_nested_multierrors(loglevel, start_method):
                     # we get back the (sent) cancel signal instead
                     if is_win():
                         if isinstance(subexc, tractor.RemoteActorError):
-                            assert subexc.type in (
+                            assert subexc.boxed_type in (
                                 BaseExceptionGroup,
                                 tractor.RemoteActorError
                             )
                         else:
                             assert isinstance(subexc, BaseExceptionGroup)
                     else:
-                        assert subexc.type is ExceptionGroup
+                        assert subexc.boxed_type is ExceptionGroup
                 else:
-                    assert subexc.type in (
+                    assert subexc.boxed_type in (
                         tractor.RemoteActorError,
                         trio.Cancelled
                     )
diff --git a/tests/test_child_manages_service_nursery.py b/tests/test_child_manages_service_nursery.py
index 350f939b..21fb3920 100644
--- a/tests/test_child_manages_service_nursery.py
+++ b/tests/test_child_manages_service_nursery.py
@@ -171,4 +171,4 @@ def test_actor_managed_trio_nursery_task_error_cancels_aio(
 
     # verify boxed error
     err = excinfo.value
-    assert isinstance(err.type(), NameError)
+    assert err.boxed_type is NameError
diff --git a/tests/test_context_stream_semantics.py b/tests/test_context_stream_semantics.py
index 42b1f7d0..1f5e3dbb 100644
--- a/tests/test_context_stream_semantics.py
+++ b/tests/test_context_stream_semantics.py
@@ -795,7 +795,7 @@ async def test_callee_cancels_before_started(
 
         # raises a special cancel signal
         except tractor.ContextCancelled as ce:
-            ce.type == trio.Cancelled
+            ce.boxed_type == trio.Cancelled
 
             # the traceback should be informative
             assert 'itself' in ce.msgdata['tb_str']
@@ -903,7 +903,7 @@ def test_one_end_stream_not_opened(
         with pytest.raises(tractor.RemoteActorError) as excinfo:
             trio.run(main)
 
-        assert excinfo.value.type == StreamOverrun
+        assert excinfo.value.boxed_type == StreamOverrun
 
     elif overrunner == 'callee':
         with pytest.raises(tractor.RemoteActorError) as excinfo:
@@ -912,7 +912,7 @@ def test_one_end_stream_not_opened(
         # TODO: embedded remote errors so that we can verify the source
         # error? the callee delivers an error which is an overrun
         # wrapped in a remote actor error.
-        assert excinfo.value.type == tractor.RemoteActorError
+        assert excinfo.value.boxed_type == tractor.RemoteActorError
 
     else:
         trio.run(main)
@@ -1131,7 +1131,7 @@ def test_maybe_allow_overruns_stream(
             # NOTE: i tried to isolate to a deterministic case here
             # based on timeing, but i was kinda wasted, and i don't
             # think it's sane to catch them..
-            assert err.type in (
+            assert err.boxed_type in (
                 tractor.RemoteActorError,
                 StreamOverrun,
             )
@@ -1139,10 +1139,10 @@ def test_maybe_allow_overruns_stream(
         elif (
             slow_side == 'child'
         ):
-            assert err.type == StreamOverrun
+            assert err.boxed_type == StreamOverrun
 
         elif slow_side == 'parent':
-            assert err.type == tractor.RemoteActorError
+            assert err.boxed_type == tractor.RemoteActorError
             assert 'StreamOverrun' in err.msgdata['tb_str']
 
     else:
diff --git a/tests/test_infected_asyncio.py b/tests/test_infected_asyncio.py
index 568708a2..5ac463ea 100644
--- a/tests/test_infected_asyncio.py
+++ b/tests/test_infected_asyncio.py
@@ -128,7 +128,7 @@ def test_aio_simple_error(reg_addr):
         assert err
 
     assert isinstance(err, RemoteActorError)
-    assert err.type == AssertionError
+    assert err.boxed_type == AssertionError
 
 
 def test_tractor_cancels_aio(reg_addr):
@@ -272,7 +272,7 @@ def test_context_spawns_aio_task_that_errors(
 
         err = excinfo.value
         assert isinstance(err, expect)
-        assert err.type == AssertionError
+        assert err.boxed_type == AssertionError
 
 
 async def aio_cancel():
@@ -314,7 +314,7 @@ def test_aio_cancelled_from_aio_causes_trio_cancelled(reg_addr):
         assert err
 
     # ensure boxed error is correct
-    assert err.type == to_asyncio.AsyncioCancelled
+    assert err.boxed_type == to_asyncio.AsyncioCancelled
 
 
 # TODO: verify open_channel_from will fail on this..
@@ -466,7 +466,7 @@ def test_trio_error_cancels_intertask_chan(reg_addr):
 
     # ensure boxed errors
     for exc in excinfo.value.exceptions:
-        assert exc.type == Exception
+        assert exc.boxed_type == Exception
 
 
 def test_trio_closes_early_and_channel_exits(reg_addr):
@@ -500,7 +500,7 @@ def test_aio_errors_and_channel_propagates_and_closes(reg_addr):
 
     # ensure boxed errors
     for exc in excinfo.value.exceptions:
-        assert exc.type == Exception
+        assert exc.boxed_type == Exception
 
 
 @tractor.context
diff --git a/tests/test_rpc.py b/tests/test_rpc.py
index a18bcb02..9581708f 100644
--- a/tests/test_rpc.py
+++ b/tests/test_rpc.py
@@ -36,7 +36,7 @@ async def sleep_back_actor(
                 if not exposed_mods:
                     expect = tractor.ModuleNotExposed
 
-                assert err.type is expect
+                assert err.boxed_type is expect
                 raise
     else:
         await trio.sleep(float('inf'))
@@ -150,4 +150,4 @@ def test_rpc_errors(
             ))
 
         if getattr(value, 'type', None):
-            assert value.type is inside_err
+            assert value.boxed_type is inside_err
-- 
2.34.1


From 668016d37b89d3e140d45e11914d606bc371f4f0 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 19 Mar 2024 18:40:50 -0400
Subject: [PATCH 175/378] Absorb EoCs via `Context.open_stream()` silently

I swear long ago it used to operate this way but, I guess this finalizes
the design decision. It makes a lot more sense to *not* propagate any
`trio.EndOfChannel` raised from a `Context.open_stream() as stream:`
block when that EoC is due to graceful-explicit stream termination.
We use the EoC much like a `StopAsyncIteration` where the error
indicates termination of the stream due to either:
- reception of a stop IPC msg indicating the far end ended the stream
  (gracecfully),
- closure of the underlying `Context._recv_chan` either by the runtime
  or due to user code having called `MsgStream.aclose()`.

User code shouldn't expect to handle EoC outside the block since the
`@acm` having closed should indicate the exactly same lifetime state
(of said stream) ;)

Deats:
- add special EoC handler in `.open_stream()` which silently "absorbs"
  the error only when the stream is already marked as closed (meaning
  the EoC indeed corresponds to IPC closure) with an assert for now
  ensuring the error is the same as set to `MsgStream._eoc`.
- in `MsgStream.receive()` break up the handlers for EoC and
  `trio.ClosedResourceError` since the error instances are saved to
  different variables and we **don't** want to rewrite the exception in
  the eoc case (normally to mask `trio` internals in tbs) bc we need the
  instance to be the exact one for doing checks inside
  `.open_stream().__aexit__()` to absorb it.

Other surrounding "improvements":
- start using the new `Context.maybe_raise()` helper where it can easily
  replace existing equivalent block-sections.
- use new `RemoteActorError.src_uid` as required.
---
 tractor/_context.py   | 117 ++++++++++++++++++++++++++----------------
 tractor/_streaming.py |  89 ++++++++++++++++++--------------
 2 files changed, 123 insertions(+), 83 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index 55902281..11975bae 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -169,8 +169,7 @@ async def _drain_to_final_msg(
             # only when we are sure the remote error is
             # the source cause of this local task's
             # cancellation.
-            if re := ctx._remote_error:
-                ctx._maybe_raise_remote_err(re)
+            ctx.maybe_raise()
 
             # CASE 1: we DID request the cancel we simply
             # continue to bubble up as normal.
@@ -257,6 +256,13 @@ async def _drain_to_final_msg(
                 )
 
             # XXX fallthrough to handle expected error XXX
+            # TODO: replace this with `ctx.maybe_raise()`
+            #
+            # TODO: would this be handier for this case maybe?
+            # async with maybe_raise_on_exit() as raises:
+            #     if raises:
+            #         log.error('some msg about raising..')
+
             re: Exception|None = ctx._remote_error
             if re:
                 log.critical(
@@ -595,7 +601,7 @@ class Context:
         if not re:
             return False
 
-        if from_uid := re.src_actor_uid:
+        if from_uid := re.src_uid:
             from_uid: tuple = tuple(from_uid)
 
         our_uid: tuple = self._actor.uid
@@ -825,7 +831,7 @@ class Context:
         # cancellation.
         maybe_error_src: tuple = getattr(
             error,
-            'src_actor_uid',
+            'src_uid',
             None,
         )
         self._canceller = (
@@ -1030,8 +1036,8 @@ class Context:
     @acm
     async def open_stream(
         self,
-        allow_overruns: bool | None = False,
-        msg_buffer_size: int | None = None,
+        allow_overruns: bool|None = False,
+        msg_buffer_size: int|None = None,
 
     ) -> AsyncGenerator[MsgStream, None]:
         '''
@@ -1071,13 +1077,16 @@ class Context:
             # absorbed there (silently) and we DO NOT want to
             # actually try to stream - a cancel msg was already
             # sent to the other side!
-            if self._remote_error:
-                # NOTE: this is diff then calling
-                # `._maybe_raise_remote_err()` specifically
-                # because any task entering this `.open_stream()`
-                # AFTER cancellation has already been requested,
-                # we DO NOT want to absorb any ctxc ACK silently!
-                raise self._remote_error
+            self.maybe_raise(
+                raise_ctxc_from_self_call=True,
+            )
+            # NOTE: this is diff then calling
+            # `._maybe_raise_remote_err()` specifically
+            # because we want to raise a ctxc on any task entering this `.open_stream()`
+            # AFTER cancellation was already been requested,
+            # we DO NOT want to absorb any ctxc ACK silently!
+            # if self._remote_error:
+            #     raise self._remote_error
 
             # XXX NOTE: if no `ContextCancelled` has been responded
             # back from the other side (yet), we raise a different
@@ -1158,7 +1167,6 @@ class Context:
                 # await trio.lowlevel.checkpoint()
                 yield stream
 
-
                 # XXX: (MEGA IMPORTANT) if this is a root opened process we
                 # wait for any immediate child in debug before popping the
                 # context from the runtime msg loop otherwise inside
@@ -1183,12 +1191,23 @@ class Context:
                 #
                 # await stream.aclose()
 
-                # if re := ctx._remote_error:
-                #     ctx._maybe_raise_remote_err(
-                #         re,
-                #         raise_ctxc_from_self_call=True,
-                #     )
-                # await trio.lowlevel.checkpoint()
+            # NOTE: absorb and do not raise any
+            # EoC received from the other side such that
+            # it is not raised inside the surrounding
+            # context block's scope!
+            except trio.EndOfChannel as eoc:
+                if (
+                    eoc
+                    and stream.closed
+                ):
+                    # sanity, can remove?
+                    assert eoc is stream._eoc
+                    # from .devx import pause
+                    # await pause()
+                    log.warning(
+                        'Stream was terminated by EoC\n\n'
+                        f'{repr(eoc)}\n'
+                    )
 
             finally:
                 if self._portal:
@@ -1204,7 +1223,6 @@ class Context:
     # TODO: replace all the instances of this!! XD
     def maybe_raise(
         self,
-
         hide_tb: bool = True,
         **kwargs,
 
@@ -1388,33 +1406,41 @@ class Context:
                 f'{drained_msgs}'
             )
 
-        if (
-            (re := self._remote_error)
-            # and self._result == res_placeholder
-        ):
-            self._maybe_raise_remote_err(
-                re,
-                # NOTE: obvi we don't care if we
-                # overran the far end if we're already
-                # waiting on a final result (msg).
-                # raise_overrun_from_self=False,
-                raise_overrun_from_self=(
-                    raise_overrun
-                    and
-                    # only when we ARE NOT the canceller
-                    # should we raise overruns, bc ow we're
-                    # raising something we know might happen
-                    # during cancellation ;)
-                    (not self._cancel_called)
-                ),
+        self.maybe_raise(
+            raise_overrun_from_self=(
+                raise_overrun
+                and
+                # only when we ARE NOT the canceller
+                # should we raise overruns, bc ow we're
+                # raising something we know might happen
+                # during cancellation ;)
+                (not self._cancel_called)
             )
+        )
+        # if (
+        #     (re := self._remote_error)
+        #     # and self._result == res_placeholder
+        # ):
+        #     self._maybe_raise_remote_err(
+        #         re,
+        #         # NOTE: obvi we don't care if we
+        #         # overran the far end if we're already
+        #         # waiting on a final result (msg).
+        #         # raise_overrun_from_self=False,
+        #         raise_overrun_from_self=(
+        #             raise_overrun
+        #             and
+        #             # only when we ARE NOT the canceller
+        #             # should we raise overruns, bc ow we're
+        #             # raising something we know might happen
+        #             # during cancellation ;)
+        #             (not self._cancel_called)
+        #         ),
+        #     )
             # if maybe_err:
             #     self._result = maybe_err
 
         return self.outcome
-            # None if self._result == res_placeholder
-            # else self._result
-        # )
 
     # TODO: switch this with above which should be named
     # `.wait_for_outcome()` and instead do
@@ -1863,8 +1889,9 @@ async def open_context_from_portal(
 
     # TODO: if we set this the wrapping `@acm` body will
     # still be shown (awkwardly) on pdb REPL entry. Ideally
-    # we can similarly annotate that frame to NOT show?
-    hide_tb: bool = True,
+    # we can similarly annotate that frame to NOT show? for now
+    # we DO SHOW this frame since it's awkward ow..
+    hide_tb: bool = False,
 
     # proxied to RPC
     **kwargs,
diff --git a/tractor/_streaming.py b/tractor/_streaming.py
index 149bb350..e0015fe4 100644
--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@@ -136,7 +136,7 @@ class MsgStream(trio.abc.Channel):
         #         return await self.receive()
         #     except trio.EndOfChannel:
         #         raise StopAsyncIteration
-
+        #
         # see ``.aclose()`` for notes on the old behaviour prior to
         # introducing this
         if self._eoc:
@@ -152,7 +152,6 @@ class MsgStream(trio.abc.Channel):
                 return msg['yield']
 
             except KeyError as kerr:
-                # log.exception('GOT KEYERROR')
                 src_err = kerr
 
                 # NOTE: may raise any of the below error types
@@ -166,30 +165,20 @@ class MsgStream(trio.abc.Channel):
                     stream=self,
                 )
 
-        # XXX: we close the stream on any of these error conditions:
+        # XXX: the stream terminates on either of:
+        # - via `self._rx_chan.receive()` raising  after manual closure
+        #   by the rpc-runtime OR,
+        # - via a received `{'stop': ...}` msg from remote side.
+        #   |_ NOTE: previously this was triggered by calling
+        #   ``._rx_chan.aclose()`` on the send side of the channel inside
+        #   `Actor._push_result()`, but now the 'stop' message handling
+        #   has been put just above inside `_raise_from_no_key_in_msg()`.
         except (
-            # trio.ClosedResourceError,  # by self._rx_chan
-            trio.EndOfChannel,  # by self._rx_chan or `stop` msg from far end
+            trio.EndOfChannel,
         ) as eoc:
-            # log.exception('GOT EOC')
             src_err = eoc
             self._eoc = eoc
 
-            # a ``ClosedResourceError`` indicates that the internal
-            # feeder memory receive channel was closed likely by the
-            # runtime after the associated transport-channel
-            # disconnected or broke.
-
-            # an ``EndOfChannel`` indicates either the internal recv
-            # memchan exhausted **or** we raisesd it just above after
-            # receiving a `stop` message from the far end of the stream.
-
-            # Previously this was triggered by calling ``.aclose()`` on
-            # the send side of the channel inside
-            # ``Actor._push_result()`` (should still be commented code
-            # there - which should eventually get removed), but now the
-            # 'stop' message handling has been put just above.
-
             # TODO: Locally, we want to close this stream gracefully, by
             # terminating any local consumers tasks deterministically.
             # Once we have broadcast support, we **don't** want to be
@@ -210,8 +199,11 @@ class MsgStream(trio.abc.Channel):
 
             # raise eoc
 
-        except trio.ClosedResourceError as cre:  # by self._rx_chan
-            # log.exception('GOT CRE')
+        # a ``ClosedResourceError`` indicates that the internal
+        # feeder memory receive channel was closed likely by the
+        # runtime after the associated transport-channel
+        # disconnected or broke.
+        except trio.ClosedResourceError as cre:  # by self._rx_chan.receive()
             src_err = cre
             log.warning(
                 '`Context._rx_chan` was already closed?'
@@ -237,15 +229,30 @@ class MsgStream(trio.abc.Channel):
         # over the end-of-stream connection error since likely
         # the remote error was the source cause?
         ctx: Context = self._ctx
-        if re := ctx._remote_error:
-            ctx._maybe_raise_remote_err(
-                re,
-                raise_ctxc_from_self_call=True,
-            )
+        ctx.maybe_raise(
+            raise_ctxc_from_self_call=True,
+        )
 
-        # propagate any error but hide low-level frames from
-        # caller by default.
-        if hide_tb:
+        # propagate any error but hide low-level frame details
+        # from the caller by default for debug noise reduction.
+        if (
+            hide_tb
+
+            # XXX NOTE XXX don't reraise on certain
+            # stream-specific internal error types like,
+            #
+            # - `trio.EoC` since we want to use the exact instance
+            #   to ensure that it is the error that bubbles upward
+            #   for silent absorption by `Context.open_stream()`.
+            and not self._eoc
+
+            # - `RemoteActorError` (or `ContextCancelled`) if it gets
+            #   raised from `_raise_from_no_key_in_msg()` since we
+            #   want the same (as the above bullet) for any
+            #   `.open_context()` block bubbled error raised by
+            #   any nearby ctx API remote-failures.
+            # and not isinstance(src_err, RemoteActorError)
+        ):
             raise type(src_err)(*src_err.args) from src_err
         else:
             raise src_err
@@ -370,6 +377,10 @@ class MsgStream(trio.abc.Channel):
         #         await rx_chan.aclose()
 
         if not self._eoc:
+            log.cancel(
+                'Stream closed before it received an EoC?\n'
+                'Setting eoc manually..\n..'
+            )
             self._eoc: bool = trio.EndOfChannel(
                 f'Context stream closed by {self._ctx.side}\n'
                 f'|_{self}\n'
@@ -414,13 +425,11 @@ class MsgStream(trio.abc.Channel):
 
     @property
     def closed(self) -> bool:
-        if (
-            (rxc := self._rx_chan._closed)
-            or
-            (_closed := self._closed)
-            or
-            (_eoc := self._eoc)
-        ):
+
+        rxc: bool = self._rx_chan._closed
+        _closed: bool|Exception = self._closed
+        _eoc: bool|trio.EndOfChannel = self._eoc
+        if rxc or _closed or _eoc:
             log.runtime(
                 f'`MsgStream` is already closed\n'
                 f'{self}\n'
@@ -496,7 +505,11 @@ class MsgStream(trio.abc.Channel):
         '''
         __tracebackhide__: bool = hide_tb
 
+        # raise any alreay known error immediately
         self._ctx.maybe_raise()
+        if self._eoc:
+            raise self._eoc
+
         if self._closed:
             raise self._closed
 
-- 
2.34.1


From 8ab5e0883025fa0a04f6e92d496554a6a97517f9 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 19 Mar 2024 19:33:06 -0400
Subject: [PATCH 176/378] Adjust advanced faults test(s) for absorbed EoCs

More or less just simplifies to not seeing the stream closure errors and
instead expecting KBIs from the simulated user who 'ctl-cs after hang'.

Toss in a little `stuff_hangin_ctlc()` to the script to wrap all that
and always check stream closure before sending the final KBI.
---
 .../ipc_failure_during_stream.py              | 120 ++++++++++--------
 tests/test_advanced_faults.py                 |  74 +++++++----
 2 files changed, 116 insertions(+), 78 deletions(-)

diff --git a/examples/advanced_faults/ipc_failure_during_stream.py b/examples/advanced_faults/ipc_failure_during_stream.py
index c7322a7c..9dca92b1 100644
--- a/examples/advanced_faults/ipc_failure_during_stream.py
+++ b/examples/advanced_faults/ipc_failure_during_stream.py
@@ -6,6 +6,7 @@ been an outage) and we want to ensure that despite being in debug mode
 actor tree will eventually be cancelled without leaving any zombies.
 
 '''
+from contextlib import asynccontextmanager as acm
 from functools import partial
 
 from tractor import (
@@ -17,6 +18,7 @@ from tractor import (
     _testing,
 )
 import trio
+import pytest
 
 
 async def break_ipc(
@@ -41,6 +43,13 @@ async def break_ipc(
         await stream.aclose()
 
     method: str = method or def_method
+    print(
+        '#################################\n'
+        'Simulating CHILD-side IPC BREAK!\n'
+        f'method: {method}\n'
+        f'pre `.aclose()`: {pre_close}\n'
+        '#################################\n'
+    )
 
     match method:
         case 'trans_aclose':
@@ -80,17 +89,17 @@ async def break_ipc_then_error(
     break_ipc_with: str|None = None,
     pre_close: bool = False,
 ):
+    await break_ipc(
+        stream=stream,
+        method=break_ipc_with,
+        pre_close=pre_close,
+    )
     async for msg in stream:
         await stream.send(msg)
-        await break_ipc(
-            stream=stream,
-            method=break_ipc_with,
-            pre_close=pre_close,
-        )
-        assert 0
+
+    assert 0
 
 
-# async def close_stream_and_error(
 async def iter_ipc_stream(
     stream: MsgStream,
     break_ipc_with: str|None = None,
@@ -99,20 +108,6 @@ async def iter_ipc_stream(
     async for msg in stream:
         await stream.send(msg)
 
-        # wipe out channel right before raising
-        # await break_ipc(
-        #     stream=stream,
-        #     method=break_ipc_with,
-        #     pre_close=pre_close,
-        # )
-
-        # send channel close msg at SC-prot level
-        #
-        # TODO: what should get raised here if anything?
-        # await stream.aclose()
-
-    # assert 0
-
 
 @context
 async def recv_and_spawn_net_killers(
@@ -134,14 +129,16 @@ async def recv_and_spawn_net_killers(
         async for i in stream:
             print(f'child echoing {i}')
             await stream.send(i)
+
             if (
                 break_ipc_after
                 and
-                i > break_ipc_after
+                i >= break_ipc_after
             ):
-                '#################################\n'
-                'Simulating CHILD-side IPC BREAK!\n'
-                '#################################\n'
+                n.start_soon(
+                    iter_ipc_stream,
+                    stream,
+                )
                 n.start_soon(
                     partial(
                         break_ipc_then_error,
@@ -149,10 +146,23 @@ async def recv_and_spawn_net_killers(
                         pre_close=pre_close,
                     )
                 )
-                n.start_soon(
-                    iter_ipc_stream,
-                    stream,
-                )
+
+
+@acm
+async def stuff_hangin_ctlc(timeout: float = 1) -> None:
+
+    with trio.move_on_after(timeout) as cs:
+        yield timeout
+
+    if cs.cancelled_caught:
+        # pretend to be a user seeing no streaming action
+        # thinking it's a hang, and then hitting ctl-c..
+        print(
+            f"i'm a user on the PARENT side and thingz hangin "
+            f'after timeout={timeout} ???\n\n'
+            'MASHING CTlR-C..!?\n'
+        )
+        raise KeyboardInterrupt
 
 
 async def main(
@@ -169,9 +179,6 @@ async def main(
 
 ) -> None:
 
-    # from tractor._state import _runtime_vars as rtv
-    # rtv['_debug_mode'] = debug_mode
-
     async with (
         open_nursery(
             start_method=start_method,
@@ -190,10 +197,11 @@ async def main(
         )
 
         async with (
+            stuff_hangin_ctlc(timeout=2) as timeout,
             _testing.expect_ctxc(
                 yay=(
                     break_parent_ipc_after
-                    or break_child_ipc_after,
+                    or break_child_ipc_after
                 ),
                 # TODO: we CAN'T remove this right?
                 # since we need the ctxc to bubble up from either
@@ -205,12 +213,14 @@ async def main(
                 # and KBI in an eg?
                 reraise=True,
             ),
+
             portal.open_context(
                 recv_and_spawn_net_killers,
                 break_ipc_after=break_child_ipc_after,
                 pre_close=pre_close,
             ) as (ctx, sent),
         ):
+            rx_eoc: bool = False
             ipc_break_sent: bool = False
             async with ctx.open_stream() as stream:
                 for i in range(1000):
@@ -228,6 +238,7 @@ async def main(
                             '#################################\n'
                         )
 
+                        # TODO: other methods? see break func above.
                         # await stream._ctx.chan.send(None)
                         # await stream._ctx.chan.transport.stream.send_eof()
                         await stream._ctx.chan.transport.stream.aclose()
@@ -251,10 +262,12 @@ async def main(
                         # TODO: is this needed or no?
                         raise
 
-                    timeout: int = 1
-                    print(f'Entering `stream.receive()` with timeout={timeout}\n')
-                    with trio.move_on_after(timeout) as cs:
-
+                    # timeout: int = 1
+                    # with trio.move_on_after(timeout) as cs:
+                    async with stuff_hangin_ctlc() as timeout:
+                        print(
+                            f'PARENT `stream.receive()` with timeout={timeout}\n'
+                        )
                         # NOTE: in the parent side IPC failure case this
                         # will raise an ``EndOfChannel`` after the child
                         # is killed and sends a stop msg back to it's
@@ -266,23 +279,30 @@ async def main(
                                 f'{rx}\n'
                             )
                         except trio.EndOfChannel:
+                            rx_eoc: bool = True
                             print('MsgStream got EoC for PARENT')
                             raise
 
-                    if cs.cancelled_caught:
-                        # pretend to be a user seeing no streaming action
-                        # thinking it's a hang, and then hitting ctl-c..
-                        print(
-                            f"YOO i'm a PARENT user anddd thingz hangin..\n"
-                            f'after timeout={timeout}\n'
-                        )
+            print(
+                'Streaming finished and we got Eoc.\n'
+                'Canceling `.open_context()` in root with\n'
+                'CTlR-C..'
+            )
+            if rx_eoc:
+                assert stream.closed
+                try:
+                    await stream.send(i)
+                    pytest.fail('stream not closed?')
+                except (
+                    trio.ClosedResourceError,
+                    trio.EndOfChannel,
+                ) as send_err:
+                    if rx_eoc:
+                        assert send_err is stream._eoc
+                    else:
+                        assert send_err is stream._closed
 
-                print(
-                    "YOO i'm mad!\n"
-                    'The send side is dun but thingz hangin..\n'
-                    'MASHING CTlR-C Ctl-c..'
-                )
-                raise KeyboardInterrupt
+            raise KeyboardInterrupt
 
 
 if __name__ == '__main__':
diff --git a/tests/test_advanced_faults.py b/tests/test_advanced_faults.py
index 8b73b4c2..5f73ac6c 100644
--- a/tests/test_advanced_faults.py
+++ b/tests/test_advanced_faults.py
@@ -85,8 +85,8 @@ def test_ipc_channel_break_during_stream(
 
     '''
     if spawn_backend != 'trio':
-    #     if debug_mode:
-    #         pytest.skip('`debug_mode` only supported on `trio` spawner')
+        if debug_mode:
+            pytest.skip('`debug_mode` only supported on `trio` spawner')
 
         # non-`trio` spawners should never hit the hang condition that
         # requires the user to do ctl-c to cancel the actor tree.
@@ -107,7 +107,10 @@ def test_ipc_channel_break_during_stream(
         # AND we tell the child to call `MsgStream.aclose()`.
         and pre_aclose_msgstream
     ):
-        expect_final_exc = trio.EndOfChannel
+        # expect_final_exc = trio.EndOfChannel
+        # ^XXX NOPE! XXX^ since now `.open_stream()` absorbs this
+        # gracefully!
+        expect_final_exc = KeyboardInterrupt
 
     # NOTE when ONLY the child breaks or it breaks BEFORE the
     # parent we expect the parent to get a closed resource error
@@ -120,11 +123,25 @@ def test_ipc_channel_break_during_stream(
         and
         ipc_break['break_parent_ipc_after'] is False
     ):
-        expect_final_exc = trio.ClosedResourceError
+        # NOTE: we DO NOT expect this any more since
+        # the child side's channel will be broken silently
+        # and nothing on the parent side will indicate this!
+        # expect_final_exc = trio.ClosedResourceError
 
-        # if child calls `MsgStream.aclose()` then expect EoC.
+        # NOTE: child will send a 'stop' msg before it breaks
+        # the transport channel BUT, that will be absorbed by the
+        # `ctx.open_stream()` block and thus the `.open_context()`
+        # should hang, after which the test script simulates
+        # a user sending ctl-c by raising a KBI.
         if pre_aclose_msgstream:
-            expect_final_exc = trio.EndOfChannel
+            expect_final_exc = KeyboardInterrupt
+
+            # XXX OLD XXX
+            # if child calls `MsgStream.aclose()` then expect EoC.
+            # ^ XXX not any more ^ since eoc is always absorbed
+            # gracefully and NOT bubbled to the `.open_context()`
+            # block!
+            # expect_final_exc = trio.EndOfChannel
 
     # BOTH but, CHILD breaks FIRST
     elif (
@@ -134,12 +151,8 @@ def test_ipc_channel_break_during_stream(
             > ipc_break['break_child_ipc_after']
         )
     ):
-        expect_final_exc = trio.ClosedResourceError
-
-        # child will send a 'stop' msg before it breaks
-        # the transport channel.
         if pre_aclose_msgstream:
-            expect_final_exc = trio.EndOfChannel
+            expect_final_exc = KeyboardInterrupt
 
     # NOTE when the parent IPC side dies (even if the child's does as well
     # but the child fails BEFORE the parent) we always expect the
@@ -160,7 +173,8 @@ def test_ipc_channel_break_during_stream(
         ipc_break['break_parent_ipc_after'] is not False
         and (
             ipc_break['break_child_ipc_after']
-            > ipc_break['break_parent_ipc_after']
+            >
+            ipc_break['break_parent_ipc_after']
         )
     ):
         expect_final_exc = trio.ClosedResourceError
@@ -224,25 +238,29 @@ def test_stream_closed_right_after_ipc_break_and_zombie_lord_engages():
 
     '''
     async def main():
-        async with tractor.open_nursery() as n:
-            portal = await n.start_actor(
-                'ipc_breaker',
-                enable_modules=[__name__],
-            )
+        with trio.fail_after(3):
+            async with tractor.open_nursery() as n:
+                portal = await n.start_actor(
+                    'ipc_breaker',
+                    enable_modules=[__name__],
+                )
 
-            with trio.move_on_after(1):
-                async with (
-                    portal.open_context(
-                        break_ipc_after_started
-                    ) as (ctx, sent),
-                ):
-                    async with ctx.open_stream():
-                        await trio.sleep(0.5)
+                with trio.move_on_after(1):
+                    async with (
+                        portal.open_context(
+                            break_ipc_after_started
+                        ) as (ctx, sent),
+                    ):
+                        async with ctx.open_stream():
+                            await trio.sleep(0.5)
 
-                    print('parent waiting on context')
+                        print('parent waiting on context')
 
-            print('parent exited context')
-            raise KeyboardInterrupt
+                print(
+                    'parent exited context\n'
+                    'parent raising KBI..\n'
+                )
+                raise KeyboardInterrupt
 
     with pytest.raises(KeyboardInterrupt):
         trio.run(main)
-- 
2.34.1


From d5e5174d9759d637c33a3d4e1e176482433a4fd9 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 20 Mar 2024 10:29:40 -0400
Subject: [PATCH 177/378] Extend inter-peer cancel tests for "inceptions"

Use new `RemoteActorError` fields in various assertions particularly
ensuring that an RTE relayed through the spawner from the little_bro
shows up at the client with the right number of entries in the
`.relay_path` and that the error is raised in the client as desired in
the original use case from `modden`'s remote spawn spawn request API
(which was kinda the whole original motivation to finally get all this
multi-actor error relay stuff workin).

Case extensions:
- RTE relayed from little_bro through spawner to client when
  `raise_sub_spawn_error_after` is set; in this case test should raise
  the relayed and RAE boxed RTE right up to the `trio.run()`.
  -> ensure the `rae.src_uid`, `.relay_uid` are set correctly.
  -> ensure ctx cancels are no acked.
- use `expect_ctxc()` around root's `tell_little_bro()` usage.
- do `debug_mode` assertions when enabled by test harness in each actor
  layer.
- obvi use new `.src_type`/`.boxed_type` for final error propagation
  assertions.
---
 tests/test_inter_peer_cancellation.py | 235 ++++++++++++++++++++------
 1 file changed, 188 insertions(+), 47 deletions(-)

diff --git a/tests/test_inter_peer_cancellation.py b/tests/test_inter_peer_cancellation.py
index e3c8a7dd..470287fb 100644
--- a/tests/test_inter_peer_cancellation.py
+++ b/tests/test_inter_peer_cancellation.py
@@ -16,6 +16,11 @@ from tractor import (  # typing
     Portal,
     Context,
     ContextCancelled,
+    RemoteActorError,
+)
+from tractor._testing import (
+    # tractor_test,
+    expect_ctxc,
 )
 
 # XXX TODO cases:
@@ -156,10 +161,11 @@ def test_do_not_swallow_error_before_started_by_remote_contextcancelled(
             ):
                 await trio.sleep_forever()
 
-    with pytest.raises(tractor.RemoteActorError) as excinfo:
+    with pytest.raises(RemoteActorError) as excinfo:
         trio.run(main)
 
-    assert excinfo.value.type == TypeError
+    rae = excinfo.value
+    assert rae.boxed_type == TypeError
 
 
 @tractor.context
@@ -739,14 +745,16 @@ def test_peer_canceller(
         with pytest.raises(ContextCancelled) as excinfo:
             trio.run(main)
 
-        assert excinfo.value.type == ContextCancelled
+        assert excinfo.value.boxed_type == ContextCancelled
         assert excinfo.value.canceller[0] == 'canceller'
 
 
 @tractor.context
 async def basic_echo_server(
     ctx: Context,
-    peer_name: str = 'stepbro',
+    peer_name: str = 'wittle_bruv',
+
+    err_after: int|None = None,
 
 ) -> None:
     '''
@@ -774,17 +782,31 @@ async def basic_echo_server(
             # assert 0
             await ipc.send(resp)
 
+            if (
+                err_after
+                and i > err_after
+            ):
+                raise RuntimeError(
+                    f'Simulated error in `{peer_name}`'
+                )
+
 
 @tractor.context
 async def serve_subactors(
     ctx: Context,
     peer_name: str,
+    debug_mode: bool,
 
 ) -> None:
     async with open_nursery() as an:
+
+        # sanity
+        if debug_mode:
+            assert tractor._state.debug_mode()
+
         await ctx.started(peer_name)
-        async with ctx.open_stream() as reqs:
-            async for msg in reqs:
+        async with ctx.open_stream() as ipc:
+            async for msg in ipc:
                 peer_name: str = msg
                 peer: Portal = await an.start_actor(
                     name=peer_name,
@@ -795,7 +817,7 @@ async def serve_subactors(
                     f'{peer_name}\n'
                     f'|_{peer}\n'
                 )
-                await reqs.send((
+                await ipc.send((
                     peer.chan.uid,
                     peer.chan.raddr,
                 ))
@@ -807,14 +829,20 @@ async def serve_subactors(
 async def client_req_subactor(
     ctx: Context,
     peer_name: str,
+    debug_mode: bool,
 
     # used to simulate a user causing an error to be raised
     # directly in thread (like a KBI) to better replicate the
     # case where a `modden` CLI client would hang afer requesting
     # a `Context.cancel()` to `bigd`'s wks spawner.
     reraise_on_cancel: str|None = None,
+    sub_err_after: int|None = None,
 
 ) -> None:
+    # sanity
+    if debug_mode:
+        assert tractor._state.debug_mode()
+
     # TODO: other cases to do with sub lifetimes:
     # -[ ] test that we can have the server spawn a sub
     #   that lives longer then ctx with this client.
@@ -836,6 +864,7 @@ async def client_req_subactor(
         spawner.open_context(
             serve_subactors,
             peer_name=peer_name,
+            debug_mode=debug_mode,
         ) as (spawner_ctx, first),
     ):
         assert first == peer_name
@@ -857,6 +886,7 @@ async def client_req_subactor(
             await tell_little_bro(
                 actor_name=sub_uid[0],
                 caller='client',
+                err_after=sub_err_after,
             )
 
             # TODO: test different scope-layers of
@@ -868,9 +898,7 @@ async def client_req_subactor(
             # TODO: would be super nice to have a special injected
             # cancel type here (maybe just our ctxc) but using
             # some native mechanism in `trio` :p
-            except (
-                trio.Cancelled
-            ) as err:
+            except trio.Cancelled as err:
                 _err = err
                 if reraise_on_cancel:
                     errtype = globals()['__builtins__'][reraise_on_cancel]
@@ -897,7 +925,9 @@ async def client_req_subactor(
 
 async def tell_little_bro(
     actor_name: str,
-    caller: str = ''
+
+    caller: str = '',
+    err_after: int|None = None,
 ):
     # contact target actor, do a stream dialog.
     async with (
@@ -906,10 +936,12 @@ async def tell_little_bro(
         ) as lb,
         lb.open_context(
             basic_echo_server,
+
+            # XXX proxy any delayed err condition
+            err_after=err_after,
         ) as (sub_ctx, first),
-        sub_ctx.open_stream(
-            basic_echo_server,
-        ) as echo_ipc,
+
+        sub_ctx.open_stream() as echo_ipc,
     ):
         actor: Actor = current_actor()
         uid: tuple = actor.uid
@@ -936,10 +968,15 @@ async def tell_little_bro(
     'raise_client_error',
     [None, 'KeyboardInterrupt'],
 )
+@pytest.mark.parametrize(
+    'raise_sub_spawn_error_after',
+    [None, 50],
+)
 def test_peer_spawns_and_cancels_service_subactor(
     debug_mode: bool,
     raise_client_error: str,
     reg_addr: tuple[str, int],
+    raise_sub_spawn_error_after: int|None,
 ):
     # NOTE: this tests for the modden `mod wks open piker` bug
     # discovered as part of implementing workspace ctx
@@ -953,6 +990,16 @@ def test_peer_spawns_and_cancels_service_subactor(
     #   and the server's spawned child should cancel and terminate!
     peer_name: str = 'little_bro'
 
+    def check_inner_rte(rae: RemoteActorError):
+        '''
+        Validate the little_bro's relayed inception!
+
+        '''
+        assert rae.boxed_type is RemoteActorError
+        assert rae.src_type is RuntimeError
+        assert 'client' in rae.relay_uid
+        assert peer_name in rae.src_uid
+
     async def main():
         async with tractor.open_nursery(
             # NOTE: to halt the peer tasks on ctxc, uncomment this.
@@ -976,14 +1023,24 @@ def test_peer_spawns_and_cancels_service_subactor(
                     server.open_context(
                         serve_subactors,
                         peer_name=peer_name,
+                        debug_mode=debug_mode,
+
                     ) as (spawn_ctx, first),
 
                     client.open_context(
                         client_req_subactor,
                         peer_name=peer_name,
+                        debug_mode=debug_mode,
                         reraise_on_cancel=raise_client_error,
+
+                        # trigger for error condition in sub
+                        # during streaming.
+                        sub_err_after=raise_sub_spawn_error_after,
+
                     ) as (client_ctx, client_says),
                 ):
+                    root: Actor = current_actor()
+                    spawner_uid: tuple = spawn_ctx.chan.uid
                     print(
                         f'Server says: {first}\n'
                         f'Client says: {client_says}\n'
@@ -993,6 +1050,7 @@ def test_peer_spawns_and_cancels_service_subactor(
                     # (grandchild of this root actor) "little_bro"
                     # and ensure we can also use it as an echo
                     # server.
+                    sub: Portal
                     async with tractor.wait_for_actor(
                         name=peer_name,
                     ) as sub:
@@ -1004,56 +1062,139 @@ def test_peer_spawns_and_cancels_service_subactor(
                         f'.uid: {sub.actor.uid}\n'
                         f'chan.raddr: {sub.chan.raddr}\n'
                     )
-                    await tell_little_bro(
-                        actor_name=peer_name,
-                        caller='root',
-                    )
 
-                    # signal client to raise a KBI
-                    await client_ctx.cancel()
-                    print('root cancelled client, checking that sub-spawn is down')
+                    async with expect_ctxc(
+                        yay=raise_sub_spawn_error_after,
+                        reraise=False,
+                    ):
+                        await tell_little_bro(
+                            actor_name=peer_name,
+                            caller='root',
+                        )
 
-                    async with tractor.find_actor(
-                        name=peer_name,
-                    ) as sub:
-                        assert not sub
+                    if not raise_sub_spawn_error_after:
 
-                    print('root cancelling server/client sub-actors')
+                        # signal client to cancel and maybe raise a KBI
+                        await client_ctx.cancel()
+                        print(
+                            '-> root cancelling client,\n'
+                            '-> root checking `client_ctx.result()`,\n'
+                            f'-> checking that sub-spawn {peer_name} is down\n'
+                        )
+                    # else:
 
-                    # await tractor.pause()
-                    res = await client_ctx.result(hide_tb=False)
-                    assert isinstance(res, ContextCancelled)
-                    assert client_ctx.cancel_acked
-                    assert res.canceller == current_actor().uid
+                    try:
+                        res = await client_ctx.result(hide_tb=False)
+
+                        # in remote (relayed inception) error
+                        # case, we should error on the line above!
+                        if raise_sub_spawn_error_after:
+                            pytest.fail(
+                                'Never rxed proxied `RemoteActorError[RuntimeError]` !?'
+                            )
+
+                        assert isinstance(res, ContextCancelled)
+                        assert client_ctx.cancel_acked
+                        assert res.canceller == root.uid
+
+                    except RemoteActorError as rae:
+                        _err = rae
+                        assert raise_sub_spawn_error_after
+
+                        # since this is a "relayed error" via the client
+                        # sub-actor, it is expected to be
+                        # a `RemoteActorError` boxing another
+                        # `RemoteActorError` otherwise known as
+                        #  an "inception" (from `trio`'s parlance)
+                        # ((or maybe a "Matryoshka" and/or "matron"
+                        # in our own working parlance)) which
+                        # contains the source error from the
+                        # little_bro: a `RuntimeError`.
+                        #
+                        check_inner_rte(rae)
+                        assert rae.relay_uid == client.chan.uid
+                        assert rae.src_uid == sub.chan.uid
+
+                        assert not client_ctx.cancel_acked
+                        assert (
+                            client_ctx.maybe_error
+                            is client_ctx.outcome
+                            is rae
+                        )
+                        raise
+                        # await tractor.pause()
+
+                    else:
+                        assert not raise_sub_spawn_error_after
+
+                        # cancelling the spawner sub should
+                        # transitively cancel it's sub, the little
+                        # bruv.
+                        print('root cancelling server/client sub-actors')
+                        await spawn_ctx.cancel()
+                        async with tractor.find_actor(
+                            name=peer_name,
+                        ) as sub:
+                            assert not sub
 
-                    await spawn_ctx.cancel()
                     # await server.cancel_actor()
 
+            except RemoteActorError as rae:
+                # XXX more-or-less same as above handler
+                # this is just making sure the error bubbles out
+                # of the 
+                _err = rae
+                assert raise_sub_spawn_error_after
+                raise
+
             # since we called `.cancel_actor()`, `.cancel_ack`
             # will not be set on the ctx bc `ctx.cancel()` was not
             # called directly fot this confext.
             except ContextCancelled as ctxc:
-                print('caught ctxc from contexts!')
-                assert ctxc.canceller == current_actor().uid
+                _ctxc = ctxc
+                print(
+                    f'{root.uid} caught ctxc from ctx with {client_ctx.chan.uid}\n'
+                    f'{repr(ctxc)}\n'
+                )
+
+                if not raise_sub_spawn_error_after:
+                    assert ctxc.canceller == root.uid
+                else:
+                    assert ctxc.canceller == spawner_uid
+
                 assert ctxc is spawn_ctx.outcome
                 assert ctxc is spawn_ctx.maybe_error
                 raise
 
-            # assert spawn_ctx.cancel_acked
-            assert spawn_ctx.cancel_acked
-            assert client_ctx.cancel_acked
+            if raise_sub_spawn_error_after:
+                pytest.fail(
+                    'context block(s) in PARENT never raised?!?'
+                )
 
-            await client.cancel_actor()
-            await server.cancel_actor()
+            if not raise_sub_spawn_error_after:
+                # assert spawn_ctx.cancel_acked
+                assert spawn_ctx.cancel_acked
+                assert client_ctx.cancel_acked
 
-            # WOA WOA WOA! we need this to close..!!!??
-            # that's super bad XD
+                await client.cancel_actor()
+                await server.cancel_actor()
 
-            # TODO: why isn't this working!?!?
-            # we're now outside the `.open_context()` block so
-            # the internal `Context._scope: CancelScope` should be
-            # gracefully "closed" ;)
+                # WOA WOA WOA! we need this to close..!!!??
+                # that's super bad XD
 
-            # assert spawn_ctx.cancelled_caught
+                # TODO: why isn't this working!?!?
+                # we're now outside the `.open_context()` block so
+                # the internal `Context._scope: CancelScope` should be
+                # gracefully "closed" ;)
 
-    trio.run(main)
+                # assert spawn_ctx.cancelled_caught
+
+    if raise_sub_spawn_error_after:
+        with pytest.raises(RemoteActorError) as excinfo:
+            trio.run(main)
+
+        rae: RemoteActorError = excinfo.value
+        check_inner_rte(rae)
+
+    else:
+        trio.run(main)
-- 
2.34.1


From 290b0a86b1517dad543278e0dd8b834f7445fb92 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 20 Mar 2024 10:42:17 -0400
Subject: [PATCH 178/378] Another cancel-req-invalid log msg fmt tweak

---
 tractor/_runtime.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index ff929c0b..ff3fb74f 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -715,10 +715,12 @@ class Actor:
                         f'|_{chan}\n'
                     )
                     try:
-                        # send a msg loop terminate sentinel
+                        # send msg loop terminate sentinel which
+                        # triggers cancellation of all remotely
+                        # started tasks.
                         await chan.send(None)
 
-                        # XXX: do we want this?
+                        # XXX: do we want this? no right?
                         # causes "[104] connection reset by peer" on other end
                         # await chan.aclose()
 
@@ -1208,10 +1210,10 @@ class Actor:
             # - callee self raises ctxc before caller send request,
             # - callee errors prior to cancel req.
             log.cancel(
-                'Cancel request invalid, RPC task already completed?\n'
+                'Cancel request invalid, RPC task already completed?\n\n'
                 f'<= canceller: {requesting_uid}\n\n'
-                f'=>{parent_chan}\n'
-                f'  |_ctx-id: {cid}\n'
+                f'=> {cid}@{parent_chan.uid}\n'
+                f'  |_{parent_chan}\n'
             )
             return True
 
@@ -1510,7 +1512,6 @@ async def async_main(
             ):
                 accept_addrs = set_accept_addr_says_rent
 
-
         # The "root" nursery ensures the channel with the immediate
         # parent is kept alive as a resilient service until
         # cancellation steps have (mostly) occurred in
-- 
2.34.1


From 8e66f45e2386efacb344329a05c341343f9c0659 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 20 Mar 2024 11:36:39 -0400
Subject: [PATCH 179/378] Lul, don't overwrite 'tb_str' with src actor's..

This is what was breaking the nested debugger test (where it was failing
on the traceback content matching) and it makes sense.. XD
=> We always want to use the locally boxed `RemoteActorError`'s
traceback content NOT overwrite it with that from the src actor..

Also gets rid of setting the `'relay_uid'` since it's pulled from the
final element in the `'relay_path'` anyway.
---
 tractor/_exceptions.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index b28a4a75..0e1d6d10 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -111,7 +111,6 @@ class RemoteActorError(Exception):
     reprol_fields: list[str] = [
         'src_uid',
         'relay_path',
-        # 'relay_uid',
     ]
 
     def __init__(
@@ -487,14 +486,11 @@ def pack_error(
     else:
         tb_str = traceback.format_exc()
 
-    our_uid: tuple = current_actor().uid
-    error_msg: dict[
+    error_msg: dict[  # for IPC
         str,
         str | tuple[str, str]
-    ] = {
-        'tb_str': tb_str,
-        'relay_uid': our_uid,
-    }
+    ] = {}
+    our_uid: tuple = current_actor().uid
 
     if (
         isinstance(exc, RemoteActorError)
@@ -535,6 +531,11 @@ def pack_error(
         [],
     ).append(our_uid)
 
+    # XXX NOTE: always ensure the traceback-str is from the
+    # locally raised error (**not** the prior relay's boxed
+    # content's `.msgdata`).
+    error_msg['tb_str'] = tb_str
+
     pkt: dict = {'error': error_msg}
     if cid:
         pkt['cid'] = cid
-- 
2.34.1


From c04d77a3c9b1347b9f90eb61f827bf0b1c30383d Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 20 Mar 2024 19:13:13 -0400
Subject: [PATCH 180/378] First draft workin minus non-main-thread usage!

---
 examples/debugging/sync_bp.py |  69 +++++++++++++
 tractor/devx/_debug.py        | 177 ++++++++++++++++++++--------------
 2 files changed, 172 insertions(+), 74 deletions(-)
 create mode 100644 examples/debugging/sync_bp.py

diff --git a/examples/debugging/sync_bp.py b/examples/debugging/sync_bp.py
new file mode 100644
index 00000000..49f4d9aa
--- /dev/null
+++ b/examples/debugging/sync_bp.py
@@ -0,0 +1,69 @@
+import trio
+import tractor
+
+
+def sync_pause():
+    tractor.pause_from_sync()
+
+
+@tractor.context
+async def start_n_sync_pause(
+    ctx: tractor.Context,
+):
+    # sync to requesting peer
+    await ctx.started()
+
+    actor: tractor.Actor = tractor.current_actor()
+    print(f'entering SYNC PAUSE in {actor.uid}')
+    sync_pause()
+    print(f'back from SYNC PAUSE in {actor.uid}')
+
+
+async def main() -> None:
+
+    from tractor._rpc import maybe_import_gb
+
+    async with tractor.open_nursery(
+        debug_mode=True,
+    ) as an:
+
+        # TODO: where to put this?
+        # => just inside `open_root_actor()` yah?
+        await maybe_import_gb()
+
+        p: tractor.Portal  = await an.start_actor(
+            'subactor',
+            enable_modules=[__name__],
+            # infect_asyncio=True,
+            debug_mode=True,
+            loglevel='cancel',
+        )
+
+        # TODO: 3 sub-actor usage cases:
+        # -[ ] via a `.run_in_actor()` call
+        # -[ ] via a `.run()`
+        # -[ ] via a `.open_context()`
+        #
+        async with p.open_context(
+            start_n_sync_pause,
+        ) as (ctx, first):
+            assert first is None
+
+            await tractor.pause()
+            sync_pause()
+
+        # TODO: make this work!!
+        await trio.to_thread.run_sync(
+            sync_pause,
+            abandon_on_cancel=False,
+        )
+
+        await ctx.cancel()
+
+        # TODO: case where we cancel from trio-side while asyncio task
+        # has debugger lock?
+        await p.cancel_actor()
+
+
+if __name__ == '__main__':
+    trio.run(main)
diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 3203af1b..105d2ca4 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -46,7 +46,7 @@ import pdbp
 import tractor
 import trio
 from trio.lowlevel import current_task
-from trio_typing import (
+from trio import (
     TaskStatus,
     # Task,
 )
@@ -400,7 +400,6 @@ async def wait_for_parent_stdin_hijack(
 
                 # this syncs to child's ``Context.started()`` call.
                 async with portal.open_context(
-
                     lock_tty_for_child,
                     subactor_uid=actor_uid,
 
@@ -682,7 +681,10 @@ def _set_trace(
 async def _pause(
 
     debug_func: Callable = _set_trace,
-    release_lock_signal: trio.Event | None = None,
+
+    # NOTE: must be passed in the `.pause_from_sync()` case!
+    pdb: MultiActorPdb|None = None,
+    undo_sigint: Callable|None = None,
 
     # TODO: allow caller to pause despite task cancellation,
     # exactly the same as wrapping with:
@@ -691,8 +693,7 @@ async def _pause(
     # => the REMAINING ISSUE is that the scope's .__exit__() frame
     # is always show in the debugger on entry.. and there seems to
     # be no way to override it?..
-    # shield: bool = False,
-
+    #
     shield: bool = False,
     task_status: TaskStatus[trio.Event] = trio.TASK_STATUS_IGNORED
 
@@ -707,7 +708,6 @@ async def _pause(
     '''
     __tracebackhide__: bool = True
     actor = current_actor()
-    pdb, undo_sigint = mk_mpdb()
     task_name: str = trio.lowlevel.current_task().name
 
     if (
@@ -716,9 +716,14 @@ async def _pause(
     ):
         Lock.local_pdb_complete = trio.Event()
 
-    debug_func = partial(
-        debug_func,
-    )
+    if debug_func is not None:
+        debug_func = partial(
+            debug_func,
+        )
+
+    if pdb is None:
+        assert undo_sigint is None, 'You must pass both!?!'
+        pdb, undo_sigint = mk_mpdb()
 
     # TODO: need a more robust check for the "root" actor
     if (
@@ -761,12 +766,14 @@ async def _pause(
         # ```
         # but not entirely sure if that's a sane way to implement it?
         try:
+            print("ACQUIRING TTY LOCK from CHILD")
             with trio.CancelScope(shield=True):
                 await actor._service_n.start(
                     wait_for_parent_stdin_hijack,
                     actor.uid,
                 )
                 Lock.repl = pdb
+
         except RuntimeError:
             Lock.release()
 
@@ -779,11 +786,13 @@ async def _pause(
             raise
 
     elif is_root_process():
+        print("ROOT TTY LOCK BRANCH")
 
         # we also wait in the root-parent for any child that
         # may have the tty locked prior
         # TODO: wait, what about multiple root tasks acquiring it though?
         if Lock.global_actor_in_debug == actor.uid:
+            print("ROOT ALREADY HAS TTY?")
             # re-entrant root process already has it: noop.
             return
 
@@ -797,11 +806,14 @@ async def _pause(
 
             # must shield here to avoid hitting a ``Cancelled`` and
             # a child getting stuck bc we clobbered the tty
+            print("ACQUIRING TTY LOCK from ROOT")
             with trio.CancelScope(shield=True):
                 await Lock._debug_lock.acquire()
         else:
             # may be cancelled
+            print("ROOT TRYING LOCK ACQUIRE")
             await Lock._debug_lock.acquire()
+            print("ROOT LOCKED TTY")
 
         Lock.global_actor_in_debug = actor.uid
         Lock.local_task_in_debug = task_name
@@ -811,32 +823,27 @@ async def _pause(
         # TODO: do we want to support using this **just** for the
         # locking / common code (prolly to help address #320)?
         #
-        # if debug_func is None:
-            # assert release_lock_signal, (
-            #     'Must pass `release_lock_signal: trio.Event` if no '
-            #     'trace func provided!'
-            # )
-            # print(f"{actor.uid} ENTERING WAIT")
-            # with trio.CancelScope(shield=True):
-            #     await release_lock_signal.wait()
+        if debug_func is None:
+            task_status.started(Lock)
+            print("ROOT .started(Lock) now!")
 
-        # else:
+        else:
             # block here one (at the appropriate frame *up*) where
             # ``breakpoint()`` was awaited and begin handling stdio.
-        log.debug('Entering sync world of the `pdb` REPL..')
-        try:
-            debug_func(
-                actor,
-                pdb,
-                extra_frames_up_when_async=2,
-                shield=shield,
-            )
-        except BaseException:
-            log.exception(
-                'Failed to invoke internal `debug_func = '
-                f'{debug_func.func.__name__}`\n'
-            )
-            raise
+            log.debug('Entering sync world of the `pdb` REPL..')
+            try:
+                debug_func(
+                    actor,
+                    pdb,
+                    extra_frames_up_when_async=2,
+                    shield=shield,
+                )
+            except BaseException:
+                log.exception(
+                    'Failed to invoke internal `debug_func = '
+                    f'{debug_func.func.__name__}`\n'
+                )
+                raise
 
     except bdb.BdbQuit:
         Lock.release()
@@ -862,8 +869,7 @@ async def _pause(
 
 async def pause(
 
-    debug_func: Callable = _set_trace,
-    release_lock_signal: trio.Event | None = None,
+    debug_func: Callable|None = _set_trace,
 
     # TODO: allow caller to pause despite task cancellation,
     # exactly the same as wrapping with:
@@ -872,10 +878,11 @@ async def pause(
     # => the REMAINING ISSUE is that the scope's .__exit__() frame
     # is always show in the debugger on entry.. and there seems to
     # be no way to override it?..
-    # shield: bool = False,
-
+    #
     shield: bool = False,
-    task_status: TaskStatus[trio.Event] = trio.TASK_STATUS_IGNORED
+    task_status: TaskStatus[trio.Event] = trio.TASK_STATUS_IGNORED,
+
+    **_pause_kwargs,
 
 ) -> None:
     '''
@@ -920,16 +927,16 @@ async def pause(
             task_status.started(cs)
             return await _pause(
                 debug_func=debug_func,
-                release_lock_signal=release_lock_signal,
                 shield=True,
                 task_status=task_status,
+                **_pause_kwargs
             )
     else:
         return await _pause(
             debug_func=debug_func,
-            release_lock_signal=release_lock_signal,
             shield=False,
             task_status=task_status,
+            **_pause_kwargs
         )
 
 
@@ -938,46 +945,64 @@ async def pause(
 # TODO: allow pausing from sync code.
 # normally by remapping python's builtin breakpoint() hook to this
 # runtime aware version which takes care of all .
-def pause_from_sync() -> None:
-    print("ENTER SYNC PAUSE")
+def pause_from_sync(
+    hide_tb: bool = True
+) -> None:
+
+    __tracebackhide__: bool = hide_tb
     actor: tractor.Actor = current_actor(
         err_on_no_runtime=False,
     )
-    if actor:
-        try:
-            import greenback
-            # __tracebackhide__ = True
+    print(
+        f'{actor.uid}: JUST ENTERED `tractor.pause_from_sync()`'
+        f'|_{actor}\n'
+    )
+    if not actor:
+        raise RuntimeError(
+            'Not inside the `tractor`-runtime?\n'
+            '`tractor.pause_from_sync()` is not functional without a wrapping\n'
+            '- `async with tractor.open_nursery()` or,\n'
+            '- `async with tractor.open_root_actor()`\n'
+        )
 
+    try:
+        import greenback
+    except ModuleNotFoundError:
+        raise RuntimeError(
+            'The `greenback` lib is required to use `tractor.pause_from_sync()`!\n'
+            'https://github.com/oremanj/greenback\n'
+        )
 
-            # task_can_release_tty_lock = trio.Event()
-
-            # spawn bg task which will lock out the TTY, we poll
-            # just below until the release event is reporting that task as
-            # waiting.. not the most ideal but works for now ;)
-            greenback.await_(
-                actor._service_n.start(partial(
-                    pause,
-                    debug_func=None,
-                    # release_lock_signal=task_can_release_tty_lock,
-                ))
-            )
-
-        except ModuleNotFoundError:
-            log.warning('NO GREENBACK FOUND')
-    else:
-        log.warning('Not inside actor-runtime')
+    # out = greenback.await_(
+    #     actor._service_n.start(partial(
+    #         pause,
+    #         debug_func=None,
+    #         release_lock_signal=task_can_release_tty_lock,
+    #     ))
+    # )
 
+    # spawn bg task which will lock out the TTY, we poll
+    # just below until the release event is reporting that task as
+    # waiting.. not the most ideal but works for now ;)
     db, undo_sigint = mk_mpdb()
-    Lock.local_task_in_debug = 'sync'
-    # db.config.enable_hidden_frames = True
+    greenback.await_(
+        pause(
+            debug_func=None,
+            pdb=db,
+            undo_sigint=undo_sigint,
+        )
+    )
 
-    # we entered the global ``breakpoint()`` built-in from sync
+    Lock.local_task_in_debug = 'sync'
+
+    # TODO: ensure we aggressively make the user aware about
+    # entering the global ``breakpoint()`` built-in from sync
     # code?
     frame: FrameType | None = sys._getframe()
-    # print(f'FRAME: {str(frame)}')
-    # assert not db._is_hidden(frame)
-
     frame: FrameType = frame.f_back  # type: ignore
+
+    # db.config.enable_hidden_frames = True
+    # assert not db._is_hidden(frame)
     # print(f'FRAME: {str(frame)}')
     # if not db._is_hidden(frame):
     #     pdbp.set_trace()
@@ -985,17 +1010,21 @@ def pause_from_sync() -> None:
     #     (frame, frame.f_lineno)
     # )
     db.set_trace(frame=frame)
-    # NOTE XXX: see the `@pdbp.hideframe` decoration
-    # on `Lock.unshield_sigint()`.. I have NO CLUE why
+
+    # XXX NOTE XXX no other LOC can be here without it
+    # showing up in the REPL's last stack frame !?!
+    # -[ ] tried to use `@pdbp.hideframe` decoration but
+    #   still doesn't work
+    #
+    # FROM BEFORE: on `Lock.unshield_sigint()`.. I have NO CLUE why
     # the next instruction's def frame is being shown
     # in the tb but it seems to be something wonky with
     # the way `pdb` core works?
+    #
+    # NOTE: not needed any more anyway since it's all in
+    # `Lock.release()` now!
     # undo_sigint()
 
-    # Lock.global_actor_in_debug = actor.uid
-    # Lock.release()
-    # task_can_release_tty_lock.set()
-
 
 # using the "pause" semantics instead since
 # that better covers actually somewhat "pausing the runtime"
-- 
2.34.1


From 4f863a698953c19ad5b1e66c78407cf1193ebe22 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 22 Mar 2024 16:41:49 -0400
Subject: [PATCH 181/378] Refine and test `tractor.pause_from_sync()`

Now supports use from any `trio` task, any sync thread started with
`trio.to_thread.run_sync()` AND also via `breakpoint()` builtin API!
The only bit missing now is support for `asyncio` tasks when in infected
mode.. Bo

`greenback` setup/API adjustments:
- move `._rpc.maybe_import_gb()` to -> `devx._debug` and factor out the cached
  import checking into a sync func whilst placing the async `.ensure_portal()`
  bootstrapping into a new async `maybe_init_greenback()`.
- use the new init-er func inside `open_root_actor()` with the output
  predicating whether we override the `breakpoint()` hook.

core `devx._debug` implementation deatz:
- make `mk_mpdb()` only return the `pdp.Pdb` subtype instance since
  the sigint unshielding func is now accessible from the `Lock`
  singleton from anywhere.

- add non-main thread support (at least for `trio.to_thread` use cases)
  to our `Lock` with a new `.is_trio_thread()` predicate that delegates
  directly to `trio`'s internal version.

- do `Lock.is_trio_thread()` checks inside any methods which require
  special provisions when invoked from a non-main `trio` thread:
  - `.[un]shield_sigint()` methods since `signal.signal` usage is only
    allowed from cpython's main thread.
  - `.release()` since `trio.StrictFIFOLock` can only be called from
    a `trio` task.

- rework `.pause_from_sync()` itself to directly call `._set_trace()`
  and don't bother with `greenback._await()` when we're already calling
  it from a `.to_thread.run_sync()` thread, oh and try to use the
  thread/task name when setting `Lock.local_task_in_debug`.

- make it an RTE for now if you try to use `.pause_from_sync()` from any
  infected-`asyncio` task, but support is (hopefully) coming soon!

For testing we add a new `test_debugger.py::test_pause_from_sync()`
which includes a ctrl-c parametrization around the
`examples/debugging/sync_bp.py` script which includes all currently
supported/working usages:
- `tractor.pause_from_sync()`.
- via `breakpoint()` overload.
- from a `trio.to_thread.run_sync()` spawn.
---
 examples/debugging/sync_bp.py |  20 ++-
 tests/test_debugger.py        |  64 +++++++
 tractor/_root.py              |  18 +-
 tractor/_rpc.py               |  26 +--
 tractor/devx/_debug.py        | 330 +++++++++++++++++++++++-----------
 5 files changed, 322 insertions(+), 136 deletions(-)

diff --git a/examples/debugging/sync_bp.py b/examples/debugging/sync_bp.py
index 49f4d9aa..23469d6c 100644
--- a/examples/debugging/sync_bp.py
+++ b/examples/debugging/sync_bp.py
@@ -2,8 +2,18 @@ import trio
 import tractor
 
 
-def sync_pause():
-    tractor.pause_from_sync()
+def sync_pause(
+    use_builtin: bool = True,
+    error: bool = False,
+):
+    if use_builtin:
+        breakpoint()
+
+    else:
+        tractor.pause_from_sync()
+
+    if error:
+        raise RuntimeError('yoyo sync code error')
 
 
 @tractor.context
@@ -21,16 +31,10 @@ async def start_n_sync_pause(
 
 async def main() -> None:
 
-    from tractor._rpc import maybe_import_gb
-
     async with tractor.open_nursery(
         debug_mode=True,
     ) as an:
 
-        # TODO: where to put this?
-        # => just inside `open_root_actor()` yah?
-        await maybe_import_gb()
-
         p: tractor.Portal  = await an.start_actor(
             'subactor',
             enable_modules=[__name__],
diff --git a/tests/test_debugger.py b/tests/test_debugger.py
index 20e67aba..38a3bc2c 100644
--- a/tests/test_debugger.py
+++ b/tests/test_debugger.py
@@ -1027,3 +1027,67 @@ def test_different_debug_mode_per_actor(
     # instead crashed completely
     assert "tractor._exceptions.RemoteActorError: ('crash_boi'" in before
     assert "RuntimeError" in before
+
+
+
+def test_pause_from_sync(
+    spawn,
+    ctlc: bool
+):
+    '''
+    Verify we can use the `pdbp` REPL from sync functions AND from
+    any thread spawned with `trio.to_thread.run_sync()`.
+
+    `examples/debugging/sync_bp.py`
+
+    '''
+    child = spawn('sync_bp')
+    child.expect(PROMPT)
+    assert_before(
+        child,
+        [
+            '`greenback` portal opened!',
+            # pre-prompt line
+            _pause_msg, "('root'",
+        ]
+    )
+    if ctlc:
+        do_ctlc(child)
+    child.sendline('c')
+    child.expect(PROMPT)
+
+    # XXX shouldn't see gb loaded again
+    before = str(child.before.decode())
+    assert not in_prompt_msg(
+        before,
+        ['`greenback` portal opened!'],
+    )
+    assert_before(
+        child,
+        [_pause_msg, "('root'",],
+    )
+
+    if ctlc:
+        do_ctlc(child)
+    child.sendline('c')
+    child.expect(PROMPT)
+    assert_before(
+        child,
+        [_pause_msg, "('subactor'",],
+    )
+
+    if ctlc:
+        do_ctlc(child)
+    child.sendline('c')
+    child.expect(PROMPT)
+    # non-main thread case
+    # TODO: should we agument the pre-prompt msg in this case?
+    assert_before(
+        child,
+        [_pause_msg, "('root'",],
+    )
+
+    if ctlc:
+        do_ctlc(child)
+    child.sendline('c')
+    child.expect(pexpect.EOF)
diff --git a/tractor/_root.py b/tractor/_root.py
index 54451918..91d7c83f 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -94,12 +94,24 @@ async def open_root_actor(
     Runtime init entry point for ``tractor``.
 
     '''
+    # TODO: stick this in a `@cm` defined in `devx._debug`?
+    #
     # Override the global debugger hook to make it play nice with
     # ``trio``, see much discussion in:
     # https://github.com/python-trio/trio/issues/1155#issuecomment-742964018
-    builtin_bp_handler = sys.breakpointhook
-    orig_bp_path: str | None = os.environ.get('PYTHONBREAKPOINT', None)
-    os.environ['PYTHONBREAKPOINT'] = 'tractor.devx._debug.pause_from_sync'
+    if (
+        await _debug.maybe_init_greenback(
+            raise_not_found=False,
+        )
+    ):
+        builtin_bp_handler = sys.breakpointhook
+        orig_bp_path: str|None = os.environ.get(
+            'PYTHONBREAKPOINT',
+            None,
+        )
+        os.environ['PYTHONBREAKPOINT'] = (
+            'tractor.devx._debug.pause_from_sync'
+        )
 
     # attempt to retreive ``trio``'s sigint handler and stash it
     # on our debugger lock state.
diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index d369b41c..91482a07 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -26,7 +26,6 @@ from contextlib import (
 from functools import partial
 import inspect
 from pprint import pformat
-from types import ModuleType
 from typing import (
     Any,
     Callable,
@@ -337,27 +336,6 @@ async def _errors_relayed_via_ipc(
                 actor._ongoing_rpc_tasks.set()
 
 
-_gb_mod: ModuleType|None|False = None
-
-
-async def maybe_import_gb():
-    global _gb_mod
-    if _gb_mod is False:
-        return
-
-    try:
-        import greenback
-        _gb_mod = greenback
-        await greenback.ensure_portal()
-
-    except ModuleNotFoundError:
-        log.debug(
-            '`greenback` is not installed.\n'
-            'No sync debug support!\n'
-        )
-        _gb_mod = False
-
-
 async def _invoke(
 
     actor: Actor,
@@ -385,7 +363,9 @@ async def _invoke(
     treat_as_gen: bool = False
 
     if _state.debug_mode():
-        await maybe_import_gb()
+        # XXX for .pause_from_sync()` usage we need to make sure
+        # `greenback` is boostrapped in the subactor!
+        await _debug.maybe_init_greenback()
 
     # TODO: possibly a specially formatted traceback
     # (not sure what typing is for this..)?
diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 105d2ca4..770995a2 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -33,14 +33,19 @@ from functools import (
 import os
 import signal
 import sys
+import threading
 import traceback
 from typing import (
     Any,
     Callable,
     AsyncIterator,
     AsyncGenerator,
+    TYPE_CHECKING,
+)
+from types import (
+    FrameType,
+    ModuleType,
 )
-from types import FrameType
 
 import pdbp
 import tractor
@@ -51,17 +56,22 @@ from trio import (
     # Task,
 )
 
-from ..log import get_logger
-from .._state import (
+from tractor.log import get_logger
+from tractor._state import (
     current_actor,
     is_root_process,
     debug_mode,
 )
-from .._exceptions import (
+from tractor._exceptions import (
     is_multi_cancelled,
     ContextCancelled,
 )
-from .._ipc import Channel
+from tractor._ipc import Channel
+
+if TYPE_CHECKING:
+    from tractor._runtime import (
+        Actor,
+    )
 
 log = get_logger(__name__)
 
@@ -116,10 +126,36 @@ class Lock:
 
     @classmethod
     def shield_sigint(cls):
-        cls._orig_sigint_handler = signal.signal(
-            signal.SIGINT,
-            shield_sigint_handler,
-        )
+        '''
+        Shield out SIGINT handling (which by default triggers
+        `trio.Task` cancellation) in subactors when the `pdb` REPL
+        is active.
+
+        Avoids cancellation of the current actor (task) when the
+        user mistakenly sends ctl-c or a signal is received from
+        an external request; explicit runtime cancel requests are
+        allowed until the use exits the REPL session using
+        'continue' or 'quit', at which point the orig SIGINT
+        handler is restored.
+
+        '''
+        #
+        # XXX detect whether we're running from a non-main thread
+        # in which case schedule the SIGINT shielding override
+        # to in the main thread.
+        # https://docs.python.org/3/library/signal.html#signals-and-threads
+        if not cls.is_trio_thread():
+            cls._orig_sigint_handler: Callable = trio.from_thread.run_sync(
+                signal.signal,
+                signal.SIGINT,
+                shield_sigint_handler,
+            )
+
+        else:
+            cls._orig_sigint_handler = signal.signal(
+                signal.SIGINT,
+                shield_sigint_handler,
+            )
 
     @classmethod
     @pdbp.hideframe  # XXX NOTE XXX see below in `.pause_from_sync()`
@@ -127,13 +163,46 @@ class Lock:
         # always restore ``trio``'s sigint handler. see notes below in
         # the pdb factory about the nightmare that is that code swapping
         # out the handler when the repl activates...
-        signal.signal(signal.SIGINT, cls._trio_handler)
+        if not cls.is_trio_thread():
+            trio.from_thread.run_sync(
+                signal.signal,
+                signal.SIGINT,
+                cls._trio_handler,
+            )
+        else:
+            signal.signal(
+                signal.SIGINT,
+                cls._trio_handler,
+            )
+
         cls._orig_sigint_handler = None
 
+    @classmethod
+    def is_trio_thread(self) -> bool:
+        '''
+        Check if we're the "main" thread (as in the first one
+        started by cpython) and presume that it is the thread that
+        called `trio.run()` and not some thread spawned with
+        `trio.to_thread.run_sync()`.
+
+        '''
+        return trio._util.is_main_thread()
+        # XXX apparently unreliable..see ^
+        # (
+        #     threading.current_thread()
+        #     is not threading.main_thread()
+        # )
+
     @classmethod
     def release(cls):
         try:
-            cls._debug_lock.release()
+            if not cls.is_trio_thread():
+                trio.from_thread.run_sync(
+                    cls._debug_lock.release
+                )
+            else:
+                cls._debug_lock.release()
+
         except RuntimeError:
             # uhhh makes no sense but been seeing the non-owner
             # release error even though this is definitely the task
@@ -437,11 +506,31 @@ async def wait_for_parent_stdin_hijack(
             log.debug('Exiting debugger from child')
 
 
-def mk_mpdb() -> tuple[MultiActorPdb, Callable]:
+def mk_mpdb() -> MultiActorPdb:
+    '''
+    Deliver a new `MultiActorPdb`: a multi-process safe `pdbp`
+    REPL using the magic of SC!
 
+    Our `pdb.Pdb` subtype accomplishes multi-process safe debugging
+    by:
+
+    - mutexing access to the root process' TTY & stdstreams
+      via an IPC managed `Lock` singleton per process tree.
+
+    - temporarily overriding any subactor's SIGINT handler to shield during
+      live REPL sessions in sub-actors such that cancellation is
+      never (mistakenly) triggered by a ctrl-c and instead only 
+      by either explicit requests in the runtime or 
+
+    '''
     pdb = MultiActorPdb()
-    # signal.signal = pdbp.hideframe(signal.signal)
 
+    # Always shield out SIGINTs for subactors when REPL is active.
+    #
+    # XXX detect whether we're running from a non-main thread
+    # in which case schedule the SIGINT shielding override
+    # to in the main thread.
+    # https://docs.python.org/3/library/signal.html#signals-and-threads
     Lock.shield_sigint()
 
     # XXX: These are the important flags mentioned in
@@ -450,7 +539,7 @@ def mk_mpdb() -> tuple[MultiActorPdb, Callable]:
     pdb.allow_kbdint = True
     pdb.nosigint = True
 
-    return pdb, Lock.unshield_sigint
+    return pdb
 
 
 def shield_sigint_handler(
@@ -463,17 +552,16 @@ def shield_sigint_handler(
     '''
     Specialized, debugger-aware SIGINT handler.
 
-    In childred we always ignore to avoid deadlocks since cancellation
-    should always be managed by the parent supervising actor. The root
-    is always cancelled on ctrl-c.
+    In childred we always ignore/shield for SIGINT to avoid
+    deadlocks since cancellation should always be managed by the
+    supervising parent actor. The root actor-proces is always
+    cancelled on ctrl-c.
 
     '''
-    __tracebackhide__ = True
+    __tracebackhide__: bool = True
+    uid_in_debug: tuple[str, str]|None = Lock.global_actor_in_debug
 
-    uid_in_debug: tuple[str, str] | None = Lock.global_actor_in_debug
-
-    actor = current_actor()
-    # print(f'{actor.uid} in HANDLER with ')
+    actor: Actor = current_actor()
 
     def do_cancel():
         # If we haven't tried to cancel the runtime then do that instead
@@ -508,7 +596,7 @@ def shield_sigint_handler(
                 return do_cancel()
 
     # only set in the actor actually running the REPL
-    pdb_obj: MultiActorPdb | None = Lock.repl
+    pdb_obj: MultiActorPdb|None = Lock.repl
 
     # root actor branch that reports whether or not a child
     # has locked debugger.
@@ -615,14 +703,20 @@ _pause_msg: str = 'Attaching to pdb REPL in actor'
 
 
 def _set_trace(
-    actor: tractor.Actor | None = None,
-    pdb: MultiActorPdb | None = None,
+    actor: tractor.Actor|None = None,
+    pdb: MultiActorPdb|None = None,
     shield: bool = False,
 
     extra_frames_up_when_async: int = 1,
+    hide_tb: bool = True,
 ):
-    __tracebackhide__: bool = True
-    actor: tractor.Actor = actor or current_actor()
+    __tracebackhide__: bool = hide_tb
+
+    actor: tractor.Actor = (
+        actor
+        or
+        current_actor()
+    )
 
     # always start 1 level up from THIS in user code.
     frame: FrameType|None
@@ -668,13 +762,8 @@ def _set_trace(
                 f'Going up frame {i} -> {frame}\n'
             )
 
-    else:
-        pdb, undo_sigint = mk_mpdb()
-
-        # we entered the global ``breakpoint()`` built-in from sync
-        # code?
-        Lock.local_task_in_debug = 'sync'
-
+    # engage ze REPL
+    # B~()
     pdb.set_trace(frame=frame)
 
 
@@ -684,7 +773,6 @@ async def _pause(
 
     # NOTE: must be passed in the `.pause_from_sync()` case!
     pdb: MultiActorPdb|None = None,
-    undo_sigint: Callable|None = None,
 
     # TODO: allow caller to pause despite task cancellation,
     # exactly the same as wrapping with:
@@ -695,6 +783,7 @@ async def _pause(
     # be no way to override it?..
     #
     shield: bool = False,
+    hide_tb: bool = True,
     task_status: TaskStatus[trio.Event] = trio.TASK_STATUS_IGNORED
 
 ) -> None:
@@ -706,9 +795,16 @@ async def _pause(
     Hopefully we won't need this in the long run.
 
     '''
-    __tracebackhide__: bool = True
-    actor = current_actor()
-    task_name: str = trio.lowlevel.current_task().name
+    __tracebackhide__: bool = hide_tb
+    actor: Actor = current_actor()
+    try:
+        task_name: str = trio.lowlevel.current_task().name
+    except RuntimeError as rte:
+        if actor.is_infected_aio():
+            raise RuntimeError(
+                '`tractor.pause[_from_sync]()` not yet supported '
+                'for infected `asyncio` mode!'
+            ) from rte
 
     if (
         not Lock.local_pdb_complete
@@ -722,8 +818,7 @@ async def _pause(
         )
 
     if pdb is None:
-        assert undo_sigint is None, 'You must pass both!?!'
-        pdb, undo_sigint = mk_mpdb()
+        pdb: MultiActorPdb = mk_mpdb()
 
     # TODO: need a more robust check for the "root" actor
     if (
@@ -766,7 +861,6 @@ async def _pause(
         # ```
         # but not entirely sure if that's a sane way to implement it?
         try:
-            print("ACQUIRING TTY LOCK from CHILD")
             with trio.CancelScope(shield=True):
                 await actor._service_n.start(
                     wait_for_parent_stdin_hijack,
@@ -786,13 +880,11 @@ async def _pause(
             raise
 
     elif is_root_process():
-        print("ROOT TTY LOCK BRANCH")
 
         # we also wait in the root-parent for any child that
         # may have the tty locked prior
         # TODO: wait, what about multiple root tasks acquiring it though?
         if Lock.global_actor_in_debug == actor.uid:
-            print("ROOT ALREADY HAS TTY?")
             # re-entrant root process already has it: noop.
             return
 
@@ -806,14 +898,11 @@ async def _pause(
 
             # must shield here to avoid hitting a ``Cancelled`` and
             # a child getting stuck bc we clobbered the tty
-            print("ACQUIRING TTY LOCK from ROOT")
             with trio.CancelScope(shield=True):
                 await Lock._debug_lock.acquire()
         else:
             # may be cancelled
-            print("ROOT TRYING LOCK ACQUIRE")
             await Lock._debug_lock.acquire()
-            print("ROOT LOCKED TTY")
 
         Lock.global_actor_in_debug = actor.uid
         Lock.local_task_in_debug = task_name
@@ -825,7 +914,6 @@ async def _pause(
         #
         if debug_func is None:
             task_status.started(Lock)
-            print("ROOT .started(Lock) now!")
 
         else:
             # block here one (at the appropriate frame *up*) where
@@ -940,20 +1028,74 @@ async def pause(
         )
 
 
+_gb_mod: None|ModuleType|False = None
+
+
+def maybe_import_greenback(
+    raise_not_found: bool = True,
+    force_reload: bool = False,
+
+) -> ModuleType|False:
+    # be cached-fast on module-already-inited
+    global _gb_mod
+
+    if _gb_mod is False:
+        return False
+
+    elif (
+        _gb_mod is not None
+        and not force_reload
+    ):
+        return _gb_mod
+
+    try:
+        import greenback
+        _gb_mod = greenback
+        return greenback
+
+    except ModuleNotFoundError as mnf:
+        log.debug(
+            '`greenback` is not installed.\n'
+            'No sync debug support!\n'
+        )
+        _gb_mod = False
+
+        if raise_not_found:
+            raise RuntimeError(
+                'The `greenback` lib is required to use `tractor.pause_from_sync()`!\n'
+                'https://github.com/oremanj/greenback\n'
+            ) from mnf
+
+        return False
+
+
+async def maybe_init_greenback(
+    **kwargs,
+) -> None|ModuleType:
+
+    if mod := maybe_import_greenback(**kwargs):
+        await mod.ensure_portal()
+        log.info(
+            '`greenback` portal opened!\n'
+            'Sync debug support activated!\n'
+        )
+        return mod
+
+    return None
 
 
 # TODO: allow pausing from sync code.
 # normally by remapping python's builtin breakpoint() hook to this
 # runtime aware version which takes care of all .
 def pause_from_sync(
-    hide_tb: bool = True
+    hide_tb: bool = False,
 ) -> None:
 
     __tracebackhide__: bool = hide_tb
     actor: tractor.Actor = current_actor(
         err_on_no_runtime=False,
     )
-    print(
+    log.debug(
         f'{actor.uid}: JUST ENTERED `tractor.pause_from_sync()`'
         f'|_{actor}\n'
     )
@@ -965,73 +1107,57 @@ def pause_from_sync(
             '- `async with tractor.open_root_actor()`\n'
         )
 
-    try:
-        import greenback
-    except ModuleNotFoundError:
-        raise RuntimeError(
-            'The `greenback` lib is required to use `tractor.pause_from_sync()`!\n'
-            'https://github.com/oremanj/greenback\n'
+    # raises on not-found by default
+    greenback: ModuleType = maybe_import_greenback()
+    mdb: MultiActorPdb = mk_mpdb()
+
+    # run async task which will lock out the root proc's TTY.
+    if not Lock.is_trio_thread():
+        trio.from_thread.run(
+            partial(
+                pause,
+                debug_func=None,
+                pdb=mdb,
+                hide_tb=hide_tb,
+            )
         )
+        # TODO: maybe the `trio.current_task()` id/name if avail?
+        Lock.local_task_in_debug: str = str(threading.current_thread().name)
 
-    # out = greenback.await_(
-    #     actor._service_n.start(partial(
-    #         pause,
-    #         debug_func=None,
-    #         release_lock_signal=task_can_release_tty_lock,
-    #     ))
-    # )
-
-    # spawn bg task which will lock out the TTY, we poll
-    # just below until the release event is reporting that task as
-    # waiting.. not the most ideal but works for now ;)
-    db, undo_sigint = mk_mpdb()
-    greenback.await_(
-        pause(
-            debug_func=None,
-            pdb=db,
-            undo_sigint=undo_sigint,
+    else:  # we are presumably the `trio.run()` + main thread
+        greenback.await_(
+            pause(
+                debug_func=None,
+                pdb=mdb,
+                hide_tb=hide_tb,
+            )
         )
-    )
-
-    Lock.local_task_in_debug = 'sync'
+        Lock.local_task_in_debug: str = current_task().name
 
     # TODO: ensure we aggressively make the user aware about
     # entering the global ``breakpoint()`` built-in from sync
     # code?
-    frame: FrameType | None = sys._getframe()
-    frame: FrameType = frame.f_back  # type: ignore
-
-    # db.config.enable_hidden_frames = True
-    # assert not db._is_hidden(frame)
-    # print(f'FRAME: {str(frame)}')
-    # if not db._is_hidden(frame):
-    #     pdbp.set_trace()
-    # db._hidden_frames.append(
-    #     (frame, frame.f_lineno)
-    # )
-    db.set_trace(frame=frame)
+    _set_trace(
+        actor=actor,
+        pdb=mdb,
+        hide_tb=hide_tb,
+        extra_frames_up_when_async=1,
 
+        # TODO? will we ever need it?
+        # -> the gb._await() won't be affected by cancellation?
+        # shield=shield,
+    )
+    # LEGACY NOTE on next LOC's frame showing weirdness..
+    #
     # XXX NOTE XXX no other LOC can be here without it
     # showing up in the REPL's last stack frame !?!
     # -[ ] tried to use `@pdbp.hideframe` decoration but
     #   still doesn't work
-    #
-    # FROM BEFORE: on `Lock.unshield_sigint()`.. I have NO CLUE why
-    # the next instruction's def frame is being shown
-    # in the tb but it seems to be something wonky with
-    # the way `pdb` core works?
-    #
-    # NOTE: not needed any more anyway since it's all in
-    # `Lock.release()` now!
-    # undo_sigint()
-
-
-# using the "pause" semantics instead since
-# that better covers actually somewhat "pausing the runtime"
-# for this particular paralell task to do debugging B)
-# pp = pause  # short-hand for "pause point"
 
 
+# NOTE prefer a new "pause" semantic since it better describes
+# "pausing the actor's runtime" for this particular
+# paralell task to do debugging in a REPL.
 async def breakpoint(**kwargs):
     log.warning(
         '`tractor.breakpoint()` is deprecated!\n'
-- 
2.34.1


From 0055c1d954b814848530fda81489d578c8545058 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 24 Mar 2024 16:39:59 -0400
Subject: [PATCH 182/378] Tweak main thread predicate to ensure `trio.run()`

Change the name to `Lock.is_main_trio_thread()` indicating that when
`True` the thread is both the main one **and** the one that called
`trio.run()`. Add a todo for just copying the
`trio._util.is_main_thread()` impl (since it's private / may change) and
some brief notes about potential usage of
`trio.from_thread.check_cancelled()` to detect non-`.to_thread` thread
spawns.
---
 tractor/devx/_debug.py | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 770995a2..bb5740bc 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -48,6 +48,7 @@ from types import (
 )
 
 import pdbp
+import sniffio
 import tractor
 import trio
 from trio.lowlevel import current_task
@@ -144,7 +145,7 @@ class Lock:
         # in which case schedule the SIGINT shielding override
         # to in the main thread.
         # https://docs.python.org/3/library/signal.html#signals-and-threads
-        if not cls.is_trio_thread():
+        if not cls.is_main_trio_thread():
             cls._orig_sigint_handler: Callable = trio.from_thread.run_sync(
                 signal.signal,
                 signal.SIGINT,
@@ -163,7 +164,7 @@ class Lock:
         # always restore ``trio``'s sigint handler. see notes below in
         # the pdb factory about the nightmare that is that code swapping
         # out the handler when the repl activates...
-        if not cls.is_trio_thread():
+        if not cls.is_main_trio_thread():
             trio.from_thread.run_sync(
                 signal.signal,
                 signal.SIGINT,
@@ -178,15 +179,21 @@ class Lock:
         cls._orig_sigint_handler = None
 
     @classmethod
-    def is_trio_thread(self) -> bool:
+    def is_main_trio_thread(cls) -> bool:
         '''
         Check if we're the "main" thread (as in the first one
-        started by cpython) and presume that it is the thread that
+        started by cpython) AND that it is ALSO the thread that
         called `trio.run()` and not some thread spawned with
         `trio.to_thread.run_sync()`.
 
         '''
-        return trio._util.is_main_thread()
+        return (
+            # TODO: since this is private, @oremanj says
+            # we should just copy the impl for now..
+            trio._util.is_main_thread()
+            and
+            sniffio.current_async_library() == 'trio'
+        )
         # XXX apparently unreliable..see ^
         # (
         #     threading.current_thread()
@@ -196,7 +203,7 @@ class Lock:
     @classmethod
     def release(cls):
         try:
-            if not cls.is_trio_thread():
+            if not cls.is_main_trio_thread():
                 trio.from_thread.run_sync(
                     cls._debug_lock.release
                 )
@@ -1112,7 +1119,16 @@ def pause_from_sync(
     mdb: MultiActorPdb = mk_mpdb()
 
     # run async task which will lock out the root proc's TTY.
-    if not Lock.is_trio_thread():
+    if not Lock.is_main_trio_thread():
+
+        # TODO: we could also check for a non-`.to_thread` context
+        # using `trio.from_thread.check_cancelled()` (says
+        # oremanj) wherein we get the following outputs:
+        #
+        # `RuntimeError`: non-`.to_thread` spawned thread
+        # noop: non-cancelled `.to_thread`
+        # `trio.Cancelled`: cancelled `.to_thread`
+        #
         trio.from_thread.run(
             partial(
                 pause,
-- 
2.34.1


From 2588e54867066953343d63bb80b74653595de1f2 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 24 Mar 2024 16:49:07 -0400
Subject: [PATCH 183/378] Add todo-notes for hiding `@acm` frames

In the particular case of the `Portal.open_context().__aexit__()` frame,
due to usage of `contextlib.asynccontextmanager`, we can't easily hook
into monkeypatching a `__tracebackhide__` set nor catch-n-reraise around
the block exit without defining our own `.__aexit__()` impl. Thus, it's
prolly most sane to do something with an override of
`contextlib._AsyncGeneratorContextManager` or the public exposed
`AsyncContextDecorator` (which uses the former internally right?).

Also fixup some old `._invoke` mod paths in comments and just show
`str(eoc)` in `.open_stream().__aexit__()` terminated-by-EoC log msg
since the `repr()` form won't pprint the IPC msg nicely..
---
 tractor/_context.py | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index 11975bae..3c2490a3 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -351,7 +351,7 @@ class Context:
     by the runtime in 2 ways:
      - by entering ``Portal.open_context()`` which is the primary
        public API for any "caller" task or,
-     - by the RPC machinery's `._runtime._invoke()` as a `ctx` arg
+     - by the RPC machinery's `._rpc._invoke()` as a `ctx` arg
        to a remotely scheduled "callee" function.
 
     AND is always constructed using the below ``mk_context()``.
@@ -361,10 +361,10 @@ class Context:
     `trio.Task`s. Contexts are allocated on each side of any task
     RPC-linked msg dialog, i.e. for every request to a remote
     actor from a `Portal`. On the "callee" side a context is
-    always allocated inside ``._runtime._invoke()``.
+    always allocated inside ``._rpc._invoke()``.
 
-    # TODO: more detailed writeup on cancellation, error and
-    # streaming semantics..
+    TODO: more detailed writeup on cancellation, error and
+    streaming semantics..
 
     A context can be cancelled and (possibly eventually restarted) from
     either side of the underlying IPC channel, it can also open task
@@ -1206,7 +1206,9 @@ class Context:
                     # await pause()
                     log.warning(
                         'Stream was terminated by EoC\n\n'
-                        f'{repr(eoc)}\n'
+                        # NOTE: won't show the error <Type> but
+                        # does show txt followed by IPC msg.
+                        f'{str(eoc)}\n'
                     )
 
             finally:
@@ -1303,7 +1305,7 @@ class Context:
             # `._cancel_called == True`.
             not raise_overrun_from_self
             and isinstance(remote_error, RemoteActorError)
-            and remote_error.msgdata['type_str'] == 'StreamOverrun'
+            and remote_error.msgdata['boxed_type_str'] == 'StreamOverrun'
             and tuple(remote_error.msgdata['sender']) == our_uid
         ):
             # NOTE: we set the local scope error to any "self
@@ -1880,6 +1882,19 @@ class Context:
             return False
 
 
+# TODO: exception tb masking by using a manual
+# `.__aexit__()`/.__aenter__()` pair on a type?
+# => currently this is one of the few places we can't easily
+# mask errors - on the exit side of a `Portal.open_context()`..
+# there's # => currently this is one of the few places we can't
+# there's 2 ways to approach it:
+# - manually write an @acm type as per above
+# - use `contextlib.AsyncContextDecorator` to override the default
+#   impl to suppress traceback frames:
+#  * https://docs.python.org/3/library/contextlib.html#contextlib.AsyncContextDecorator
+#  * https://docs.python.org/3/library/contextlib.html#contextlib.ContextDecorator
+# - also we could just override directly the underlying
+#   `contextlib._AsyncGeneratorContextManager`?
 @acm
 async def open_context_from_portal(
     portal: Portal,
-- 
2.34.1


From 507cd96904138e0ae0b84e6e52d974dc9c7cc013 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 25 Mar 2024 14:15:36 -0400
Subject: [PATCH 184/378] Change all `| None` -> `|None` in `._runtime`

---
 tractor/_runtime.py | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index ff3fb74f..e2d78d51 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -140,16 +140,16 @@ class Actor:
     msg_buffer_size: int = 2**6
 
     # nursery placeholders filled in by `async_main()` after fork
-    _root_n: Nursery | None = None
-    _service_n: Nursery | None = None
-    _server_n: Nursery | None = None
+    _root_n: Nursery|None = None
+    _service_n: Nursery|None = None
+    _server_n: Nursery|None = None
 
     # Information about `__main__` from parent
     _parent_main_data: dict[str, str]
-    _parent_chan_cs: CancelScope | None = None
+    _parent_chan_cs: CancelScope|None = None
 
     # syncs for setup/teardown sequences
-    _server_down: trio.Event | None = None
+    _server_down: trio.Event|None = None
 
     # user toggled crash handling (including monkey-patched in
     # `trio.open_nursery()` via `.trionics._supervisor` B)
@@ -178,7 +178,7 @@ class Actor:
         spawn_method: str|None = None,
 
         # TODO: remove!
-        arbiter_addr: tuple[str, int] | None = None,
+        arbiter_addr: tuple[str, int]|None = None,
 
     ) -> None:
         '''
@@ -193,7 +193,7 @@ class Actor:
         )
 
         self._cancel_complete = trio.Event()
-        self._cancel_called_by_remote: tuple[str, tuple] | None = None
+        self._cancel_called_by_remote: tuple[str, tuple]|None = None
         self._cancel_called: bool = False
 
         # retreive and store parent `__main__` data which
@@ -249,11 +249,11 @@ class Actor:
         ] = {}
 
         self._listeners: list[trio.abc.Listener] = []
-        self._parent_chan: Channel | None = None
-        self._forkserver_info: tuple | None = None
+        self._parent_chan: Channel|None = None
+        self._forkserver_info: tuple|None = None
         self._actoruid2nursery: dict[
             tuple[str, str],
-            ActorNursery | None,
+            ActorNursery|None,
         ] = {}  # type: ignore  # noqa
 
         # when provided, init the registry addresses property from
@@ -779,7 +779,7 @@ class Actor:
         #
         # side: str|None = None,
 
-        msg_buffer_size: int | None = None,
+        msg_buffer_size: int|None = None,
         allow_overruns: bool = False,
 
     ) -> Context:
@@ -844,7 +844,7 @@ class Actor:
         kwargs: dict,
 
         # IPC channel config
-        msg_buffer_size: int | None = None,
+        msg_buffer_size: int|None = None,
         allow_overruns: bool = False,
         load_nsf: bool = False,
 
@@ -918,11 +918,11 @@ class Actor:
 
     async def _from_parent(
         self,
-        parent_addr: tuple[str, int] | None,
+        parent_addr: tuple[str, int]|None,
 
     ) -> tuple[
         Channel,
-        list[tuple[str, int]] | None,
+        list[tuple[str, int]]|None,
     ]:
         '''
         Bootstrap this local actor's runtime config from its parent by
@@ -943,7 +943,7 @@ class Actor:
             # Initial handshake: swap names.
             await self._do_handshake(chan)
 
-            accept_addrs: list[tuple[str, int]] | None = None
+            accept_addrs: list[tuple[str, int]]|None = None
             if self._spawn_method == "trio":
                 # Receive runtime state from our parent
                 parent_data: dict[str, Any]
@@ -1007,7 +1007,7 @@ class Actor:
         handler_nursery: Nursery,
         *,
         # (host, port) to bind for channel server
-        listen_sockaddrs: list[tuple[str, int]] | None = None,
+        listen_sockaddrs: list[tuple[str, int]]|None = None,
 
         task_status: TaskStatus[Nursery] = trio.TASK_STATUS_IGNORED,
     ) -> None:
@@ -1464,7 +1464,7 @@ class Actor:
 
 async def async_main(
     actor: Actor,
-    accept_addrs: tuple[str, int] | None = None,
+    accept_addrs: tuple[str, int]|None = None,
 
     # XXX: currently ``parent_addr`` is only needed for the
     # ``multiprocessing`` backend (which pickles state sent to
@@ -1473,7 +1473,7 @@ async def async_main(
     # change this to a simple ``is_subactor: bool`` which will
     # be False when running as root actor and True when as
     # a subactor.
-    parent_addr: tuple[str, int] | None = None,
+    parent_addr: tuple[str, int]|None = None,
     task_status: TaskStatus[None] = trio.TASK_STATUS_IGNORED,
 
 ) -> None:
@@ -1496,7 +1496,7 @@ async def async_main(
     try:
 
         # establish primary connection with immediate parent
-        actor._parent_chan: Channel | None = None
+        actor._parent_chan: Channel|None = None
         if parent_addr is not None:
 
             (
@@ -1795,7 +1795,7 @@ class Arbiter(Actor):
         self,
         name: str,
 
-    ) -> tuple[str, int] | None:
+    ) -> tuple[str, int]|None:
 
         for uid, sockaddr in self._registry.items():
             if name in uid:
-- 
2.34.1


From 90bfdaf58cc674e74c51ae71a627dbaf5d6b3db3 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 25 Mar 2024 15:03:33 -0400
Subject: [PATCH 185/378] Drop extra newline from log msg

---
 tractor/_supervise.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tractor/_supervise.py b/tractor/_supervise.py
index 733dd53c..df3d7def 100644
--- a/tractor/_supervise.py
+++ b/tractor/_supervise.py
@@ -583,7 +583,7 @@ async def open_nursery(
     finally:
         msg: str = (
             'Actor-nursery exited\n'
-            f'|_{an}\n\n'
+            f'|_{an}\n'
         )
 
         # shutdown runtime if it was started
-- 
2.34.1


From 72b4dc14616ceb8372d4728ef6d922cd28220507 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 25 Mar 2024 16:09:32 -0400
Subject: [PATCH 186/378] Provision for infected-`asyncio` debug mode support

It's **almost** there, we're just missing the final translation code to
get from an `asyncio` side task to be able to call
`.devx._debug..wait_for_parent_stdin_hijack()` to do root actor TTY
locking. Then we just need to ensure internals also do the right thing
with `greenback()` for equivalent sync `breakpoint()` style pause
points.

Since i'm deferring this until later, tossing in some xfail tests to
`test_infected_asyncio` with TODOs for the needed implementation as well
as eventual test org.

By "provision" it means we add:
- `greenback` init block to `_run_asyncio_task()` when debug mode is
  enabled (but which will currently rte when `asyncio` is detected)
  using `.bestow_portal()` around the `asyncio.Task`.
- a call to `_debug.maybe_init_greenback()` in the `run_as_asyncio_guest()`
  guest-mode entry point.
- as part of `._debug.Lock.is_main_trio_thread()` whenever the async-lib
  is not 'trio' error lock the backend name (which is obvi `'asyncio'`
  in this use case).
---
 examples/debugging/asyncio_bp.py |  4 ++-
 tests/test_infected_asyncio.py   | 37 +++++++++++++++++++++-
 tractor/devx/_debug.py           | 17 ++++++++--
 tractor/to_asyncio.py            | 54 ++++++++++++++++++++++++--------
 4 files changed, 95 insertions(+), 17 deletions(-)

diff --git a/examples/debugging/asyncio_bp.py b/examples/debugging/asyncio_bp.py
index b32ad1d8..baddfe03 100644
--- a/examples/debugging/asyncio_bp.py
+++ b/examples/debugging/asyncio_bp.py
@@ -77,7 +77,9 @@ async def main(
 
 ) -> None:
 
-    async with tractor.open_nursery() as n:
+    async with tractor.open_nursery(
+        # debug_mode=True,
+    ) as n:
 
         p = await n.start_actor(
             'aio_daemon',
diff --git a/tests/test_infected_asyncio.py b/tests/test_infected_asyncio.py
index 5ac463ea..8d34bef4 100644
--- a/tests/test_infected_asyncio.py
+++ b/tests/test_infected_asyncio.py
@@ -601,7 +601,8 @@ def test_echoserver_detailed_mechanics(
                             pass
                         else:
                             pytest.fail(
-                                "stream wasn't stopped after sentinel?!")
+                                'stream not stopped after sentinel ?!'
+                            )
 
             # TODO: the case where this blocks and
             # is cancelled by kbi or out of task cancellation
@@ -613,3 +614,37 @@ def test_echoserver_detailed_mechanics(
 
     else:
         trio.run(main)
+
+
+# TODO: debug_mode tests once we get support for `asyncio`!
+#
+# -[ ] need tests to wrap both scripts:
+#   - [ ] infected_asyncio_echo_server.py
+#   - [ ] debugging/asyncio_bp.py
+#  -[ ] consider moving ^ (some of) these ^ to `test_debugger`?
+#
+# -[ ] missing impl outstanding includes:
+#  - [x] for sync pauses we need to ensure we open yet another
+#    `greenback` portal in the asyncio task
+#    => completed using `.bestow_portal(task)` inside
+#     `.to_asyncio._run_asyncio_task()` right?
+#   -[ ] translation func to get from `asyncio` task calling to 
+#     `._debug.wait_for_parent_stdin_hijack()` which does root
+#     call to do TTY locking.
+#
+def test_sync_breakpoint():
+    '''
+    Verify we can do sync-func/code breakpointing using the
+    `breakpoint()` builtin inside infected mode actors.
+
+    '''
+    pytest.xfail('This support is not implemented yet!')
+
+
+def test_debug_mode_crash_handling():
+    '''
+    Verify mult-actor crash handling works with a combo of infected-`asyncio`-mode
+    and normal `trio` actors despite nested process trees.
+
+    '''
+    pytest.xfail('This support is not implemented yet!')
diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index bb5740bc..75be7a2a 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -187,13 +187,18 @@ class Lock:
         `trio.to_thread.run_sync()`.
 
         '''
-        return (
+        is_trio_main = (
             # TODO: since this is private, @oremanj says
             # we should just copy the impl for now..
             trio._util.is_main_thread()
             and
-            sniffio.current_async_library() == 'trio'
+            (async_lib := sniffio.current_async_library()) == 'trio'
         )
+        if not is_trio_main:
+            log.warning(
+                f'Current async-lib detected by `sniffio`: {async_lib}\n'
+            )
+        return is_trio_main
         # XXX apparently unreliable..see ^
         # (
         #     threading.current_thread()
@@ -1114,6 +1119,14 @@ def pause_from_sync(
             '- `async with tractor.open_root_actor()`\n'
         )
 
+    # NOTE: once supported, remove this AND the one
+    # inside `._pause()`!
+    if actor.is_infected_aio():
+        raise RuntimeError(
+            '`tractor.pause[_from_sync]()` not yet supported '
+            'for infected `asyncio` mode!'
+        )
+
     # raises on not-found by default
     greenback: ModuleType = maybe_import_greenback()
     mdb: MultiActorPdb = mk_mpdb()
diff --git a/tractor/to_asyncio.py b/tractor/to_asyncio.py
index 7c88edd2..585b0b00 100644
--- a/tractor/to_asyncio.py
+++ b/tractor/to_asyncio.py
@@ -33,10 +33,14 @@ from typing import (
 import trio
 from outcome import Error
 
-from .log import get_logger
-from ._state import current_actor
-from ._exceptions import AsyncioCancelled
-from .trionics._broadcast import (
+from tractor.log import get_logger
+from tractor._state import (
+    current_actor,
+    debug_mode,
+)
+from tractor.devx import _debug
+from tractor._exceptions import AsyncioCancelled
+from tractor.trionics._broadcast import (
     broadcast_receiver,
     BroadcastReceiver,
 )
@@ -64,9 +68,9 @@ class LinkedTaskChannel(trio.abc.Channel):
     _trio_exited: bool = False
 
     # set after ``asyncio.create_task()``
-    _aio_task: asyncio.Task | None = None
-    _aio_err: BaseException | None = None
-    _broadcaster: BroadcastReceiver | None = None
+    _aio_task: asyncio.Task|None = None
+    _aio_err: BaseException|None = None
+    _broadcaster: BroadcastReceiver|None = None
 
     async def aclose(self) -> None:
         await self._from_aio.aclose()
@@ -158,7 +162,9 @@ def _run_asyncio_task(
     '''
     __tracebackhide__ = True
     if not current_actor().is_infected_aio():
-        raise RuntimeError("`infect_asyncio` mode is not enabled!?")
+        raise RuntimeError(
+            "`infect_asyncio` mode is not enabled!?"
+        )
 
     # ITC (inter task comms), these channel/queue names are mostly from
     # ``asyncio``'s perspective.
@@ -187,7 +193,7 @@ def _run_asyncio_task(
 
     cancel_scope = trio.CancelScope()
     aio_task_complete = trio.Event()
-    aio_err: BaseException | None = None
+    aio_err: BaseException|None = None
 
     chan = LinkedTaskChannel(
         aio_q,  # asyncio.Queue
@@ -253,7 +259,7 @@ def _run_asyncio_task(
     if not inspect.isawaitable(coro):
         raise TypeError(f"No support for invoking {coro}")
 
-    task = asyncio.create_task(
+    task: asyncio.Task = asyncio.create_task(
         wait_on_coro_final_result(
             to_trio,
             coro,
@@ -262,6 +268,18 @@ def _run_asyncio_task(
     )
     chan._aio_task = task
 
+    # XXX TODO XXX get this actually workin.. XD
+    # maybe setup `greenback` for `asyncio`-side task REPLing
+    if (
+        debug_mode()
+        and
+        (greenback := _debug.maybe_import_greenback(
+            force_reload=True,
+            raise_not_found=False,
+        ))
+    ):
+        greenback.bestow_portal(task)
+
     def cancel_trio(task: asyncio.Task) -> None:
         '''
         Cancel the calling ``trio`` task on error.
@@ -269,7 +287,7 @@ def _run_asyncio_task(
         '''
         nonlocal chan
         aio_err = chan._aio_err
-        task_err: BaseException | None = None
+        task_err: BaseException|None = None
 
         # only to avoid ``asyncio`` complaining about uncaptured
         # task exceptions
@@ -349,11 +367,11 @@ async def translate_aio_errors(
     '''
     trio_task = trio.lowlevel.current_task()
 
-    aio_err: BaseException | None = None
+    aio_err: BaseException|None = None
 
     # TODO: make thisi a channel method?
     def maybe_raise_aio_err(
-        err: Exception | None = None
+        err: Exception|None = None
     ) -> None:
         aio_err = chan._aio_err
         if (
@@ -531,6 +549,16 @@ def run_as_asyncio_guest(
         loop = asyncio.get_running_loop()
         trio_done_fut = asyncio.Future()
 
+        if debug_mode():
+            # XXX make it obvi we know this isn't supported yet!
+            log.error(
+                'Attempting to enter unsupported `greenback` init '
+                'from `asyncio` task..'
+            )
+            await _debug.maybe_init_greenback(
+                force_reload=True,
+            )
+
         def trio_done_callback(main_outcome):
 
             if isinstance(main_outcome, Error):
-- 
2.34.1


From 496dce57a8940eb65fc0132599e1c96992e03dc8 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 25 Mar 2024 16:31:16 -0400
Subject: [PATCH 187/378] Prepare to offer (dynamic) `.msg.Codec` overrides

By simply allowing an input `codec: tuple` of funcs for now to the
`MsgpackTCPStream` transport but, ideally wrapping this in a `Codec`
type with an API for dynamic extension of the interchange lib's msg
processing settings. Right now we're tied to `msgspec.msgpack` for this
transport but with the right design this can likely extend to other libs
in the future.

Relates to starting feature work toward #36, #196, #365.
---
 tractor/_ipc.py | 43 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 38 insertions(+), 5 deletions(-)

diff --git a/tractor/_ipc.py b/tractor/_ipc.py
index f57d3bd8..2b5df698 100644
--- a/tractor/_ipc.py
+++ b/tractor/_ipc.py
@@ -30,6 +30,7 @@ import struct
 import typing
 from typing import (
     Any,
+    Callable,
     runtime_checkable,
     Protocol,
     Type,
@@ -123,6 +124,16 @@ class MsgpackTCPStream(MsgTransport):
         stream: trio.SocketStream,
         prefix_size: int = 4,
 
+        # XXX optionally provided codec pair for `msgspec`:
+        # https://jcristharif.com/msgspec/extending.html#mapping-to-from-native-types
+        #
+        # TODO: define this as a `Codec` struct which can be
+        # overriden dynamically by the application/runtime.
+        codec: tuple[
+            Callable[[Any], Any]|None,  # coder
+            Callable[[type, Any], Any]|None,  # decoder
+        ]|None = None,
+
     ) -> None:
 
         self.stream = stream
@@ -138,12 +149,18 @@ class MsgpackTCPStream(MsgTransport):
         # public i guess?
         self.drained: list[dict] = []
 
-        self.recv_stream = BufferedReceiveStream(transport_stream=stream)
+        self.recv_stream = BufferedReceiveStream(
+            transport_stream=stream
+        )
         self.prefix_size = prefix_size
 
         # TODO: struct aware messaging coders
-        self.encode = msgspec.msgpack.Encoder().encode
-        self.decode = msgspec.msgpack.Decoder().decode  # dict[str, Any])
+        self.encode = msgspec.msgpack.Encoder(
+            enc_hook=codec[0] if codec else None,
+        ).encode
+        self.decode = msgspec.msgpack.Decoder(
+            dec_hook=codec[1] if codec else None,
+        ).decode
 
     async def _iter_packets(self) -> AsyncGenerator[dict, None]:
         '''Yield packets from the underlying stream.
@@ -349,9 +366,25 @@ class Channel:
         stream: trio.SocketStream,
         type_key: tuple[str, str]|None = None,
 
+        # XXX optionally provided codec pair for `msgspec`:
+        # https://jcristharif.com/msgspec/extending.html#mapping-to-from-native-types
+        codec: tuple[
+            Callable[[Any], Any],  # coder
+            Callable[[type, Any], Any],  # decoder
+        ]|None = None,
+
     ) -> MsgTransport:
-        type_key = type_key or self._transport_key
-        self._transport = get_msg_transport(type_key)(stream)
+        type_key = (
+            type_key
+            or
+            self._transport_key
+        )
+        self._transport = get_msg_transport(
+            type_key
+        )(
+            stream,
+            codec=codec,
+        )
         return self._transport
 
     def __repr__(self) -> str:
-- 
2.34.1


From 0a69829ec576f1429f52ec9f3f2ca58607f0c5aa Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 26 Mar 2024 15:50:47 -0400
Subject: [PATCH 188/378] Proto `MsgCodec`, an interchange fmt modify API

Fitting in line with the issues outstanding:
- #36: (msg)spec-ing out our SCIPP (structured-con-inter-proc-prot).
  (https://github.com/goodboy/tractor/issues/36)

- #196: adding strictly typed IPC msg dialog schemas, more or less
  better described as "dialog/transaction scoped message specs"
  using `msgspec`'s tagged unions and custom codecs.
  (https://github.com/goodboy/tractor/issues/196)

- #365: using modern static type-annots to drive capability based
  messaging and RPC.
  (statically https://github.com/goodboy/tractor/issues/365)

This is a first draft of a new API for dynamically overriding IPC msg
codecs for a given interchange lib from any task in the runtime. Right
now we obviously only support `msgspec` but ideally this API holds
general enough to be used for other backends eventually (like
`capnproto`, and apache arrow).

Impl is in a new `tractor.msg._codec` with:
- a new `MsgCodec` type for encapsing `msgspec.msgpack.Encoder/Decoder`
  pairs and configuring any custom enc/dec_hooks or typed decoding.
- factory `mk_codec()` for creating new codecs ad-hoc from a task.
- `contextvars` support for a new `trio.Task` scoped
  `_ctxvar_MsgCodec: ContextVar[MsgCodec]` named 'msgspec_codec'.
- `apply_codec()` for temporarily modifying the above per task
  as needed around `.open_context()` / `.open_stream()` operation.

A new test (suite) in `test_caps_msging.py`:
- verify a parent and its child can enable the same custom codec (in
  this case to transmit `NamespacePath`s) with tons of pedantic ctx-vars
  checks.
- ToDo: still need to implement #36 msg types in order to be able to get
  decodes working (as in `MsgStream.receive()` will deliver an already
  created `NamespacePath` obj) since currently all msgs come packed in `dict`-msg
  wrapper packets..
  -> use the proto from PR #35 to get nested `msgspec.Raw` processing up
  and running Bo
---
 tests/test_caps_msging.py | 198 +++++++++++++++++++++++++++++
 tractor/_ipc.py           |  74 ++++++++---
 tractor/msg/__init__.py   |  10 ++
 tractor/msg/_codec.py     | 253 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 519 insertions(+), 16 deletions(-)
 create mode 100644 tests/test_caps_msging.py
 create mode 100644 tractor/msg/_codec.py

diff --git a/tests/test_caps_msging.py b/tests/test_caps_msging.py
new file mode 100644
index 00000000..f659cb13
--- /dev/null
+++ b/tests/test_caps_msging.py
@@ -0,0 +1,198 @@
+'''
+Functional audits for our "capability based messaging (schema)" feats.
+
+B~)
+
+'''
+from typing import (
+    Any,
+    Type,
+)
+from contextvars import (
+    Context,
+)
+
+import tractor
+from tractor.msg import (
+    _def_msgspec_codec,
+    _ctxvar_MsgCodec,
+
+    NamespacePath,
+    MsgCodec,
+    mk_codec,
+    apply_codec,
+    current_msgspec_codec,
+)
+import trio
+
+# TODO: wrap these into `._codec` such that user can just pass
+# a type table of some sort?
+def enc_hook(obj: Any) -> Any:
+    if isinstance(obj, NamespacePath):
+        return str(obj)
+    else:
+        raise NotImplementedError(
+            f'Objects of type {type(obj)} are not supported'
+        )
+
+
+def dec_hook(type: Type, obj: Any) -> Any:
+    print(f'type is: {type}')
+    if type is NamespacePath:
+        return NamespacePath(obj)
+    else:
+        raise NotImplementedError(
+            f'Objects of type {type(obj)} are not supported'
+        )
+
+
+def ex_func(*args):
+    print(f'ex_func({args})')
+
+
+def mk_custom_codec() -> MsgCodec:
+    # apply custom hooks and set a `Decoder` which only
+    # loads `NamespacePath` types.
+    nsp_codec: MsgCodec = mk_codec(
+        dec_types=NamespacePath,
+        enc_hook=enc_hook,
+        dec_hook=dec_hook,
+    )
+
+    # TODO: validate `MsgCodec` interface/semantics?
+    # -[ ] simple field tests to ensure caching + reset is workin?
+    # -[ ] custom / changing `.decoder()` calls?
+    #
+    # dec = nsp_codec.decoder(
+    #     types=NamespacePath,
+    # )
+    # assert nsp_codec.dec is dec
+    return nsp_codec
+
+
+@tractor.context
+async def send_back_nsp(
+    ctx: tractor.Context,
+
+) -> None:
+    '''
+    Setup up a custom codec to load instances of `NamespacePath`
+    and ensure we can round trip a func ref with our parent.
+
+    '''
+    task: trio.Task = trio.lowlevel.current_task()
+    task_ctx: Context = task.context
+    assert _ctxvar_MsgCodec not in task_ctx
+
+    nsp_codec: MsgCodec = mk_custom_codec()
+    with apply_codec(nsp_codec) as codec:
+        chk_codec_applied(
+            custom_codec=nsp_codec,
+            enter_value=codec,
+        )
+
+        nsp = NamespacePath.from_ref(ex_func)
+        await ctx.started(nsp)
+
+        async with ctx.open_stream() as ipc:
+            async for msg in ipc:
+
+                assert msg == f'{__name__}:ex_func'
+
+                # TODO: as per below
+                # assert isinstance(msg, NamespacePath)
+                assert isinstance(msg, str)
+
+
+def chk_codec_applied(
+    custom_codec: MsgCodec,
+    enter_value: MsgCodec,
+) -> MsgCodec:
+
+    task: trio.Task = trio.lowlevel.current_task()
+    task_ctx: Context = task.context
+
+    assert _ctxvar_MsgCodec in task_ctx
+    curr_codec: MsgCodec = task.context[_ctxvar_MsgCodec]
+
+    assert (
+        # returned from `mk_codec()`
+        custom_codec is
+
+        # yielded value from `apply_codec()`
+        enter_value is
+
+        # read from current task's `contextvars.Context`
+        curr_codec is
+
+        # public API for all of the above
+        current_msgspec_codec()
+
+        # the default `msgspec` settings
+        is not _def_msgspec_codec
+    )
+
+
+def test_codec_hooks_mod():
+    '''
+    Audit the `.msg.MsgCodec` override apis details given our impl
+    uses `contextvars` to accomplish per `trio` task codec
+    application around an inter-proc-task-comms context.
+
+    '''
+    async def main():
+        task: trio.Task = trio.lowlevel.current_task()
+        task_ctx: Context = task.context
+        assert _ctxvar_MsgCodec not in task_ctx
+
+        async with tractor.open_nursery() as an:
+            p: tractor.Portal = await an.start_actor(
+                'sub',
+                enable_modules=[__name__],
+            )
+
+            # TODO: 2 cases:
+            # - codec not modified -> decode nsp as `str`
+            # - codec modified with hooks -> decode nsp as
+            #   `NamespacePath`
+            nsp_codec: MsgCodec = mk_custom_codec()
+            with apply_codec(nsp_codec) as codec:
+                chk_codec_applied(
+                    custom_codec=nsp_codec,
+                    enter_value=codec,
+                )
+
+                async with (
+                    p.open_context(
+                        send_back_nsp,
+                    ) as (ctx, first),
+                    ctx.open_stream() as ipc,
+                ):
+                    # ensure codec is still applied across
+                    # `tractor.Context` + its embedded nursery.
+                    chk_codec_applied(
+                        custom_codec=nsp_codec,
+                        enter_value=codec,
+                    )
+
+                    assert first == f'{__name__}:ex_func'
+                    # TODO: actually get the decoder loading
+                    # to native once we spec our SCIPP msgspec
+                    # (structurred-conc-inter-proc-protocol)
+                    # implemented as per,
+                    # https://github.com/goodboy/tractor/issues/36
+                    #
+                    # assert isinstance(first, NamespacePath)
+                    assert isinstance(first, str)
+                    await ipc.send(first)
+
+                    with trio.move_on_after(1):
+                        async for msg in ipc:
+
+                            # TODO: as per above
+                            # assert isinstance(msg, NamespacePath)
+                            assert isinstance(msg, str)
+
+            await p.cancel_actor()
+
+    trio.run(main)
diff --git a/tractor/_ipc.py b/tractor/_ipc.py
index 2b5df698..5aafda3f 100644
--- a/tractor/_ipc.py
+++ b/tractor/_ipc.py
@@ -23,7 +23,10 @@ from collections.abc import (
     AsyncGenerator,
     AsyncIterator,
 )
-from contextlib import asynccontextmanager as acm
+from contextlib import (
+    asynccontextmanager as acm,
+    contextmanager as cm,
+)
 import platform
 from pprint import pformat
 import struct
@@ -37,12 +40,15 @@ from typing import (
     TypeVar,
 )
 
-import msgspec
 from tricycle import BufferedReceiveStream
 import trio
 
 from tractor.log import get_logger
 from tractor._exceptions import TransportClosed
+from tractor.msg import (
+    _ctxvar_MsgCodec,
+    MsgCodec,
+)
 
 log = get_logger(__name__)
 
@@ -154,13 +160,9 @@ class MsgpackTCPStream(MsgTransport):
         )
         self.prefix_size = prefix_size
 
-        # TODO: struct aware messaging coders
-        self.encode = msgspec.msgpack.Encoder(
-            enc_hook=codec[0] if codec else None,
-        ).encode
-        self.decode = msgspec.msgpack.Decoder(
-            dec_hook=codec[1] if codec else None,
-        ).decode
+        # allow for custom IPC msg interchange format
+        # dynamic override Bo
+        self.codec: MsgCodec = codec or MsgCodec()
 
     async def _iter_packets(self) -> AsyncGenerator[dict, None]:
         '''Yield packets from the underlying stream.
@@ -199,7 +201,23 @@ class MsgpackTCPStream(MsgTransport):
 
             log.transport(f"received {msg_bytes}")  # type: ignore
             try:
-                yield self.decode(msg_bytes)
+                # NOTE: lookup the `trio.Task.context`'s var for
+                # the current `MsgCodec`.
+                yield  _ctxvar_MsgCodec.get().decode(msg_bytes)
+
+                # TODO: remove, was only for orig draft impl
+                # testing.
+                #
+                # curr_codec: MsgCodec = _ctxvar_MsgCodec.get()
+                # obj = curr_codec.decode(msg_bytes)
+                # if (
+                #     curr_codec is not
+                #     _codec._def_msgspec_codec
+                # ):
+                #     print(f'OBJ: {obj}\n')
+                #
+                # yield obj
+
             except (
                 msgspec.DecodeError,
                 UnicodeDecodeError,
@@ -235,7 +253,10 @@ class MsgpackTCPStream(MsgTransport):
         # __tracebackhide__: bool = hide_tb
         async with self._send_lock:
 
-            bytes_data: bytes = self.encode(msg)
+            # NOTE: lookup the `trio.Task.context`'s var for
+            # the current `MsgCodec`.
+            bytes_data: bytes = _ctxvar_MsgCodec.get().encode(msg)
+            # bytes_data: bytes = self.codec.encode(msg)
 
             # supposedly the fastest says,
             # https://stackoverflow.com/a/54027962
@@ -335,7 +356,9 @@ class Channel:
 
     @property
     def msgstream(self) -> MsgTransport:
-        log.info('`Channel.msgstream` is an old name, use `._transport`')
+        log.info(
+            '`Channel.msgstream` is an old name, use `._transport`'
+        )
         return self._transport
 
     @property
@@ -368,10 +391,7 @@ class Channel:
 
         # XXX optionally provided codec pair for `msgspec`:
         # https://jcristharif.com/msgspec/extending.html#mapping-to-from-native-types
-        codec: tuple[
-            Callable[[Any], Any],  # coder
-            Callable[[type, Any], Any],  # decoder
-        ]|None = None,
+        codec: MsgCodec|None = None,
 
     ) -> MsgTransport:
         type_key = (
@@ -379,14 +399,36 @@ class Channel:
             or
             self._transport_key
         )
+        # get transport type, then
         self._transport = get_msg_transport(
             type_key
+        # instantiate an instance of the msg-transport
         )(
             stream,
             codec=codec,
         )
         return self._transport
 
+    # TODO: something simliar at the IPC-`Context`
+    # level so as to support 
+    @cm
+    def apply_codec(
+        self,
+        codec: MsgCodec,
+
+    ) -> None:
+        '''
+        Temporarily override the underlying IPC msg codec for
+        dynamic enforcement of messaging schema.
+
+        '''
+        orig: MsgCodec = self._transport.codec
+        try:
+            self._transport.codec = codec
+            yield
+        finally:
+            self._transport.codec = orig
+
     def __repr__(self) -> str:
         if not self._transport:
             return '<Channel with inactive transport?>'
diff --git a/tractor/msg/__init__.py b/tractor/msg/__init__.py
index 906627cf..e2296788 100644
--- a/tractor/msg/__init__.py
+++ b/tractor/msg/__init__.py
@@ -24,3 +24,13 @@ from .ptr import (
 from .types import (
     Struct as Struct,
 )
+from ._codec import (
+
+    _def_msgspec_codec as _def_msgspec_codec,
+    _ctxvar_MsgCodec as _ctxvar_MsgCodec,
+
+    apply_codec as apply_codec,
+    mk_codec as mk_codec,
+    MsgCodec as MsgCodec,
+    current_msgspec_codec as current_msgspec_codec,
+)
diff --git a/tractor/msg/_codec.py b/tractor/msg/_codec.py
new file mode 100644
index 00000000..0da454ad
--- /dev/null
+++ b/tractor/msg/_codec.py
@@ -0,0 +1,253 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+IPC msg interchange codec management.
+
+Supported backend libs:
+- `msgspec.msgpack`
+
+ToDo: backends we prolly should offer:
+
+- see project/lib list throughout GH issue discussion comments:
+  https://github.com/goodboy/tractor/issues/196
+
+- `capnproto`: https://capnproto.org/rpc.html
+   - https://capnproto.org/language.html#language-reference
+
+'''
+from contextvars import (
+    ContextVar,
+    Token,
+)
+from contextlib import (
+    contextmanager as cm,
+)
+from typing import (
+    Any,
+    Callable,
+    Type,
+    Union,
+)
+from types import ModuleType
+
+import msgspec
+from msgspec import msgpack
+
+from .types import Struct
+
+
+# TODO: API changes towards being interchange lib agnostic!
+# -[ ] capnproto has pre-compiled schema for eg..
+#  * https://capnproto.org/language.html
+#  * http://capnproto.github.io/pycapnp/quickstart.html
+#   * https://github.com/capnproto/pycapnp/blob/master/examples/addressbook.capnp
+class MsgCodec(Struct):
+    '''
+    A IPC msg interchange format lib's encoder + decoder pair.
+
+    '''
+
+    lib: ModuleType = msgspec
+
+    # ad-hoc type extensions
+    # https://jcristharif.com/msgspec/extending.html#mapping-to-from-native-types
+    enc_hook: Callable[[Any], Any]|None = None  # coder
+    dec_hook: Callable[[type, Any], Any]|None = None # decoder
+
+    # struct type unions
+    # https://jcristharif.com/msgspec/structs.html#tagged-unions
+    types: Union[Type[Struct]]|Any = Any
+
+    # post-configure cached props
+    _enc: msgpack.Encoder|None = None
+    _dec: msgpack.Decoder|None = None
+
+
+    # TODO: use `functools.cached_property` for these ?
+    # https://docs.python.org/3/library/functools.html#functools.cached_property
+    @property
+    def enc(self) -> msgpack.Encoder:
+        return self._enc or self.encoder()
+
+    def encoder(
+        self,
+        enc_hook: Callable|None = None,
+        reset: bool = False,
+
+        # TODO: what's the default for this?
+        # write_buffer_size: int
+        **kwargs,
+
+    ) -> msgpack.Encoder:
+        '''
+        Set or get the maybe-cached `msgspec.msgpack.Encoder`
+        instance configured for this codec.
+
+        When `reset=True` any previously configured encoder will
+        be recreated and then cached with the new settings passed
+        as input.
+
+        '''
+        if (
+            self._enc is None
+            or reset
+        ):
+            self._enc = self.lib.msgpack.Encoder(
+                enc_hook=enc_hook or self.enc_hook,
+                # write_buffer_size=write_buffer_size,
+            )
+
+        return self._enc
+
+    def encode(
+        self,
+        py_obj: Any,
+
+    ) -> bytes:
+        '''
+        Encode input python objects to `msgpack` bytes for transfer
+        on a tranport protocol connection.
+
+        '''
+        return self.enc.encode(py_obj)
+
+    @property
+    def dec(self) -> msgpack.Decoder:
+        return self._dec or self.decoder()
+
+    def decoder(
+        self,
+        types: Union[Type[Struct]]|None = None,
+        dec_hook: Callable|None = None,
+        reset: bool = False,
+        **kwargs,
+        # ext_hook: ext_hook_sig
+
+    ) -> msgpack.Decoder:
+        '''
+        Set or get the maybe-cached `msgspec.msgpack.Decoder`
+        instance configured for this codec.
+
+        When `reset=True` any previously configured decoder will
+        be recreated and then cached with the new settings passed
+        as input.
+
+        '''
+        if (
+            self._dec is None
+            or reset
+        ):
+            self._dec = self.lib.msgpack.Decoder(
+                types or self.types,
+                dec_hook=dec_hook or self.dec_hook,
+                **kwargs,
+            )
+
+        return self._dec
+
+    def decode(
+        self,
+        msg: bytes,
+    ) -> Any:
+        '''
+        Decode received `msgpack` bytes into a local python object
+        with special `msgspec.Struct` (or other type) handling
+        determined by the 
+
+        '''
+
+        return self.dec.decode(msg)
+
+
+# TODO: struct aware messaging coders as per:
+# - https://github.com/goodboy/tractor/issues/36
+# - https://github.com/goodboy/tractor/issues/196
+# - https://github.com/goodboy/tractor/issues/365
+
+def mk_codec(
+    libname: str = 'msgspec',
+
+    # struct type unions set for `Decoder`
+    # https://jcristharif.com/msgspec/structs.html#tagged-unions
+    dec_types: Union[Type[Struct]]|Any = Any,
+
+    cache_now: bool = True,
+
+    # proxy to the `Struct.__init__()`
+    **kwargs,
+
+) -> MsgCodec:
+    '''
+    Convenience factory for creating codecs eventually meant
+    to be interchange lib agnostic (i.e. once we support more then just
+    `msgspec` ;).
+
+    '''
+    codec = MsgCodec(
+        types=dec_types,
+        **kwargs,
+    )
+    assert codec.lib.__name__ == libname
+
+    # by default config and cache the codec pair for given
+    # input settings.
+    if cache_now:
+        assert codec.enc
+        assert codec.dec
+
+    return codec
+
+
+# instance of the default `msgspec.msgpack` codec settings, i.e.
+# no custom structs, hooks or other special types.
+_def_msgspec_codec: MsgCodec = mk_codec()
+
+# NOTE: provides for per-`trio.Task` specificity of the
+# IPC msging codec used by the transport layer when doing
+# `Channel.send()/.recv()` of wire data.
+_ctxvar_MsgCodec: ContextVar[MsgCodec] = ContextVar(
+    'msgspec_codec',
+    default=_def_msgspec_codec,
+)
+
+
+@cm
+def apply_codec(
+    codec: MsgCodec,
+
+) -> MsgCodec:
+    '''
+    Dynamically apply a `MsgCodec` to the current task's
+    runtime context such that all IPC msgs are processed
+    with it for that task.
+
+    '''
+    token: Token = _ctxvar_MsgCodec.set(codec)
+    try:
+        yield _ctxvar_MsgCodec.get()
+    finally:
+        _ctxvar_MsgCodec.reset(token)
+
+
+def current_msgspec_codec() -> MsgCodec:
+    '''
+    Return the current `trio.Task.context`'s value
+    for `msgspec_codec` used by `Channel.send/.recv()`
+    for wire serialization.
+
+    '''
+    return _ctxvar_MsgCodec.get()
-- 
2.34.1


From 2eaef26547863ac5b236198364e4ff9092c04de1 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 6 Jul 2022 17:35:09 -0400
Subject: [PATCH 189/378] WIP tagged union message type API

XXX NOTE XXX: this is a heavily modified commit from the original
(ec226463) which was super out of date when rebased onto the current
branch. I went through a manual conflict rework and removed all the
legacy segments as well as rename-moved this original mod
`tractor.msg.py` -> `tractor.msg/_old_msg.py`. Further the
`NamespacePath` type def was discarded from this mod since it was from
a super old version which was already moved to a `.msg.ptr` submod.

As per original questions and discussion with `msgspec` author:
- https://github.com/jcrist/msgspec/issues/25
- https://github.com/jcrist/msgspec/issues/140

this prototypes a new (but very naive) `msgspec.Struct` codec
implementation which will be more filled out in the next commit.
---
 tractor/_ipc.py         |   4 +-
 tractor/msg/_old_msg.py | 121 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 124 insertions(+), 1 deletion(-)
 create mode 100644 tractor/msg/_old_msg.py

diff --git a/tractor/_ipc.py b/tractor/_ipc.py
index 5aafda3f..b1c2ccd2 100644
--- a/tractor/_ipc.py
+++ b/tractor/_ipc.py
@@ -165,7 +165,9 @@ class MsgpackTCPStream(MsgTransport):
         self.codec: MsgCodec = codec or MsgCodec()
 
     async def _iter_packets(self) -> AsyncGenerator[dict, None]:
-        '''Yield packets from the underlying stream.
+        '''
+        Yield `bytes`-blob decoded packets from the underlying TCP
+        stream using the current task's `MsgCodec`.
 
         '''
         import msgspec  # noqa
diff --git a/tractor/msg/_old_msg.py b/tractor/msg/_old_msg.py
new file mode 100644
index 00000000..823228a3
--- /dev/null
+++ b/tractor/msg/_old_msg.py
@@ -0,0 +1,121 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+Capability-based messaging specifications: or colloquially as "msgspecs".
+
+Includes our SCIPP (structured-con-inter-process-protocol) message type defs
+and APIs for applying custom msgspec-sets for implementing un-protocol state machines.
+
+'''
+
+# TODO: integration with our ``enable_modules: list[str]`` caps sys.
+
+# ``pkgutil.resolve_name()`` internally uses
+# ``importlib.import_module()`` which can be filtered by inserting
+# a ``MetaPathFinder`` into ``sys.meta_path`` (which we could do before
+# entering the ``Actor._process_messages()`` loop).
+# https://github.com/python/cpython/blob/main/Lib/pkgutil.py#L645
+# https://stackoverflow.com/questions/1350466/preventing-python-code-from-importing-certain-modules
+#   - https://stackoverflow.com/a/63320902
+#   - https://docs.python.org/3/library/sys.html#sys.meta_path
+
+# the new "Implicit Namespace Packages" might be relevant?
+# - https://www.python.org/dev/peps/pep-0420/
+
+# add implicit serialized message type support so that paths can be
+# handed directly to IPC primitives such as streams and `Portal.run()`
+# calls:
+# - via ``msgspec``:
+#   - https://jcristharif.com/msgspec/api.html#struct
+#   - https://jcristharif.com/msgspec/extending.html
+# via ``msgpack-python``:
+# https://github.com/msgpack/msgpack-python#packingunpacking-of-custom-data-type
+
+from __future__ import annotations
+from contextlib import contextmanager as cm
+from typing import (
+    Union,
+    Any,
+)
+
+from msgspec import Struct
+from msgspec.msgpack import (
+    Encoder,
+    Decoder,
+)
+
+
+# LIFO codec stack that is appended when the user opens the
+# ``configure_native_msgs()`` cm below to configure a new codec set
+# which will be applied to all new (msgspec relevant) IPC transports
+# that are spawned **after** the configure call is made.
+_lifo_codecs: list[
+    tuple[
+        Encoder,
+        Decoder,
+    ],
+] = [(Encoder(), Decoder())]
+
+
+def get_msg_codecs() -> tuple[
+    Encoder,
+    Decoder,
+]:
+    '''
+    Return the currently configured ``msgspec`` codec set.
+
+    The defaults are defined above.
+
+    '''
+    global _lifo_codecs
+    return _lifo_codecs[-1]
+
+
+@cm
+def configure_native_msgs(
+    tagged_structs: list[Struct],
+):
+    '''
+    Push a codec set that will natively decode
+    tagged structs provied in ``tagged_structs``
+    in all IPC transports and pop the codec on exit.
+
+    '''
+    global _lifo_codecs
+
+    # See "tagged unions" docs:
+    # https://jcristharif.com/msgspec/structs.html#tagged-unions
+
+    # "The quickest way to enable tagged unions is to set tag=True when
+    # defining every struct type in the union. In this case tag_field
+    # defaults to "type", and tag defaults to the struct class name
+    # (e.g. "Get")."
+    enc = Encoder()
+
+    types_union = Union[tagged_structs[0]] | Any
+    for struct in tagged_structs[1:]:
+        types_union |= struct
+
+    dec = Decoder(types_union)
+
+    _lifo_codecs.append((enc, dec))
+    try:
+        print("YOYOYOOYOYOYOY")
+        yield enc, dec
+    finally:
+        print("NONONONONON")
+        _lifo_codecs.pop()
-- 
2.34.1


From 336db8425e8e14085ac94778d4e40082499d378c Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 7 Jul 2022 15:48:16 -0400
Subject: [PATCH 190/378] Re-think, `msgspec`-multi-typed msg dialogs

The greasy details are strewn throughout a `msgspec` issue:
https://github.com/jcrist/msgspec/issues/140

and specifically this code was mostly written as part of POC example in
this comment:
https://github.com/jcrist/msgspec/issues/140#issuecomment-1177850792

This work obviously pertains to our desire and prep for typed messaging
and capabilities aware msg-oriented-protocols in #196. I added a "wants
to have" method to `Context` showing how I think we could offer a pretty
neat msg-type-set-as-capability-for-protocol system.

XXX NOTE XXX: this commit was rewritten during a rebase from a very old
version as per the prior commit.
---
 tractor/_streaming.py   | 11 +++++
 tractor/msg/_old_msg.py | 95 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 102 insertions(+), 4 deletions(-)

diff --git a/tractor/_streaming.py b/tractor/_streaming.py
index e0015fe4..90c33d31 100644
--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@@ -533,6 +533,17 @@ class MsgStream(trio.abc.Channel):
             else:
                 raise
 
+    # TODO: msg capability context api1
+    # @acm
+    # async def enable_msg_caps(
+    #     self,
+    #     msg_subtypes: Union[
+    #         list[list[Struct]],
+    #         Protocol,   # hypothetical type that wraps a msg set
+    #     ],
+    # ) -> tuple[Callable, Callable]:  # payload enc, dec pair
+    #     ...
+
 
 def stream(func: Callable) -> Callable:
     '''
diff --git a/tractor/msg/_old_msg.py b/tractor/msg/_old_msg.py
index 823228a3..240b2eca 100644
--- a/tractor/msg/_old_msg.py
+++ b/tractor/msg/_old_msg.py
@@ -48,11 +48,12 @@ and APIs for applying custom msgspec-sets for implementing un-protocol state mac
 from __future__ import annotations
 from contextlib import contextmanager as cm
 from typing import (
-    Union,
     Any,
+    Optional,
+    Union,
 )
 
-from msgspec import Struct
+from msgspec import Struct, Raw
 from msgspec.msgpack import (
     Encoder,
     Decoder,
@@ -95,8 +96,6 @@ def configure_native_msgs(
     in all IPC transports and pop the codec on exit.
 
     '''
-    global _lifo_codecs
-
     # See "tagged unions" docs:
     # https://jcristharif.com/msgspec/structs.html#tagged-unions
 
@@ -119,3 +118,91 @@ def configure_native_msgs(
     finally:
         print("NONONONONON")
         _lifo_codecs.pop()
+
+
+class Header(Struct, tag=True):
+    '''
+    A msg header which defines payload properties
+
+    '''
+    uid: str
+    msgtype: Optional[str] = None
+
+
+class Msg(Struct, tag=True):
+    '''
+    The "god" msg type, a box for task level msg types.
+
+    '''
+    header: Header
+    payload: Raw
+
+
+_root_dec = Decoder(Msg)
+_root_enc = Encoder()
+
+# sub-decoders for retreiving embedded
+# payload data and decoding to a sender
+# side defined (struct) type.
+_subdecs:  dict[
+    Optional[str],
+    Decoder] = {
+    None: Decoder(Any),
+}
+
+
+@cm
+def enable_context(
+    msg_subtypes: list[list[Struct]]
+) -> Decoder:
+
+    for types in msg_subtypes:
+        first = types[0]
+
+        # register using the default tag_field of "type"
+        # which seems to map to the class "name".
+        tags = [first.__name__]
+
+        # create a tagged union decoder for this type set
+        type_union = Union[first]
+        for typ in types[1:]:
+            type_union |= typ
+            tags.append(typ.__name__)
+
+        dec = Decoder(type_union)
+
+        # register all tags for this union sub-decoder
+        for tag in tags:
+            _subdecs[tag] = dec
+        try:
+            yield dec
+        finally:
+            for tag in tags:
+                _subdecs.pop(tag)
+
+
+def decmsg(msg: Msg) -> Any:
+    msg = _root_dec.decode(msg)
+    tag_field = msg.header.msgtype
+    dec = _subdecs[tag_field]
+    return dec.decode(msg.payload)
+
+
+def encmsg(
+    dialog_id: str | int,
+    payload: Any,
+) -> Msg:
+
+    tag_field = None
+
+    plbytes = _root_enc.encode(payload)
+    if b'type' in plbytes:
+        assert isinstance(payload, Struct)
+        tag_field = type(payload).__name__
+        payload = Raw(plbytes)
+
+    msg = Msg(
+        Header(dialog_id, tag_field),
+        payload,
+    )
+    return _root_enc.encode(msg)
-- 
2.34.1


From 79211eab9a55a7d728a1f708e0b93af7b6b0fea3 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 26 Mar 2024 17:47:55 -0400
Subject: [PATCH 191/378] Merge original content from PR #311 into `.msg.types`
 for now

---
 tractor/msg/_old_msg.py | 208 ----------------------------------------
 tractor/msg/types.py    | 185 ++++++++++++++++++++++++++++++++++-
 2 files changed, 182 insertions(+), 211 deletions(-)
 delete mode 100644 tractor/msg/_old_msg.py

diff --git a/tractor/msg/_old_msg.py b/tractor/msg/_old_msg.py
deleted file mode 100644
index 240b2eca..00000000
--- a/tractor/msg/_old_msg.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# tractor: structured concurrent "actors".
-# Copyright 2018-eternity Tyler Goodlet.
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-'''
-Capability-based messaging specifications: or colloquially as "msgspecs".
-
-Includes our SCIPP (structured-con-inter-process-protocol) message type defs
-and APIs for applying custom msgspec-sets for implementing un-protocol state machines.
-
-'''
-
-# TODO: integration with our ``enable_modules: list[str]`` caps sys.
-
-# ``pkgutil.resolve_name()`` internally uses
-# ``importlib.import_module()`` which can be filtered by inserting
-# a ``MetaPathFinder`` into ``sys.meta_path`` (which we could do before
-# entering the ``Actor._process_messages()`` loop).
-# https://github.com/python/cpython/blob/main/Lib/pkgutil.py#L645
-# https://stackoverflow.com/questions/1350466/preventing-python-code-from-importing-certain-modules
-#   - https://stackoverflow.com/a/63320902
-#   - https://docs.python.org/3/library/sys.html#sys.meta_path
-
-# the new "Implicit Namespace Packages" might be relevant?
-# - https://www.python.org/dev/peps/pep-0420/
-
-# add implicit serialized message type support so that paths can be
-# handed directly to IPC primitives such as streams and `Portal.run()`
-# calls:
-# - via ``msgspec``:
-#   - https://jcristharif.com/msgspec/api.html#struct
-#   - https://jcristharif.com/msgspec/extending.html
-# via ``msgpack-python``:
-# https://github.com/msgpack/msgpack-python#packingunpacking-of-custom-data-type
-
-from __future__ import annotations
-from contextlib import contextmanager as cm
-from typing import (
-    Any,
-    Optional,
-    Union,
-)
-
-from msgspec import Struct, Raw
-from msgspec.msgpack import (
-    Encoder,
-    Decoder,
-)
-
-
-# LIFO codec stack that is appended when the user opens the
-# ``configure_native_msgs()`` cm below to configure a new codec set
-# which will be applied to all new (msgspec relevant) IPC transports
-# that are spawned **after** the configure call is made.
-_lifo_codecs: list[
-    tuple[
-        Encoder,
-        Decoder,
-    ],
-] = [(Encoder(), Decoder())]
-
-
-def get_msg_codecs() -> tuple[
-    Encoder,
-    Decoder,
-]:
-    '''
-    Return the currently configured ``msgspec`` codec set.
-
-    The defaults are defined above.
-
-    '''
-    global _lifo_codecs
-    return _lifo_codecs[-1]
-
-
-@cm
-def configure_native_msgs(
-    tagged_structs: list[Struct],
-):
-    '''
-    Push a codec set that will natively decode
-    tagged structs provied in ``tagged_structs``
-    in all IPC transports and pop the codec on exit.
-
-    '''
-    # See "tagged unions" docs:
-    # https://jcristharif.com/msgspec/structs.html#tagged-unions
-
-    # "The quickest way to enable tagged unions is to set tag=True when
-    # defining every struct type in the union. In this case tag_field
-    # defaults to "type", and tag defaults to the struct class name
-    # (e.g. "Get")."
-    enc = Encoder()
-
-    types_union = Union[tagged_structs[0]] | Any
-    for struct in tagged_structs[1:]:
-        types_union |= struct
-
-    dec = Decoder(types_union)
-
-    _lifo_codecs.append((enc, dec))
-    try:
-        print("YOYOYOOYOYOYOY")
-        yield enc, dec
-    finally:
-        print("NONONONONON")
-        _lifo_codecs.pop()
-
-
-class Header(Struct, tag=True):
-    '''
-    A msg header which defines payload properties
-
-    '''
-    uid: str
-    msgtype: Optional[str] = None
-
-
-class Msg(Struct, tag=True):
-    '''
-    The "god" msg type, a box for task level msg types.
-
-    '''
-    header: Header
-    payload: Raw
-
-
-_root_dec = Decoder(Msg)
-_root_enc = Encoder()
-
-# sub-decoders for retreiving embedded
-# payload data and decoding to a sender
-# side defined (struct) type.
-_subdecs:  dict[
-    Optional[str],
-    Decoder] = {
-    None: Decoder(Any),
-}
-
-
-@cm
-def enable_context(
-    msg_subtypes: list[list[Struct]]
-) -> Decoder:
-
-    for types in msg_subtypes:
-        first = types[0]
-
-        # register using the default tag_field of "type"
-        # which seems to map to the class "name".
-        tags = [first.__name__]
-
-        # create a tagged union decoder for this type set
-        type_union = Union[first]
-        for typ in types[1:]:
-            type_union |= typ
-            tags.append(typ.__name__)
-
-        dec = Decoder(type_union)
-
-        # register all tags for this union sub-decoder
-        for tag in tags:
-            _subdecs[tag] = dec
-        try:
-            yield dec
-        finally:
-            for tag in tags:
-                _subdecs.pop(tag)
-
-
-def decmsg(msg: Msg) -> Any:
-    msg = _root_dec.decode(msg)
-    tag_field = msg.header.msgtype
-    dec = _subdecs[tag_field]
-    return dec.decode(msg.payload)
-
-
-def encmsg(
-    dialog_id: str | int,
-    payload: Any,
-) -> Msg:
-
-    tag_field = None
-
-    plbytes = _root_enc.encode(payload)
-    if b'type' in plbytes:
-        assert isinstance(payload, Struct)
-        tag_field = type(payload).__name__
-        payload = Raw(plbytes)
-
-    msg = Msg(
-        Header(dialog_id, tag_field),
-        payload,
-    )
-    return _root_enc.encode(msg)
diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index 3ceff845..e457370e 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -21,19 +21,27 @@ types.
 '''
 from __future__ import annotations
 from collections import UserList
-from pprint import (
-    saferepr,
-)
+from contextlib import contextmanager as cm
 from typing import (
     Any,
     Iterator,
+    Optional,
+    Union,
 )
 
 from msgspec import (
     msgpack,
+    Raw,
     Struct as _Struct,
     structs,
 )
+from msgspec.msgpack import (
+    Encoder,
+    Decoder,
+)
+from pprint import (
+    saferepr,
+)
 
 # TODO: auto-gen type sig for input func both for
 # type-msgs and logging of RPC tasks?
@@ -268,3 +276,174 @@ class Struct(
                 ))
 
         return diffs
+
+# ------ - ------
+#
+# TODO: integration with our ``enable_modules: list[str]`` caps sys.
+#
+# ``pkgutil.resolve_name()`` internally uses
+# ``importlib.import_module()`` which can be filtered by inserting
+# a ``MetaPathFinder`` into ``sys.meta_path`` (which we could do before
+# entering the ``Actor._process_messages()`` loop).
+# https://github.com/python/cpython/blob/main/Lib/pkgutil.py#L645
+# https://stackoverflow.com/questions/1350466/preventing-python-code-from-importing-certain-modules
+#   - https://stackoverflow.com/a/63320902
+#   - https://docs.python.org/3/library/sys.html#sys.meta_path
+
+# the new "Implicit Namespace Packages" might be relevant?
+# - https://www.python.org/dev/peps/pep-0420/
+
+# add implicit serialized message type support so that paths can be
+# handed directly to IPC primitives such as streams and `Portal.run()`
+# calls:
+# - via ``msgspec``:
+#   - https://jcristharif.com/msgspec/api.html#struct
+#   - https://jcristharif.com/msgspec/extending.html
+# via ``msgpack-python``:
+# https://github.com/msgpack/msgpack-python#packingunpacking-of-custom-data-type
+# LIFO codec stack that is appended when the user opens the
+# ``configure_native_msgs()`` cm below to configure a new codec set
+# which will be applied to all new (msgspec relevant) IPC transports
+# that are spawned **after** the configure call is made.
+_lifo_codecs: list[
+    tuple[
+        Encoder,
+        Decoder,
+    ],
+] = [(Encoder(), Decoder())]
+
+
+def get_msg_codecs() -> tuple[
+    Encoder,
+    Decoder,
+]:
+    '''
+    Return the currently configured ``msgspec`` codec set.
+
+    The defaults are defined above.
+
+    '''
+    global _lifo_codecs
+    return _lifo_codecs[-1]
+
+
+@cm
+def configure_native_msgs(
+    tagged_structs: list[_Struct],
+):
+    '''
+    Push a codec set that will natively decode
+    tagged structs provied in ``tagged_structs``
+    in all IPC transports and pop the codec on exit.
+
+    '''
+    # See "tagged unions" docs:
+    # https://jcristharif.com/msgspec/structs.html#tagged-unions
+
+    # "The quickest way to enable tagged unions is to set tag=True when
+    # defining every struct type in the union. In this case tag_field
+    # defaults to "type", and tag defaults to the struct class name
+    # (e.g. "Get")."
+    enc = Encoder()
+
+    types_union = Union[tagged_structs[0]] | Any
+    for struct in tagged_structs[1:]:
+        types_union |= struct
+
+    dec = Decoder(types_union)
+
+    _lifo_codecs.append((enc, dec))
+    try:
+        print("YOYOYOOYOYOYOY")
+        yield enc, dec
+    finally:
+        print("NONONONONON")
+        _lifo_codecs.pop()
+
+
+class Header(_Struct, tag=True):
+    '''
+    A msg header which defines payload properties
+
+    '''
+    uid: str
+    msgtype: Optional[str] = None
+
+
+class Msg(_Struct, tag=True):
+    '''
+    The "god" msg type, a box for task level msg types.
+
+    '''
+    header: Header
+    payload: Raw
+
+
+_root_dec = Decoder(Msg)
+_root_enc = Encoder()
+
+# sub-decoders for retreiving embedded
+# payload data and decoding to a sender
+# side defined (struct) type.
+_subdecs:  dict[
+    Optional[str],
+    Decoder] = {
+    None: Decoder(Any),
+}
+
+
+@cm
+def enable_context(
+    msg_subtypes: list[list[_Struct]]
+) -> Decoder:
+
+    for types in msg_subtypes:
+        first = types[0]
+
+        # register using the default tag_field of "type"
+        # which seems to map to the class "name".
+        tags = [first.__name__]
+
+        # create a tagged union decoder for this type set
+        type_union = Union[first]
+        for typ in types[1:]:
+            type_union |= typ
+            tags.append(typ.__name__)
+
+        dec = Decoder(type_union)
+
+        # register all tags for this union sub-decoder
+        for tag in tags:
+            _subdecs[tag] = dec
+        try:
+            yield dec
+        finally:
+            for tag in tags:
+                _subdecs.pop(tag)
+
+
+def decmsg(msg: Msg) -> Any:
+    msg = _root_dec.decode(msg)
+    tag_field = msg.header.msgtype
+    dec = _subdecs[tag_field]
+    return dec.decode(msg.payload)
+
+
+def encmsg(
+    dialog_id: str | int,
+    payload: Any,
+) -> Msg:
+
+    tag_field = None
+
+    plbytes = _root_enc.encode(payload)
+    if b'type' in plbytes:
+        assert isinstance(payload, _Struct)
+        tag_field = type(payload).__name__
+        payload = Raw(plbytes)
+
+    msg = Msg(
+        Header(dialog_id, tag_field),
+        payload,
+    )
+    return _root_enc.encode(msg)
-- 
2.34.1


From d55266f4a29a001a7c5f866d8bce1c69da67a98a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 26 Mar 2024 18:27:55 -0400
Subject: [PATCH 192/378] Move the pretty-`Struct` stuff to a `.pretty_struct`

Leave all the proto native struct-msg stuff in `.types` since i'm
thinking it's the right name for the mod that will hold all the built-in
SCIPP msgspecs longer run. Obvi the naive codec stack stuff needs to be
cleaned out/up and anything useful moved into `._codec` ;)
---
 tractor/msg/__init__.py      |   3 +-
 tractor/msg/_codec.py        |   2 +-
 tractor/msg/pretty_struct.py | 269 ++++++++++++++++++++++++++++++++++
 tractor/msg/types.py         | 277 ++---------------------------------
 4 files changed, 286 insertions(+), 265 deletions(-)
 create mode 100644 tractor/msg/pretty_struct.py

diff --git a/tractor/msg/__init__.py b/tractor/msg/__init__.py
index e2296788..b5c261cc 100644
--- a/tractor/msg/__init__.py
+++ b/tractor/msg/__init__.py
@@ -21,11 +21,10 @@ Built-in messaging patterns, types, APIs and helpers.
 from .ptr import (
     NamespacePath as NamespacePath,
 )
-from .types import (
+from .pretty_struct import (
     Struct as Struct,
 )
 from ._codec import (
-
     _def_msgspec_codec as _def_msgspec_codec,
     _ctxvar_MsgCodec as _ctxvar_MsgCodec,
 
diff --git a/tractor/msg/_codec.py b/tractor/msg/_codec.py
index 0da454ad..c26de8d4 100644
--- a/tractor/msg/_codec.py
+++ b/tractor/msg/_codec.py
@@ -47,7 +47,7 @@ from types import ModuleType
 import msgspec
 from msgspec import msgpack
 
-from .types import Struct
+from .pretty_struct import Struct
 
 
 # TODO: API changes towards being interchange lib agnostic!
diff --git a/tractor/msg/pretty_struct.py b/tractor/msg/pretty_struct.py
new file mode 100644
index 00000000..143fc7a4
--- /dev/null
+++ b/tractor/msg/pretty_struct.py
@@ -0,0 +1,269 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+Prettified version of `msgspec.Struct` for easier console grokin.
+
+'''
+from __future__ import annotations
+from collections import UserList
+from typing import (
+    Any,
+    Iterator,
+)
+
+from msgspec import (
+    msgpack,
+    Struct as _Struct,
+    structs,
+)
+from pprint import (
+    saferepr,
+)
+
+# TODO: auto-gen type sig for input func both for
+# type-msgs and logging of RPC tasks?
+# taken and modified from:
+# https://stackoverflow.com/a/57110117
+# import inspect
+# from typing import List
+
+# def my_function(input_1: str, input_2: int) -> list[int]:
+#     pass
+
+# def types_of(func):
+#     specs = inspect.getfullargspec(func)
+#     return_type = specs.annotations['return']
+#     input_types = [t.__name__ for s, t in specs.annotations.items() if s != 'return']
+#     return f'{func.__name__}({": ".join(input_types)}) -> {return_type}'
+
+# types_of(my_function)
+
+
+class DiffDump(UserList):
+    '''
+    Very simple list delegator that repr() dumps (presumed) tuple
+    elements of the form `tuple[str, Any, Any]` in a nice
+    multi-line readable form for analyzing `Struct` diffs.
+
+    '''
+    def __repr__(self) -> str:
+        if not len(self):
+            return super().__repr__()
+
+        # format by displaying item pair's ``repr()`` on multiple,
+        # indented lines such that they are more easily visually
+        # comparable when printed to console when printed to
+        # console.
+        repstr: str = '[\n'
+        for k, left, right in self:
+            repstr += (
+                f'({k},\n'
+                f'\t{repr(left)},\n'
+                f'\t{repr(right)},\n'
+                ')\n'
+            )
+        repstr += ']\n'
+        return repstr
+
+
+class Struct(
+    _Struct,
+
+    # https://jcristharif.com/msgspec/structs.html#tagged-unions
+    # tag='pikerstruct',
+    # tag=True,
+):
+    '''
+    A "human friendlier" (aka repl buddy) struct subtype.
+
+    '''
+    def _sin_props(self) -> Iterator[
+        tuple[
+            structs.FieldIinfo,
+            str,
+            Any,
+        ]
+    ]:
+        '''
+        Iterate over all non-@property fields of this struct.
+
+        '''
+        fi: structs.FieldInfo
+        for fi in structs.fields(self):
+            key: str = fi.name
+            val: Any = getattr(self, key)
+            yield fi, key, val
+
+    def to_dict(
+        self,
+        include_non_members: bool = True,
+
+    ) -> dict:
+        '''
+        Like it sounds.. direct delegation to:
+        https://jcristharif.com/msgspec/api.html#msgspec.structs.asdict
+
+        BUT, by default we pop all non-member (aka not defined as
+        struct fields) fields by default.
+
+        '''
+        asdict: dict = structs.asdict(self)
+        if include_non_members:
+            return asdict
+
+        # only return a dict of the struct members
+        # which were provided as input, NOT anything
+        # added as type-defined `@property` methods!
+        sin_props: dict = {}
+        fi: structs.FieldInfo
+        for fi, k, v in self._sin_props():
+            sin_props[k] = asdict[k]
+
+        return sin_props
+
+    def pformat(
+        self,
+        field_indent: int = 2,
+        indent: int = 0,
+
+    ) -> str:
+        '''
+        Recursion-safe `pprint.pformat()` style formatting of
+        a `msgspec.Struct` for sane reading by a human using a REPL.
+
+        '''
+        # global whitespace indent
+        ws: str = ' '*indent
+
+        # field whitespace indent
+        field_ws: str = ' '*(field_indent + indent)
+
+        # qtn: str = ws + self.__class__.__qualname__
+        qtn: str = self.__class__.__qualname__
+
+        obj_str: str = ''  # accumulator
+        fi: structs.FieldInfo
+        k: str
+        v: Any
+        for fi, k, v in self._sin_props():
+
+            # TODO: how can we prefer `Literal['option1',  'option2,
+            # ..]` over .__name__ == `Literal` but still get only the
+            # latter for simple types like `str | int | None` etc..?
+            ft: type = fi.type
+            typ_name: str = getattr(ft, '__name__', str(ft))
+
+            # recurse to get sub-struct's `.pformat()` output Bo
+            if isinstance(v, Struct):
+                val_str: str =  v.pformat(
+                    indent=field_indent + indent,
+                    field_indent=indent + field_indent,
+                )
+
+            else:  # the `pprint` recursion-safe format:
+                # https://docs.python.org/3.11/library/pprint.html#pprint.saferepr
+                val_str: str = saferepr(v)
+
+            # TODO: LOLOL use `textwrap.indent()` instead dawwwwwg!
+            obj_str += (field_ws + f'{k}: {typ_name} = {val_str},\n')
+
+        return (
+            f'{qtn}(\n'
+            f'{obj_str}'
+            f'{ws})'
+        )
+
+    # TODO: use a pprint.PrettyPrinter instance around ONLY rendering
+    # inside a known tty?
+    # def __repr__(self) -> str:
+    #     ...
+
+    # __str__ = __repr__ = pformat
+    __repr__ = pformat
+
+    def copy(
+        self,
+        update: dict | None = None,
+
+    ) -> Struct:
+        '''
+        Validate-typecast all self defined fields, return a copy of
+        us with all such fields.
+
+        NOTE: This is kinda like the default behaviour in
+        `pydantic.BaseModel` except a copy of the object is
+        returned making it compat with `frozen=True`.
+
+        '''
+        if update:
+            for k, v in update.items():
+                setattr(self, k, v)
+
+        # NOTE: roundtrip serialize to validate
+        # - enode to msgpack binary format,
+        # - decode that back to a struct.
+        return msgpack.Decoder(type=type(self)).decode(
+            msgpack.Encoder().encode(self)
+        )
+
+    def typecast(
+        self,
+
+        # TODO: allow only casting a named subset?
+        # fields: set[str] | None = None,
+
+    ) -> None:
+        '''
+        Cast all fields using their declared type annotations
+        (kinda like what `pydantic` does by default).
+
+        NOTE: this of course won't work on frozen types, use
+        ``.copy()`` above in such cases.
+
+        '''
+        # https://jcristharif.com/msgspec/api.html#msgspec.structs.fields
+        fi: structs.FieldInfo
+        for fi in structs.fields(self):
+            setattr(
+                self,
+                fi.name,
+                fi.type(getattr(self, fi.name)),
+            )
+
+    def __sub__(
+        self,
+        other: Struct,
+
+    ) -> DiffDump[tuple[str, Any, Any]]:
+        '''
+        Compare fields/items key-wise and return a ``DiffDump``
+        for easy visual REPL comparison B)
+
+        '''
+        diffs: DiffDump[tuple[str, Any, Any]] = DiffDump()
+        for fi in structs.fields(self):
+            attr_name: str = fi.name
+            ours: Any = getattr(self, attr_name)
+            theirs: Any = getattr(other, attr_name)
+            if ours != theirs:
+                diffs.append((
+                    attr_name,
+                    ours,
+                    theirs,
+                ))
+
+        return diffs
diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index e457370e..d2fb0877 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -20,12 +20,9 @@ types.
 
 '''
 from __future__ import annotations
-from collections import UserList
 from contextlib import contextmanager as cm
 from typing import (
     Any,
-    Iterator,
-    Optional,
     Union,
 )
 
@@ -33,252 +30,8 @@ from msgspec import (
     msgpack,
     Raw,
     Struct as _Struct,
-    structs,
-)
-from msgspec.msgpack import (
-    Encoder,
-    Decoder,
-)
-from pprint import (
-    saferepr,
 )
 
-# TODO: auto-gen type sig for input func both for
-# type-msgs and logging of RPC tasks?
-# taken and modified from:
-# https://stackoverflow.com/a/57110117
-# import inspect
-# from typing import List
-
-# def my_function(input_1: str, input_2: int) -> list[int]:
-#     pass
-
-# def types_of(func):
-#     specs = inspect.getfullargspec(func)
-#     return_type = specs.annotations['return']
-#     input_types = [t.__name__ for s, t in specs.annotations.items() if s != 'return']
-#     return f'{func.__name__}({": ".join(input_types)}) -> {return_type}'
-
-# types_of(my_function)
-
-
-class DiffDump(UserList):
-    '''
-    Very simple list delegator that repr() dumps (presumed) tuple
-    elements of the form `tuple[str, Any, Any]` in a nice
-    multi-line readable form for analyzing `Struct` diffs.
-
-    '''
-    def __repr__(self) -> str:
-        if not len(self):
-            return super().__repr__()
-
-        # format by displaying item pair's ``repr()`` on multiple,
-        # indented lines such that they are more easily visually
-        # comparable when printed to console when printed to
-        # console.
-        repstr: str = '[\n'
-        for k, left, right in self:
-            repstr += (
-                f'({k},\n'
-                f'\t{repr(left)},\n'
-                f'\t{repr(right)},\n'
-                ')\n'
-            )
-        repstr += ']\n'
-        return repstr
-
-
-class Struct(
-    _Struct,
-
-    # https://jcristharif.com/msgspec/structs.html#tagged-unions
-    # tag='pikerstruct',
-    # tag=True,
-):
-    '''
-    A "human friendlier" (aka repl buddy) struct subtype.
-
-    '''
-    def _sin_props(self) -> Iterator[
-        tuple[
-            structs.FieldIinfo,
-            str,
-            Any,
-        ]
-    ]:
-        '''
-        Iterate over all non-@property fields of this struct.
-
-        '''
-        fi: structs.FieldInfo
-        for fi in structs.fields(self):
-            key: str = fi.name
-            val: Any = getattr(self, key)
-            yield fi, key, val
-
-    def to_dict(
-        self,
-        include_non_members: bool = True,
-
-    ) -> dict:
-        '''
-        Like it sounds.. direct delegation to:
-        https://jcristharif.com/msgspec/api.html#msgspec.structs.asdict
-
-        BUT, by default we pop all non-member (aka not defined as
-        struct fields) fields by default.
-
-        '''
-        asdict: dict = structs.asdict(self)
-        if include_non_members:
-            return asdict
-
-        # only return a dict of the struct members
-        # which were provided as input, NOT anything
-        # added as type-defined `@property` methods!
-        sin_props: dict = {}
-        fi: structs.FieldInfo
-        for fi, k, v in self._sin_props():
-            sin_props[k] = asdict[k]
-
-        return sin_props
-
-    def pformat(
-        self,
-        field_indent: int = 2,
-        indent: int = 0,
-
-    ) -> str:
-        '''
-        Recursion-safe `pprint.pformat()` style formatting of
-        a `msgspec.Struct` for sane reading by a human using a REPL.
-
-        '''
-        # global whitespace indent
-        ws: str = ' '*indent
-
-        # field whitespace indent
-        field_ws: str = ' '*(field_indent + indent)
-
-        # qtn: str = ws + self.__class__.__qualname__
-        qtn: str = self.__class__.__qualname__
-
-        obj_str: str = ''  # accumulator
-        fi: structs.FieldInfo
-        k: str
-        v: Any
-        for fi, k, v in self._sin_props():
-
-            # TODO: how can we prefer `Literal['option1',  'option2,
-            # ..]` over .__name__ == `Literal` but still get only the
-            # latter for simple types like `str | int | None` etc..?
-            ft: type = fi.type
-            typ_name: str = getattr(ft, '__name__', str(ft))
-
-            # recurse to get sub-struct's `.pformat()` output Bo
-            if isinstance(v, Struct):
-                val_str: str =  v.pformat(
-                    indent=field_indent + indent,
-                    field_indent=indent + field_indent,
-                )
-
-            else:  # the `pprint` recursion-safe format:
-                # https://docs.python.org/3.11/library/pprint.html#pprint.saferepr
-                val_str: str = saferepr(v)
-
-            # TODO: LOLOL use `textwrap.indent()` instead dawwwwwg!
-            obj_str += (field_ws + f'{k}: {typ_name} = {val_str},\n')
-
-        return (
-            f'{qtn}(\n'
-            f'{obj_str}'
-            f'{ws})'
-        )
-
-    # TODO: use a pprint.PrettyPrinter instance around ONLY rendering
-    # inside a known tty?
-    # def __repr__(self) -> str:
-    #     ...
-
-    # __str__ = __repr__ = pformat
-    __repr__ = pformat
-
-    def copy(
-        self,
-        update: dict | None = None,
-
-    ) -> Struct:
-        '''
-        Validate-typecast all self defined fields, return a copy of
-        us with all such fields.
-
-        NOTE: This is kinda like the default behaviour in
-        `pydantic.BaseModel` except a copy of the object is
-        returned making it compat with `frozen=True`.
-
-        '''
-        if update:
-            for k, v in update.items():
-                setattr(self, k, v)
-
-        # NOTE: roundtrip serialize to validate
-        # - enode to msgpack binary format,
-        # - decode that back to a struct.
-        return msgpack.Decoder(type=type(self)).decode(
-            msgpack.Encoder().encode(self)
-        )
-
-    def typecast(
-        self,
-
-        # TODO: allow only casting a named subset?
-        # fields: set[str] | None = None,
-
-    ) -> None:
-        '''
-        Cast all fields using their declared type annotations
-        (kinda like what `pydantic` does by default).
-
-        NOTE: this of course won't work on frozen types, use
-        ``.copy()`` above in such cases.
-
-        '''
-        # https://jcristharif.com/msgspec/api.html#msgspec.structs.fields
-        fi: structs.FieldInfo
-        for fi in structs.fields(self):
-            setattr(
-                self,
-                fi.name,
-                fi.type(getattr(self, fi.name)),
-            )
-
-    def __sub__(
-        self,
-        other: Struct,
-
-    ) -> DiffDump[tuple[str, Any, Any]]:
-        '''
-        Compare fields/items key-wise and return a ``DiffDump``
-        for easy visual REPL comparison B)
-
-        '''
-        diffs: DiffDump[tuple[str, Any, Any]] = DiffDump()
-        for fi in structs.fields(self):
-            attr_name: str = fi.name
-            ours: Any = getattr(self, attr_name)
-            theirs: Any = getattr(other, attr_name)
-            if ours != theirs:
-                diffs.append((
-                    attr_name,
-                    ours,
-                    theirs,
-                ))
-
-        return diffs
-
-# ------ - ------
-#
 # TODO: integration with our ``enable_modules: list[str]`` caps sys.
 #
 # ``pkgutil.resolve_name()`` internally uses
@@ -307,15 +60,15 @@ class Struct(
 # that are spawned **after** the configure call is made.
 _lifo_codecs: list[
     tuple[
-        Encoder,
-        Decoder,
+        msgpack.Encoder,
+        msgpack.Decoder,
     ],
-] = [(Encoder(), Decoder())]
+] = [(msgpack.Encoder(), msgpack.Decoder())]
 
 
 def get_msg_codecs() -> tuple[
-    Encoder,
-    Decoder,
+    msgpack.Encoder,
+    msgpack.Decoder,
 ]:
     '''
     Return the currently configured ``msgspec`` codec set.
@@ -344,13 +97,13 @@ def configure_native_msgs(
     # defining every struct type in the union. In this case tag_field
     # defaults to "type", and tag defaults to the struct class name
     # (e.g. "Get")."
-    enc = Encoder()
+    enc = msgpack.Encoder()
 
     types_union = Union[tagged_structs[0]] | Any
     for struct in tagged_structs[1:]:
         types_union |= struct
 
-    dec = Decoder(types_union)
+    dec = msgpack.Decoder(types_union)
 
     _lifo_codecs.append((enc, dec))
     try:
@@ -367,7 +120,7 @@ class Header(_Struct, tag=True):
 
     '''
     uid: str
-    msgtype: Optional[str] = None
+    msgtype: str|None = None
 
 
 class Msg(_Struct, tag=True):
@@ -379,23 +132,23 @@ class Msg(_Struct, tag=True):
     payload: Raw
 
 
-_root_dec = Decoder(Msg)
-_root_enc = Encoder()
+_root_dec = msgpack.Decoder(Msg)
+_root_enc = msgpack.Encoder()
 
 # sub-decoders for retreiving embedded
 # payload data and decoding to a sender
 # side defined (struct) type.
 _subdecs:  dict[
-    Optional[str],
-    Decoder] = {
-    None: Decoder(Any),
+    str|None,
+    msgpack.Decoder] = {
+    None: msgpack.Decoder(Any),
 }
 
 
 @cm
 def enable_context(
     msg_subtypes: list[list[_Struct]]
-) -> Decoder:
+) -> msgpack.Decoder:
 
     for types in msg_subtypes:
         first = types[0]
@@ -410,7 +163,7 @@ def enable_context(
             type_union |= typ
             tags.append(typ.__name__)
 
-        dec = Decoder(type_union)
+        dec = msgpack.Decoder(type_union)
 
         # register all tags for this union sub-decoder
         for tag in tags:
-- 
2.34.1


From 995af130cf200d7e12731b00eb334f8c44197307 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 28 Mar 2024 10:45:01 -0400
Subject: [PATCH 193/378] Init def of "SC shuttle prot" with
 "msg-spec-limiting"

As per the long outstanding GH issue this starts our rigorous journey
into an attempt at a type-safe, cross-actor SC, IPC protocol Bo

boop -> https://github.com/goodboy/tractor/issues/36

The idea is to "formally" define our SC "shuttle (dialog) protocol" by
specifying a new `.msg.types.Msg` subtype-set which can fully
encapsulate all IPC msg schemas needed in order to accomplish
cross-process SC!

The msg set deviated a little in terms of (type) names from the existing
`dict`-msgs currently used in the runtime impl but, I think the name
changes are much better in terms of explicitly representing the internal
semantics of the actor runtime machinery/subsystems and the
IPC-msg-dialog required for SC enforced RPC.

------ - ------

In cursory, the new formal msgs-spec includes the following msg-subtypes
of a new top-level `Msg` boxing type (that holds the base field schema
for all msgs):

- `Start` to request RPC task scheduling by passing a `FuncSpec` payload
  (to replace the currently used `{'cmd': ... }` dict msg impl)

- `StartAck` to allow the RPC task callee-side to report a `IpcCtxSpec`
  payload immediately back to the caller (currently responded naively via
  a `{'functype': ... }` msg)

- `Started` to deliver the first value from `Context.started()`
  (instead of the existing `{'started': ... }`)

- `Yield` to shuttle `MsgStream.send()`-ed values (instead of
  our `{'yield': ... }`)

- `Stop` to terminate a `Context.open_stream()` session/block
  (over `{'stop': True }`)

- `Return` to deliver the final value from the `Actor.start_remote_task()`
  (which is a `{'return': ... }`)

- `Error` to box `RemoteActorError` exceptions via a `.pld: ErrorData`
  payload, planned to replace/extend the current `RemoteActorError.msgdata`
  mechanism internal to `._exceptions.pack/unpack_error()`

The new `tractor.msg.types` includes all the above msg defs as well an API
for rendering a "payload type specification" using a
`payload_type_spec: Union[Type]` that can be passed to
`msgspec.msgpack.Decoder(type=payload_type_spec)`. This ensures that
(for a subset of the above msg set) `Msg.pld: PayloadT` data is
type-parameterized using `msgspec`'s new `Generic[PayloadT]` field
support and thus enables providing for an API where IPC `Context`
dialogs can strictly define the allowed payload-datatype-set via type
union!

Iow, this is the foundation for supporting `Channel`/`Context`/`MsgStream`
IPC primitives which are type checked/safe as desired in GH issue:
- https://github.com/goodboy/tractor/issues/365

Misc notes on current impl(s) status:
------ - ------
- add a `.msg.types.mk_msg_spec()` which uses the new `msgspec` support
  for `class MyStruct[Struct, Generic[T]]` parameterize-able fields and
  delivers our boxing SC-msg-(sub)set with the desired `payload_types`
  applied to `.pld`:
  - https://jcristharif.com/msgspec/supported-types.html#generic-types
  - as a note this impl seems to need to use `type.new_class()` dynamic
    subtype generation, though i don't really get *why* still.. but
    without that the `msgspec.msgpack.Decoder` doesn't seem to reject
    `.pld` limited `Msg` subtypes as demonstrated in the new test.

- around this ^ add a `.msg._codec.limit_msg_spec()` cm which exposes
  this payload type limiting API such that it can be applied per task
  via a `MsgCodec` in app code.

- the orig approach in https://github.com/goodboy/tractor/pull/311 was
  the idea of making payload fields `.pld: Raw` wherein we could have
  per-field/sub-msg decoders dynamically loaded depending on the
  particular application-layer schema in use. I don't want to lose the
  idea of this since I think it might be useful for an idea I have about
  capability-based-fields(-sharing, maybe using field-subset
  encryption?), and as such i've kept the (ostensibly) working impls in
  TODO-comments in `.msg._codec` wherein maybe we can add
  a `MsgCodec._payload_decs: dict` table for this later on.
  |_ also left in the `.msg.types.enc/decmsg()` impls but renamed as
    `enc/dec_payload()` (but reworked to not rely on the lifo codec
    stack tables; now removed) such that we can prolly move them to
    `MsgCodec` methods in the future.

- add an unused `._codec.mk_tagged_union_dec()` helper which was
  originally factored out the #311 proto-code but didn't end up working
  as desired with the new parameterized generic fields approach (now
  in `msg.types.mk_msg_spec()`)

Testing/deps work:
------ - ------
- new `test_limit_msgspec()` which ensures all the `.types` content is
  correct but without using the wrapping APIs in `._codec`; i.e. using
  a in-line `Decoder` instead of a `MsgCodec`.

- pin us to `msgspec>=0.18.5` which has the needed generic-types support
  (which took me way too long yester to figure out when implementing all
  this XD)!
---
 setup.py                  |   2 +-
 tests/test_caps_msging.py | 181 ++++++++++++++-
 tractor/msg/_codec.py     | 144 +++++++++++-
 tractor/msg/types.py      | 474 ++++++++++++++++++++++++++------------
 4 files changed, 645 insertions(+), 156 deletions(-)

diff --git a/setup.py b/setup.py
index 50ee92ec..a2219372 100755
--- a/setup.py
+++ b/setup.py
@@ -60,7 +60,7 @@ setup(
         'wrapt',
 
         # IPC serialization
-        'msgspec',
+        'msgspec>=0.18.5',
 
         # debug mode REPL
         'pdbp',
diff --git a/tests/test_caps_msging.py b/tests/test_caps_msging.py
index f659cb13..b101c1e0 100644
--- a/tests/test_caps_msging.py
+++ b/tests/test_caps_msging.py
@@ -6,12 +6,22 @@ B~)
 '''
 from typing import (
     Any,
+    _GenericAlias,
     Type,
+    Union,
 )
 from contextvars import (
     Context,
 )
+# from inspect import Parameter
 
+from msgspec import (
+    structs,
+    msgpack,
+    # defstruct,
+    Struct,
+    ValidationError,
+)
 import tractor
 from tractor.msg import (
     _def_msgspec_codec,
@@ -23,6 +33,12 @@ from tractor.msg import (
     apply_codec,
     current_msgspec_codec,
 )
+from tractor.msg.types import (
+    PayloadT,
+    Msg,
+    # Started,
+    mk_msg_spec,
+)
 import trio
 
 # TODO: wrap these into `._codec` such that user can just pass
@@ -54,7 +70,7 @@ def mk_custom_codec() -> MsgCodec:
     # apply custom hooks and set a `Decoder` which only
     # loads `NamespacePath` types.
     nsp_codec: MsgCodec = mk_codec(
-        dec_types=NamespacePath,
+        ipc_msg_spec=NamespacePath,
         enc_hook=enc_hook,
         dec_hook=dec_hook,
     )
@@ -196,3 +212,166 @@ def test_codec_hooks_mod():
             await p.cancel_actor()
 
     trio.run(main)
+
+
+def chk_pld_type(
+    generic: Msg|_GenericAlias,
+    payload_type: Type[Struct]|Any,
+    pld: Any,
+
+) -> bool:
+
+    roundtrip: bool = False
+    pld_val_type: Type = type(pld)
+
+    # gen_paramed: _GenericAlias = generic[payload_type]
+    # TODO: verify that the overridden subtypes
+    # DO NOT have modified type-annots from original!
+    # 'Start',  .pld: FuncSpec
+    # 'StartAck',  .pld: IpcCtxSpec
+    # 'Stop',  .pld: UNSEt
+    # 'Error',  .pld: ErrorData
+    # for typedef in (
+    #     [gen_paramed]
+    #     +
+
+    #     # type-var should always be set for these sub-types
+    #     # as well!
+    #     Msg.__subclasses__()
+    # ):
+    #     if typedef.__name__ not in [
+    #         'Msg',
+    #         'Started',
+    #         'Yield',
+    #         'Return',
+    #     ]:
+    #         continue
+    # payload_type: Type[Struct] = CustomPayload
+
+    # TODO: can remove all this right!?
+    #
+    # when parameterized (like `Msg[Any]`) then
+    # we expect an alias as input.
+    # if isinstance(generic, _GenericAlias):
+    #     assert payload_type in generic.__args__
+    # else:
+        # assert PayloadType in generic.__parameters__
+        # pld_param: Parameter = generic.__signature__.parameters['pld']
+        # assert pld_param.annotation is PayloadType
+
+    type_spec: Union[Type[Struct]]
+    msg_types: list[Msg[payload_type]]
+    (
+        type_spec,
+        msg_types,
+    ) = mk_msg_spec(
+        payload_type=payload_type,
+    )
+    enc = msgpack.Encoder()
+    dec = msgpack.Decoder(
+        type=type_spec,  # like `Msg[Any]`
+    )
+
+    # verify the boxed-type for all variable payload-type msgs.
+    for typedef in msg_types:
+
+        pld_field = structs.fields(typedef)[1]
+        assert pld_field.type in {payload_type, PayloadT}
+        # TODO: does this need to work to get all subtypes to
+        # adhere?
+        assert pld_field.type is payload_type
+
+        kwargs: dict[str, Any] = {
+            'cid': '666',
+            'pld': pld,
+        }
+        enc_msg = typedef(**kwargs)
+
+        wire_bytes: bytes = enc.encode(enc_msg)
+
+        try:
+            dec_msg = dec.decode(wire_bytes)
+            assert dec_msg.pld == pld
+            assert (roundtrip := (dec_msg == enc_msg))
+
+        except ValidationError as ve:
+            # breakpoint()
+            if pld_val_type is payload_type:
+                raise ValueError(
+                   'Got `ValidationError` despite type-var match!?\n'
+                    f'pld_val_type: {pld_val_type}\n'
+                    f'payload_type: {payload_type}\n'
+                ) from ve
+
+            else:
+                # ow we good cuz the pld spec mismatched.
+                print(
+                    'Got expected `ValidationError` since,\n'
+                    f'{pld_val_type} is not {payload_type}\n'
+                )
+        else:
+            if (
+                pld_val_type is not payload_type
+                and payload_type is not Any
+            ):
+                raise ValueError(
+                   'DID NOT `ValidationError` despite expected type match!?\n'
+                    f'pld_val_type: {pld_val_type}\n'
+                    f'payload_type: {payload_type}\n'
+                )
+
+    return roundtrip
+
+
+
+def test_limit_msgspec():
+
+    async def main():
+        async with tractor.open_root_actor(
+            debug_mode=True
+        ):
+
+            # ensure we can round-trip a boxing `Msg`
+            assert chk_pld_type(
+                Msg,
+                Any,
+                None,
+            )
+
+            # TODO: don't need this any more right since
+            # `msgspec>=0.15` has the nice generics stuff yah??
+            #
+            # manually override the type annot of the payload
+            # field and ensure it propagates to all msg-subtypes.
+            # Msg.__annotations__['pld'] = Any
+
+            # verify that a mis-typed payload value won't decode
+            assert not chk_pld_type(
+                Msg,
+                int,
+                pld='doggy',
+            )
+
+            # parametrize the boxed `.pld` type as a custom-struct
+            # and ensure that parametrization propagates
+            # to all payload-msg-spec-able subtypes!
+            class CustomPayload(Struct):
+                name: str
+                value: Any
+
+            assert not chk_pld_type(
+                Msg,
+                CustomPayload,
+                pld='doggy',
+            )
+
+            assert chk_pld_type(
+                Msg,
+                CustomPayload,
+                pld=CustomPayload(name='doggy', value='urmom')
+            )
+
+            # uhh bc we can `.pause_from_sync()` now! :surfer:
+            # breakpoint()
+
+    trio.run(main)
diff --git a/tractor/msg/_codec.py b/tractor/msg/_codec.py
index c26de8d4..5ce02055 100644
--- a/tractor/msg/_codec.py
+++ b/tractor/msg/_codec.py
@@ -47,20 +47,25 @@ from types import ModuleType
 import msgspec
 from msgspec import msgpack
 
-from .pretty_struct import Struct
+from tractor.msg.pretty_struct import Struct
+from tractor.msg.types import (
+    mk_msg_spec,
+    Msg,
+)
 
 
 # TODO: API changes towards being interchange lib agnostic!
+#
 # -[ ] capnproto has pre-compiled schema for eg..
 #  * https://capnproto.org/language.html
 #  * http://capnproto.github.io/pycapnp/quickstart.html
 #   * https://github.com/capnproto/pycapnp/blob/master/examples/addressbook.capnp
+#
 class MsgCodec(Struct):
     '''
     A IPC msg interchange format lib's encoder + decoder pair.
 
     '''
-
     lib: ModuleType = msgspec
 
     # ad-hoc type extensions
@@ -70,12 +75,22 @@ class MsgCodec(Struct):
 
     # struct type unions
     # https://jcristharif.com/msgspec/structs.html#tagged-unions
-    types: Union[Type[Struct]]|Any = Any
+    ipc_msg_spec: Union[Type[Struct]]|Any = Any
+    payload_msg_spec: Union[Type[Struct]] = Any
 
     # post-configure cached props
     _enc: msgpack.Encoder|None = None
     _dec: msgpack.Decoder|None = None
 
+    # TODO: a sub-decoder system as well?
+    # see related comments in `.msg.types`
+    # _payload_decs: (
+    #     dict[
+    #         str,
+    #         msgpack.Decoder,
+    #     ]
+    #     |None
+    # ) = None
 
     # TODO: use `functools.cached_property` for these ?
     # https://docs.python.org/3/library/functools.html#functools.cached_property
@@ -88,8 +103,9 @@ class MsgCodec(Struct):
         enc_hook: Callable|None = None,
         reset: bool = False,
 
-        # TODO: what's the default for this?
+        # TODO: what's the default for this, and do we care?
         # write_buffer_size: int
+        #
         **kwargs,
 
     ) -> msgpack.Encoder:
@@ -131,7 +147,7 @@ class MsgCodec(Struct):
 
     def decoder(
         self,
-        types: Union[Type[Struct]]|None = None,
+        ipc_msg_spec: Union[Type[Struct]]|None = None,
         dec_hook: Callable|None = None,
         reset: bool = False,
         **kwargs,
@@ -152,7 +168,7 @@ class MsgCodec(Struct):
             or reset
         ):
             self._dec = self.lib.msgpack.Decoder(
-                types or self.types,
+                type=ipc_msg_spec or self.ipc_msg_spec,
                 dec_hook=dec_hook or self.dec_hook,
                 **kwargs,
             )
@@ -169,10 +185,39 @@ class MsgCodec(Struct):
         determined by the 
 
         '''
-
         return self.dec.decode(msg)
 
 
+def mk_tagged_union_dec(
+    tagged_structs: list[Struct],
+
+) -> tuple[
+    list[str],
+    msgpack.Decoder,
+]:
+    # See "tagged unions" docs:
+    # https://jcristharif.com/msgspec/structs.html#tagged-unions
+
+    # "The quickest way to enable tagged unions is to set tag=True when
+    # defining every struct type in the union. In this case tag_field
+    # defaults to "type", and tag defaults to the struct class name
+    # (e.g. "Get")."
+    first: Struct = tagged_structs[0]
+    types_union: Union[Type[Struct]] = Union[
+       first
+    ]|Any
+    tags: list[str] = [first.__name__]
+
+    for struct in tagged_structs[1:]:
+        types_union |= struct
+        tags.append(struct.__name__)
+
+    dec = msgpack.Decoder(types_union)
+    return (
+        tags,
+        dec,
+    )
+
 # TODO: struct aware messaging coders as per:
 # - https://github.com/goodboy/tractor/issues/36
 # - https://github.com/goodboy/tractor/issues/196
@@ -181,13 +226,18 @@ class MsgCodec(Struct):
 def mk_codec(
     libname: str = 'msgspec',
 
+    # for codec-ing boxed `Msg`-with-payload msgs
+    payload_types: Union[Type[Struct]]|None = None,
+
+    # TODO: do we want to allow NOT/using a diff `Msg`-set?
+    #
     # struct type unions set for `Decoder`
     # https://jcristharif.com/msgspec/structs.html#tagged-unions
-    dec_types: Union[Type[Struct]]|Any = Any,
+    ipc_msg_spec: Union[Type[Struct]]|Any = Any,
 
     cache_now: bool = True,
 
-    # proxy to the `Struct.__init__()`
+    # proxy as `Struct(**kwargs)`
     **kwargs,
 
 ) -> MsgCodec:
@@ -197,14 +247,59 @@ def mk_codec(
     `msgspec` ;).
 
     '''
+    # (manually) generate a msg-payload-spec for all relevant
+    # god-boxing-msg subtypes, parameterizing the `Msg.pld: PayloadT`
+    # for the decoder such that all sub-type msgs in our SCIPP
+    # will automatically decode to a type-"limited" payload (`Struct`)
+    # object (set).
+    payload_type_spec: Union[Type[Msg]]|None = None
+    if payload_types:
+        (
+            payload_type_spec,
+            msg_types,
+        ) = mk_msg_spec(
+            payload_type=payload_types,
+        )
+        assert len(payload_type_spec.__args__) == len(msg_types)
+
+        # TODO: sub-decode `.pld: Raw`?
+        # see similar notes inside `.msg.types`..
+        #
+        # not sure we'll end up wanting/needing this
+        # though it might have unforeseen advantages in terms
+        # of enabling encrypted appliciation layer (only)
+        # payloads?
+        #
+        # register sub-payload decoders to load `.pld: Raw`
+        # decoded `Msg`-packets using a dynamic lookup (table)
+        # instead of a pre-defined msg-spec via `Generic`
+        # parameterization.
+        #
+        # (
+        #     tags,
+        #     payload_dec,
+        # ) = mk_tagged_union_dec(
+        #     tagged_structs=list(payload_types.__args__),
+        # )
+        # _payload_decs: (
+        #     dict[str, msgpack.Decoder]|None
+        # ) = {
+        #     # pre-seed decoders for std-py-type-set for use when
+        #     # `Msg.pld == None|Any`.
+        #     None: msgpack.Decoder(Any),
+        #     Any: msgpack.Decoder(Any),
+        # }
+        # for name in tags:
+        #     _payload_decs[name] = payload_dec
+
     codec = MsgCodec(
-        types=dec_types,
+        ipc_msg_spec=ipc_msg_spec,
+        payload_msg_spec=payload_type_spec,
         **kwargs,
     )
     assert codec.lib.__name__ == libname
 
-    # by default config and cache the codec pair for given
-    # input settings.
+    # by default, config-n-cache the codec pair from input settings.
     if cache_now:
         assert codec.enc
         assert codec.dec
@@ -251,3 +346,28 @@ def current_msgspec_codec() -> MsgCodec:
 
     '''
     return _ctxvar_MsgCodec.get()
+
+
+@cm
+def limit_msg_spec(
+    payload_types: Union[Type[Struct]],
+
+    # TODO: don't need this approach right?
+    #
+    # tagged_structs: list[Struct]|None = None,
+
+    **codec_kwargs,
+):
+    '''
+    Apply a `MsgCodec` that will natively decode the SC-msg set's
+    `Msg.pld: Union[Type[Struct]]` payload fields using
+    tagged-unions of `msgspec.Struct`s from the `payload_types`
+    for all IPC contexts in use by the current `trio.Task`.
+
+    '''
+    msgspec_codec: MsgCodec = mk_codec(
+        payload_types=payload_types,
+        **codec_kwargs,
+    )
+    with apply_codec(msgspec_codec):
+        yield msgspec_codec
diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index d2fb0877..732a0f5d 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -15,23 +15,315 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 '''
-Extensions to built-in or (heavily used but 3rd party) friend-lib
-types.
+Define our strictly typed IPC message spec for the SCIPP:
+
+that is,
+
+the "Structurred-Concurrency-Inter-Process-(dialog)-(un)Protocol".
 
 '''
+
 from __future__ import annotations
-from contextlib import contextmanager as cm
+# from contextlib import contextmanager as cm
+import types
 from typing import (
     Any,
+    Generic,
+    Literal,
+    Type,
+    TypeVar,
     Union,
 )
 
 from msgspec import (
     msgpack,
     Raw,
-    Struct as _Struct,
+    Struct,
+    UNSET,
 )
 
+
+# TODO: can also remove yah?
+#
+# class Header(Struct, tag=True):
+#     '''
+#     A msg header which defines payload properties
+
+#     '''
+#     payload_tag: str|None = None
+
+# type variable for the boxed payload field `.pld`
+PayloadT = TypeVar('PayloadT')
+
+
+class Msg(
+    Struct,
+    Generic[PayloadT],
+    tag=True,
+    tag_field='msg_type',
+):
+    '''
+    The "god" boxing msg type.
+
+    Boxes user data-msgs in a `.pld` and uses `msgspec`'s tagged
+    unions support to enable a spec from a common msg inheritance
+    tree.
+
+    '''
+    # header: Header
+    # TODO: use UNSET here?
+    cid: str|None  # call/context-id
+
+    # The msgs "payload" (spelled without vowels):
+    # https://en.wikipedia.org/wiki/Payload_(computing)
+    #
+    # NOTE: inherited from any `Msg` (and maybe overriden
+    # by use of `limit_msg_spec()`), but by default is
+    # parameterized to be `Any`.
+    #
+    # XXX this `Union` must strictly NOT contain `Any` if
+    # a limited msg-type-spec is intended, such that when
+    # creating and applying a new `MsgCodec` its 
+    # `.decoder: Decoder` is configured with a `Union[Type[Struct]]` which
+    # restricts the allowed payload content (this `.pld` field) 
+    # by type system defined loading constraints B)
+    #
+    # TODO: could also be set to `msgspec.Raw` if the sub-decoders
+    # approach is preferred over the generic parameterization 
+    # approach as take by `mk_msg_spec()` below.
+    pld: PayloadT
+
+
+# TODO: better name, like `Call/TaskInput`?
+class FuncSpec(Struct):
+    # TODO: can we combine these 2 into a `NamespacePath` field?
+    ns: str
+    func: str
+
+    kwargs: dict
+    uid: str  # (calling) actor-id
+
+
+class Start(
+    Msg,
+):
+    '''
+    Initial request to remotely schedule an RPC `trio.Task` via
+    `Actor.start_remote_task()`.
+
+    It is called by all the following public APIs:
+
+    - `ActorNursery.run_in_actor()`
+
+    - `Portal.run()`
+          `|_.run_from_ns()`
+          `|_.open_stream_from()`
+          `|_._submit_for_result()`
+
+    - `Context.open_context()`
+
+    '''
+    pld: FuncSpec
+
+
+FuncType: Literal[
+    'asyncfunc',
+    'asyncgen',
+    'context',  # TODO: the only one eventually?
+] = 'context'
+
+
+class IpcCtxSpec(Struct):
+    '''
+    An inter-actor-`trio.Task`-comms `Context` spec.
+
+    '''
+    functype: FuncType
+
+    # TODO: as part of the reponse we should report our allowed
+    # msg spec which should be generated from the type-annots as
+    # desired in # https://github.com/goodboy/tractor/issues/365
+    # When this does not match what the starter/caller side
+    # expects we of course raise a `TypeError` just like if
+    # a function had been called using an invalid signature.
+    #
+    # msgspec: MsgSpec
+
+
+class StartAck(
+    Msg,
+    Generic[PayloadT],
+):
+    '''
+    Init response to a `Cmd` request indicating the far
+    end's RPC callable "type".
+
+    '''
+    pld: IpcCtxSpec
+
+
+class Started(
+    Msg,
+    Generic[PayloadT],
+):
+    '''
+    Packet to shuttle the "first value" delivered by
+    `Context.started(value: Any)` from a `@tractor.context`
+    decorated IPC endpoint.
+
+    '''
+
+
+# TODO: instead of using our existing `Start`
+# for this (as we did with the original `{'cmd': ..}` style)
+# class Cancel(Msg):
+#     cid: str
+
+
+class Yield(
+    Msg,
+    Generic[PayloadT],
+):
+    '''
+    Per IPC transmission of a value from `await MsgStream.send(<value>)`.
+
+    '''
+
+
+class Stop(Msg):
+    '''
+    Stream termination signal much like an IPC version 
+    of `StopAsyncIteration`.
+
+    '''
+    pld: UNSET
+
+
+class Return(
+    Msg,
+    Generic[PayloadT],
+):
+    '''
+    Final `return <value>` from a remotely scheduled
+    func-as-`trio.Task`.
+
+    '''
+
+
+class ErrorData(Struct):
+    '''
+    Remote actor error meta-data as needed originally by
+    `RemoteActorError.msgdata: dict`.
+
+    '''
+    src_uid: str
+    src_type_str: str
+    boxed_type_str: str
+
+    relay_path: list[str]
+    tb_str: str
+
+    # `ContextCancelled`
+    canceller: str|None = None
+
+    # `StreamOverrun`
+    sender: str|None = None
+
+
+class Error(Msg):
+    '''
+    A pkt that wraps `RemoteActorError`s for relay.
+
+    '''
+    pld: ErrorData
+
+
+# TODO: should be make a msg version of `ContextCancelled?`
+# and/or with a scope field or a full `ActorCancelled`?
+# class Cancelled(Msg):
+#     cid: str
+
+# TODO what about overruns?
+# class Overrun(Msg):
+#     cid: str
+
+
+def mk_msg_spec(
+    payload_type: Union[Type] = Any,
+    boxing_msg_set: set[Msg] = {
+        Started,
+        Yield,
+        Return,
+    },
+
+) -> tuple[
+    Union[Type[Msg]],
+    list[Type[Msg]],
+]:
+    '''
+    Generate a payload-type-parameterized `Msg` specification such
+    that IPC msgs which can be `Msg.pld` (payload) type
+    limited/filterd are specified given an input `payload_type:
+    Union[Type]`.
+
+    '''
+    submsg_types: list[Type[Msg]] = Msg.__subclasses__()
+
+    # TODO: see below as well,
+    # => union building approach with `.__class_getitem__()`
+    # doesn't seem to work..?
+    #
+    # payload_type_spec: Union[Type[Msg]]
+    #
+    msg_types: list[Msg] = []
+    for msgtype in boxing_msg_set:
+
+        # check inheritance sanity
+        assert msgtype in submsg_types
+
+        # TODO: wait why do we need the dynamic version here?
+        # -[ ] paraming the `PayloadT` values via `Generic[T]`
+        #   doesn't seem to work at all?
+        # -[ ] is there a way to get it to work at module level
+        #   just using inheritance or maybe a metaclass?
+        #
+        # index_paramed_msg_type: Msg = msgtype[payload_type]
+
+        # TODO: WHY do we need to dynamically generate the
+        # subtype-msgs here to ensure the `.pld` parameterization
+        # propagates as well as works at all in terms of the
+        # `msgpack.Decoder()`..?
+        #
+        # dynamically create the payload type-spec-limited msg set.
+        manual_paramed_msg_subtype: Type = types.new_class(
+            msgtype.__name__,
+            (
+                # XXX NOTE XXX this seems to be THE ONLY
+                # way to get this to work correctly!?!
+                Msg[payload_type],
+                Generic[PayloadT],
+            ),
+            {},
+        )
+
+        # TODO: grok the diff here better..
+        # assert index_paramed_msg_type == manual_paramed_msg_subtype
+
+        # XXX TODO: why does the manual method work but not the
+        # `.__class_getitem__()` one!?!
+        paramed_msg_type = manual_paramed_msg_subtype
+
+        # payload_type_spec |= paramed_msg_type
+        msg_types.append(paramed_msg_type)
+
+
+    payload_type_spec: Union[Type[Msg]] = Union[*msg_types]
+    return (
+        payload_type_spec,
+        msg_types,
+    )
+
+
 # TODO: integration with our ``enable_modules: list[str]`` caps sys.
 #
 # ``pkgutil.resolve_name()`` internally uses
@@ -43,160 +335,58 @@ from msgspec import (
 #   - https://stackoverflow.com/a/63320902
 #   - https://docs.python.org/3/library/sys.html#sys.meta_path
 
-# the new "Implicit Namespace Packages" might be relevant?
-# - https://www.python.org/dev/peps/pep-0420/
-
-# add implicit serialized message type support so that paths can be
-# handed directly to IPC primitives such as streams and `Portal.run()`
-# calls:
-# - via ``msgspec``:
-#   - https://jcristharif.com/msgspec/api.html#struct
-#   - https://jcristharif.com/msgspec/extending.html
-# via ``msgpack-python``:
-# https://github.com/msgpack/msgpack-python#packingunpacking-of-custom-data-type
-# LIFO codec stack that is appended when the user opens the
-# ``configure_native_msgs()`` cm below to configure a new codec set
-# which will be applied to all new (msgspec relevant) IPC transports
-# that are spawned **after** the configure call is made.
-_lifo_codecs: list[
-    tuple[
-        msgpack.Encoder,
-        msgpack.Decoder,
-    ],
-] = [(msgpack.Encoder(), msgpack.Decoder())]
-
-
-def get_msg_codecs() -> tuple[
-    msgpack.Encoder,
-    msgpack.Decoder,
-]:
-    '''
-    Return the currently configured ``msgspec`` codec set.
-
-    The defaults are defined above.
-
-    '''
-    global _lifo_codecs
-    return _lifo_codecs[-1]
-
-
-@cm
-def configure_native_msgs(
-    tagged_structs: list[_Struct],
-):
-    '''
-    Push a codec set that will natively decode
-    tagged structs provied in ``tagged_structs``
-    in all IPC transports and pop the codec on exit.
-
-    '''
-    # See "tagged unions" docs:
-    # https://jcristharif.com/msgspec/structs.html#tagged-unions
-
-    # "The quickest way to enable tagged unions is to set tag=True when
-    # defining every struct type in the union. In this case tag_field
-    # defaults to "type", and tag defaults to the struct class name
-    # (e.g. "Get")."
-    enc = msgpack.Encoder()
-
-    types_union = Union[tagged_structs[0]] | Any
-    for struct in tagged_structs[1:]:
-        types_union |= struct
-
-    dec = msgpack.Decoder(types_union)
-
-    _lifo_codecs.append((enc, dec))
-    try:
-        print("YOYOYOOYOYOYOY")
-        yield enc, dec
-    finally:
-        print("NONONONONON")
-        _lifo_codecs.pop()
-
-
-class Header(_Struct, tag=True):
-    '''
-    A msg header which defines payload properties
-
-    '''
-    uid: str
-    msgtype: str|None = None
-
-
-class Msg(_Struct, tag=True):
-    '''
-    The "god" msg type, a box for task level msg types.
-
-    '''
-    header: Header
-    payload: Raw
-
-
-_root_dec = msgpack.Decoder(Msg)
-_root_enc = msgpack.Encoder()
-
+# TODO: do we still want to try and support the sub-decoder with
+# `Raw` technique in the case that the `Generic` approach gives
+# future grief?
+#
 # sub-decoders for retreiving embedded
 # payload data and decoding to a sender
 # side defined (struct) type.
-_subdecs:  dict[
+_payload_decs:  dict[
     str|None,
-    msgpack.Decoder] = {
+    msgpack.Decoder,
+] = {
+    # default decoder is used when `Header.payload_tag == None`
     None: msgpack.Decoder(Any),
 }
 
 
-@cm
-def enable_context(
-    msg_subtypes: list[list[_Struct]]
-) -> msgpack.Decoder:
+def dec_payload(
+    msg: Msg,
+    msg_dec: msgpack.Decoder = msgpack.Decoder(
+        type=Msg[Any]
+    ),
 
-    for types in msg_subtypes:
-        first = types[0]
+) -> Any|Struct:
 
-        # register using the default tag_field of "type"
-        # which seems to map to the class "name".
-        tags = [first.__name__]
-
-        # create a tagged union decoder for this type set
-        type_union = Union[first]
-        for typ in types[1:]:
-            type_union |= typ
-            tags.append(typ.__name__)
-
-        dec = msgpack.Decoder(type_union)
-
-        # register all tags for this union sub-decoder
-        for tag in tags:
-            _subdecs[tag] = dec
-        try:
-            yield dec
-        finally:
-            for tag in tags:
-                _subdecs.pop(tag)
+    msg: Msg = msg_dec.decode(msg)
+    payload_tag: str = msg.header.payload_tag
+    payload_dec: msgpack.Decoder = _payload_decs[payload_tag]
+    return payload_dec.decode(msg.pld)
 
 
-def decmsg(msg: Msg) -> Any:
-    msg = _root_dec.decode(msg)
-    tag_field = msg.header.msgtype
-    dec = _subdecs[tag_field]
-    return dec.decode(msg.payload)
-
-
-def encmsg(
-    dialog_id: str | int,
+def enc_payload(
+    enc: msgpack.Encoder,
     payload: Any,
-) -> Msg:
+    cid: str,
 
-    tag_field = None
+) -> bytes:
 
-    plbytes = _root_enc.encode(payload)
-    if b'type' in plbytes:
-        assert isinstance(payload, _Struct)
-        tag_field = type(payload).__name__
+    # tag_field: str|None = None
+
+    plbytes = enc.encode(payload)
+    if b'msg_type' in plbytes:
+        assert isinstance(payload, Struct)
+
+        # tag_field: str = type(payload).__name__
         payload = Raw(plbytes)
 
     msg = Msg(
-        Header(dialog_id, tag_field),
-        payload,
+        cid=cid,
+        pld=payload,
+        # Header(
+        #     payload_tag=tag_field,
+        #     # dialog_id,
+        # ),
     )
-    return _root_enc.encode(msg)
+    return enc.encode(msg)
-- 
2.34.1


From 456979dd12bae1dc4a5ffb9c4e6a9c6c29416cc8 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 28 Mar 2024 13:07:03 -0400
Subject: [PATCH 194/378] Tweak msg-spec test suite mod name

---
 tests/{test_caps_msging.py => test_caps_based_msging.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/{test_caps_msging.py => test_caps_based_msging.py} (100%)

diff --git a/tests/test_caps_msging.py b/tests/test_caps_based_msging.py
similarity index 100%
rename from tests/test_caps_msging.py
rename to tests/test_caps_based_msging.py
-- 
2.34.1


From 8ff18739be1e55deeb46f386f2e5ced6020adc51 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 28 Mar 2024 13:08:18 -0400
Subject: [PATCH 195/378] Change to multi-line-static-`dict` style msgs

Re-arranging such that element-orders are line-arranged to our new
IPC `.msg.types.Msg` fields spec in prep for replacing the current
`dict`-as-msg impls with the `msgspec.Struct` native versions!
---
 tractor/_exceptions.py |  4 +++-
 tractor/_rpc.py        | 35 +++++++++++++++++++++++++----------
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 0e1d6d10..b1a8ee63 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -536,7 +536,9 @@ def pack_error(
     # content's `.msgdata`).
     error_msg['tb_str'] = tb_str
 
-    pkt: dict = {'error': error_msg}
+    pkt: dict = {
+        'error': error_msg,
+    }
     if cid:
         pkt['cid'] = cid
 
diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index 91482a07..310b80af 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -89,7 +89,10 @@ async def _invoke_non_context(
 
     # TODO: can we unify this with the `context=True` impl below?
     if inspect.isasyncgen(coro):
-        await chan.send({'functype': 'asyncgen', 'cid': cid})
+        await chan.send({
+            'cid': cid,
+            'functype': 'asyncgen',
+        })
         # XXX: massive gotcha! If the containing scope
         # is cancelled and we execute the below line,
         # any ``ActorNursery.__aexit__()`` WON'T be
@@ -109,18 +112,27 @@ async def _invoke_non_context(
                     # to_send = await chan.recv_nowait()
                     # if to_send is not None:
                     #     to_yield = await coro.asend(to_send)
-                    await chan.send({'yield': item, 'cid': cid})
+                    await chan.send({
+                        'yield': item,
+                        'cid': cid,
+                    })
 
         log.runtime(f"Finished iterating {coro}")
         # TODO: we should really support a proper
         # `StopAsyncIteration` system here for returning a final
         # value if desired
-        await chan.send({'stop': True, 'cid': cid})
+        await chan.send({
+            'stop': True,
+            'cid': cid,
+        })
 
     # one way @stream func that gets treated like an async gen
     # TODO: can we unify this with the `context=True` impl below?
     elif treat_as_gen:
-        await chan.send({'functype': 'asyncgen', 'cid': cid})
+        await chan.send({
+            'cid': cid,
+            'functype': 'asyncgen',
+        })
         # XXX: the async-func may spawn further tasks which push
         # back values like an async-generator would but must
         # manualy construct the response dict-packet-responses as
@@ -133,7 +145,10 @@ async def _invoke_non_context(
         if not cs.cancelled_caught:
             # task was not cancelled so we can instruct the
             # far end async gen to tear down
-            await chan.send({'stop': True, 'cid': cid})
+            await chan.send({
+                'stop': True,
+                'cid': cid
+            })
     else:
         # regular async function/method
         # XXX: possibly just a scheduled `Actor._cancel_task()`
@@ -182,10 +197,10 @@ async def _invoke_non_context(
                 and chan.connected()
             ):
                 try:
-                    await chan.send(
-                        {'return': result,
-                         'cid': cid}
-                    )
+                    await chan.send({
+                        'return': result,
+                        'cid': cid,
+                    })
                 except (
                     BrokenPipeError,
                     trio.BrokenResourceError,
@@ -479,8 +494,8 @@ async def _invoke(
         # "least sugary" type of RPC ep with support for
         # bi-dir streaming B)
         await chan.send({
+            'cid': cid,
             'functype': 'context',
-            'cid': cid
         })
 
         # TODO: should we also use an `.open_context()` equiv
-- 
2.34.1


From b6ed26589a3e38b45bfc78cf36ab8b3f5d92fe07 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 29 Mar 2024 12:46:59 -0400
Subject: [PATCH 196/378] Drop `MsgCodec.decoder()/.encoder()` design

Instead just instantiate `msgpack.Encoder/Decoder` instances inside
`mk_codec()` and assign them directly as `._enc/._dec` fields.
Explicitly take in named-args to both and proxy to the coder/decoder
instantiation calls directly.

Shuffling some codec internals:
- rename `mk_codec()` inputs as `ipc_msg_spec` and `ipc_pld_spec`, make
  them mutex such that a payload type spec can't be passed if the
  built-in msg-spec isn't used.
  => expose `MsgCodec.ipc_pld_spec` directly from `._dec.type`
  => presume input `ipc_msg_spec` is `Any` by default when no
    `ipc_pld_spec` is passed since we have no way atm to enable
    a similar type-restricted-payload feature without a wrapping
    "shuttle protocol" ;)

- move all the payload-sub-decoders stuff prototyped in GH#311
  (inside `.types`) to `._codec` as commented-for-later-maybe `MsgCodec`
  methods including:
  - `.mk_pld_subdec()` for registering
  - `.enc/dec_payload()` for sub-codec field loading.

- also comment out `._codec.mk_tagged_union_dec()` as the orig
  tag-to-decoder table factory, now mostly superseded by
  `.types.mk_msg_spec()` which takes the generic parameterizing approach
  instead.

- change naming to `types.mk_msg_spec(payload_type_union)` input, making
  it more explicit that it expects a `Union[Type]`.

Oh right, and start exposing all the `.types.Msg` subtypes in the `.msg`
subpkg in prep for usage throughout the runtime B)
---
 tractor/msg/__init__.py |  37 ++++
 tractor/msg/_codec.py   | 394 +++++++++++++++++++++++-----------------
 tractor/msg/types.py    | 113 +++---------
 3 files changed, 297 insertions(+), 247 deletions(-)

diff --git a/tractor/msg/__init__.py b/tractor/msg/__init__.py
index b5c261cc..a93fa888 100644
--- a/tractor/msg/__init__.py
+++ b/tractor/msg/__init__.py
@@ -33,3 +33,40 @@ from ._codec import (
     MsgCodec as MsgCodec,
     current_msgspec_codec as current_msgspec_codec,
 )
+
+from .types import (
+    Msg as Msg,
+
+    Start,  # with pld
+    FuncSpec as FuncSpec,
+
+    StartAck, # with pld
+    IpcCtxSpec as IpcCtxSpec,
+
+    Started,
+    Yield,
+    Stop,
+    Return,
+
+    Error,  # with pld
+    ErrorData as ErrorData
+)
+
+
+# built-in SC shuttle protocol msg type set in
+# approx order of the IPC txn-state spaces.
+__spec__: list[Msg] = [
+
+    # inter-actor RPC initiation
+    Start,
+    StartAck,
+
+    # no-outcome-yet IAC (inter-actor-communication)
+    Started,
+    Yield,
+    Stop,
+
+    # termination outcomes
+    Return,
+    Error,
+]
diff --git a/tractor/msg/_codec.py b/tractor/msg/_codec.py
index 5ce02055..e6cb4f1f 100644
--- a/tractor/msg/_codec.py
+++ b/tractor/msg/_codec.py
@@ -29,6 +29,7 @@ ToDo: backends we prolly should offer:
    - https://capnproto.org/language.html#language-reference
 
 '''
+from __future__ import annotations
 from contextvars import (
     ContextVar,
     Token,
@@ -54,18 +55,36 @@ from tractor.msg.types import (
 )
 
 
-# TODO: API changes towards being interchange lib agnostic!
+# TODO: overall IPC msg-spec features (i.e. in this mod)!
 #
-# -[ ] capnproto has pre-compiled schema for eg..
-#  * https://capnproto.org/language.html
-#  * http://capnproto.github.io/pycapnp/quickstart.html
-#   * https://github.com/capnproto/pycapnp/blob/master/examples/addressbook.capnp
+# -[ ] API changes towards being interchange lib agnostic!
+#   -[ ] capnproto has pre-compiled schema for eg..
+#    * https://capnproto.org/language.html
+#    * http://capnproto.github.io/pycapnp/quickstart.html
+#     * https://github.com/capnproto/pycapnp/blob/master/examples/addressbook.capnp
+#
+# -[ ] struct aware messaging coders as per:
+#   -[x] https://github.com/goodboy/tractor/issues/36
+#   -[ ] https://github.com/goodboy/tractor/issues/196
+#   -[ ] https://github.com/goodboy/tractor/issues/365
 #
 class MsgCodec(Struct):
     '''
     A IPC msg interchange format lib's encoder + decoder pair.
 
     '''
+    # post-configure-cached when prop-accessed (see `mk_codec()`
+    # OR can be passed directly as,
+    # `MsgCodec(_enc=<Encoder>,  _dec=<Decoder>)`
+    _enc: msgpack.Encoder|None = None
+    _dec: msgpack.Decoder|None = None
+
+    # struct type unions
+    # https://jcristharif.com/msgspec/structs.html#tagged-unions
+    @property
+    def ipc_pld_spec(self) -> Union[Type[Struct]]:
+        return self._dec.type
+
     lib: ModuleType = msgspec
 
     # ad-hoc type extensions
@@ -73,16 +92,8 @@ class MsgCodec(Struct):
     enc_hook: Callable[[Any], Any]|None = None  # coder
     dec_hook: Callable[[type, Any], Any]|None = None # decoder
 
-    # struct type unions
-    # https://jcristharif.com/msgspec/structs.html#tagged-unions
-    ipc_msg_spec: Union[Type[Struct]]|Any = Any
-    payload_msg_spec: Union[Type[Struct]] = Any
-
-    # post-configure cached props
-    _enc: msgpack.Encoder|None = None
-    _dec: msgpack.Decoder|None = None
-
     # TODO: a sub-decoder system as well?
+    # payload_msg_specs: Union[Type[Struct]] = Any
     # see related comments in `.msg.types`
     # _payload_decs: (
     #     dict[
@@ -91,42 +102,18 @@ class MsgCodec(Struct):
     #     ]
     #     |None
     # ) = None
+    # OR
+    # ) = {
+    #     # pre-seed decoders for std-py-type-set for use when
+    #     # `Msg.pld == None|Any`.
+    #     None: msgpack.Decoder(Any),
+    #     Any: msgpack.Decoder(Any),
+    # }
 
     # TODO: use `functools.cached_property` for these ?
     # https://docs.python.org/3/library/functools.html#functools.cached_property
     @property
     def enc(self) -> msgpack.Encoder:
-        return self._enc or self.encoder()
-
-    def encoder(
-        self,
-        enc_hook: Callable|None = None,
-        reset: bool = False,
-
-        # TODO: what's the default for this, and do we care?
-        # write_buffer_size: int
-        #
-        **kwargs,
-
-    ) -> msgpack.Encoder:
-        '''
-        Set or get the maybe-cached `msgspec.msgpack.Encoder`
-        instance configured for this codec.
-
-        When `reset=True` any previously configured encoder will
-        be recreated and then cached with the new settings passed
-        as input.
-
-        '''
-        if (
-            self._enc is None
-            or reset
-        ):
-            self._enc = self.lib.msgpack.Encoder(
-                enc_hook=enc_hook or self.enc_hook,
-                # write_buffer_size=write_buffer_size,
-            )
-
         return self._enc
 
     def encode(
@@ -139,40 +126,10 @@ class MsgCodec(Struct):
         on a tranport protocol connection.
 
         '''
-        return self.enc.encode(py_obj)
+        return self._enc.encode(py_obj)
 
     @property
     def dec(self) -> msgpack.Decoder:
-        return self._dec or self.decoder()
-
-    def decoder(
-        self,
-        ipc_msg_spec: Union[Type[Struct]]|None = None,
-        dec_hook: Callable|None = None,
-        reset: bool = False,
-        **kwargs,
-        # ext_hook: ext_hook_sig
-
-    ) -> msgpack.Decoder:
-        '''
-        Set or get the maybe-cached `msgspec.msgpack.Decoder`
-        instance configured for this codec.
-
-        When `reset=True` any previously configured decoder will
-        be recreated and then cached with the new settings passed
-        as input.
-
-        '''
-        if (
-            self._dec is None
-            or reset
-        ):
-            self._dec = self.lib.msgpack.Decoder(
-                type=ipc_msg_spec or self.ipc_msg_spec,
-                dec_hook=dec_hook or self.dec_hook,
-                **kwargs,
-            )
-
         return self._dec
 
     def decode(
@@ -185,60 +142,165 @@ class MsgCodec(Struct):
         determined by the 
 
         '''
-        return self.dec.decode(msg)
+        return self._dec.decode(msg)
+
+    # TODO: do we still want to try and support the sub-decoder with
+    # `.Raw` technique in the case that the `Generic` approach gives
+    # future grief?
+    #
+    # -[ ] <NEW-ISSUE-FOR-ThIS-HERE>
+    #
+    #def mk_pld_subdec(
+    #    self,
+    #    payload_types: Union[Type[Struct]],
+
+    #) -> msgpack.Decoder:
+    #    # TODO: sub-decoder suppor for `.pld: Raw`?
+    #    # => see similar notes inside `.msg.types`..
+    #    #
+    #    # not sure we'll end up needing this though it might have
+    #    # unforeseen advantages in terms of enabling encrypted
+    #    # appliciation layer (only) payloads?
+    #    #
+    #    # register sub-payload decoders to load `.pld: Raw`
+    #    # decoded `Msg`-packets using a dynamic lookup (table)
+    #    # instead of a pre-defined msg-spec via `Generic`
+    #    # parameterization.
+    #    #
+    #    (
+    #        tags,
+    #        payload_dec,
+    #    ) = mk_tagged_union_dec(
+    #        tagged_structs=list(payload_types.__args__),
+    #    )
+    #    # register sub-decoders by tag
+    #    subdecs: dict[str, msgpack.Decoder]|None = self._payload_decs
+    #    for name in tags:
+    #        subdecs.setdefault(
+    #            name,
+    #            payload_dec,
+    #        )
+
+    #    return payload_dec
+
+    # sub-decoders for retreiving embedded
+    # payload data and decoding to a sender
+    # side defined (struct) type.
+    # def dec_payload(
+    #     codec: MsgCodec,
+    #     msg: Msg,
+
+    # ) -> Any|Struct:
+
+    #     msg: Msg = codec.dec.decode(msg)
+    #     payload_tag: str = msg.header.payload_tag
+    #     payload_dec: msgpack.Decoder = codec._payload_decs[payload_tag]
+    #     return payload_dec.decode(msg.pld)
+
+    # def enc_payload(
+    #     codec: MsgCodec,
+    #     payload: Any,
+    #     cid: str,
+
+    # ) -> bytes:
+
+    #     # tag_field: str|None = None
+
+    #     plbytes = codec.enc.encode(payload)
+    #     if b'msg_type' in plbytes:
+    #         assert isinstance(payload, Struct)
+
+    #         # tag_field: str = type(payload).__name__
+    #         payload = msgspec.Raw(plbytes)
+
+    #     msg = Msg(
+    #         cid=cid,
+    #         pld=payload,
+    #         # Header(
+    #         #     payload_tag=tag_field,
+    #         #     # dialog_id,
+    #         # ),
+    #     )
+    #     return codec.enc.encode(msg)
 
 
-def mk_tagged_union_dec(
-    tagged_structs: list[Struct],
+ #def mk_tagged_union_dec(
+    # tagged_structs: list[Struct],
 
-) -> tuple[
-    list[str],
-    msgpack.Decoder,
-]:
-    # See "tagged unions" docs:
-    # https://jcristharif.com/msgspec/structs.html#tagged-unions
+ #) -> tuple[
+    # list[str],
+    # msgpack.Decoder,
+ #]:
+    # '''
+    # Create a `msgpack.Decoder` for an input `list[msgspec.Struct]`
+    # and return a `list[str]` of each struct's `tag_field: str` value
+    # which can be used to "map to" the initialized dec.
 
-    # "The quickest way to enable tagged unions is to set tag=True when
-    # defining every struct type in the union. In this case tag_field
-    # defaults to "type", and tag defaults to the struct class name
-    # (e.g. "Get")."
-    first: Struct = tagged_structs[0]
-    types_union: Union[Type[Struct]] = Union[
-       first
-    ]|Any
-    tags: list[str] = [first.__name__]
+    # '''
+    # # See "tagged unions" docs:
+    # # https://jcristharif.com/msgspec/structs.html#tagged-unions
 
-    for struct in tagged_structs[1:]:
-        types_union |= struct
-        tags.append(struct.__name__)
+    # # "The quickest way to enable tagged unions is to set tag=True when
+    # # defining every struct type in the union. In this case tag_field
+    # # defaults to "type", and tag defaults to the struct class name
+    # # (e.g. "Get")."
+    # first: Struct = tagged_structs[0]
+    # types_union: Union[Type[Struct]] = Union[
+    #    first
+    # ]|Any
+    # tags: list[str] = [first.__name__]
 
-    dec = msgpack.Decoder(types_union)
-    return (
-        tags,
-        dec,
-    )
+    # for struct in tagged_structs[1:]:
+    #     types_union |= struct
+    #     tags.append(
+    #         getattr(
+    #             struct,
+    #             struct.__struct_config__.tag_field,
+    #             struct.__name__,
+    #         )
+    #     )
+
+    # dec = msgpack.Decoder(types_union)
+    # return (
+    #     tags,
+    #     dec,
+    # )
 
-# TODO: struct aware messaging coders as per:
-# - https://github.com/goodboy/tractor/issues/36
-# - https://github.com/goodboy/tractor/issues/196
-# - https://github.com/goodboy/tractor/issues/365
 
 def mk_codec(
-    libname: str = 'msgspec',
-
-    # for codec-ing boxed `Msg`-with-payload msgs
-    payload_types: Union[Type[Struct]]|None = None,
-
-    # TODO: do we want to allow NOT/using a diff `Msg`-set?
+    ipc_msg_spec: Union[Type[Struct]]|Any|None = None,
     #
+    # ^TODO^: in the long run, do we want to allow using a diff IPC `Msg`-set?
+    # it would break the runtime, but maybe say if you wanted
+    # to add some kinda field-specific or wholesale `.pld` ecryption?
+
     # struct type unions set for `Decoder`
     # https://jcristharif.com/msgspec/structs.html#tagged-unions
-    ipc_msg_spec: Union[Type[Struct]]|Any = Any,
+    ipc_pld_spec: Union[Type[Struct]]|Any|None = None,
 
-    cache_now: bool = True,
+    # TODO: offering a per-msg(-field) type-spec such that
+    # the fields can be dynamically NOT decoded and left as `Raw`
+    # values which are later loaded by a sub-decoder specified
+    # by `tag_field: str` value key?
+    # payload_msg_specs: dict[
+    #     str,  # tag_field value as sub-decoder key
+    #     Union[Type[Struct]]  # `Msg.pld` type spec
+    # ]|None = None,
+
+    libname: str = 'msgspec',
 
     # proxy as `Struct(**kwargs)`
+    # ------ - ------
+    dec_hook: Callable|None = None,
+    enc_hook: Callable|None = None,
+    # ------ - ------
     **kwargs,
+    #
+    # Encoder:
+    # write_buffer_size=write_buffer_size,
+    #
+    # Decoder:
+    # ext_hook: ext_hook_sig
 
 ) -> MsgCodec:
     '''
@@ -247,75 +309,81 @@ def mk_codec(
     `msgspec` ;).
 
     '''
-    # (manually) generate a msg-payload-spec for all relevant
-    # god-boxing-msg subtypes, parameterizing the `Msg.pld: PayloadT`
-    # for the decoder such that all sub-type msgs in our SCIPP
-    # will automatically decode to a type-"limited" payload (`Struct`)
-    # object (set).
-    payload_type_spec: Union[Type[Msg]]|None = None
-    if payload_types:
+    if (
+        ipc_msg_spec is not None
+        and ipc_pld_spec
+    ):
+        raise RuntimeError(
+            f'If a payload spec is provided,\n'
+            "the builtin SC-shuttle-protocol's msg set\n"
+            f'(i.e. `{Msg}`) MUST be used!\n\n'
+            f'However both values were passed as => mk_codec(\n'
+            f'   ipc_msg_spec={ipc_msg_spec}`\n'
+            f'   ipc_pld_spec={ipc_pld_spec}`\n)\n'
+        )
+
+    elif (
+        ipc_pld_spec
+        and
+
+        # XXX required for now (or maybe forever?) until
+        # we can dream up a way to allow parameterizing and/or
+        # custom overrides to the `Msg`-spec protocol itself?
+        ipc_msg_spec is None
+    ):
+        # (manually) generate a msg-payload-spec for all relevant
+        # god-boxing-msg subtypes, parameterizing the `Msg.pld: PayloadT`
+        # for the decoder such that all sub-type msgs in our SCIPP
+        # will automatically decode to a type-"limited" payload (`Struct`)
+        # object (set).
         (
-            payload_type_spec,
+            ipc_msg_spec,
             msg_types,
         ) = mk_msg_spec(
-            payload_type=payload_types,
+            payload_type_union=ipc_pld_spec,
         )
-        assert len(payload_type_spec.__args__) == len(msg_types)
+        assert len(ipc_msg_spec.__args__) == len(msg_types)
+        assert ipc_msg_spec
 
-        # TODO: sub-decode `.pld: Raw`?
-        # see similar notes inside `.msg.types`..
-        #
-        # not sure we'll end up wanting/needing this
-        # though it might have unforeseen advantages in terms
-        # of enabling encrypted appliciation layer (only)
-        # payloads?
-        #
-        # register sub-payload decoders to load `.pld: Raw`
-        # decoded `Msg`-packets using a dynamic lookup (table)
-        # instead of a pre-defined msg-spec via `Generic`
-        # parameterization.
-        #
-        # (
-        #     tags,
-        #     payload_dec,
-        # ) = mk_tagged_union_dec(
-        #     tagged_structs=list(payload_types.__args__),
-        # )
-        # _payload_decs: (
-        #     dict[str, msgpack.Decoder]|None
-        # ) = {
-        #     # pre-seed decoders for std-py-type-set for use when
-        #     # `Msg.pld == None|Any`.
-        #     None: msgpack.Decoder(Any),
-        #     Any: msgpack.Decoder(Any),
-        # }
-        # for name in tags:
-        #     _payload_decs[name] = payload_dec
+        dec = msgpack.Decoder(
+            type=ipc_msg_spec,  # like `Msg[Any]`
+        )
+
+    else:
+        ipc_msg_spec = ipc_msg_spec or Any
+
+    enc = msgpack.Encoder(
+       enc_hook=enc_hook,
+    )
+    dec = msgpack.Decoder(
+        type=ipc_msg_spec,  # like `Msg[Any]`
+        dec_hook=dec_hook,
+    )
 
     codec = MsgCodec(
-        ipc_msg_spec=ipc_msg_spec,
-        payload_msg_spec=payload_type_spec,
-        **kwargs,
+        _enc=enc,
+        _dec=dec,
+        # payload_msg_specs=payload_msg_specs,
+        # **kwargs,
     )
-    assert codec.lib.__name__ == libname
 
-    # by default, config-n-cache the codec pair from input settings.
-    if cache_now:
-        assert codec.enc
-        assert codec.dec
+    # sanity on expected backend support
+    assert codec.lib.__name__ == libname
 
     return codec
 
 
 # instance of the default `msgspec.msgpack` codec settings, i.e.
 # no custom structs, hooks or other special types.
-_def_msgspec_codec: MsgCodec = mk_codec()
+_def_msgspec_codec: MsgCodec = mk_codec(ipc_msg_spec=Any)
 
 # NOTE: provides for per-`trio.Task` specificity of the
 # IPC msging codec used by the transport layer when doing
 # `Channel.send()/.recv()` of wire data.
 _ctxvar_MsgCodec: ContextVar[MsgCodec] = ContextVar(
     'msgspec_codec',
+
+    # TODO: move this to our new `Msg`-spec!
     default=_def_msgspec_codec,
 )
 
@@ -353,7 +421,7 @@ def limit_msg_spec(
     payload_types: Union[Type[Struct]],
 
     # TODO: don't need this approach right?
-    #
+    # -> related to the `MsgCodec._payload_decs` stuff above..
     # tagged_structs: list[Struct]|None = None,
 
     **codec_kwargs,
diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index 732a0f5d..7d64e766 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -22,9 +22,7 @@ that is,
 the "Structurred-Concurrency-Inter-Process-(dialog)-(un)Protocol".
 
 '''
-
 from __future__ import annotations
-# from contextlib import contextmanager as cm
 import types
 from typing import (
     Any,
@@ -36,14 +34,12 @@ from typing import (
 )
 
 from msgspec import (
-    msgpack,
-    Raw,
     Struct,
     UNSET,
 )
 
-
-# TODO: can also remove yah?
+# TODO: sub-decoded `Raw` fields?
+# -[ ] see `MsgCodec._payload_decs` notes
 #
 # class Header(Struct, tag=True):
 #     '''
@@ -70,7 +66,6 @@ class Msg(
     tree.
 
     '''
-    # header: Header
     # TODO: use UNSET here?
     cid: str|None  # call/context-id
 
@@ -94,9 +89,24 @@ class Msg(
     pld: PayloadT
 
 
-# TODO: better name, like `Call/TaskInput`?
+# TODO: caps based RPC support in the payload?
+#
+# -[ ] integration with our ``enable_modules: list[str]`` caps sys.
+#   ``pkgutil.resolve_name()`` internally uses
+#   ``importlib.import_module()`` which can be filtered by
+#   inserting a ``MetaPathFinder`` into ``sys.meta_path`` (which
+#   we could do before entering the ``Actor._process_messages()``
+#   loop)?
+#   - https://github.com/python/cpython/blob/main/Lib/pkgutil.py#L645
+#   - https://stackoverflow.com/questions/1350466/preventing-python-code-from-importing-certain-modules
+#   - https://stackoverflow.com/a/63320902
+#   - https://docs.python.org/3/library/sys.html#sys.meta_path
+#
+# -[ ] can we combine .ns + .func into a native `NamespacePath` field?
+#
+# -[ ]better name, like `Call/TaskInput`?
+#
 class FuncSpec(Struct):
-    # TODO: can we combine these 2 into a `NamespacePath` field?
     ns: str
     func: str
 
@@ -249,7 +259,7 @@ class Error(Msg):
 
 
 def mk_msg_spec(
-    payload_type: Union[Type] = Any,
+    payload_type_union: Union[Type] = Any,
     boxing_msg_set: set[Msg] = {
         Started,
         Yield,
@@ -261,10 +271,13 @@ def mk_msg_spec(
     list[Type[Msg]],
 ]:
     '''
-    Generate a payload-type-parameterized `Msg` specification such
-    that IPC msgs which can be `Msg.pld` (payload) type
-    limited/filterd are specified given an input `payload_type:
-    Union[Type]`.
+    Create a payload-(data-)type-parameterized IPC message specification.
+
+    Allows generating IPC msg types from the above builtin set
+    with a payload (field) restricted data-type via the `Msg.pld:
+    PayloadT` type var. This allows runtime-task contexts to use
+    the python type system to limit/filter payload values as
+    determined by the input `payload_type_union: Union[Type]`.
 
     '''
     submsg_types: list[Type[Msg]] = Msg.__subclasses__()
@@ -287,7 +300,7 @@ def mk_msg_spec(
         # -[ ] is there a way to get it to work at module level
         #   just using inheritance or maybe a metaclass?
         #
-        # index_paramed_msg_type: Msg = msgtype[payload_type]
+        # index_paramed_msg_type: Msg = msgtype[payload_type_union]
 
         # TODO: WHY do we need to dynamically generate the
         # subtype-msgs here to ensure the `.pld` parameterization
@@ -300,7 +313,7 @@ def mk_msg_spec(
             (
                 # XXX NOTE XXX this seems to be THE ONLY
                 # way to get this to work correctly!?!
-                Msg[payload_type],
+                Msg[payload_type_union],
                 Generic[PayloadT],
             ),
             {},
@@ -322,71 +335,3 @@ def mk_msg_spec(
         payload_type_spec,
         msg_types,
     )
-
-
-# TODO: integration with our ``enable_modules: list[str]`` caps sys.
-#
-# ``pkgutil.resolve_name()`` internally uses
-# ``importlib.import_module()`` which can be filtered by inserting
-# a ``MetaPathFinder`` into ``sys.meta_path`` (which we could do before
-# entering the ``Actor._process_messages()`` loop).
-# https://github.com/python/cpython/blob/main/Lib/pkgutil.py#L645
-# https://stackoverflow.com/questions/1350466/preventing-python-code-from-importing-certain-modules
-#   - https://stackoverflow.com/a/63320902
-#   - https://docs.python.org/3/library/sys.html#sys.meta_path
-
-# TODO: do we still want to try and support the sub-decoder with
-# `Raw` technique in the case that the `Generic` approach gives
-# future grief?
-#
-# sub-decoders for retreiving embedded
-# payload data and decoding to a sender
-# side defined (struct) type.
-_payload_decs:  dict[
-    str|None,
-    msgpack.Decoder,
-] = {
-    # default decoder is used when `Header.payload_tag == None`
-    None: msgpack.Decoder(Any),
-}
-
-
-def dec_payload(
-    msg: Msg,
-    msg_dec: msgpack.Decoder = msgpack.Decoder(
-        type=Msg[Any]
-    ),
-
-) -> Any|Struct:
-
-    msg: Msg = msg_dec.decode(msg)
-    payload_tag: str = msg.header.payload_tag
-    payload_dec: msgpack.Decoder = _payload_decs[payload_tag]
-    return payload_dec.decode(msg.pld)
-
-
-def enc_payload(
-    enc: msgpack.Encoder,
-    payload: Any,
-    cid: str,
-
-) -> bytes:
-
-    # tag_field: str|None = None
-
-    plbytes = enc.encode(payload)
-    if b'msg_type' in plbytes:
-        assert isinstance(payload, Struct)
-
-        # tag_field: str = type(payload).__name__
-        payload = Raw(plbytes)
-
-    msg = Msg(
-        cid=cid,
-        pld=payload,
-        # Header(
-        #     payload_tag=tag_field,
-        #     # dialog_id,
-        # ),
-    )
-    return enc.encode(msg)
-- 
2.34.1


From fb8196e354bb819c2e0ed1e306f065466372a4ba Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 29 Mar 2024 13:48:08 -0400
Subject: [PATCH 197/378] Tweak msging tests to match codec api changes

Mostly adjusting input args/logic to various spec/codec signatures and
new runtime semantics:

- `test_msg_spec_xor_pld_spec()` to verify that a shuttle prot spec and
  payload spec are necessarily mutex and that `mk_codec()` enforces it.
- switch to `ipc_msg_spec` input in `mk_custom_codec()` helper.
- drop buncha commented cruft from `test_limit_msgspec()` including no
  longer needed type union instance checks in dunder attributes.
---
 tests/test_caps_based_msging.py | 100 +++++++++++++++++++++-----------
 1 file changed, 66 insertions(+), 34 deletions(-)

diff --git a/tests/test_caps_based_msging.py b/tests/test_caps_based_msging.py
index b101c1e0..98ab7fa3 100644
--- a/tests/test_caps_based_msging.py
+++ b/tests/test_caps_based_msging.py
@@ -1,5 +1,6 @@
 '''
-Functional audits for our "capability based messaging (schema)" feats.
+Low-level functional audits for our
+"capability based messaging"-spec feats.
 
 B~)
 
@@ -22,6 +23,7 @@ from msgspec import (
     Struct,
     ValidationError,
 )
+import pytest
 import tractor
 from tractor.msg import (
     _def_msgspec_codec,
@@ -34,13 +36,29 @@ from tractor.msg import (
     current_msgspec_codec,
 )
 from tractor.msg.types import (
-    PayloadT,
+    # PayloadT,
     Msg,
     # Started,
     mk_msg_spec,
 )
 import trio
 
+
+def test_msg_spec_xor_pld_spec():
+    '''
+    If the `.msg.types.Msg`-set is overridden, we
+    can't also support a `Msg.pld` spec.
+
+    '''
+    # apply custom hooks and set a `Decoder` which only
+    # loads `NamespacePath` types.
+    with pytest.raises(RuntimeError):
+        mk_codec(
+            ipc_msg_spec=Any,
+            ipc_pld_spec=NamespacePath,
+        )
+
+
 # TODO: wrap these into `._codec` such that user can just pass
 # a type table of some sort?
 def enc_hook(obj: Any) -> Any:
@@ -66,11 +84,13 @@ def ex_func(*args):
     print(f'ex_func({args})')
 
 
-def mk_custom_codec() -> MsgCodec:
+def mk_custom_codec(
+    ipc_msg_spec: Type[Any] = Any,
+) -> MsgCodec:
     # apply custom hooks and set a `Decoder` which only
     # loads `NamespacePath` types.
     nsp_codec: MsgCodec = mk_codec(
-        ipc_msg_spec=NamespacePath,
+        ipc_msg_spec=ipc_msg_spec,
         enc_hook=enc_hook,
         dec_hook=dec_hook,
     )
@@ -225,16 +245,9 @@ def chk_pld_type(
     pld_val_type: Type = type(pld)
 
     # gen_paramed: _GenericAlias = generic[payload_type]
-    # TODO: verify that the overridden subtypes
-    # DO NOT have modified type-annots from original!
-    # 'Start',  .pld: FuncSpec
-    # 'StartAck',  .pld: IpcCtxSpec
-    # 'Stop',  .pld: UNSEt
-    # 'Error',  .pld: ErrorData
     # for typedef in (
     #     [gen_paramed]
     #     +
-
     #     # type-var should always be set for these sub-types
     #     # as well!
     #     Msg.__subclasses__()
@@ -246,56 +259,75 @@ def chk_pld_type(
     #         'Return',
     #     ]:
     #         continue
-    # payload_type: Type[Struct] = CustomPayload
 
-    # TODO: can remove all this right!?
-    #
-    # when parameterized (like `Msg[Any]`) then
-    # we expect an alias as input.
-    # if isinstance(generic, _GenericAlias):
-    #     assert payload_type in generic.__args__
-    # else:
-        # assert PayloadType in generic.__parameters__
-        # pld_param: Parameter = generic.__signature__.parameters['pld']
-        # assert pld_param.annotation is PayloadType
+    # TODO: verify that the overridden subtypes
+    # DO NOT have modified type-annots from original!
+    # 'Start',  .pld: FuncSpec
+    # 'StartAck',  .pld: IpcCtxSpec
+    # 'Stop',  .pld: UNSEt
+    # 'Error',  .pld: ErrorData
 
-    type_spec: Union[Type[Struct]]
+
+    pld_type_spec: Union[Type[Struct]]
     msg_types: list[Msg[payload_type]]
+
+    # make a one-off dec to compare with our `MsgCodec` instance
+    # which does the below `mk_msg_spec()` call internally
     (
-        type_spec,
+        pld_type_spec,
         msg_types,
     ) = mk_msg_spec(
-        payload_type=payload_type,
+        payload_type_union=payload_type,
     )
     enc = msgpack.Encoder()
     dec = msgpack.Decoder(
-        type=type_spec,  # like `Msg[Any]`
+        type=pld_type_spec or Any,  # like `Msg[Any]`
+    )
+
+    codec: MsgCodec = mk_codec(
+        # NOTE: this ONLY accepts `Msg.pld` fields of a specified
+        # type union.
+        ipc_pld_spec=payload_type,
+    )
+
+    # assert codec.dec == dec
+    # XXX-^ not sure why these aren't "equal" but when cast
+    # to `str` they seem to match ?? .. kk
+    assert (
+        str(pld_type_spec)
+        ==
+        str(codec.ipc_pld_spec)
+        ==
+        str(dec.type)
+        ==
+        str(codec.dec.type)
     )
 
     # verify the boxed-type for all variable payload-type msgs.
     for typedef in msg_types:
 
         pld_field = structs.fields(typedef)[1]
-        assert pld_field.type in {payload_type, PayloadT}
-        # TODO: does this need to work to get all subtypes to
-        # adhere?
         assert pld_field.type is payload_type
+        # TODO-^ does this need to work to get all subtypes to adhere?
 
         kwargs: dict[str, Any] = {
             'cid': '666',
             'pld': pld,
         }
-        enc_msg = typedef(**kwargs)
+        enc_msg: Msg = typedef(**kwargs)
 
-        wire_bytes: bytes = enc.encode(enc_msg)
+        wire_bytes: bytes = codec.enc.encode(enc_msg)
+        _wire_bytes: bytes = enc.encode(enc_msg)
 
         try:
-            dec_msg = dec.decode(wire_bytes)
+            _dec_msg = dec.decode(wire_bytes)
+            dec_msg = codec.dec.decode(wire_bytes)
+
             assert dec_msg.pld == pld
-            assert (roundtrip := (dec_msg == enc_msg))
+            assert _dec_msg.pld == pld
+            assert (roundtrip := (_dec_msg == enc_msg))
 
         except ValidationError as ve:
-            # breakpoint()
             if pld_val_type is payload_type:
                 raise ValueError(
                    'Got `ValidationError` despite type-var match!?\n'
-- 
2.34.1


From 3ba46362a94175d8b3a1650c14b9104fac8bd7e9 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 29 Mar 2024 18:46:37 -0400
Subject: [PATCH 198/378] Be mega pedantic with msg-spec building

Turns out the generics based payload speccing API, as in
https://jcristharif.com/msgspec/supported-types.html#generic-types,
DOES WORK properly as long as we don't rely on inheritance from `Msg`
a parent `Generic`..

So let's get real pedantic in the `mk_msg_spec()` internals as well as
verification in the test suite!

Fixes in `.msg.types`:
- implement (as part of tinker testing) multiple spec union building
  methods via a `spec_build_method: str` to `mk_msg_spec()` and leave a
  buncha notes around what did and didn't work:
  - 'indexed_generics' is the only method THAT WORKS and the one that
    you'd expect being closest to the `msgspec` docs (link above).
  - 'defstruct' using dynamically defined msgs => doesn't work!
  - 'types_new_class' using dynamically defined msgs but with
    `types.new_clas()` => ALSO doesn't work..

- explicitly separate the `.pld` type-constrainable by user code msg
  set into `types._payload_spec_msgs` putting the others in
  a `types._runtime_spec_msgs` and the full set defined as `.__spec__`
  (moving it out of the pkg-mod and back to `.types` as well).

- for the `_payload_spec_msgs` msgs manually make them inherit `Generic[PayloadT]`
  and (redunantly) define a `.pld: PayloadT` field.

- make `IpcCtxSpec.functype` an in line `Literal`.

- toss in some TODO notes about choosing a better `Msg.cid` type.

Fixes/tweaks around `.msg._codec`:
- rename `MsgCodec.ipc/pld_msg_spec` -> `.msg/pld_spec`
- make `._enc/._dec` non optional fields
- wow, ^facepalm^ , make sure `._ipc.MsgpackTCPStream.__init__()` uses
  `mk_codec()` since `MsgCodec` can't be (easily) constructed directly.

Get more detailed in testing:
- inside the `chk_pld_type()` helper ensure `roundtrip` is always set to
  some value, `None` by default but a bool depending on legit outcome.
  - drop input `generic`; no longer used.
  - drop the masked `typedef` loop from `Msg.__subclasses__()`.
  - for add an `expect_roundtrip: bool` and use to jump into debugger
    when any expectation doesn't match the outcome.
- use new `MsgCodec` field names (as per first section above).
- ensure the encoded msg matches the decoded one from both the ad-hoc
  decoder and codec loaded values.
- ensure the pld checking is only applied to msgs in the
  `types._payload_spec_msgs` set by `typef.__name__` filtering
  since `mk_msg_spec()` now returns the full `.types.Msg` set.
---
 tests/test_caps_based_msging.py | 150 +++++++++++++---------
 tractor/_ipc.py                 |   3 +-
 tractor/msg/__init__.py         |  38 ++----
 tractor/msg/_codec.py           |  32 +++--
 tractor/msg/types.py            | 220 ++++++++++++++++++++++++--------
 5 files changed, 289 insertions(+), 154 deletions(-)

diff --git a/tests/test_caps_based_msging.py b/tests/test_caps_based_msging.py
index 98ab7fa3..abdda0a5 100644
--- a/tests/test_caps_based_msging.py
+++ b/tests/test_caps_based_msging.py
@@ -35,6 +35,7 @@ from tractor.msg import (
     apply_codec,
     current_msgspec_codec,
 )
+from tractor.msg import types
 from tractor.msg.types import (
     # PayloadT,
     Msg,
@@ -235,31 +236,15 @@ def test_codec_hooks_mod():
 
 
 def chk_pld_type(
-    generic: Msg|_GenericAlias,
-    payload_type: Type[Struct]|Any,
+    payload_spec: Type[Struct]|Any,
     pld: Any,
 
+    expect_roundtrip: bool|None = None,
+
 ) -> bool:
 
-    roundtrip: bool = False
     pld_val_type: Type = type(pld)
 
-    # gen_paramed: _GenericAlias = generic[payload_type]
-    # for typedef in (
-    #     [gen_paramed]
-    #     +
-    #     # type-var should always be set for these sub-types
-    #     # as well!
-    #     Msg.__subclasses__()
-    # ):
-    #     if typedef.__name__ not in [
-    #         'Msg',
-    #         'Started',
-    #         'Yield',
-    #         'Return',
-    #     ]:
-    #         continue
-
     # TODO: verify that the overridden subtypes
     # DO NOT have modified type-annots from original!
     # 'Start',  .pld: FuncSpec
@@ -267,48 +252,64 @@ def chk_pld_type(
     # 'Stop',  .pld: UNSEt
     # 'Error',  .pld: ErrorData
 
-
-    pld_type_spec: Union[Type[Struct]]
-    msg_types: list[Msg[payload_type]]
-
-    # make a one-off dec to compare with our `MsgCodec` instance
-    # which does the below `mk_msg_spec()` call internally
-    (
-        pld_type_spec,
-        msg_types,
-    ) = mk_msg_spec(
-        payload_type_union=payload_type,
-    )
-    enc = msgpack.Encoder()
-    dec = msgpack.Decoder(
-        type=pld_type_spec or Any,  # like `Msg[Any]`
-    )
-
     codec: MsgCodec = mk_codec(
         # NOTE: this ONLY accepts `Msg.pld` fields of a specified
         # type union.
-        ipc_pld_spec=payload_type,
+        ipc_pld_spec=payload_spec,
+    )
+
+    # make a one-off dec to compare with our `MsgCodec` instance
+    # which does the below `mk_msg_spec()` call internally
+    ipc_msg_spec: Union[Type[Struct]]
+    msg_types: list[Msg[payload_spec]]
+    (
+        ipc_msg_spec,
+        msg_types,
+    ) = mk_msg_spec(
+        payload_type_union=payload_spec,
+    )
+    _enc = msgpack.Encoder()
+    _dec = msgpack.Decoder(
+        type=ipc_msg_spec or Any,  # like `Msg[Any]`
+    )
+
+    assert (
+        payload_spec
+        ==
+        codec.pld_spec
     )
 
     # assert codec.dec == dec
-    # XXX-^ not sure why these aren't "equal" but when cast
+    #
+    # ^-XXX-^ not sure why these aren't "equal" but when cast
     # to `str` they seem to match ?? .. kk
+
     assert (
-        str(pld_type_spec)
+        str(ipc_msg_spec)
         ==
-        str(codec.ipc_pld_spec)
+        str(codec.msg_spec)
         ==
-        str(dec.type)
+        str(_dec.type)
         ==
         str(codec.dec.type)
     )
 
     # verify the boxed-type for all variable payload-type msgs.
+    if not msg_types:
+        breakpoint()
+
+    roundtrip: bool|None = None
+    pld_spec_msg_names: list[str] = [
+        td.__name__ for td in types._payload_spec_msgs
+    ]
     for typedef in msg_types:
 
+        skip_runtime_msg: bool = typedef.__name__ not in pld_spec_msg_names
+        if skip_runtime_msg:
+            continue
+
         pld_field = structs.fields(typedef)[1]
-        assert pld_field.type is payload_type
-        # TODO-^ does this need to work to get all subtypes to adhere?
+        assert pld_field.type is payload_spec # TODO-^ does this need to work to get all subtypes to adhere?
 
         kwargs: dict[str, Any] = {
             'cid': '666',
@@ -316,44 +317,72 @@ def chk_pld_type(
         }
         enc_msg: Msg = typedef(**kwargs)
 
+        _wire_bytes: bytes = _enc.encode(enc_msg)
         wire_bytes: bytes = codec.enc.encode(enc_msg)
-        _wire_bytes: bytes = enc.encode(enc_msg)
+        assert _wire_bytes == wire_bytes
 
+        ve: ValidationError|None = None
         try:
-            _dec_msg = dec.decode(wire_bytes)
             dec_msg = codec.dec.decode(wire_bytes)
+            _dec_msg = _dec.decode(wire_bytes)
 
-            assert dec_msg.pld == pld
-            assert _dec_msg.pld == pld
-            assert (roundtrip := (_dec_msg == enc_msg))
+            # decoded msg and thus payload should be exactly same!
+            assert (roundtrip := (
+                _dec_msg
+                ==
+                dec_msg
+                ==
+                enc_msg
+            ))
 
-        except ValidationError as ve:
-            if pld_val_type is payload_type:
+            if (
+                expect_roundtrip is not None
+                and expect_roundtrip != roundtrip
+            ):
+                breakpoint()
+
+            assert (
+                pld
+                ==
+                dec_msg.pld
+                ==
+                enc_msg.pld
+            )
+            # assert (roundtrip := (_dec_msg == enc_msg))
+
+        except ValidationError as _ve:
+            ve = _ve
+            roundtrip: bool = False
+            if pld_val_type is payload_spec:
                 raise ValueError(
                    'Got `ValidationError` despite type-var match!?\n'
                     f'pld_val_type: {pld_val_type}\n'
-                    f'payload_type: {payload_type}\n'
+                    f'payload_type: {payload_spec}\n'
                 ) from ve
 
             else:
                 # ow we good cuz the pld spec mismatched.
                 print(
                     'Got expected `ValidationError` since,\n'
-                    f'{pld_val_type} is not {payload_type}\n'
+                    f'{pld_val_type} is not {payload_spec}\n'
                 )
         else:
             if (
-                pld_val_type is not payload_type
-                and payload_type is not Any
+                payload_spec is not Any
+                and
+                pld_val_type is not payload_spec
             ):
                 raise ValueError(
                    'DID NOT `ValidationError` despite expected type match!?\n'
                     f'pld_val_type: {pld_val_type}\n'
-                    f'payload_type: {payload_type}\n'
+                    f'payload_type: {payload_spec}\n'
                 )
 
-    return roundtrip
+    # full code decode should always be attempted!
+    if roundtrip is None:
+        breakpoint()
 
+    return roundtrip
 
 
 def test_limit_msgspec():
@@ -365,9 +394,10 @@ def test_limit_msgspec():
 
             # ensure we can round-trip a boxing `Msg`
             assert chk_pld_type(
-                Msg,
+                # Msg,
                 Any,
                 None,
+                expect_roundtrip=True,
             )
 
             # TODO: don't need this any more right since
@@ -379,7 +409,7 @@ def test_limit_msgspec():
 
             # verify that a mis-typed payload value won't decode
             assert not chk_pld_type(
-                Msg,
+                # Msg,
                 int,
                 pld='doggy',
             )
@@ -392,13 +422,13 @@ def test_limit_msgspec():
                 value: Any
 
             assert not chk_pld_type(
-                Msg,
+                # Msg,
                 CustomPayload,
                 pld='doggy',
             )
 
             assert chk_pld_type(
-                Msg,
+                # Msg,
                 CustomPayload,
                 pld=CustomPayload(name='doggy', value='urmom')
             )
diff --git a/tractor/_ipc.py b/tractor/_ipc.py
index b1c2ccd2..5f71c38c 100644
--- a/tractor/_ipc.py
+++ b/tractor/_ipc.py
@@ -48,6 +48,7 @@ from tractor._exceptions import TransportClosed
 from tractor.msg import (
     _ctxvar_MsgCodec,
     MsgCodec,
+    mk_codec,
 )
 
 log = get_logger(__name__)
@@ -162,7 +163,7 @@ class MsgpackTCPStream(MsgTransport):
 
         # allow for custom IPC msg interchange format
         # dynamic override Bo
-        self.codec: MsgCodec = codec or MsgCodec()
+        self.codec: MsgCodec = codec or mk_codec()
 
     async def _iter_packets(self) -> AsyncGenerator[dict, None]:
         '''
diff --git a/tractor/msg/__init__.py b/tractor/msg/__init__.py
index a93fa888..0c8809a9 100644
--- a/tractor/msg/__init__.py
+++ b/tractor/msg/__init__.py
@@ -37,36 +37,20 @@ from ._codec import (
 from .types import (
     Msg as Msg,
 
-    Start,  # with pld
+    Start as Start,  # with pld
     FuncSpec as FuncSpec,
 
-    StartAck, # with pld
+    StartAck as StartAck, # with pld
     IpcCtxSpec as IpcCtxSpec,
 
-    Started,
-    Yield,
-    Stop,
-    Return,
+    Started as Started,
+    Yield as Yield,
+    Stop as Stop,
+    Return as Return,
 
-    Error,  # with pld
-    ErrorData as ErrorData
+    Error as Error,  # with pld
+    ErrorData as ErrorData,
+
+    # full msg spec set
+    __spec__ as __spec__,
 )
-
-
-# built-in SC shuttle protocol msg type set in
-# approx order of the IPC txn-state spaces.
-__spec__: list[Msg] = [
-
-    # inter-actor RPC initiation
-    Start,
-    StartAck,
-
-    # no-outcome-yet IAC (inter-actor-communication)
-    Started,
-    Yield,
-    Stop,
-
-    # termination outcomes
-    Return,
-    Error,
-]
diff --git a/tractor/msg/_codec.py b/tractor/msg/_codec.py
index e6cb4f1f..4477d393 100644
--- a/tractor/msg/_codec.py
+++ b/tractor/msg/_codec.py
@@ -73,16 +73,15 @@ class MsgCodec(Struct):
     A IPC msg interchange format lib's encoder + decoder pair.
 
     '''
-    # post-configure-cached when prop-accessed (see `mk_codec()`
-    # OR can be passed directly as,
-    # `MsgCodec(_enc=<Encoder>,  _dec=<Decoder>)`
-    _enc: msgpack.Encoder|None = None
-    _dec: msgpack.Decoder|None = None
+    _enc: msgpack.Encoder
+    _dec: msgpack.Decoder
+
+    pld_spec: Union[Type[Struct]]|None
 
     # struct type unions
     # https://jcristharif.com/msgspec/structs.html#tagged-unions
     @property
-    def ipc_pld_spec(self) -> Union[Type[Struct]]:
+    def msg_spec(self) -> Union[Type[Struct]]:
         return self._dec.type
 
     lib: ModuleType = msgspec
@@ -142,6 +141,7 @@ class MsgCodec(Struct):
         determined by the 
 
         '''
+        # https://jcristharif.com/msgspec/usage.html#typed-decoding
         return self._dec.decode(msg)
 
     # TODO: do we still want to try and support the sub-decoder with
@@ -149,6 +149,7 @@ class MsgCodec(Struct):
     # future grief?
     #
     # -[ ] <NEW-ISSUE-FOR-ThIS-HERE>
+    #  -> https://jcristharif.com/msgspec/api.html#raw
     #
     #def mk_pld_subdec(
     #    self,
@@ -224,6 +225,20 @@ class MsgCodec(Struct):
     #     return codec.enc.encode(msg)
 
 
+
+# TODO: sub-decoded `Raw` fields?
+# -[ ] see `MsgCodec._payload_decs` notes
+#
+# XXX if we wanted something more complex then field name str-keys
+# we might need a header field type to describe the lookup sys?
+# class Header(Struct, tag=True):
+#     '''
+#     A msg header which defines payload properties
+
+#     '''
+#     payload_tag: str|None = None
+
+
  #def mk_tagged_union_dec(
     # tagged_structs: list[Struct],
 
@@ -345,10 +360,6 @@ def mk_codec(
         assert len(ipc_msg_spec.__args__) == len(msg_types)
         assert ipc_msg_spec
 
-        dec = msgpack.Decoder(
-            type=ipc_msg_spec,  # like `Msg[Any]`
-        )
-
     else:
         ipc_msg_spec = ipc_msg_spec or Any
 
@@ -363,6 +374,7 @@ def mk_codec(
     codec = MsgCodec(
         _enc=enc,
         _dec=dec,
+        pld_spec=ipc_pld_spec,
         # payload_msg_specs=payload_msg_specs,
         # **kwargs,
     )
diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index 7d64e766..2411f0f9 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -34,20 +34,13 @@ from typing import (
 )
 
 from msgspec import (
+    defstruct,
+    # field,
     Struct,
     UNSET,
+    UnsetType,
 )
 
-# TODO: sub-decoded `Raw` fields?
-# -[ ] see `MsgCodec._payload_decs` notes
-#
-# class Header(Struct, tag=True):
-#     '''
-#     A msg header which defines payload properties
-
-#     '''
-#     payload_tag: str|None = None
-
 # type variable for the boxed payload field `.pld`
 PayloadT = TypeVar('PayloadT')
 
@@ -57,6 +50,9 @@ class Msg(
     Generic[PayloadT],
     tag=True,
     tag_field='msg_type',
+
+    # eq=True,
+    # order=True,
 ):
     '''
     The "god" boxing msg type.
@@ -66,8 +62,13 @@ class Msg(
     tree.
 
     '''
-    # TODO: use UNSET here?
     cid: str|None  # call/context-id
+    # ^-TODO-^: more explicit type?
+    # -[ ] use UNSET here?
+    #  https://jcristharif.com/msgspec/supported-types.html#unset
+    #
+    # -[ ] `uuid.UUID` which has multi-protocol support
+    #  https://jcristharif.com/msgspec/supported-types.html#uuid
 
     # The msgs "payload" (spelled without vowels):
     # https://en.wikipedia.org/wiki/Payload_(computing)
@@ -136,19 +137,18 @@ class Start(
     pld: FuncSpec
 
 
-FuncType: Literal[
-    'asyncfunc',
-    'asyncgen',
-    'context',  # TODO: the only one eventually?
-] = 'context'
-
-
 class IpcCtxSpec(Struct):
     '''
     An inter-actor-`trio.Task`-comms `Context` spec.
 
     '''
-    functype: FuncType
+    # TODO: maybe better names for all these?
+    # -[ ] obvi ^ would need sync with `._rpc`
+    functype: Literal[
+        'asyncfunc',
+        'asyncgen',
+        'context',  # TODO: the only one eventually?
+    ]
 
     # TODO: as part of the reponse we should report our allowed
     # msg spec which should be generated from the type-annots as
@@ -182,6 +182,7 @@ class Started(
     decorated IPC endpoint.
 
     '''
+    pld: PayloadT
 
 
 # TODO: instead of using our existing `Start`
@@ -198,6 +199,7 @@ class Yield(
     Per IPC transmission of a value from `await MsgStream.send(<value>)`.
 
     '''
+    pld: PayloadT
 
 
 class Stop(Msg):
@@ -206,7 +208,7 @@ class Stop(Msg):
     of `StopAsyncIteration`.
 
     '''
-    pld: UNSET
+    pld: UnsetType = UNSET
 
 
 class Return(
@@ -218,6 +220,7 @@ class Return(
     func-as-`trio.Task`.
 
     '''
+    pld: PayloadT
 
 
 class ErrorData(Struct):
@@ -258,13 +261,47 @@ class Error(Msg):
 #     cid: str
 
 
+# built-in SC shuttle protocol msg type set in
+# approx order of the IPC txn-state spaces.
+__spec__: list[Msg] = [
+
+    # inter-actor RPC initiation
+    Start,
+    StartAck,
+
+    # no-outcome-yet IAC (inter-actor-communication)
+    Started,
+    Yield,
+    Stop,
+
+    # termination outcomes
+    Return,
+    Error,
+]
+
+_runtime_spec_msgs: list[Msg] = [
+    Start,
+    StartAck,
+    Stop,
+    Error,
+]
+_payload_spec_msgs: list[Msg] = [
+    Started,
+    Yield,
+    Return,
+]
+
+
 def mk_msg_spec(
     payload_type_union: Union[Type] = Any,
-    boxing_msg_set: set[Msg] = {
-        Started,
-        Yield,
-        Return,
-    },
+
+    # boxing_msg_set: list[Msg] = _payload_spec_msgs,
+    spec_build_method: Literal[
+        'indexed_generics',  # works
+        'defstruct',
+        'types_new_class',
+
+    ] = 'indexed_generics',
 
 ) -> tuple[
     Union[Type[Msg]],
@@ -281,26 +318,58 @@ def mk_msg_spec(
 
     '''
     submsg_types: list[Type[Msg]] = Msg.__subclasses__()
+    bases: tuple = (
+        # XXX NOTE XXX the below generic-parameterization seems to
+        # be THE ONLY way to get this to work correctly in terms
+        # of getting ValidationError on a roundtrip?
+        Msg[payload_type_union],
+        Generic[PayloadT],
+    )
+    defstruct_bases: tuple = (
+        Msg, # [payload_type_union],
+        # Generic[PayloadT],
+        # ^-XXX-^: not allowed? lul..
+    )
+    ipc_msg_types: list[Msg] = []
 
-    # TODO: see below as well,
-    # => union building approach with `.__class_getitem__()`
-    # doesn't seem to work..?
-    #
-    # payload_type_spec: Union[Type[Msg]]
-    #
-    msg_types: list[Msg] = []
-    for msgtype in boxing_msg_set:
+    idx_msg_types: list[Msg] = []
+    defs_msg_types: list[Msg] = []
+    nc_msg_types: list[Msg] = []
+
+    for msgtype in __spec__:
+
+        # for the NON-payload (user api) type specify-able
+        # msgs types, we simply aggregate the def as is
+        # for inclusion in the output type `Union`.
+        if msgtype not in _payload_spec_msgs:
+            ipc_msg_types.append(msgtype)
+            continue
 
         # check inheritance sanity
         assert msgtype in submsg_types
 
         # TODO: wait why do we need the dynamic version here?
-        # -[ ] paraming the `PayloadT` values via `Generic[T]`
-        #   doesn't seem to work at all?
-        # -[ ] is there a way to get it to work at module level
-        #   just using inheritance or maybe a metaclass?
+        # XXX ANSWER XXX -> BC INHERITANCE.. don't work w generics..
         #
-        # index_paramed_msg_type: Msg = msgtype[payload_type_union]
+        # NOTE previously bc msgtypes WERE NOT inheritting
+        # directly the `Generic[PayloadT]` type, the manual method
+        # of generic-paraming with `.__class_getitem__()` wasn't
+        # working..
+        #
+        # XXX but bc i changed that to make every subtype inherit
+        # it, this manual "indexed parameterization" method seems
+        # to work?
+        #
+        # -[x] paraming the `PayloadT` values via `Generic[T]`
+        #   does work it seems but WITHOUT inheritance of generics
+        #
+        # -[-] is there a way to get it to work at module level
+        #   just using inheritance or maybe a metaclass?
+        #  => thot that `defstruct` might work, but NOPE, see
+        #   below..
+        #
+        idxed_msg_type: Msg = msgtype[payload_type_union]
+        idx_msg_types.append(idxed_msg_type)
 
         # TODO: WHY do we need to dynamically generate the
         # subtype-msgs here to ensure the `.pld` parameterization
@@ -308,30 +377,69 @@ def mk_msg_spec(
         # `msgpack.Decoder()`..?
         #
         # dynamically create the payload type-spec-limited msg set.
-        manual_paramed_msg_subtype: Type = types.new_class(
-            msgtype.__name__,
-            (
-                # XXX NOTE XXX this seems to be THE ONLY
-                # way to get this to work correctly!?!
-                Msg[payload_type_union],
-                Generic[PayloadT],
-            ),
-            {},
+        newclass_msgtype: Type = types.new_class(
+            name=msgtype.__name__,
+            bases=bases,
+            kwds={},
+        )
+        nc_msg_types.append(
+            newclass_msgtype[payload_type_union]
         )
 
-        # TODO: grok the diff here better..
+        # with `msgspec.structs.defstruct`
+        # XXX ALSO DOESN'T WORK
+        defstruct_msgtype = defstruct(
+            name=msgtype.__name__,
+            fields=[
+                ('cid', str),
+
+                # XXX doesn't seem to work..
+                # ('pld', PayloadT),
+
+                ('pld', payload_type_union),
+            ],
+            bases=defstruct_bases,
+        )
+        defs_msg_types.append(defstruct_msgtype)
+
         # assert index_paramed_msg_type == manual_paramed_msg_subtype
 
-        # XXX TODO: why does the manual method work but not the
-        # `.__class_getitem__()` one!?!
-        paramed_msg_type = manual_paramed_msg_subtype
+        # paramed_msg_type = manual_paramed_msg_subtype
 
-        # payload_type_spec |= paramed_msg_type
-        msg_types.append(paramed_msg_type)
+        # ipc_payload_msgs_type_union |= index_paramed_msg_type
 
+    idx_spec: Union[Type[Msg]] = Union[*idx_msg_types]
+    def_spec: Union[Type[Msg]] = Union[*defs_msg_types]
+    nc_spec: Union[Type[Msg]] = Union[*nc_msg_types]
+
+    specs: dict[str, Union[Type[Msg]]] = {
+        'indexed_generics': idx_spec,
+        'defstruct': def_spec,
+        'types_new_class': nc_spec,
+    }
+    msgtypes_table: dict[str, list[Msg]] = {
+        'indexed_generics': idx_msg_types,
+        'defstruct': defs_msg_types,
+        'types_new_class': nc_msg_types,
+    }
+
+    # XXX lol apparently type unions can't ever
+    # be equal eh?
+    # TODO: grok the diff here better..
+    #
+    # assert (
+    #     idx_spec
+    #     ==
+    #     nc_spec
+    #     ==
+    #     def_spec
+    # )
+    # breakpoint()
+
+    pld_spec: Union[Type] = specs[spec_build_method]
+    runtime_spec: Union[Type] = Union[*ipc_msg_types]
 
-    payload_type_spec: Union[Type[Msg]] = Union[*msg_types]
     return (
-        payload_type_spec,
-        msg_types,
+        pld_spec | runtime_spec,
+        msgtypes_table[spec_build_method] + ipc_msg_types,
     )
-- 
2.34.1


From 25ffdedc0682edf112029ccfe4249a4aa342be7a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 29 Mar 2024 19:15:50 -0400
Subject: [PATCH 199/378] Oof, fix walrus assign causes name-error edge case

Only warn log on a non-`trio` async lib when in the main thread to
avoid a name error when in the non-`asyncio` non-main-thread case.

=> To cherry into the `.pause_from_sync()` feature branch.
---
 tractor/devx/_debug.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 75be7a2a..255b1dbd 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -190,11 +190,14 @@ class Lock:
         is_trio_main = (
             # TODO: since this is private, @oremanj says
             # we should just copy the impl for now..
-            trio._util.is_main_thread()
+            (is_main_thread := trio._util.is_main_thread())
             and
             (async_lib := sniffio.current_async_library()) == 'trio'
         )
-        if not is_trio_main:
+        if (
+            not is_trio_main
+            and is_main_thread
+        ):
             log.warning(
                 f'Current async-lib detected by `sniffio`: {async_lib}\n'
             )
-- 
2.34.1


From f3ca8608d5e86285f01aa696a8abaf0e6cf6ae07 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 2 Apr 2024 09:21:30 -0400
Subject: [PATCH 200/378] Get msg spec type limiting working with a `RunVar`

Since `contextvars.ContextVar` seems to reset to the default in every
new task, switching to using `trio.lowlevel.RunVar` kinda gets close to
what we'd like where a child scope can override what's in the rent but
ideally without modifying the rent's. I tried `tricycle.TreeVar` as well
but it also seems to reset across (embedded) nurseries in our runtime;
need to try it again bc apparently that's not how it's suppose to work?

NOTE that for now i'm keeping the `.msg.types._ctxvar_MsgCodec` set to
the `msgspec` default (`Any` types) so that the test suite will still
pass until the runtime is ported to the new msg-spec + codec.

Surrounding and in support of all this the `Msg`-set impl deats changed
a bit as well as various stuff in `.msg` sub-mods:

- drop the `.pld` struct types for `Error`, `Start`, `StartAck` since we
  don't really need the `.pld` payload field in those cases since
  they're runtime control msgs for starting RPC tasks and handling
  remote errors; we can just put the fields directly on each msg since
  the user will never want/need to override the `.pld` field type.

- add a couple new runtime msgs and include them in `msg.__spec__`
  and make them NOT inherit from `Msg` since they are runtime-specific
  and thus have no need for `.pld` type constraints:
  - `Aid` the actor-id identity handshake msg.
  - `SpawnSpec`: the spawn data passed from a parent actor down to a
    a child in `Actor._from_parent()` for which we need a shuttle
    protocol msg, so might as well make it a pendatic one ;)

- fix some `Actor.uid` field types that were type-borked on `Error`

- add notes about how we need built-in `debug_mode` msgs in order to
  avoid msg-type errors when using the TTY lock machinery and
  a different `.pld` spec then the default `Any` is in use..
  -> since `devx._debug.lock_tty_for_child()` and it's client side
  `wait_for_parent_stdin_hijack()` use `Context.started('Locked')`
  and `MsgStream.send('pdb_unlock')` string values as their `.pld`
  contents we'd need to either always do a `ipc_pld_spec | str` or
  pre-define some dedicated `Msg` types which get `Union`-ed in
  for this?

- break out `msg.pretty_struct.Struct._sin_props()` into a helper func
  `iter_fields()` since the impl doesn't require a struct instance.

- as mentioned above since `ContextVar` didn't work as anticipated
  I next tried `tricycle.TreeVar` but that too didn't seem to keep
  the `apply_codec()` setting intact across
  `Portal.open_context()`/`Context.open_stream()` (it kept reverting to
  the default `.pld: Any` default setting) so I finalized on
  a trio.lowlevel.RunVar` for now despite it basically being
  a `global`..
  -> will probably come back to test this with `TreeVar` and some hot
  tips i picked up from @mikenerone in the `trio` gitter, which i put in
  comments surrounding proto-code.
---
 tractor/msg/__init__.py      |  13 +-
 tractor/msg/_codec.py        |  95 ++++++++++--
 tractor/msg/pretty_struct.py |  43 +++---
 tractor/msg/types.py         | 270 +++++++++++++++++++++++++++++------
 4 files changed, 337 insertions(+), 84 deletions(-)

diff --git a/tractor/msg/__init__.py b/tractor/msg/__init__.py
index 0c8809a9..d8f37477 100644
--- a/tractor/msg/__init__.py
+++ b/tractor/msg/__init__.py
@@ -31,25 +31,24 @@ from ._codec import (
     apply_codec as apply_codec,
     mk_codec as mk_codec,
     MsgCodec as MsgCodec,
-    current_msgspec_codec as current_msgspec_codec,
+    current_codec as current_codec,
 )
 
 from .types import (
     Msg as Msg,
 
-    Start as Start,  # with pld
-    FuncSpec as FuncSpec,
+    Aid as Aid,
+    SpawnSpec as SpawnSpec,
 
-    StartAck as StartAck, # with pld
-    IpcCtxSpec as IpcCtxSpec,
+    Start as Start,
+    StartAck as StartAck,
 
     Started as Started,
     Yield as Yield,
     Stop as Stop,
     Return as Return,
 
-    Error as Error,  # with pld
-    ErrorData as ErrorData,
+    Error as Error,
 
     # full msg spec set
     __spec__ as __spec__,
diff --git a/tractor/msg/_codec.py b/tractor/msg/_codec.py
index 4477d393..32a58a56 100644
--- a/tractor/msg/_codec.py
+++ b/tractor/msg/_codec.py
@@ -30,13 +30,13 @@ ToDo: backends we prolly should offer:
 
 '''
 from __future__ import annotations
-from contextvars import (
-    ContextVar,
-    Token,
-)
 from contextlib import (
     contextmanager as cm,
 )
+# from contextvars import (
+#     ContextVar,
+#     Token,
+# )
 from typing import (
     Any,
     Callable,
@@ -47,6 +47,12 @@ from types import ModuleType
 
 import msgspec
 from msgspec import msgpack
+from trio.lowlevel import (
+    RunVar,
+    RunVarToken,
+)
+# TODO: see notes below from @mikenerone..
+# from tricycle import TreeVar
 
 from tractor.msg.pretty_struct import Struct
 from tractor.msg.types import (
@@ -72,6 +78,9 @@ class MsgCodec(Struct):
     '''
     A IPC msg interchange format lib's encoder + decoder pair.
 
+    Pretty much nothing more then delegation to underlying
+    `msgspec.<interchange-protocol>.Encoder/Decoder`s for now.
+
     '''
     _enc: msgpack.Encoder
     _dec: msgpack.Decoder
@@ -86,11 +95,6 @@ class MsgCodec(Struct):
 
     lib: ModuleType = msgspec
 
-    # ad-hoc type extensions
-    # https://jcristharif.com/msgspec/extending.html#mapping-to-from-native-types
-    enc_hook: Callable[[Any], Any]|None = None  # coder
-    dec_hook: Callable[[type, Any], Any]|None = None # decoder
-
     # TODO: a sub-decoder system as well?
     # payload_msg_specs: Union[Type[Struct]] = Any
     # see related comments in `.msg.types`
@@ -304,7 +308,8 @@ def mk_codec(
 
     libname: str = 'msgspec',
 
-    # proxy as `Struct(**kwargs)`
+    # proxy as `Struct(**kwargs)` for ad-hoc type extensions
+    # https://jcristharif.com/msgspec/extending.html#mapping-to-from-native-types
     # ------ - ------
     dec_hook: Callable|None = None,
     enc_hook: Callable|None = None,
@@ -389,14 +394,52 @@ def mk_codec(
 # no custom structs, hooks or other special types.
 _def_msgspec_codec: MsgCodec = mk_codec(ipc_msg_spec=Any)
 
-# NOTE: provides for per-`trio.Task` specificity of the
+# The built-in IPC `Msg` spec.
+# Our composing "shuttle" protocol which allows `tractor`-app code
+# to use any `msgspec` supported type as the `Msg.pld` payload,
+# https://jcristharif.com/msgspec/supported-types.html
+#
+_def_tractor_codec: MsgCodec = mk_codec(
+    ipc_pld_spec=Any,
+)
+# TODO: IDEALLY provides for per-`trio.Task` specificity of the
 # IPC msging codec used by the transport layer when doing
 # `Channel.send()/.recv()` of wire data.
-_ctxvar_MsgCodec: ContextVar[MsgCodec] = ContextVar(
+
+# ContextVar-TODO: DIDN'T WORK, kept resetting in every new task to default!?
+# _ctxvar_MsgCodec: ContextVar[MsgCodec] = ContextVar(
+
+# TreeVar-TODO: DIDN'T WORK, kept resetting in every new embedded nursery
+# even though it's supposed to inherit from a parent context ???
+#
+# _ctxvar_MsgCodec: TreeVar[MsgCodec] = TreeVar(
+#
+# ^-NOTE-^: for this to work see the mods by @mikenerone from `trio` gitter:
+#
+# 22:02:54 <mikenerone> even for regular contextvars, all you have to do is:
+#    `task: Task = trio.lowlevel.current_task()`
+#    `task.parent_nursery.parent_task.context.run(my_ctx_var.set, new_value)`
+#
+# From a comment in his prop code he couldn't share outright:
+# 1. For every TreeVar set in the current task (which covers what
+#    we need from SynchronizerFacade), walk up the tree until the
+#    root or finding one where the TreeVar is already set, setting
+#    it in all of the contexts along the way.
+# 2. For each of those, we also forcibly set the values that are
+#    pending for child nurseries that have not yet accessed the
+#    TreeVar.
+# 3. We similarly set the pending values for the child nurseries
+#    of the *current* task.
+#
+
+# TODO: STOP USING THIS, since it's basically a global and won't
+# allow sub-IPC-ctxs to limit the msg-spec however desired..
+_ctxvar_MsgCodec: MsgCodec = RunVar(
     'msgspec_codec',
 
     # TODO: move this to our new `Msg`-spec!
     default=_def_msgspec_codec,
+    # default=_def_tractor_codec,
 )
 
 
@@ -410,15 +453,36 @@ def apply_codec(
     runtime context such that all IPC msgs are processed
     with it for that task.
 
+    Uses a `tricycle.TreeVar` to ensure the scope of the codec
+    matches the `@cm` block and DOES NOT change to the original
+    (default) value in new tasks (as it does for `ContextVar`).
+
+    See the docs:
+    - https://tricycle.readthedocs.io/en/latest/reference.html#tree-variables
+    - https://github.com/oremanj/tricycle/blob/master/tricycle/_tests/test_tree_var.py
+
     '''
-    token: Token = _ctxvar_MsgCodec.set(codec)
+    orig: MsgCodec = _ctxvar_MsgCodec.get()
+    assert orig is not codec
+    token: RunVarToken = _ctxvar_MsgCodec.set(codec)
+
+    # TODO: for TreeVar approach, see docs for @cm `.being()` API:
+    # https://tricycle.readthedocs.io/en/latest/reference.html#tree-variables
+    # try:
+    #     with _ctxvar_MsgCodec.being(codec):
+    #         new = _ctxvar_MsgCodec.get()
+    #         assert new is codec
+    #         yield codec
+
     try:
         yield _ctxvar_MsgCodec.get()
     finally:
         _ctxvar_MsgCodec.reset(token)
 
+    assert _ctxvar_MsgCodec.get() is orig
 
-def current_msgspec_codec() -> MsgCodec:
+
+def current_codec() -> MsgCodec:
     '''
     Return the current `trio.Task.context`'s value
     for `msgspec_codec` used by `Channel.send/.recv()`
@@ -449,5 +513,6 @@ def limit_msg_spec(
         payload_types=payload_types,
         **codec_kwargs,
     )
-    with apply_codec(msgspec_codec):
+    with apply_codec(msgspec_codec) as applied_codec:
+        assert applied_codec is msgspec_codec
         yield msgspec_codec
diff --git a/tractor/msg/pretty_struct.py b/tractor/msg/pretty_struct.py
index 143fc7a4..412b6ed6 100644
--- a/tractor/msg/pretty_struct.py
+++ b/tractor/msg/pretty_struct.py
@@ -80,6 +80,28 @@ class DiffDump(UserList):
         return repstr
 
 
+def iter_fields(struct: Struct) -> Iterator[
+    tuple[
+        structs.FieldIinfo,
+        str,
+        Any,
+    ]
+]:
+    '''
+    Iterate over all non-@property fields of this struct.
+
+    '''
+    fi: structs.FieldInfo
+    for fi in structs.fields(struct):
+        key: str = fi.name
+        val: Any = getattr(struct, key)
+        yield (
+            fi,
+            key,
+            val,
+        )
+
+
 class Struct(
     _Struct,
 
@@ -91,23 +113,6 @@ class Struct(
     A "human friendlier" (aka repl buddy) struct subtype.
 
     '''
-    def _sin_props(self) -> Iterator[
-        tuple[
-            structs.FieldIinfo,
-            str,
-            Any,
-        ]
-    ]:
-        '''
-        Iterate over all non-@property fields of this struct.
-
-        '''
-        fi: structs.FieldInfo
-        for fi in structs.fields(self):
-            key: str = fi.name
-            val: Any = getattr(self, key)
-            yield fi, key, val
-
     def to_dict(
         self,
         include_non_members: bool = True,
@@ -130,7 +135,7 @@ class Struct(
         # added as type-defined `@property` methods!
         sin_props: dict = {}
         fi: structs.FieldInfo
-        for fi, k, v in self._sin_props():
+        for fi, k, v in iter_fields(self):
             sin_props[k] = asdict[k]
 
         return sin_props
@@ -159,7 +164,7 @@ class Struct(
         fi: structs.FieldInfo
         k: str
         v: Any
-        for fi, k, v in self._sin_props():
+        for fi, k, v in iter_fields(self):
 
             # TODO: how can we prefer `Literal['option1',  'option2,
             # ..]` over .__name__ == `Literal` but still get only the
diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index 2411f0f9..a81473d7 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -26,6 +26,7 @@ from __future__ import annotations
 import types
 from typing import (
     Any,
+    Callable,
     Generic,
     Literal,
     Type,
@@ -37,8 +38,12 @@ from msgspec import (
     defstruct,
     # field,
     Struct,
-    UNSET,
-    UnsetType,
+    # UNSET,
+    # UnsetType,
+)
+
+from tractor.msg import (
+    pretty_struct,
 )
 
 # type variable for the boxed payload field `.pld`
@@ -48,11 +53,19 @@ PayloadT = TypeVar('PayloadT')
 class Msg(
     Struct,
     Generic[PayloadT],
+
+    # https://jcristharif.com/msgspec/structs.html#tagged-unions
     tag=True,
     tag_field='msg_type',
 
-    # eq=True,
+    # https://jcristharif.com/msgspec/structs.html#field-ordering
+    # kw_only=True,
+
+    # https://jcristharif.com/msgspec/structs.html#equality-and-order
     # order=True,
+
+    # https://jcristharif.com/msgspec/structs.html#encoding-decoding-as-arrays
+    # as_array=True,
 ):
     '''
     The "god" boxing msg type.
@@ -90,6 +103,53 @@ class Msg(
     pld: PayloadT
 
 
+class Aid(
+    Struct,
+    tag=True,
+    tag_field='msg_type',
+):
+    '''
+    Actor-identity msg.
+
+    Initial contact exchange enabling an actor "mailbox handshake"
+    delivering the peer identity (and maybe eventually contact)
+    info.
+
+    Used by discovery protocol to register actors as well as
+    conduct the initial comms (capability) filtering.
+
+    '''
+    name: str
+    uuid: str
+    # TODO: use built-in support for UUIDs?
+    # -[ ] `uuid.UUID` which has multi-protocol support
+    #  https://jcristharif.com/msgspec/supported-types.html#uuid
+
+
+class SpawnSpec(
+    pretty_struct.Struct,
+    tag=True,
+    tag_field='msg_type',
+):
+    '''
+    Initial runtime spec handed down from a spawning parent to its
+    child subactor immediately following first contact via an
+    `Aid` msg.
+
+    '''
+    _parent_main_data: dict
+    _runtime_vars: dict[str, Any]
+
+    # module import capability
+    enable_modules: dict[str, str]
+
+    # TODO: not just sockaddr pairs?
+    # -[ ] abstract into a `TransportAddr` type?
+    reg_addrs: list[tuple[str, int]]
+    bind_addrs: list[tuple[str, int]]
+
+
+
 # TODO: caps based RPC support in the payload?
 #
 # -[ ] integration with our ``enable_modules: list[str]`` caps sys.
@@ -105,18 +165,31 @@ class Msg(
 #
 # -[ ] can we combine .ns + .func into a native `NamespacePath` field?
 #
-# -[ ]better name, like `Call/TaskInput`?
+# -[ ] better name, like `Call/TaskInput`?
 #
-class FuncSpec(Struct):
-    ns: str
-    func: str
-
-    kwargs: dict
-    uid: str  # (calling) actor-id
+# -[ ] XXX a debugger lock msg transaction with payloads like,
+#   child -> `.pld: DebugLock` -> root
+#   child <- `.pld: DebugLocked` <- root
+#   child -> `.pld: DebugRelease` -> root
+#
+#   WHY => when a pld spec is provided it might not allow for
+#   debug mode msgs as they currently are (using plain old `pld.
+#   str` payloads) so we only when debug_mode=True we need to
+#   union in this debugger payload set?
+#
+#   mk_msg_spec(
+#       MyPldSpec,
+#       debug_mode=True,
+#   ) -> (
+#       Union[MyPldSpec]
+#      | Union[DebugLock, DebugLocked, DebugRelease]
+#   )
 
 
 class Start(
-    Msg,
+    Struct,
+    tag=True,
+    tag_field='msg_type',
 ):
     '''
     Initial request to remotely schedule an RPC `trio.Task` via
@@ -134,14 +207,26 @@ class Start(
     - `Context.open_context()`
 
     '''
-    pld: FuncSpec
+    cid: str
+
+    ns: str
+    func: str
+
+    kwargs: dict
+    uid: tuple[str, str]  # (calling) actor-id
 
 
-class IpcCtxSpec(Struct):
+class StartAck(
+    Struct,
+    tag=True,
+    tag_field='msg_type',
+):
     '''
-    An inter-actor-`trio.Task`-comms `Context` spec.
+    Init response to a `Cmd` request indicating the far
+    end's RPC spec, namely its callable "type".
 
     '''
+    cid: str
     # TODO: maybe better names for all these?
     # -[ ] obvi ^ would need sync with `._rpc`
     functype: Literal[
@@ -160,18 +245,6 @@ class IpcCtxSpec(Struct):
     # msgspec: MsgSpec
 
 
-class StartAck(
-    Msg,
-    Generic[PayloadT],
-):
-    '''
-    Init response to a `Cmd` request indicating the far
-    end's RPC callable "type".
-
-    '''
-    pld: IpcCtxSpec
-
-
 class Started(
     Msg,
     Generic[PayloadT],
@@ -202,13 +275,19 @@ class Yield(
     pld: PayloadT
 
 
-class Stop(Msg):
+class Stop(
+    Struct,
+    tag=True,
+    tag_field='msg_type',
+):
     '''
     Stream termination signal much like an IPC version 
     of `StopAsyncIteration`.
 
     '''
-    pld: UnsetType = UNSET
+    cid: str
+    # TODO: do we want to support a payload on stop?
+    # pld: UnsetType = UNSET
 
 
 class Return(
@@ -223,32 +302,33 @@ class Return(
     pld: PayloadT
 
 
-class ErrorData(Struct):
+class Error(
+    Struct,
+    tag=True,
+    tag_field='msg_type',
+):
     '''
-    Remote actor error meta-data as needed originally by
+    A pkt that wraps `RemoteActorError`s for relay and raising.
+
+    Fields are 1-to-1 meta-data as needed originally by
     `RemoteActorError.msgdata: dict`.
 
     '''
-    src_uid: str
+    src_uid: tuple[str, str]
     src_type_str: str
     boxed_type_str: str
-
-    relay_path: list[str]
+    relay_path: list[tuple[str, str]]
     tb_str: str
 
+    cid: str|None = None
+
+    # TODO: use UNSET or don't include them via
+    #
     # `ContextCancelled`
-    canceller: str|None = None
+    canceller: tuple[str, str]|None = None
 
     # `StreamOverrun`
-    sender: str|None = None
-
-
-class Error(Msg):
-    '''
-    A pkt that wraps `RemoteActorError`s for relay.
-
-    '''
-    pld: ErrorData
+    sender: tuple[str, str]|None = None
 
 
 # TODO: should be make a msg version of `ContextCancelled?`
@@ -265,6 +345,12 @@ class Error(Msg):
 # approx order of the IPC txn-state spaces.
 __spec__: list[Msg] = [
 
+    # identity handshake
+    Aid,
+
+    # spawn specification from parent
+    SpawnSpec,
+
     # inter-actor RPC initiation
     Start,
     StartAck,
@@ -280,6 +366,8 @@ __spec__: list[Msg] = [
 ]
 
 _runtime_spec_msgs: list[Msg] = [
+    Aid,
+    SpawnSpec,
     Start,
     StartAck,
     Stop,
@@ -443,3 +531,99 @@ def mk_msg_spec(
         pld_spec | runtime_spec,
         msgtypes_table[spec_build_method] + ipc_msg_types,
     )
+
+
+# TODO: make something similar to this inside `._codec` such that
+# user can just pass a type table of some sort?
+# def mk_dict_msg_codec_hooks() -> tuple[Callable, Callable]:
+#     '''
+#     Deliver a `enc_hook()`/`dec_hook()` pair which does
+#     manual convertion from our above native `Msg` set
+#     to `dict` equivalent (wire msgs) in order to keep legacy compat
+#     with the original runtime implementation.
+
+#     Note: this is is/was primarly used while moving the core
+#     runtime over to using native `Msg`-struct types wherein we
+#     start with the send side emitting without loading
+#     a typed-decoder and then later flipping the switch over to
+#     load to the native struct types once all runtime usage has
+#     been adjusted appropriately.
+
+#     '''
+#     def enc_to_dict(msg: Any) -> Any:
+#         '''
+#         Encode `Msg`-structs to `dict` msgs instead
+#         of using `msgspec.msgpack.Decoder.type`-ed
+#         features.
+
+#         '''
+#         match msg:
+#             case Start():
+#                 dctmsg: dict = pretty_struct.Struct.to_dict(
+#                     msg
+#                 )['pld']
+
+#             case Error():
+#                 dctmsg: dict = pretty_struct.Struct.to_dict(
+#                     msg
+#                 )['pld']
+#                 return {'error': dctmsg}
+
+
+#     def dec_from_dict(
+#         type: Type,
+#         obj: Any,
+#     ) -> Any:
+#         '''
+#         Decode to `Msg`-structs from `dict` msgs instead
+#         of using `msgspec.msgpack.Decoder.type`-ed
+#         features.
+
+#         '''
+#         cid: str = obj.get('cid')
+#         match obj:
+#             case {'cmd': pld}:
+#                 return Start(
+#                     cid=cid,
+#                     pld=pld,
+#                 )
+#             case {'functype': pld}:
+#                 return StartAck(
+#                     cid=cid,
+#                     functype=pld,
+#                     # pld=IpcCtxSpec(
+#                     #     functype=pld,
+#                     # ),
+#                 )
+#             case {'started': pld}:
+#                 return Started(
+#                     cid=cid,
+#                     pld=pld,
+#                 )
+#             case {'yield': pld}:
+#                 return Yield(
+#                     cid=obj['cid'],
+#                     pld=pld,
+#                 )
+#             case {'stop': pld}:
+#                 return Stop(
+#                     cid=cid,
+#                 )
+#             case {'return': pld}:
+#                 return Return(
+#                     cid=cid,
+#                     pld=pld,
+#                 )
+
+#             case {'error': pld}:
+#                 return Error(
+#                     cid=cid,
+#                     pld=ErrorData(
+#                         **pld
+#                     ),
+#                 )
+
+#     return (
+#         # enc_to_dict,
+#         dec_from_dict,
+#     )
-- 
2.34.1


From 3aa964315a054c5d6ae6acd9e632aaa566728fc0 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 2 Apr 2024 11:14:43 -0400
Subject: [PATCH 201/378] Get `test_codec_hooks_mod` working with `Msg`s

Though the runtime hasn't been changed over in this patch (it was in the
local index at the time however), the test does now demonstrate that
using a `Started` the correctly typed `.pld` will codec correctly when
passed manually to `MsgCodec.encode/decode()`.

Despite not having the runtime ported to the new shuttle msg set
(meaning the mentioned test will fail without the runtime port patch),
I was able to get this first original test working that limits payload
packets as a `Msg.pld: NamespacePath`this as long as we spec
`enc/dec_hook()`s then the `Msg.pld` will be processed correctly as per:
https://jcristharif.com/msgspec/extending.html#mapping-to-from-native-types
in both the `Any` and `NamespacePath|None` spec cases.
^- turns out in this case -^ that the codec hooks only get invoked on
the unknown-fields NOT the entire `Struct`-msg.

A further gotcha was merging a `|None` into the `pld_spec` since this
test spawns a subactor and opens a context via `send_back_nsp()` and
that func has no explicit `return` - so of course it delivers
a `Return(pld=None)` which will fail if we only spec `NamespacePath`.
---
 tests/test_caps_based_msging.py | 305 ++++++++++++++++++++++++--------
 1 file changed, 236 insertions(+), 69 deletions(-)

diff --git a/tests/test_caps_based_msging.py b/tests/test_caps_based_msging.py
index abdda0a5..b42d9e35 100644
--- a/tests/test_caps_based_msging.py
+++ b/tests/test_caps_based_msging.py
@@ -7,7 +7,6 @@ B~)
 '''
 from typing import (
     Any,
-    _GenericAlias,
     Type,
     Union,
 )
@@ -26,20 +25,23 @@ from msgspec import (
 import pytest
 import tractor
 from tractor.msg import (
-    _def_msgspec_codec,
+    _codec,
     _ctxvar_MsgCodec,
 
     NamespacePath,
     MsgCodec,
     mk_codec,
     apply_codec,
-    current_msgspec_codec,
+    current_codec,
 )
-from tractor.msg import types
+from tractor.msg import (
+    types,
+)
+from tractor import _state
 from tractor.msg.types import (
     # PayloadT,
     Msg,
-    # Started,
+    Started,
     mk_msg_spec,
 )
 import trio
@@ -60,56 +62,110 @@ def test_msg_spec_xor_pld_spec():
         )
 
 
-# TODO: wrap these into `._codec` such that user can just pass
-# a type table of some sort?
-def enc_hook(obj: Any) -> Any:
-    if isinstance(obj, NamespacePath):
-        return str(obj)
-    else:
-        raise NotImplementedError(
-            f'Objects of type {type(obj)} are not supported'
-        )
-
-
-def dec_hook(type: Type, obj: Any) -> Any:
-    print(f'type is: {type}')
-    if type is NamespacePath:
-        return NamespacePath(obj)
-    else:
-        raise NotImplementedError(
-            f'Objects of type {type(obj)} are not supported'
-        )
-
-
 def ex_func(*args):
     print(f'ex_func({args})')
 
 
 def mk_custom_codec(
-    ipc_msg_spec: Type[Any] = Any,
-) -> MsgCodec:
-    # apply custom hooks and set a `Decoder` which only
-    # loads `NamespacePath` types.
-    nsp_codec: MsgCodec = mk_codec(
-        ipc_msg_spec=ipc_msg_spec,
-        enc_hook=enc_hook,
-        dec_hook=dec_hook,
-    )
+    pld_spec: Union[Type]|Any,
 
-    # TODO: validate `MsgCodec` interface/semantics?
-    # -[ ] simple field tests to ensure caching + reset is workin?
-    # -[ ] custom / changing `.decoder()` calls?
-    #
-    # dec = nsp_codec.decoder(
-    #     types=NamespacePath,
-    # )
-    # assert nsp_codec.dec is dec
+) -> MsgCodec:
+    '''
+    Create custom `msgpack` enc/dec-hooks and set a `Decoder`
+    which only loads `NamespacePath` types.
+
+    '''
+    uid: tuple[str, str] = tractor.current_actor().uid
+
+    # XXX NOTE XXX: despite defining `NamespacePath` as a type
+    # field on our `Msg.pld`, we still need a enc/dec_hook() pair
+    # to cast to/from that type on the wire. See the docs:
+    # https://jcristharif.com/msgspec/extending.html#mapping-to-from-native-types
+
+    def enc_nsp(obj: Any) -> Any:
+        match obj:
+            case NamespacePath():
+                print(
+                    f'{uid}: `NamespacePath`-Only ENCODE?\n'
+                    f'type: {type(obj)}\n'
+                    f'obj: {obj}\n'
+                )
+
+                return str(obj)
+
+        logmsg: str = (
+            f'{uid}: Encoding `{obj}: <{type(obj)}>` not supported'
+            f'type: {type(obj)}\n'
+            f'obj: {obj}\n'
+        )
+        print(logmsg)
+        raise NotImplementedError(logmsg)
+
+    def dec_nsp(
+        type: Type,
+        obj: Any,
+
+    ) -> Any:
+        print(
+            f'{uid}: CUSTOM DECODE\n'
+            f'input type: {type}\n'
+            f'obj: {obj}\n'
+            f'type(obj): `{type(obj).__class__}`\n'
+        )
+        nsp = None
+
+        # This never seems to hit?
+        if isinstance(obj, Msg):
+            print(f'Msg type: {obj}')
+
+        if (
+            type is NamespacePath
+            and isinstance(obj, str)
+            and ':' in obj
+        ):
+            nsp = NamespacePath(obj)
+
+        if nsp:
+            print(f'Returning NSP instance: {nsp}')
+            return nsp
+
+        logmsg: str = (
+            f'{uid}: Decoding `{obj}: <{type(obj)}>` not supported'
+            f'input type: {type(obj)}\n'
+            f'obj: {obj}\n'
+            f'type(obj): `{type(obj).__class__}`\n'
+        )
+        print(logmsg)
+        raise NotImplementedError(logmsg)
+
+
+    nsp_codec: MsgCodec = mk_codec(
+        ipc_pld_spec=pld_spec,
+
+        # NOTE XXX: the encode hook MUST be used no matter what since
+        # our `NamespacePath` is not any of a `Any` native type nor
+        # a `msgspec.Struct` subtype - so `msgspec` has no way to know
+        # how to encode it unless we provide the custom hook.
+        #
+        # AGAIN that is, regardless of whether we spec an
+        # `Any`-decoded-pld the enc has no knowledge (by default)
+        # how to enc `NamespacePath` (nsp), so we add a custom
+        # hook to do that ALWAYS.
+        enc_hook=enc_nsp,
+
+        # XXX NOTE: pretty sure this is mutex with the `type=` to
+        # `Decoder`? so it won't work in tandem with the
+        # `ipc_pld_spec` passed above?
+        dec_hook=dec_nsp,
+    )
     return nsp_codec
 
 
 @tractor.context
 async def send_back_nsp(
-    ctx: tractor.Context,
+    ctx: Context,
+    expect_debug: bool,
+    use_any_spec: bool,
 
 ) -> None:
     '''
@@ -117,28 +173,65 @@ async def send_back_nsp(
     and ensure we can round trip a func ref with our parent.
 
     '''
-    task: trio.Task = trio.lowlevel.current_task()
-    task_ctx: Context = task.context
-    assert _ctxvar_MsgCodec not in task_ctx
+    # debug mode sanity check
+    assert expect_debug == _state.debug_mode()
 
-    nsp_codec: MsgCodec = mk_custom_codec()
+    # task: trio.Task = trio.lowlevel.current_task()
+
+    # TreeVar
+    # curr_codec = _ctxvar_MsgCodec.get_in(task)
+
+    # ContextVar
+    # task_ctx: Context = task.context
+    # assert _ctxvar_MsgCodec not in task_ctx
+
+    curr_codec = _ctxvar_MsgCodec.get()
+    assert curr_codec is _codec._def_tractor_codec
+
+    if use_any_spec:
+        pld_spec = Any
+    else:
+        # NOTE: don't need the |None here since
+        # the parent side will never send `None` like
+        # we do here in the implicit return at the end of this
+        # `@context` body.
+        pld_spec = NamespacePath  # |None
+
+    nsp_codec: MsgCodec = mk_custom_codec(
+        pld_spec=pld_spec,
+    )
     with apply_codec(nsp_codec) as codec:
         chk_codec_applied(
             custom_codec=nsp_codec,
             enter_value=codec,
         )
 
+        # ensure roundtripping works locally
         nsp = NamespacePath.from_ref(ex_func)
-        await ctx.started(nsp)
+        wire_bytes: bytes = nsp_codec.encode(
+            Started(
+                cid=ctx.cid,
+                pld=nsp
+            )
+        )
+        msg: Started = nsp_codec.decode(wire_bytes)
+        pld = msg.pld
+        assert pld == nsp
 
+        await ctx.started(nsp)
         async with ctx.open_stream() as ipc:
             async for msg in ipc:
 
-                assert msg == f'{__name__}:ex_func'
+                if use_any_spec:
+                    assert msg == f'{__name__}:ex_func'
 
-                # TODO: as per below
-                # assert isinstance(msg, NamespacePath)
-                assert isinstance(msg, str)
+                    # TODO: as per below
+                    # assert isinstance(msg, NamespacePath)
+                    assert isinstance(msg, str)
+                else:
+                    assert isinstance(msg, NamespacePath)
+
+                await ipc.send(msg)
 
 
 def chk_codec_applied(
@@ -146,11 +239,20 @@ def chk_codec_applied(
     enter_value: MsgCodec,
 ) -> MsgCodec:
 
-    task: trio.Task = trio.lowlevel.current_task()
-    task_ctx: Context = task.context
+    # task: trio.Task = trio.lowlevel.current_task()
 
-    assert _ctxvar_MsgCodec in task_ctx
-    curr_codec: MsgCodec = task.context[_ctxvar_MsgCodec]
+    # TreeVar
+    # curr_codec = _ctxvar_MsgCodec.get_in(task)
+
+    # ContextVar
+    # task_ctx: Context = task.context
+    # assert _ctxvar_MsgCodec in task_ctx
+    # curr_codec: MsgCodec = task.context[_ctxvar_MsgCodec]
+
+    # RunVar
+    curr_codec: MsgCodec = _ctxvar_MsgCodec.get()
+    last_read_codec = _ctxvar_MsgCodec.get()
+    assert curr_codec is last_read_codec
 
     assert (
         # returned from `mk_codec()`
@@ -163,14 +265,31 @@ def chk_codec_applied(
         curr_codec is
 
         # public API for all of the above
-        current_msgspec_codec()
+        current_codec()
 
         # the default `msgspec` settings
-        is not _def_msgspec_codec
+        is not _codec._def_msgspec_codec
+        is not _codec._def_tractor_codec
     )
 
 
-def test_codec_hooks_mod():
+@pytest.mark.parametrize(
+    'ipc_pld_spec',
+    [
+        # _codec._def_msgspec_codec,
+        Any,
+        # _codec._def_tractor_codec,
+        NamespacePath|None,
+    ],
+    ids=[
+        'any_type',
+        'nsp_type',
+    ]
+)
+def test_codec_hooks_mod(
+    debug_mode: bool,
+    ipc_pld_spec: Union[Type]|Any,
+):
     '''
     Audit the `.msg.MsgCodec` override apis details given our impl
     uses `contextvars` to accomplish per `trio` task codec
@@ -178,11 +297,21 @@ def test_codec_hooks_mod():
 
     '''
     async def main():
-        task: trio.Task = trio.lowlevel.current_task()
-        task_ctx: Context = task.context
-        assert _ctxvar_MsgCodec not in task_ctx
 
-        async with tractor.open_nursery() as an:
+        # task: trio.Task = trio.lowlevel.current_task()
+
+        # ContextVar
+        # task_ctx: Context = task.context
+        # assert _ctxvar_MsgCodec not in task_ctx
+
+        # TreeVar
+        # def_codec: MsgCodec = _ctxvar_MsgCodec.get_in(task)
+        def_codec = _ctxvar_MsgCodec.get()
+        assert def_codec is _codec._def_tractor_codec
+
+        async with tractor.open_nursery(
+            debug_mode=debug_mode,
+        ) as an:
             p: tractor.Portal = await an.start_actor(
                 'sub',
                 enable_modules=[__name__],
@@ -192,7 +321,9 @@ def test_codec_hooks_mod():
             # - codec not modified -> decode nsp as `str`
             # - codec modified with hooks -> decode nsp as
             #   `NamespacePath`
-            nsp_codec: MsgCodec = mk_custom_codec()
+            nsp_codec: MsgCodec = mk_custom_codec(
+                pld_spec=ipc_pld_spec,
+            )
             with apply_codec(nsp_codec) as codec:
                 chk_codec_applied(
                     custom_codec=nsp_codec,
@@ -202,9 +333,22 @@ def test_codec_hooks_mod():
                 async with (
                     p.open_context(
                         send_back_nsp,
+                        # TODO: send the original nsp here and
+                        # test with `limit_msg_spec()` above?
+                        expect_debug=debug_mode,
+                        use_any_spec=(ipc_pld_spec==Any),
+
                     ) as (ctx, first),
                     ctx.open_stream() as ipc,
                 ):
+                    if ipc_pld_spec is NamespacePath:
+                        assert isinstance(first, NamespacePath)
+
+                    print(
+                        'root: ENTERING CONTEXT BLOCK\n'
+                        f'type(first): {type(first)}\n'
+                        f'first: {first}\n'
+                    )
                     # ensure codec is still applied across
                     # `tractor.Context` + its embedded nursery.
                     chk_codec_applied(
@@ -212,23 +356,46 @@ def test_codec_hooks_mod():
                         enter_value=codec,
                     )
 
-                    assert first == f'{__name__}:ex_func'
+                    first_nsp = NamespacePath(first)
+
+                    # ensure roundtripping works
+                    wire_bytes: bytes = nsp_codec.encode(
+                        Started(
+                            cid=ctx.cid,
+                            pld=first_nsp
+                        )
+                    )
+                    msg: Started = nsp_codec.decode(wire_bytes)
+                    pld = msg.pld
+                    assert  pld == first_nsp
+
+                    # try a manual decode of the started msg+pld
+
                     # TODO: actually get the decoder loading
                     # to native once we spec our SCIPP msgspec
                     # (structurred-conc-inter-proc-protocol)
                     # implemented as per,
                     # https://github.com/goodboy/tractor/issues/36
                     #
-                    # assert isinstance(first, NamespacePath)
-                    assert isinstance(first, str)
+                    if ipc_pld_spec is NamespacePath:
+                        assert isinstance(first, NamespacePath)
+
+                    # `Any`-payload-spec case
+                    else:
+                        assert isinstance(first, str)
+                        assert first == f'{__name__}:ex_func'
+
                     await ipc.send(first)
 
-                    with trio.move_on_after(1):
+                    with trio.move_on_after(.6):
                         async for msg in ipc:
+                            print(msg)
 
                             # TODO: as per above
                             # assert isinstance(msg, NamespacePath)
                             assert isinstance(msg, str)
+                            await ipc.send(msg)
+                            await trio.sleep(0.1)
 
             await p.cancel_actor()
 
-- 
2.34.1


From f2ce4a3469d7cd89670f679118c7b2b0963ee257 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 2 Apr 2024 13:33:06 -0400
Subject: [PATCH 202/378] Add timeouts around some context test bodies

Since with my in-index runtime-port to our native msg-spec it seems
these ones are hanging B(

- `test_one_end_stream_not_opened()`
- `test_maybe_allow_overruns_stream()`

Tossing in some `trio.fail_after()`s seems to at least gnab them as
failures B)
---
 tests/test_context_stream_semantics.py | 134 ++++++++++++++-----------
 1 file changed, 74 insertions(+), 60 deletions(-)

diff --git a/tests/test_context_stream_semantics.py b/tests/test_context_stream_semantics.py
index 1f5e3dbb..cc7f402f 100644
--- a/tests/test_context_stream_semantics.py
+++ b/tests/test_context_stream_semantics.py
@@ -6,6 +6,7 @@ sync-opening a ``tractor.Context`` beforehand.
 
 '''
 from itertools import count
+import math
 import platform
 from pprint import pformat
 from typing import (
@@ -845,7 +846,10 @@ async def keep_sending_from_callee(
         ('caller', 1, never_open_stream),
         ('callee', 0, keep_sending_from_callee),
     ],
-    ids='overrun_condition={}'.format,
+    ids=[
+         ('caller_1buf_never_open_stream'),
+         ('callee_0buf_keep_sending_from_callee'),
+    ]
 )
 def test_one_end_stream_not_opened(
     overrun_by: tuple[str, int, Callable],
@@ -869,29 +873,30 @@ def test_one_end_stream_not_opened(
                 enable_modules=[__name__],
             )
 
-            async with portal.open_context(
-                entrypoint,
-            ) as (ctx, sent):
-                assert sent is None
+            with trio.fail_after(0.8):
+                async with portal.open_context(
+                    entrypoint,
+                ) as (ctx, sent):
+                    assert sent is None
 
-                if 'caller' in overrunner:
+                    if 'caller' in overrunner:
 
-                    async with ctx.open_stream() as stream:
+                        async with ctx.open_stream() as stream:
 
-                        # itersend +1 msg more then the buffer size
-                        # to cause the most basic overrun.
-                        for i in range(buf_size):
-                            print(f'sending {i}')
-                            await stream.send(i)
+                            # itersend +1 msg more then the buffer size
+                            # to cause the most basic overrun.
+                            for i in range(buf_size):
+                                print(f'sending {i}')
+                                await stream.send(i)
 
-                        else:
-                            # expect overrun error to be relayed back
-                            # and this sleep interrupted
-                            await trio.sleep_forever()
+                            else:
+                                # expect overrun error to be relayed back
+                                # and this sleep interrupted
+                                await trio.sleep_forever()
 
-                else:
-                    # callee overruns caller case so we do nothing here
-                    await trio.sleep_forever()
+                    else:
+                        # callee overruns caller case so we do nothing here
+                        await trio.sleep_forever()
 
             await portal.cancel_actor()
 
@@ -1055,54 +1060,63 @@ def test_maybe_allow_overruns_stream(
                 loglevel=loglevel,
                 debug_mode=debug_mode,
             )
-            seq = list(range(10))
-            async with portal.open_context(
-                echo_back_sequence,
-                seq=seq,
-                wait_for_cancel=cancel_ctx,
-                be_slow=(slow_side == 'child'),
-                allow_overruns_side=allow_overruns_side,
 
-            ) as (ctx, sent):
-                assert sent is None
+            # stream-sequence batch info with send delay to determine
+            # approx timeout determining whether test has hung.
+            total_batches: int = 2
+            num_items: int = 10
+            seq = list(range(num_items))
+            parent_send_delay: float = 0.16
+            timeout: float = math.ceil(
+                total_batches * num_items * parent_send_delay
+            )
+            with trio.fail_after(timeout):
+                async with portal.open_context(
+                    echo_back_sequence,
+                    seq=seq,
+                    wait_for_cancel=cancel_ctx,
+                    be_slow=(slow_side == 'child'),
+                    allow_overruns_side=allow_overruns_side,
 
-                async with ctx.open_stream(
-                    msg_buffer_size=1 if slow_side == 'parent' else None,
-                    allow_overruns=(allow_overruns_side in {'parent', 'both'}),
-                ) as stream:
+                ) as (ctx, sent):
+                    assert sent is None
 
-                    total_batches: int = 2
-                    for _ in range(total_batches):
-                        for msg in seq:
-                            # print(f'root tx {msg}')
-                            await stream.send(msg)
-                            if slow_side == 'parent':
-                                # NOTE: we make the parent slightly
-                                # slower, when it is slow, to make sure
-                                # that in the overruns everywhere case
-                                await trio.sleep(0.16)
+                    async with ctx.open_stream(
+                        msg_buffer_size=1 if slow_side == 'parent' else None,
+                        allow_overruns=(allow_overruns_side in {'parent', 'both'}),
+                    ) as stream:
 
-                        batch = []
-                        async for msg in stream:
-                            print(f'root rx {msg}')
-                            batch.append(msg)
-                            if batch == seq:
-                                break
+                        for _ in range(total_batches):
+                            for msg in seq:
+                                # print(f'root tx {msg}')
+                                await stream.send(msg)
+                                if slow_side == 'parent':
+                                    # NOTE: we make the parent slightly
+                                    # slower, when it is slow, to make sure
+                                    # that in the overruns everywhere case
+                                    await trio.sleep(parent_send_delay)
+
+                            batch = []
+                            async for msg in stream:
+                                print(f'root rx {msg}')
+                                batch.append(msg)
+                                if batch == seq:
+                                    break
+
+                    if cancel_ctx:
+                        # cancel the remote task
+                        print('Requesting `ctx.cancel()` in parent!')
+                        await ctx.cancel()
+
+                res: str|ContextCancelled = await ctx.result()
 
                 if cancel_ctx:
-                    # cancel the remote task
-                    print('Requesting `ctx.cancel()` in parent!')
-                    await ctx.cancel()
+                    assert isinstance(res, ContextCancelled)
+                    assert tuple(res.canceller) == current_actor().uid
 
-            res: str|ContextCancelled = await ctx.result()
-
-            if cancel_ctx:
-                assert isinstance(res, ContextCancelled)
-                assert tuple(res.canceller) == current_actor().uid
-
-            else:
-                print(f'RX ROOT SIDE RESULT {res}')
-                assert res == 'yo'
+                else:
+                    print(f'RX ROOT SIDE RESULT {res}')
+                    assert res == 'yo'
 
             # cancel the daemon
             await portal.cancel_actor()
-- 
2.34.1


From e153cc0187de7bb37a76809b16d917cf124468b3 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 2 Apr 2024 13:41:52 -0400
Subject: [PATCH 203/378] WIP porting runtime to use `Msg`-spec

---
 tractor/_context.py    | 245 +++++++++-------
 tractor/_entry.py      |   1 +
 tractor/_exceptions.py |  94 ++++--
 tractor/_ipc.py        |  99 ++++++-
 tractor/_portal.py     |  19 +-
 tractor/_rpc.py        | 646 ++++++++++++++++++++++++-----------------
 tractor/_runtime.py    | 174 +++++++----
 tractor/_spawn.py      |  30 +-
 tractor/_streaming.py  |  46 ++-
 tractor/devx/_debug.py |   3 +
 10 files changed, 879 insertions(+), 478 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index 3c2490a3..38b4431d 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -53,7 +53,14 @@ from ._exceptions import (
     _raise_from_no_key_in_msg,
 )
 from .log import get_logger
-from .msg import NamespacePath
+from .msg import (
+    NamespacePath,
+    Msg,
+    Return,
+    Started,
+    Stop,
+    Yield,
+)
 from ._ipc import Channel
 from ._streaming import MsgStream
 from ._state import (
@@ -96,7 +103,8 @@ async def _drain_to_final_msg(
     # wait for a final context result by collecting (but
     # basically ignoring) any bi-dir-stream msgs still in transit
     # from the far end.
-    pre_result_drained: list[dict] = []
+    # pre_result_drained: list[dict] = []
+    pre_result_drained: list[Msg] = []
     while not (
         ctx.maybe_error
         and not ctx._final_result_is_set()
@@ -155,7 +163,10 @@ async def _drain_to_final_msg(
             #     await pause()
 
             # pray to the `trio` gawds that we're corrent with this
-            msg: dict = await ctx._recv_chan.receive()
+            # msg: dict = await ctx._recv_chan.receive()
+            msg: Msg = await ctx._recv_chan.receive()
+            # always capture unexpected/non-result msgs
+            pre_result_drained.append(msg)
 
         # NOTE: we get here if the far end was
         # `ContextCancelled` in 2 cases:
@@ -175,24 +186,31 @@ async def _drain_to_final_msg(
             # continue to bubble up as normal.
             raise
 
-        try:
-            ctx._result: Any = msg['return']
-            log.runtime(
-                'Context delivered final draining msg:\n'
-                f'{pformat(msg)}'
-            )
-            # XXX: only close the rx mem chan AFTER
-            # a final result is retreived.
-            # if ctx._recv_chan:
-            #     await ctx._recv_chan.aclose()
-            # TODO: ^ we don't need it right?
-            break
+        match msg:
+            case Return(
+                cid=cid,
+                pld=res,
+            ):
+        # try:
+            # ctx._result: Any = msg['return']
+            # ctx._result: Any = msg.pld
+                ctx._result: Any = res
+                log.runtime(
+                    'Context delivered final draining msg:\n'
+                    f'{pformat(msg)}'
+                )
+                # XXX: only close the rx mem chan AFTER
+                # a final result is retreived.
+                # if ctx._recv_chan:
+                #     await ctx._recv_chan.aclose()
+                # TODO: ^ we don't need it right?
+                break
 
-        except KeyError:
-            # always capture unexpected/non-result msgs
-            pre_result_drained.append(msg)
+        # except KeyError:
+        # except AttributeError:
+            case Yield():
+            # if 'yield' in msg:
 
-            if 'yield' in msg:
                 # far end task is still streaming to us so discard
                 # and report per local context state.
                 if (
@@ -238,9 +256,10 @@ async def _drain_to_final_msg(
             # TODO: work out edge cases here where
             # a stream is open but the task also calls
             # this?
-            # -[ ] should be a runtime error if a stream is open
-            #   right?
-            elif 'stop' in msg:
+            # -[ ] should be a runtime error if a stream is open right?
+            # Stop()
+            case Stop():
+            # elif 'stop' in msg:
                 log.cancel(
                     'Remote stream terminated due to "stop" msg:\n\n'
                     f'{pformat(msg)}\n'
@@ -249,78 +268,80 @@ async def _drain_to_final_msg(
 
             # It's an internal error if any other msg type without
             # a`'cid'` field arrives here!
-            if not msg.get('cid'):
-                raise InternalError(
-                    'Unexpected cid-missing msg?\n\n'
-                    f'{msg}\n'
-                )
+            case _:
+            # if not msg.get('cid'):
+                if not msg.cid:
+                    raise InternalError(
+                        'Unexpected cid-missing msg?\n\n'
+                        f'{msg}\n'
+                    )
 
-            # XXX fallthrough to handle expected error XXX
-            # TODO: replace this with `ctx.maybe_raise()`
-            #
-            # TODO: would this be handier for this case maybe?
-            # async with maybe_raise_on_exit() as raises:
-            #     if raises:
-            #         log.error('some msg about raising..')
+                # XXX fallthrough to handle expected error XXX
+                # TODO: replace this with `ctx.maybe_raise()`
+                #
+                # TODO: would this be handier for this case maybe?
+                # async with maybe_raise_on_exit() as raises:
+                #     if raises:
+                #         log.error('some msg about raising..')
 
-            re: Exception|None = ctx._remote_error
-            if re:
-                log.critical(
-                    'Remote ctx terminated due to "error" msg:\n'
-                    f'{re}'
-                )
-                assert msg is ctx._cancel_msg
-                # NOTE: this solved a super dupe edge case XD
-                # this was THE super duper edge case of:
-                # - local task opens a remote task,
-                # - requests remote cancellation of far end
-                #   ctx/tasks,
-                # - needs to wait for the cancel ack msg
-                #   (ctxc) or some result in the race case
-                #   where the other side's task returns
-                #   before the cancel request msg is ever
-                #   rxed and processed,
-                # - here this surrounding drain loop (which
-                #   iterates all ipc msgs until the ack or
-                #   an early result arrives) was NOT exiting
-                #   since we are the edge case: local task
-                #   does not re-raise any ctxc it receives
-                #   IFF **it** was the cancellation
-                #   requester..
-                # will raise if necessary, ow break from
-                # loop presuming any error terminates the
-                # context!
-                ctx._maybe_raise_remote_err(
-                    re,
-                    # NOTE: obvi we don't care if we
-                    # overran the far end if we're already
-                    # waiting on a final result (msg).
-                    # raise_overrun_from_self=False,
-                    raise_overrun_from_self=raise_overrun,
-                )
+                re: Exception|None = ctx._remote_error
+                if re:
+                    log.critical(
+                        'Remote ctx terminated due to "error" msg:\n'
+                        f'{re}'
+                    )
+                    assert msg is ctx._cancel_msg
+                    # NOTE: this solved a super dupe edge case XD
+                    # this was THE super duper edge case of:
+                    # - local task opens a remote task,
+                    # - requests remote cancellation of far end
+                    #   ctx/tasks,
+                    # - needs to wait for the cancel ack msg
+                    #   (ctxc) or some result in the race case
+                    #   where the other side's task returns
+                    #   before the cancel request msg is ever
+                    #   rxed and processed,
+                    # - here this surrounding drain loop (which
+                    #   iterates all ipc msgs until the ack or
+                    #   an early result arrives) was NOT exiting
+                    #   since we are the edge case: local task
+                    #   does not re-raise any ctxc it receives
+                    #   IFF **it** was the cancellation
+                    #   requester..
+                    # will raise if necessary, ow break from
+                    # loop presuming any error terminates the
+                    # context!
+                    ctx._maybe_raise_remote_err(
+                        re,
+                        # NOTE: obvi we don't care if we
+                        # overran the far end if we're already
+                        # waiting on a final result (msg).
+                        # raise_overrun_from_self=False,
+                        raise_overrun_from_self=raise_overrun,
+                    )
 
-                break  # OOOOOF, yeah obvi we need this..
+                    break  # OOOOOF, yeah obvi we need this..
 
-            # XXX we should never really get here
-            # right! since `._deliver_msg()` should
-            # always have detected an {'error': ..}
-            # msg and already called this right!?!
-            elif error := unpack_error(
-                msg=msg,
-                chan=ctx._portal.channel,
-                hide_tb=False,
-            ):
-                log.critical('SHOULD NEVER GET HERE!?')
-                assert msg is ctx._cancel_msg
-                assert error.msgdata == ctx._remote_error.msgdata
-                from .devx._debug import pause
-                await pause()
-                ctx._maybe_cancel_and_set_remote_error(error)
-                ctx._maybe_raise_remote_err(error)
+                # XXX we should never really get here
+                # right! since `._deliver_msg()` should
+                # always have detected an {'error': ..}
+                # msg and already called this right!?!
+                elif error := unpack_error(
+                    msg=msg,
+                    chan=ctx._portal.channel,
+                    hide_tb=False,
+                ):
+                    log.critical('SHOULD NEVER GET HERE!?')
+                    assert msg is ctx._cancel_msg
+                    assert error.msgdata == ctx._remote_error.msgdata
+                    from .devx._debug import pause
+                    await pause()
+                    ctx._maybe_cancel_and_set_remote_error(error)
+                    ctx._maybe_raise_remote_err(error)
 
-            else:
-                # bubble the original src key error
-                raise
+                else:
+                    # bubble the original src key error
+                    raise
     else:
         log.cancel(
             'Skipping `MsgStream` drain since final outcome is set\n\n'
@@ -710,10 +731,14 @@ class Context:
 
     async def send_stop(self) -> None:
         # await pause()
-        await self.chan.send({
-            'stop': True,
-            'cid': self.cid
-        })
+        # await self.chan.send({
+        #     # Stop(
+        #     'stop': True,
+        #     'cid': self.cid
+        # })
+        await self.chan.send(
+            Stop(cid=self.cid)
+        )
 
     def _maybe_cancel_and_set_remote_error(
         self,
@@ -1395,17 +1420,19 @@ class Context:
             for msg in drained_msgs:
 
                 # TODO: mask this by default..
-                if 'return' in msg:
+                # if 'return' in msg:
+                if isinstance(msg, Return):
                     # from .devx import pause
                     # await pause()
-                    raise InternalError(
+                    # raise InternalError(
+                    log.warning(
                         'Final `return` msg should never be drained !?!?\n\n'
                         f'{msg}\n'
                     )
 
             log.cancel(
                 'Ctx drained pre-result msgs:\n'
-                f'{drained_msgs}'
+                f'{pformat(drained_msgs)}'
             )
 
         self.maybe_raise(
@@ -1613,7 +1640,18 @@ class Context:
                 f'called `.started()` twice on context with {self.chan.uid}'
             )
 
-        await self.chan.send({'started': value, 'cid': self.cid})
+        # await self.chan.send(
+        #     {
+        #         'started': value,
+        #          'cid': self.cid,
+        #     }
+        # )
+        await self.chan.send(
+            Started(
+                cid=self.cid,
+                pld=value,
+            )
+        )
         self._started_called = True
 
     async def _drain_overflows(
@@ -1668,7 +1706,8 @@ class Context:
 
     async def _deliver_msg(
         self,
-        msg: dict,
+        # msg: dict,
+        msg: Msg,
 
     ) -> bool:
         '''
@@ -1852,7 +1891,7 @@ class Context:
                         # anything different.
                         return False
             else:
-                txt += f'\n{msg}\n'
+                # txt += f'\n{msg}\n'
                 # raise local overrun and immediately pack as IPC
                 # msg for far end.
                 try:
@@ -1983,15 +2022,17 @@ async def open_context_from_portal(
     )
 
     assert ctx._remote_func_type == 'context'
-    msg: dict = await ctx._recv_chan.receive()
+    msg: Started = await ctx._recv_chan.receive()
 
     try:
         # the "first" value here is delivered by the callee's
         # ``Context.started()`` call.
-        first: Any = msg['started']
+        # first: Any = msg['started']
+        first: Any = msg.pld
         ctx._started_called: bool = True
 
-    except KeyError as src_error:
+    # except KeyError as src_error:
+    except AttributeError as src_error:
         _raise_from_no_key_in_msg(
             ctx=ctx,
             msg=msg,
diff --git a/tractor/_entry.py b/tractor/_entry.py
index 0ac0dc47..b2aae2e5 100644
--- a/tractor/_entry.py
+++ b/tractor/_entry.py
@@ -135,6 +135,7 @@ def _trio_main(
             run_as_asyncio_guest(trio_main)
         else:
             trio.run(trio_main)
+
     except KeyboardInterrupt:
         log.cancel(
             'Actor received KBI\n'
diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index b1a8ee63..7deda9d2 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -31,9 +31,16 @@ import textwrap
 import traceback
 
 import trio
+from msgspec import structs
 
 from tractor._state import current_actor
 from tractor.log import get_logger
+from tractor.msg import (
+    Error,
+    Msg,
+    Stop,
+    Yield,
+)
 
 if TYPE_CHECKING:
     from ._context import Context
@@ -135,6 +142,8 @@ class RemoteActorError(Exception):
         # and instead render if from `.boxed_type_str`?
         self._boxed_type: BaseException = boxed_type
         self._src_type: BaseException|None = None
+
+        # TODO: make this a `.errmsg: Error` throughout?
         self.msgdata: dict[str, Any] = msgdata
 
         # TODO: mask out eventually or place in `pack_error()`
@@ -464,7 +473,23 @@ class AsyncioCancelled(Exception):
     '''
 
 class MessagingError(Exception):
-    'Some kind of unexpected SC messaging dialog issue'
+    '''
+    IPC related msg (typing), transaction (ordering) or dialog
+    handling error.
+
+    '''
+
+
+class MsgTypeError(MessagingError):
+    '''
+    Equivalent of a `TypeError` for an IPC wire-message
+    due to an invalid field value (type).
+
+    Normally this is re-raised from some `.msg._codec`
+    decode error raised by a backend interchange lib
+    like `msgspec` or `pycapnproto`.
+
+    '''
 
 
 def pack_error(
@@ -473,7 +498,7 @@ def pack_error(
     tb: str|None = None,
     cid: str|None = None,
 
-) -> dict[str, dict]:
+) -> Error|dict[str, dict]:
     '''
     Create an "error message" which boxes a locally caught
     exception's meta-data and encodes it for wire transport via an
@@ -536,17 +561,23 @@ def pack_error(
     # content's `.msgdata`).
     error_msg['tb_str'] = tb_str
 
-    pkt: dict = {
-        'error': error_msg,
-    }
-    if cid:
-        pkt['cid'] = cid
+    # Error()
+    # pkt: dict = {
+    #     'error': error_msg,
+    # }
+    pkt: Error = Error(
+        cid=cid,
+        **error_msg,
+        # TODO: just get rid of `.pld` on this msg?
+    )
+    # if cid:
+    #     pkt['cid'] = cid
 
     return pkt
 
 
 def unpack_error(
-    msg: dict[str, Any],
+    msg: dict[str, Any]|Error,
 
     chan: Channel|None = None,
     box_type: RemoteActorError = RemoteActorError,
@@ -564,15 +595,17 @@ def unpack_error(
     '''
     __tracebackhide__: bool = hide_tb
 
-    error_dict: dict[str, dict] | None
-    if (
-        error_dict := msg.get('error')
-    ) is None:
+    error_dict: dict[str, dict]|None
+    if not isinstance(msg, Error):
+    # if (
+    #     error_dict := msg.get('error')
+    # ) is None:
         # no error field, nothing to unpack.
         return None
 
     # retrieve the remote error's msg encoded details
-    tb_str: str = error_dict.get('tb_str', '')
+    # tb_str: str = error_dict.get('tb_str', '')
+    tb_str: str = msg.tb_str
     message: str = (
         f'{chan.uid}\n'
         +
@@ -581,7 +614,8 @@ def unpack_error(
 
     # try to lookup a suitable error type from the local runtime
     # env then use it to construct a local instance.
-    boxed_type_str: str = error_dict['boxed_type_str']
+    # boxed_type_str: str = error_dict['boxed_type_str']
+    boxed_type_str: str = msg.boxed_type_str
     boxed_type: Type[BaseException] = get_err_type(boxed_type_str)
 
     if boxed_type_str == 'ContextCancelled':
@@ -595,7 +629,11 @@ def unpack_error(
     # original source error.
     elif boxed_type_str == 'RemoteActorError':
         assert boxed_type is RemoteActorError
-        assert len(error_dict['relay_path']) >= 1
+        # assert len(error_dict['relay_path']) >= 1
+        assert len(msg.relay_path) >= 1
+
+    # TODO: mk RAE just take the `Error` instance directly?
+    error_dict: dict = structs.asdict(msg)
 
     exc = box_type(
         message,
@@ -623,11 +661,12 @@ def is_multi_cancelled(exc: BaseException) -> bool:
 
 def _raise_from_no_key_in_msg(
     ctx: Context,
-    msg: dict,
+    msg: Msg,
     src_err: KeyError,
     log: StackLevelAdapter,  # caller specific `log` obj
 
     expect_key: str = 'yield',
+    expect_msg: str = Yield,
     stream: MsgStream | None = None,
 
     # allow "deeper" tbs when debugging B^o
@@ -660,8 +699,10 @@ def _raise_from_no_key_in_msg(
 
     # an internal error should never get here
     try:
-        cid: str = msg['cid']
-    except KeyError as src_err:
+        cid: str = msg.cid
+        # cid: str = msg['cid']
+    # except KeyError as src_err:
+    except AttributeError as src_err:
         raise MessagingError(
             f'IPC `Context` rx-ed msg without a ctx-id (cid)!?\n'
             f'cid: {cid}\n\n'
@@ -672,7 +713,10 @@ def _raise_from_no_key_in_msg(
     # TODO: test that shows stream raising an expected error!!!
 
     # raise the error message in a boxed exception type!
-    if msg.get('error'):
+    # if msg.get('error'):
+    if isinstance(msg, Error):
+    # match msg:
+    #     case Error():
         raise unpack_error(
             msg,
             ctx.chan,
@@ -683,8 +727,10 @@ def _raise_from_no_key_in_msg(
     # `MsgStream` termination msg.
     # TODO: does it make more sense to pack 
     # the stream._eoc outside this in the calleer always?
+        # case Stop():
     elif (
-        msg.get('stop')
+        # msg.get('stop')
+        isinstance(msg, Stop)
         or (
             stream
             and stream._eoc
@@ -725,14 +771,16 @@ def _raise_from_no_key_in_msg(
         stream
         and stream._closed
     ):
-        raise trio.ClosedResourceError('This stream was closed')
-
+        # TODO: our own error subtype?
+        raise trio.ClosedResourceError(
+            'This stream was closed'
+        )
 
     # always re-raise the source error if no translation error case
     # is activated above.
     _type: str = 'Stream' if stream else 'Context'
     raise MessagingError(
-        f"{_type} was expecting a '{expect_key}' message"
+        f"{_type} was expecting a '{expect_key.upper()}' message"
         " BUT received a non-error msg:\n"
         f'{pformat(msg)}'
     ) from src_err
diff --git a/tractor/_ipc.py b/tractor/_ipc.py
index 5f71c38c..6168c77c 100644
--- a/tractor/_ipc.py
+++ b/tractor/_ipc.py
@@ -38,17 +38,23 @@ from typing import (
     Protocol,
     Type,
     TypeVar,
+    Union,
 )
 
+import msgspec
 from tricycle import BufferedReceiveStream
 import trio
 
 from tractor.log import get_logger
-from tractor._exceptions import TransportClosed
+from tractor._exceptions import (
+    TransportClosed,
+    MsgTypeError,
+)
 from tractor.msg import (
     _ctxvar_MsgCodec,
+    _codec,
     MsgCodec,
-    mk_codec,
+    types,
 )
 
 log = get_logger(__name__)
@@ -163,7 +169,16 @@ class MsgpackTCPStream(MsgTransport):
 
         # allow for custom IPC msg interchange format
         # dynamic override Bo
-        self.codec: MsgCodec = codec or mk_codec()
+        self._task = trio.lowlevel.current_task()
+        self._codec: MsgCodec = (
+            codec
+            or
+            _codec._ctxvar_MsgCodec.get()
+        )
+        log.critical(
+            '!?!: USING STD `tractor` CODEC !?!?\n'
+            f'{self._codec}\n'
+        )
 
     async def _iter_packets(self) -> AsyncGenerator[dict, None]:
         '''
@@ -171,7 +186,6 @@ class MsgpackTCPStream(MsgTransport):
         stream using the current task's `MsgCodec`.
 
         '''
-        import msgspec  # noqa
         decodes_failed: int = 0
 
         while True:
@@ -206,7 +220,19 @@ class MsgpackTCPStream(MsgTransport):
             try:
                 # NOTE: lookup the `trio.Task.context`'s var for
                 # the current `MsgCodec`.
-                yield  _ctxvar_MsgCodec.get().decode(msg_bytes)
+                codec: MsgCodec = _ctxvar_MsgCodec.get()
+                if self._codec.pld_spec != codec.pld_spec:
+                    # assert (
+                    #     task := trio.lowlevel.current_task()
+                    # ) is not self._task
+                    # self._task = task
+                    self._codec = codec
+                    log.critical(
+                        '.recv() USING NEW CODEC !?!?\n'
+                        f'{self._codec}\n\n'
+                        f'msg_bytes -> {msg_bytes}\n'
+                    )
+                yield codec.decode(msg_bytes)
 
                 # TODO: remove, was only for orig draft impl
                 # testing.
@@ -221,6 +247,41 @@ class MsgpackTCPStream(MsgTransport):
                 #
                 # yield obj
 
+            # XXX NOTE: since the below error derives from
+            # `DecodeError` we need to catch is specially
+            # and always raise such that spec violations
+            # are never allowed to be caught silently!
+            except msgspec.ValidationError as verr:
+
+                # decode the msg-bytes using the std msgpack
+                # interchange-prot (i.e. without any
+                # `msgspec.Struct` handling) so that we can
+                # determine what `.msg.types.Msg` is the culprit
+                # by reporting the received value.
+                msg_dict: dict = msgspec.msgpack.decode(msg_bytes)
+                msg_type_name: str = msg_dict['msg_type']
+                msg_type = getattr(types, msg_type_name)
+                errmsg: str = (
+                    f'Received invalid IPC `{msg_type_name}` msg\n\n'
+                )
+
+                # XXX see if we can determine the exact invalid field
+                # such that we can comprehensively report the
+                # specific field's type problem
+                msgspec_msg: str = verr.args[0].rstrip('`')
+                msg, _, maybe_field = msgspec_msg.rpartition('$.')
+                if field_val := msg_dict.get(maybe_field):
+                    field_type: Union[Type] = msg_type.__signature__.parameters[
+                        maybe_field
+                    ].annotation
+                    errmsg += (
+                        f'{msg.rstrip("`")}\n\n'
+                        f'{msg_type}\n'
+                        f' |_.{maybe_field}: {field_type} = {field_val}\n'
+                    )
+
+                raise MsgTypeError(errmsg) from verr
+
             except (
                 msgspec.DecodeError,
                 UnicodeDecodeError,
@@ -230,14 +291,15 @@ class MsgpackTCPStream(MsgTransport):
                     # do with a channel drop - hope that receiving from the
                     # channel will raise an expected error and bubble up.
                     try:
-                        msg_str: str | bytes = msg_bytes.decode()
+                        msg_str: str|bytes = msg_bytes.decode()
                     except UnicodeDecodeError:
                         msg_str = msg_bytes
 
-                    log.error(
-                        '`msgspec` failed to decode!?\n'
-                        'dumping bytes:\n'
-                        f'{msg_str!r}'
+                    log.exception(
+                        'Failed to decode msg?\n'
+                        f'{codec}\n\n'
+                        'Rxed bytes from wire:\n\n'
+                        f'{msg_str!r}\n'
                     )
                     decodes_failed += 1
                 else:
@@ -258,8 +320,21 @@ class MsgpackTCPStream(MsgTransport):
 
             # NOTE: lookup the `trio.Task.context`'s var for
             # the current `MsgCodec`.
-            bytes_data: bytes = _ctxvar_MsgCodec.get().encode(msg)
-            # bytes_data: bytes = self.codec.encode(msg)
+            codec: MsgCodec = _ctxvar_MsgCodec.get()
+            # if self._codec != codec:
+            if self._codec.pld_spec != codec.pld_spec:
+                self._codec = codec
+                log.critical(
+                    '.send() using NEW CODEC !?!?\n'
+                    f'{self._codec}\n\n'
+                    f'OBJ -> {msg}\n'
+                )
+            if type(msg) not in types.__spec__:
+                log.warning(
+                    'Sending non-`Msg`-spec msg?\n\n'
+                    f'{msg}\n'
+                )
+            bytes_data: bytes = codec.encode(msg)
 
             # supposedly the fastest says,
             # https://stackoverflow.com/a/54027962
diff --git a/tractor/_portal.py b/tractor/_portal.py
index ac602dd5..cc9052ba 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -45,7 +45,10 @@ from ._state import (
 )
 from ._ipc import Channel
 from .log import get_logger
-from .msg import NamespacePath
+from .msg import (
+    NamespacePath,
+    Return,
+)
 from ._exceptions import (
     unpack_error,
     NoResult,
@@ -66,7 +69,8 @@ log = get_logger(__name__)
 # `._raise_from_no_key_in_msg()` (after tweak to
 # accept a `chan: Channel` arg) in key block!
 def _unwrap_msg(
-    msg: dict[str, Any],
+    # msg: dict[str, Any],
+    msg: Return,
     channel: Channel,
 
     hide_tb: bool = True,
@@ -79,18 +83,21 @@ def _unwrap_msg(
     __tracebackhide__: bool = hide_tb
 
     try:
-        return msg['return']
-    except KeyError as ke:
+        return msg.pld
+        # return msg['return']
+    # except KeyError as ke:
+    except AttributeError as err:
 
         # internal error should never get here
-        assert msg.get('cid'), (
+        # assert msg.get('cid'), (
+        assert msg.cid, (
             "Received internal error at portal?"
         )
 
         raise unpack_error(
             msg,
             channel
-        ) from ke
+        ) from err
 
 
 class Portal:
diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index 310b80af..0549b0cb 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -55,12 +55,21 @@ from ._exceptions import (
     TransportClosed,
 )
 from .devx import (
-    # pause,
+    pause,
     maybe_wait_for_debugger,
     _debug,
 )
 from . import _state
 from .log import get_logger
+from tractor.msg.types import (
+    Start,
+    StartAck,
+    Started,
+    Stop,
+    Yield,
+    Return,
+    Error,
+)
 
 
 if TYPE_CHECKING:
@@ -89,10 +98,13 @@ async def _invoke_non_context(
 
     # TODO: can we unify this with the `context=True` impl below?
     if inspect.isasyncgen(coro):
-        await chan.send({
-            'cid': cid,
-            'functype': 'asyncgen',
-        })
+        # await chan.send({
+        await chan.send(
+            StartAck(
+                cid=cid,
+                functype='asyncgen',
+            )
+        )
         # XXX: massive gotcha! If the containing scope
         # is cancelled and we execute the below line,
         # any ``ActorNursery.__aexit__()`` WON'T be
@@ -112,27 +124,45 @@ async def _invoke_non_context(
                     # to_send = await chan.recv_nowait()
                     # if to_send is not None:
                     #     to_yield = await coro.asend(to_send)
-                    await chan.send({
-                        'yield': item,
-                        'cid': cid,
-                    })
+                    # await chan.send({
+                    #     # Yield()
+                    #     'cid': cid,
+                    #     'yield': item,
+                    # })
+                    await chan.send(
+                        Yield(
+                            cid=cid,
+                            pld=item,
+                        )
+                    )
 
         log.runtime(f"Finished iterating {coro}")
         # TODO: we should really support a proper
         # `StopAsyncIteration` system here for returning a final
         # value if desired
-        await chan.send({
-            'stop': True,
-            'cid': cid,
-        })
+        await chan.send(
+            Stop(cid=cid)
+        )
+        # await chan.send({
+        #     # Stop(
+        #     'cid': cid,
+        #     'stop': True,
+        # })
 
     # one way @stream func that gets treated like an async gen
     # TODO: can we unify this with the `context=True` impl below?
     elif treat_as_gen:
-        await chan.send({
-            'cid': cid,
-            'functype': 'asyncgen',
-        })
+        await chan.send(
+            StartAck(
+                cid=cid,
+                functype='asyncgen',
+            )
+        )
+        # await chan.send({
+        #     # StartAck()
+        #     'cid': cid,
+        #     'functype': 'asyncgen',
+        # })
         # XXX: the async-func may spawn further tasks which push
         # back values like an async-generator would but must
         # manualy construct the response dict-packet-responses as
@@ -145,10 +175,14 @@ async def _invoke_non_context(
         if not cs.cancelled_caught:
             # task was not cancelled so we can instruct the
             # far end async gen to tear down
-            await chan.send({
-                'stop': True,
-                'cid': cid
-            })
+            await chan.send(
+                Stop(cid=cid)
+            )
+            # await chan.send({
+            #     # Stop(
+            #     'cid': cid,
+            #     'stop': True,
+            # })
     else:
         # regular async function/method
         # XXX: possibly just a scheduled `Actor._cancel_task()`
@@ -160,10 +194,17 @@ async def _invoke_non_context(
         # way: using the linked IPC context machinery.
         failed_resp: bool = False
         try:
-            await chan.send({
-                'functype': 'asyncfunc',
-                'cid': cid
-            })
+            await chan.send(
+                StartAck(
+                    cid=cid,
+                    functype='asyncfunc',
+                )
+            )
+            # await chan.send({
+            #     # StartAck()
+            #     'cid': cid,
+            #     'functype': 'asyncfunc',
+            # })
         except (
             trio.ClosedResourceError,
             trio.BrokenResourceError,
@@ -197,10 +238,17 @@ async def _invoke_non_context(
                 and chan.connected()
             ):
                 try:
-                    await chan.send({
-                        'return': result,
-                        'cid': cid,
-                    })
+                    # await chan.send({
+                    #     # Return()
+                    #     'cid': cid,
+                    #     'return': result,
+                    # })
+                    await chan.send(
+                        Return(
+                            cid=cid,
+                            pld=result,
+                        )
+                    )
                 except (
                     BrokenPipeError,
                     trio.BrokenResourceError,
@@ -381,6 +429,8 @@ async def _invoke(
         # XXX for .pause_from_sync()` usage we need to make sure
         # `greenback` is boostrapped in the subactor!
         await _debug.maybe_init_greenback()
+    # else:
+    #     await pause()
 
     # TODO: possibly a specially formatted traceback
     # (not sure what typing is for this..)?
@@ -493,10 +543,18 @@ async def _invoke(
         # a "context" endpoint type is the most general and
         # "least sugary" type of RPC ep with support for
         # bi-dir streaming B)
-        await chan.send({
-            'cid': cid,
-            'functype': 'context',
-        })
+        # StartAck
+        await chan.send(
+            StartAck(
+                cid=cid,
+                functype='context',
+            )
+        )
+        # await chan.send({
+        #     # StartAck()
+        #     'cid': cid,
+        #     'functype': 'context',
+        # })
 
         # TODO: should we also use an `.open_context()` equiv
         # for this callee side by factoring the impl from
@@ -520,10 +578,17 @@ async def _invoke(
                 ctx._result = res
 
                 # deliver final result to caller side.
-                await chan.send({
-                    'return': res,
-                    'cid': cid
-                })
+                await chan.send(
+                    Return(
+                        cid=cid,
+                        pld=res,
+                    )
+                )
+                # await chan.send({
+                #     # Return()
+                #     'cid': cid,
+                #     'return': res,
+                # })
 
             # NOTE: this happens IFF `ctx._scope.cancel()` is
             # called by any of,
@@ -696,7 +761,8 @@ async def try_ship_error_to_remote(
         try:
             # NOTE: normally only used for internal runtime errors
             # so ship to peer actor without a cid.
-            msg: dict = pack_error(
+            # msg: dict = pack_error(
+            msg: Error = pack_error(
                 err,
                 cid=cid,
 
@@ -712,12 +778,13 @@ async def try_ship_error_to_remote(
             trio.BrokenResourceError,
             BrokenPipeError,
         ):
-            err_msg: dict = msg['error']['tb_str']
+            # err_msg: dict = msg['error']['tb_str']
             log.critical(
                 'IPC transport failure -> '
                 f'failed to ship error to {remote_descr}!\n\n'
                 f'X=> {channel.uid}\n\n'
-                f'{err_msg}\n'
+                # f'{err_msg}\n'
+                f'{msg}\n'
             )
 
 
@@ -777,31 +844,6 @@ async def process_messages(
         with CancelScope(shield=shield) as loop_cs:
             task_status.started(loop_cs)
             async for msg in chan:
-
-                # dedicated loop terminate sentinel
-                if msg is None:
-
-                    tasks: dict[
-                        tuple[Channel, str],
-                        tuple[Context, Callable, trio.Event]
-                    ] = actor._rpc_tasks.copy()
-                    log.cancel(
-                        f'Peer IPC channel terminated via `None` setinel msg?\n'
-                        f'=> Cancelling all {len(tasks)} local RPC tasks..\n'
-                        f'peer: {chan.uid}\n'
-                        f'|_{chan}\n'
-                    )
-                    for (channel, cid) in tasks:
-                        if channel is chan:
-                            await actor._cancel_task(
-                                cid,
-                                channel,
-                                requesting_uid=channel.uid,
-
-                                ipc_msg=msg,
-                            )
-                    break
-
                 log.transport(   # type: ignore
                     f'<= IPC msg from peer: {chan.uid}\n\n'
 
@@ -811,216 +853,294 @@ async def process_messages(
                     f'{pformat(msg)}\n'
                 )
 
-                cid = msg.get('cid')
-                if cid:
-                    # deliver response to local caller/waiter
-                    # via its per-remote-context memory channel.
-                    await actor._push_result(
-                        chan,
-                        cid,
-                        msg,
-                    )
+                match msg:
 
-                    log.runtime(
-                        'Waiting on next IPC msg from\n'
-                        f'peer: {chan.uid}:\n'
-                        f'|_{chan}\n'
+                # if msg is None:
+                # dedicated loop terminate sentinel
+                    case None:
 
-                        # f'last msg: {msg}\n'
-                    )
-                    continue
-
-                # process a 'cmd' request-msg upack
-                # TODO: impl with native `msgspec.Struct` support !!
-                # -[ ] implement with ``match:`` syntax?
-                # -[ ] discard un-authed msgs as per,
-                # <TODO put issue for typed msging structs>
-                try:
-                    (
-                        ns,
-                        funcname,
-                        kwargs,
-                        actorid,
-                        cid,
-                    ) = msg['cmd']
-
-                except KeyError:
-                    # This is the non-rpc error case, that is, an
-                    # error **not** raised inside a call to ``_invoke()``
-                    # (i.e. no cid was provided in the msg - see above).
-                    # Push this error to all local channel consumers
-                    # (normally portals) by marking the channel as errored
-                    assert chan.uid
-                    exc = unpack_error(msg, chan=chan)
-                    chan._exc = exc
-                    raise exc
-
-                log.runtime(
-                    'Handling RPC cmd from\n'
-                    f'peer: {actorid}\n'
-                    '\n'
-                    f'=> {ns}.{funcname}({kwargs})\n'
-                )
-                if ns == 'self':
-                    if funcname == 'cancel':
-                        func: Callable = actor.cancel
-                        kwargs |= {
-                            'req_chan': chan,
-                        }
-
-                        # don't start entire actor runtime cancellation
-                        # if this actor is currently in debug mode!
-                        pdb_complete: trio.Event|None = _debug.Lock.local_pdb_complete
-                        if pdb_complete:
-                            await pdb_complete.wait()
-
-                        # Either of  `Actor.cancel()`/`.cancel_soon()`
-                        # was called, so terminate this IPC msg
-                        # loop, exit back out into `async_main()`,
-                        # and immediately start the core runtime
-                        # machinery shutdown!
-                        with CancelScope(shield=True):
-                            await _invoke(
-                                actor,
-                                cid,
-                                chan,
-                                func,
-                                kwargs,
-                                is_rpc=False,
-                            )
-
-                        log.runtime(
-                            'Cancelling IPC transport msg-loop with peer:\n'
+                        tasks: dict[
+                            tuple[Channel, str],
+                            tuple[Context, Callable, trio.Event]
+                        ] = actor._rpc_tasks.copy()
+                        log.cancel(
+                            f'Peer IPC channel terminated via `None` setinel msg?\n'
+                            f'=> Cancelling all {len(tasks)} local RPC tasks..\n'
+                            f'peer: {chan.uid}\n'
                             f'|_{chan}\n'
                         )
-                        loop_cs.cancel()
+                        for (channel, cid) in tasks:
+                            if channel is chan:
+                                await actor._cancel_task(
+                                    cid,
+                                    channel,
+                                    requesting_uid=channel.uid,
+
+                                    ipc_msg=msg,
+                                )
                         break
 
-                    if funcname == '_cancel_task':
-                        func: Callable = actor._cancel_task
-
-                        # we immediately start the runtime machinery
-                        # shutdown
-                        # with CancelScope(shield=True):
-                        target_cid: str = kwargs['cid']
-                        kwargs |= {
-                            # NOTE: ONLY the rpc-task-owning
-                            # parent IPC channel should be able to
-                            # cancel it!
-                            'parent_chan': chan,
-                            'requesting_uid': chan.uid,
-                            'ipc_msg': msg,
-                        }
-                        # TODO: remove? already have emit in meth.
-                        # log.runtime(
-                        #     f'Rx RPC task cancel request\n'
-                        #     f'<= canceller: {chan.uid}\n'
-                        #     f'  |_{chan}\n\n'
-                        #     f'=> {actor}\n'
-                        #     f'  |_cid: {target_cid}\n'
-                        # )
-                        try:
-                            await _invoke(
-                                actor,
-                                cid,
-                                chan,
-                                func,
-                                kwargs,
-                                is_rpc=False,
-                            )
-                        except BaseException:
-                            log.exception(
-                                'Failed to cancel task?\n'
-                                f'<= canceller: {chan.uid}\n'
-                                f'  |_{chan}\n\n'
-                                f'=> {actor}\n'
-                                f'  |_cid: {target_cid}\n'
-                            )
-                        continue
-                    else:
-                        # normally registry methods, eg.
-                        # ``.register_actor()`` etc.
-                        func: Callable = getattr(actor, funcname)
-
-                else:
-                    # complain to client about restricted modules
-                    try:
-                        func = actor._get_rpc_func(ns, funcname)
-                    except (
-                        ModuleNotExposed,
-                        AttributeError,
-                    ) as err:
-                        err_msg: dict[str, dict] = pack_error(
-                            err,
-                            cid=cid,
-                        )
-                        await chan.send(err_msg)
-                        continue
-
-                # schedule a task for the requested RPC function
-                # in the actor's main "service nursery".
-                # TODO: possibly a service-tn per IPC channel for
-                # supervision isolation? would avoid having to
-                # manage RPC tasks individually in `._rpc_tasks`
-                # table?
-                log.runtime(
-                    f'Spawning task for RPC request\n'
-                    f'<= caller: {chan.uid}\n'
-                    f'  |_{chan}\n\n'
-                    # TODO: maddr style repr?
-                    # f'  |_@ /ipv4/{chan.raddr}/tcp/{chan.rport}/'
-                    # f'cid="{cid[-16:]} .."\n\n'
-
-                    f'=> {actor}\n'
-                    f'  |_cid: {cid}\n'
-                    f'   |>> {func}()\n'
-                )
-                assert actor._service_n  # wait why? do it at top?
-                try:
-                    ctx: Context = await actor._service_n.start(
-                        partial(
-                            _invoke,
-                            actor,
-                            cid,
+                # cid = msg.get('cid')
+                # if cid:
+                    case (
+                        StartAck(cid=cid)
+                        | Started(cid=cid)
+                        | Yield(cid=cid)
+                        | Stop(cid=cid)
+                        | Return(cid=cid)
+                        | Error(cid=cid)
+                    ):
+                        # deliver response to local caller/waiter
+                        # via its per-remote-context memory channel.
+                        await actor._push_result(
                             chan,
-                            func,
-                            kwargs,
-                        ),
-                        name=funcname,
-                    )
+                            cid,
+                            msg,
+                        )
 
-                except (
-                    RuntimeError,
-                    BaseExceptionGroup,
-                ):
-                    # avoid reporting a benign race condition
-                    # during actor runtime teardown.
-                    nursery_cancelled_before_task: bool = True
-                    break
+                        log.runtime(
+                            'Waiting on next IPC msg from\n'
+                            f'peer: {chan.uid}:\n'
+                            f'|_{chan}\n'
 
-                # in the lone case where a ``Context`` is not
-                # delivered, it's likely going to be a locally
-                # scoped exception from ``_invoke()`` itself.
-                if isinstance(err := ctx, Exception):
-                    log.warning(
-                        'Task for RPC failed?'
-                        f'|_ {func}()\n\n'
+                            # f'last msg: {msg}\n'
+                        )
+                        continue
 
-                        f'{err}'
-                    )
-                    continue
+                    # process a 'cmd' request-msg upack
+                    # TODO: impl with native `msgspec.Struct` support !!
+                    # -[ ] implement with ``match:`` syntax?
+                    # -[ ] discard un-authed msgs as per,
+                    # <TODO put issue for typed msging structs>
+                    case Start(
+                        cid=cid,
+                        ns=ns,
+                        func=funcname,
+                        kwargs=kwargs,
+                        uid=actorid,
+                    ):
+                        # try:
+                        #     (
+                        #         ns,
+                        #         funcname,
+                        #         kwargs,
+                        #         actorid,
+                        #         cid,
+                        #     ) = msg['cmd']
 
-                else:
-                    # mark that we have ongoing rpc tasks
-                    actor._ongoing_rpc_tasks = trio.Event()
+                        # # TODO: put in `case Error():` right?
+                        # except KeyError:
+                        #     # This is the non-rpc error case, that is, an
+                        #     # error **not** raised inside a call to ``_invoke()``
+                        #     # (i.e. no cid was provided in the msg - see above).
+                        #     # Push this error to all local channel consumers
+                        #     # (normally portals) by marking the channel as errored
+                        #     assert chan.uid
+                        #     exc = unpack_error(msg, chan=chan)
+                        #     chan._exc = exc
+                        #     raise exc
 
-                    # store cancel scope such that the rpc task can be
-                    # cancelled gracefully if requested
-                    actor._rpc_tasks[(chan, cid)] = (
-                        ctx,
-                        func,
-                        trio.Event(),
-                    )
+                        log.runtime(
+                            'Handling RPC `Start` request from\n'
+                            f'peer: {actorid}\n'
+                            '\n'
+                            f'=> {ns}.{funcname}({kwargs})\n'
+                        )
+                        # case Start(
+                        #     ns='self',
+                        #     funcname='cancel',
+                        # ):
+                        if ns == 'self':
+                            if funcname == 'cancel':
+                                func: Callable = actor.cancel
+                                kwargs |= {
+                                    'req_chan': chan,
+                                }
+
+                                # don't start entire actor runtime cancellation
+                                # if this actor is currently in debug mode!
+                                pdb_complete: trio.Event|None = _debug.Lock.local_pdb_complete
+                                if pdb_complete:
+                                    await pdb_complete.wait()
+
+                                # Either of  `Actor.cancel()`/`.cancel_soon()`
+                                # was called, so terminate this IPC msg
+                                # loop, exit back out into `async_main()`,
+                                # and immediately start the core runtime
+                                # machinery shutdown!
+                                with CancelScope(shield=True):
+                                    await _invoke(
+                                        actor,
+                                        cid,
+                                        chan,
+                                        func,
+                                        kwargs,
+                                        is_rpc=False,
+                                    )
+
+                                log.runtime(
+                                    'Cancelling IPC transport msg-loop with peer:\n'
+                                    f'|_{chan}\n'
+                                )
+                                loop_cs.cancel()
+                                break
+
+                        # case Start(
+                        #     ns='self',
+                        #     funcname='_cancel_task',
+                        # ):
+                            if funcname == '_cancel_task':
+                                func: Callable = actor._cancel_task
+
+                                # we immediately start the runtime machinery
+                                # shutdown
+                                # with CancelScope(shield=True):
+                                target_cid: str = kwargs['cid']
+                                kwargs |= {
+                                    # NOTE: ONLY the rpc-task-owning
+                                    # parent IPC channel should be able to
+                                    # cancel it!
+                                    'parent_chan': chan,
+                                    'requesting_uid': chan.uid,
+                                    'ipc_msg': msg,
+                                }
+                                # TODO: remove? already have emit in meth.
+                                # log.runtime(
+                                #     f'Rx RPC task cancel request\n'
+                                #     f'<= canceller: {chan.uid}\n'
+                                #     f'  |_{chan}\n\n'
+                                #     f'=> {actor}\n'
+                                #     f'  |_cid: {target_cid}\n'
+                                # )
+                                try:
+                                    await _invoke(
+                                        actor,
+                                        cid,
+                                        chan,
+                                        func,
+                                        kwargs,
+                                        is_rpc=False,
+                                    )
+                                except BaseException:
+                                    log.exception(
+                                        'Failed to cancel task?\n'
+                                        f'<= canceller: {chan.uid}\n'
+                                        f'  |_{chan}\n\n'
+                                        f'=> {actor}\n'
+                                        f'  |_cid: {target_cid}\n'
+                                    )
+                                continue
+
+                            # case Start(
+                            #     ns='self',
+                            #     funcname='register_actor',
+                            # ):
+                            else:
+                                # normally registry methods, eg.
+                                # ``.register_actor()`` etc.
+                                func: Callable = getattr(actor, funcname)
+
+                        # case Start(
+                        #     ns=str(),
+                        #     funcname=funcname,
+                        # ):
+                        else:
+                            # complain to client about restricted modules
+                            try:
+                                func = actor._get_rpc_func(ns, funcname)
+                            except (
+                                ModuleNotExposed,
+                                AttributeError,
+                            ) as err:
+                                err_msg: dict[str, dict] = pack_error(
+                                    err,
+                                    cid=cid,
+                                )
+                                await chan.send(err_msg)
+                                continue
+
+                        # schedule a task for the requested RPC function
+                        # in the actor's main "service nursery".
+                        # TODO: possibly a service-tn per IPC channel for
+                        # supervision isolation? would avoid having to
+                        # manage RPC tasks individually in `._rpc_tasks`
+                        # table?
+                        log.runtime(
+                            f'Spawning task for RPC request\n'
+                            f'<= caller: {chan.uid}\n'
+                            f'  |_{chan}\n\n'
+                            # TODO: maddr style repr?
+                            # f'  |_@ /ipv4/{chan.raddr}/tcp/{chan.rport}/'
+                            # f'cid="{cid[-16:]} .."\n\n'
+
+                            f'=> {actor}\n'
+                            f'  |_cid: {cid}\n'
+                            f'   |>> {func}()\n'
+                        )
+                        assert actor._service_n  # wait why? do it at top?
+                        try:
+                            ctx: Context = await actor._service_n.start(
+                                partial(
+                                    _invoke,
+                                    actor,
+                                    cid,
+                                    chan,
+                                    func,
+                                    kwargs,
+                                ),
+                                name=funcname,
+                            )
+
+                        except (
+                            RuntimeError,
+                            BaseExceptionGroup,
+                        ):
+                            # avoid reporting a benign race condition
+                            # during actor runtime teardown.
+                            nursery_cancelled_before_task: bool = True
+                            break
+
+                        # in the lone case where a ``Context`` is not
+                        # delivered, it's likely going to be a locally
+                        # scoped exception from ``_invoke()`` itself.
+                        if isinstance(err := ctx, Exception):
+                            log.warning(
+                                'Task for RPC failed?'
+                                f'|_ {func}()\n\n'
+
+                                f'{err}'
+                            )
+                            continue
+
+                        else:
+                            # mark that we have ongoing rpc tasks
+                            actor._ongoing_rpc_tasks = trio.Event()
+
+                            # store cancel scope such that the rpc task can be
+                            # cancelled gracefully if requested
+                            actor._rpc_tasks[(chan, cid)] = (
+                                ctx,
+                                func,
+                                trio.Event(),
+                            )
+
+                    case Error()|_:
+                        # This is the non-rpc error case, that is, an
+                        # error **not** raised inside a call to ``_invoke()``
+                        # (i.e. no cid was provided in the msg - see above).
+                        # Push this error to all local channel consumers
+                        # (normally portals) by marking the channel as errored
+                        log.exception(
+                            f'Unhandled IPC msg:\n\n'
+                            f'{msg}\n'
+                        )
+                        assert chan.uid
+                        exc = unpack_error(
+                            msg,
+                            chan=chan,
+                        )
+                        chan._exc = exc
+                        raise exc
 
                 log.runtime(
                     'Waiting on next IPC msg from\n'
diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index e2d78d51..3bafada1 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -91,6 +91,23 @@ from ._rpc import (
     process_messages,
     try_ship_error_to_remote,
 )
+from tractor.msg import (
+    types as msgtypes,
+    pretty_struct,
+)
+# from tractor.msg.types import (
+#     Aid,
+#     SpawnSpec,
+#     Start,
+#     StartAck,
+#     Started,
+#     Yield,
+#     Stop,
+#     Return,
+#     Error,
+# )
+
+
 
 
 if TYPE_CHECKING:
@@ -147,6 +164,7 @@ class Actor:
     # Information about `__main__` from parent
     _parent_main_data: dict[str, str]
     _parent_chan_cs: CancelScope|None = None
+    _spawn_spec: SpawnSpec|None = None
 
     # syncs for setup/teardown sequences
     _server_down: trio.Event|None = None
@@ -537,7 +555,8 @@ class Actor:
 
                             f'{pformat(msg)}\n'
                         )
-                        cid = msg.get('cid')
+                        # cid: str|None = msg.get('cid')
+                        cid: str|None = msg.cid
                         if cid:
                             # deliver response to local caller/waiter
                             await self._push_result(
@@ -889,29 +908,44 @@ class Actor:
             f'=> {ns}.{func}({kwargs})\n'
         )
         await chan.send(
-            {'cmd': (
-                ns,
-                func,
-                kwargs,
-                self.uid,
-                cid,
-            )}
+            msgtypes.Start(
+                ns=ns,
+                func=func,
+                kwargs=kwargs,
+                uid=self.uid,
+                cid=cid,
+            )
         )
+            # {'cmd': (
+            #     ns,
+            #     func,
+            #     kwargs,
+            #     self.uid,
+            #     cid,
+            # )}
+        # )
 
         # Wait on first response msg and validate; this should be
         # immediate.
-        first_msg: dict = await ctx._recv_chan.receive()
-        functype: str = first_msg.get('functype')
+        # first_msg: dict = await ctx._recv_chan.receive()
+        # functype: str = first_msg.get('functype')
 
-        if 'error' in first_msg:
+        first_msg: msgtypes.StartAck = await ctx._recv_chan.receive()
+        try:
+            functype: str = first_msg.functype
+        except AttributeError:
             raise unpack_error(first_msg, chan)
+            # if 'error' in first_msg:
+            #     raise unpack_error(first_msg, chan)
 
-        elif functype not in (
+        if functype not in (
             'asyncfunc',
             'asyncgen',
             'context',
         ):
-            raise ValueError(f"{first_msg} is an invalid response packet?")
+            raise ValueError(
+                f'{first_msg} is an invalid response packet?'
+            )
 
         ctx._remote_func_type = functype
         return ctx
@@ -944,24 +978,36 @@ class Actor:
             await self._do_handshake(chan)
 
             accept_addrs: list[tuple[str, int]]|None = None
-            if self._spawn_method == "trio":
-                # Receive runtime state from our parent
-                parent_data: dict[str, Any]
-                parent_data = await chan.recv()
-                log.runtime(
-                    'Received state from parent:\n\n'
-                    # TODO: eventually all these msgs as
-                    # `msgspec.Struct` with a special mode that
-                    # pformats them in multi-line mode, BUT only
-                    # if "trace"/"util" mode is enabled?
-                    f'{pformat(parent_data)}\n'
-                )
-                accept_addrs: list[tuple[str, int]] = parent_data.pop('bind_addrs')
-                rvs = parent_data.pop('_runtime_vars')
 
+            if self._spawn_method == "trio":
+
+                # Receive runtime state from our parent
+                # parent_data: dict[str, Any]
+                # parent_data = await chan.recv()
+
+                # TODO: maybe we should just wrap this directly
+                # in a `Actor.spawn_info: SpawnInfo` struct?
+                spawnspec: msgtypes.SpawnSpec = await chan.recv()
+                self._spawn_spec = spawnspec
+
+                # TODO: eventually all these msgs as
+                # `msgspec.Struct` with a special mode that
+                # pformats them in multi-line mode, BUT only
+                # if "trace"/"util" mode is enabled?
+                log.runtime(
+                    'Received runtime spec from parent:\n\n'
+                    f'{pformat(spawnspec)}\n'
+                )
+                # accept_addrs: list[tuple[str, int]] = parent_data.pop('bind_addrs')
+                accept_addrs: list[tuple[str, int]] = spawnspec.bind_addrs
+
+                # rvs = parent_data.pop('_runtime_vars')
+                rvs = spawnspec._runtime_vars
                 if rvs['_debug_mode']:
                     try:
-                        log.info('Enabling `stackscope` traces on SIGUSR1')
+                        log.info(
+                            'Enabling `stackscope` traces on SIGUSR1'
+                        )
                         from .devx import enable_stack_on_sig
                         enable_stack_on_sig()
                     except ImportError:
@@ -969,28 +1015,40 @@ class Actor:
                             '`stackscope` not installed for use in debug mode!'
                         )
 
-                log.runtime(f"Runtime vars are: {rvs}")
+                log.runtime(f'Runtime vars are: {rvs}')
                 rvs['_is_root'] = False
                 _state._runtime_vars.update(rvs)
 
-                for attr, value in parent_data.items():
-                    if (
-                        attr == 'reg_addrs'
-                        and value
-                    ):
-                        # XXX: ``msgspec`` doesn't support serializing tuples
-                        # so just cash manually here since it's what our
-                        # internals expect.
-                        # TODO: we don't really NEED these as
-                        # tuples so we can probably drop this
-                        # casting since apparently in python lists
-                        # are "more efficient"?
-                        self.reg_addrs = [tuple(val) for val in value]
+                # XXX: ``msgspec`` doesn't support serializing tuples
+                # so just cash manually here since it's what our
+                # internals expect.
+                #
+                self.reg_addrs = [
+                    # TODO: we don't really NEED these as tuples?
+                    # so we can probably drop this casting since
+                    # apparently in python lists are "more
+                    # efficient"?
+                    tuple(val)
+                    for val in spawnspec.reg_addrs
+                ]
 
-                    else:
-                        setattr(self, attr, value)
+                # for attr, value in parent_data.items():
+                for _, attr, value in pretty_struct.iter_fields(
+                    spawnspec,
+                ):
+                    setattr(self, attr, value)
+                    # if (
+                    #     attr == 'reg_addrs'
+                    #     and value
+                    # ):
+                    #     self.reg_addrs = [tuple(val) for val in value]
+                    # else:
+                    #     setattr(self, attr, value)
 
-            return chan, accept_addrs
+            return (
+                chan,
+                accept_addrs,
+            )
 
         except OSError:  # failed to connect
             log.warning(
@@ -1432,7 +1490,7 @@ class Actor:
         self,
         chan: Channel
 
-    ) -> tuple[str, str]:
+    ) -> msgtypes.Aid:
         '''
         Exchange `(name, UUIDs)` identifiers as the first
         communication step with any (peer) remote `Actor`.
@@ -1441,14 +1499,27 @@ class Actor:
         "actor model" parlance.
 
         '''
-        await chan.send(self.uid)
-        value: tuple = await chan.recv()
-        uid: tuple[str, str] = (str(value[0]), str(value[1]))
+        name, uuid = self.uid
+        await chan.send(
+            msgtypes.Aid(
+                name=name,
+                uuid=uuid,
+            )
+        )
+        aid: msgtypes.Aid = await chan.recv()
+        chan.aid = aid
+
+        uid: tuple[str, str] = (
+            # str(value[0]),
+            # str(value[1])
+            aid.name,
+            aid.uuid,
+        )
 
         if not isinstance(uid, tuple):
             raise ValueError(f"{uid} is not a valid uid?!")
 
-        chan.uid = str(uid[0]), str(uid[1])
+        chan.uid = uid
         return uid
 
     def is_infected_aio(self) -> bool:
@@ -1508,7 +1579,8 @@ async def async_main(
             # because we're running in mp mode
             if (
                 set_accept_addr_says_rent
-                and set_accept_addr_says_rent is not None
+                and
+                set_accept_addr_says_rent is not None
             ):
                 accept_addrs = set_accept_addr_says_rent
 
diff --git a/tractor/_spawn.py b/tractor/_spawn.py
index 741a2f87..4715bd1a 100644
--- a/tractor/_spawn.py
+++ b/tractor/_spawn.py
@@ -49,6 +49,9 @@ from tractor._portal import Portal
 from tractor._runtime import Actor
 from tractor._entry import _mp_main
 from tractor._exceptions import ActorFailure
+from tractor.msg.types import (
+    SpawnSpec,
+)
 
 
 if TYPE_CHECKING:
@@ -489,14 +492,25 @@ async def trio_proc(
             portal,
         )
 
-        # send additional init params
-        await chan.send({
-            '_parent_main_data': subactor._parent_main_data,
-            'enable_modules': subactor.enable_modules,
-            'reg_addrs': subactor.reg_addrs,
-            'bind_addrs': bind_addrs,
-            '_runtime_vars': _runtime_vars,
-        })
+        # send a "spawning specification" which configures the
+        # initial runtime state of the child.
+        await chan.send(
+            SpawnSpec(
+                _parent_main_data=subactor._parent_main_data,
+                enable_modules=subactor.enable_modules,
+                reg_addrs=subactor.reg_addrs,
+                bind_addrs=bind_addrs,
+                _runtime_vars=_runtime_vars,
+            )
+        )
+
+        # await chan.send({
+        #     '_parent_main_data': subactor._parent_main_data,
+        #     'enable_modules': subactor.enable_modules,
+        #     'reg_addrs': subactor.reg_addrs,
+        #     'bind_addrs': bind_addrs,
+        #     '_runtime_vars': _runtime_vars,
+        # })
 
         # track subactor in current nursery
         curr_actor = current_actor()
diff --git a/tractor/_streaming.py b/tractor/_streaming.py
index 90c33d31..941cfe8d 100644
--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@@ -43,6 +43,11 @@ from .trionics import (
     broadcast_receiver,
     BroadcastReceiver,
 )
+from tractor.msg import (
+    Stop,
+    Yield,
+    Error,
+)
 
 if TYPE_CHECKING:
     from ._context import Context
@@ -94,21 +99,25 @@ class MsgStream(trio.abc.Channel):
         self,
         allow_msg_keys: list[str] = ['yield'],
     ):
-        msg: dict = self._rx_chan.receive_nowait()
+        # msg: dict = self._rx_chan.receive_nowait()
+        msg: Yield|Stop = self._rx_chan.receive_nowait()
         for (
             i,
             key,
         ) in enumerate(allow_msg_keys):
             try:
-                return msg[key]
-            except KeyError as kerr:
+                # return msg[key]
+                return msg.pld
+            # except KeyError as kerr:
+            except AttributeError as attrerr:
                 if i < (len(allow_msg_keys) - 1):
                     continue
 
                 _raise_from_no_key_in_msg(
                     ctx=self._ctx,
                     msg=msg,
-                    src_err=kerr,
+                    # src_err=kerr,
+                    src_err=attrerr,
                     log=log,
                     expect_key=key,
                     stream=self,
@@ -148,18 +157,22 @@ class MsgStream(trio.abc.Channel):
         src_err: Exception|None = None  # orig tb
         try:
             try:
-                msg = await self._rx_chan.receive()
-                return msg['yield']
+                msg: Yield = await self._rx_chan.receive()
+                # return msg['yield']
+                return msg.pld
 
-            except KeyError as kerr:
-                src_err = kerr
+            # except KeyError as kerr:
+            except AttributeError as attrerr:
+                # src_err = kerr
+                src_err = attrerr
 
                 # NOTE: may raise any of the below error types
                 # includg EoC when a 'stop' msg is found.
                 _raise_from_no_key_in_msg(
                     ctx=self._ctx,
                     msg=msg,
-                    src_err=kerr,
+                    # src_err=kerr,
+                    src_err=attrerr,
                     log=log,
                     expect_key='yield',
                     stream=self,
@@ -514,11 +527,18 @@ class MsgStream(trio.abc.Channel):
             raise self._closed
 
         try:
+            # await self._ctx.chan.send(
+            #     payload={
+            #         'yield': data,
+            #         'cid': self._ctx.cid,
+            #     },
+            #     # hide_tb=hide_tb,
+            # )
             await self._ctx.chan.send(
-                payload={
-                    'yield': data,
-                    'cid': self._ctx.cid,
-                },
+                payload=Yield(
+                    cid=self._ctx.cid,
+                    pld=data,
+                ),
                 # hide_tb=hide_tb,
             )
         except (
diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 255b1dbd..26155b22 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -935,6 +935,9 @@ async def _pause(
             # ``breakpoint()`` was awaited and begin handling stdio.
             log.debug('Entering sync world of the `pdb` REPL..')
             try:
+                # log.critical(
+                #     f'stack len: {len(pdb.stack)}\n'
+                # )
                 debug_func(
                     actor,
                     pdb,
-- 
2.34.1


From 40cba519092d5ad28ccec380219d70e92d8452de Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 2 Apr 2024 14:32:16 -0400
Subject: [PATCH 204/378] Woops, only pack `Error(cid=cid)` if input is not
 `None`

---
 tractor/_exceptions.py | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 7deda9d2..9c1dc36d 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -498,7 +498,7 @@ def pack_error(
     tb: str|None = None,
     cid: str|None = None,
 
-) -> Error|dict[str, dict]:
+) -> Error:
     '''
     Create an "error message" which boxes a locally caught
     exception's meta-data and encodes it for wire transport via an
@@ -561,19 +561,10 @@ def pack_error(
     # content's `.msgdata`).
     error_msg['tb_str'] = tb_str
 
-    # Error()
-    # pkt: dict = {
-    #     'error': error_msg,
-    # }
-    pkt: Error = Error(
-        cid=cid,
-        **error_msg,
-        # TODO: just get rid of `.pld` on this msg?
-    )
-    # if cid:
-    #     pkt['cid'] = cid
+    if cid is not None:
+        error_msg['cid'] = cid
 
-    return pkt
+    return Error(**error_msg)
 
 
 def unpack_error(
-- 
2.34.1


From a65e1e7a88aacc5e6ed6adc62302c65a0ebd11f3 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 2 Apr 2024 14:34:08 -0400
Subject: [PATCH 205/378] TOSQUASH f2ce4a3, timeout bump

---
 tests/test_context_stream_semantics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_context_stream_semantics.py b/tests/test_context_stream_semantics.py
index cc7f402f..06a7f8c9 100644
--- a/tests/test_context_stream_semantics.py
+++ b/tests/test_context_stream_semantics.py
@@ -873,7 +873,7 @@ def test_one_end_stream_not_opened(
                 enable_modules=[__name__],
             )
 
-            with trio.fail_after(0.8):
+            with trio.fail_after(1):
                 async with portal.open_context(
                     entrypoint,
                 ) as (ctx, sent):
-- 
2.34.1


From 70ab60ce7cc9e367d5832364be07369a28e30e3b Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 3 Apr 2024 09:45:03 -0400
Subject: [PATCH 206/378] Flip default codec to our `Msg`-spec

Yes, this is "the switch" and will likely cause the test suite to bail
until a few more fixes some in.

Tweaked a couple `.msg` pkg exports:
- remove `__spec__` (used by modules) and change it to `__msg_types:
  lists[Msg]` as well as add a new `__msg_spec__: TypeAlias`, being the
  default `Any` paramed spec.
- tweak the naming of `msg.types` lists of runtime vs payload msgs to:
  `._runtime_msgs` and `._payload_msgs`.
- just build `__msg_types__` out of the above 2 lists.
---
 tractor/msg/__init__.py | 12 ++++++--
 tractor/msg/_codec.py   |  4 +--
 tractor/msg/types.py    | 61 +++++++++++++++++++++++------------------
 3 files changed, 47 insertions(+), 30 deletions(-)

diff --git a/tractor/msg/__init__.py b/tractor/msg/__init__.py
index d8f37477..fe965e0b 100644
--- a/tractor/msg/__init__.py
+++ b/tractor/msg/__init__.py
@@ -18,6 +18,10 @@
 Built-in messaging patterns, types, APIs and helpers.
 
 '''
+from typing import (
+    Union,
+    TypeAlias,
+)
 from .ptr import (
     NamespacePath as NamespacePath,
 )
@@ -50,6 +54,10 @@ from .types import (
 
     Error as Error,
 
-    # full msg spec set
-    __spec__ as __spec__,
+    # full msg class set from above as list
+    __msg_types__ as __msg_types__,
 )
+# TODO: use new type declaration syntax for msg-type-spec
+# https://docs.python.org/3/library/typing.html#type-aliases
+# https://docs.python.org/3/reference/simple_stmts.html#type
+__msg_spec__: TypeAlias = Union[*__msg_types__]
diff --git a/tractor/msg/_codec.py b/tractor/msg/_codec.py
index 32a58a56..56f24d62 100644
--- a/tractor/msg/_codec.py
+++ b/tractor/msg/_codec.py
@@ -438,8 +438,8 @@ _ctxvar_MsgCodec: MsgCodec = RunVar(
     'msgspec_codec',
 
     # TODO: move this to our new `Msg`-spec!
-    default=_def_msgspec_codec,
-    # default=_def_tractor_codec,
+    # default=_def_msgspec_codec,
+    default=_def_tractor_codec,
 )
 
 
diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index a81473d7..b246cb61 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -26,7 +26,7 @@ from __future__ import annotations
 import types
 from typing import (
     Any,
-    Callable,
+    # Callable,
     Generic,
     Literal,
     Type,
@@ -340,50 +340,54 @@ class Error(
 # class Overrun(Msg):
 #     cid: str
 
+_runtime_msgs: list[Msg] = [
 
-# built-in SC shuttle protocol msg type set in
-# approx order of the IPC txn-state spaces.
-__spec__: list[Msg] = [
-
-    # identity handshake
+    # identity handshake on first IPC `Channel` contact.
     Aid,
 
-    # spawn specification from parent
+    # parent-to-child spawn specification passed as 2nd msg after
+    # handshake ONLY after child connects back to parent.
     SpawnSpec,
 
     # inter-actor RPC initiation
-    Start,
-    StartAck,
+    Start,  # schedule remote task-as-func
+    StartAck,  # ack the schedule request
 
-    # no-outcome-yet IAC (inter-actor-communication)
-    Started,
-    Yield,
+    # emission from `MsgStream.aclose()`
     Stop,
 
-    # termination outcomes
-    Return,
+    # box remote errors, normally subtypes
+    # of `RemoteActorError`.
     Error,
 ]
 
-_runtime_spec_msgs: list[Msg] = [
-    Aid,
-    SpawnSpec,
-    Start,
-    StartAck,
-    Stop,
-    Error,
-]
-_payload_spec_msgs: list[Msg] = [
+# the no-outcome-yet IAC (inter-actor-communication) sub-set which
+# can be `Msg.pld` payload field type-limited by application code
+# using `apply_codec()` and `limit_msg_spec()`.
+_payload_msgs: list[Msg] = [
+    # first <value> from `Context.started(<value>)`
     Started,
+
+    # any <value> sent via `MsgStream.send(<value>)`
     Yield,
+
+    # the final value returned from a `@context` decorated
+    # IPC endpoint.
     Return,
 ]
 
+# built-in SC shuttle protocol msg type set in
+# approx order of the IPC txn-state spaces.
+__msg_types__: list[Msg] = (
+    _runtime_msgs
+    +
+    _payload_msgs
+)
+
 
 def mk_msg_spec(
     payload_type_union: Union[Type] = Any,
 
-    # boxing_msg_set: list[Msg] = _payload_spec_msgs,
     spec_build_method: Literal[
         'indexed_generics',  # works
         'defstruct',
@@ -424,12 +428,12 @@ def mk_msg_spec(
     defs_msg_types: list[Msg] = []
     nc_msg_types: list[Msg] = []
 
-    for msgtype in __spec__:
+    for msgtype in __msg_types__:
 
         # for the NON-payload (user api) type specify-able
         # msgs types, we simply aggregate the def as is
         # for inclusion in the output type `Union`.
-        if msgtype not in _payload_spec_msgs:
+        if msgtype not in _payload_msgs:
             ipc_msg_types.append(msgtype)
             continue
 
@@ -535,6 +539,11 @@ def mk_msg_spec(
 
 # TODO: make something similar to this inside `._codec` such that
 # user can just pass a type table of some sort?
+# -[ ] we would need to decode all msgs to `pretty_struct.Struct`
+#   and then call `.to_dict()` on them?
+# -[ ] we're going to need to re-impl all the stuff changed in the
+# runtime port such that it can handle dicts or `Msg`s?
+#
 # def mk_dict_msg_codec_hooks() -> tuple[Callable, Callable]:
 #     '''
 #     Deliver a `enc_hook()`/`dec_hook()` pair which does
-- 
2.34.1


From 0fcd424d57518dff420c374c8ee3fc54c985f11a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 3 Apr 2024 09:50:22 -0400
Subject: [PATCH 207/378] Start a new `._testing.fault_simulation`

Since I needed the `break_ipc()` helper from the
`examples/advanced_faults/ipc_failure_during_stream.py` used in the
`test_advanced_faults` suite, might as well move it into a pkg-wide
importable module. Also changed the default break method to be
`socket_close` which just calls `Stream.socket.close()` underneath in
`trio`.

Also tweak that example to not keep sending after the stream has been
broken since with new `trio` that will raise `ClosedResourceError` and
in the wrapping test we generally speaking want to see a hang and then
cancel via simulated user sent SIGINT/ctl-c.
---
 .../ipc_failure_during_stream.py              | 86 ++++-------------
 tractor/_testing/__init__.py                  |  3 +
 tractor/_testing/fault_simulation.py          | 92 +++++++++++++++++++
 3 files changed, 112 insertions(+), 69 deletions(-)
 create mode 100644 tractor/_testing/fault_simulation.py

diff --git a/examples/advanced_faults/ipc_failure_during_stream.py b/examples/advanced_faults/ipc_failure_during_stream.py
index 9dca92b1..60b28c3e 100644
--- a/examples/advanced_faults/ipc_failure_during_stream.py
+++ b/examples/advanced_faults/ipc_failure_during_stream.py
@@ -21,75 +21,12 @@ import trio
 import pytest
 
 
-async def break_ipc(
-    stream: MsgStream,
-    method: str|None = None,
-    pre_close: bool = False,
-
-    def_method: str = 'eof',
-
-) -> None:
-    '''
-    XXX: close the channel right after an error is raised
-    purposely breaking the IPC transport to make sure the parent
-    doesn't get stuck in debug or hang on the connection join.
-    this more or less simulates an infinite msg-receive hang on
-    the other end.
-
-    '''
-    # close channel via IPC prot msging before
-    # any transport breakage
-    if pre_close:
-        await stream.aclose()
-
-    method: str = method or def_method
-    print(
-        '#################################\n'
-        'Simulating CHILD-side IPC BREAK!\n'
-        f'method: {method}\n'
-        f'pre `.aclose()`: {pre_close}\n'
-        '#################################\n'
-    )
-
-    match method:
-        case 'trans_aclose':
-            await stream._ctx.chan.transport.stream.aclose()
-
-        case 'eof':
-            await stream._ctx.chan.transport.stream.send_eof()
-
-        case 'msg':
-            await stream._ctx.chan.send(None)
-
-        # TODO: the actual real-world simulated cases like
-        # transport layer hangs and/or lower layer 2-gens type
-        # scenarios..
-        #
-        # -[ ] already have some issues for this general testing
-        # area:
-        #  - https://github.com/goodboy/tractor/issues/97
-        #  - https://github.com/goodboy/tractor/issues/124
-        #   - PR from @guille:
-        #     https://github.com/goodboy/tractor/pull/149
-        # case 'hang':
-        # TODO: framework research:
-        #
-        # - https://github.com/GuoTengda1993/pynetem
-        # - https://github.com/shopify/toxiproxy
-        # - https://manpages.ubuntu.com/manpages/trusty/man1/wirefilter.1.html
-
-        case _:
-            raise RuntimeError(
-                f'IPC break method unsupported: {method}'
-            )
-
-
 async def break_ipc_then_error(
     stream: MsgStream,
     break_ipc_with: str|None = None,
     pre_close: bool = False,
 ):
-    await break_ipc(
+    await _testing.break_ipc(
         stream=stream,
         method=break_ipc_with,
         pre_close=pre_close,
@@ -121,6 +58,7 @@ async def recv_and_spawn_net_killers(
     Receive stream msgs and spawn some IPC killers mid-stream.
 
     '''
+    broke_ipc: bool = False
     await ctx.started()
     async with (
         ctx.open_stream() as stream,
@@ -128,13 +66,17 @@ async def recv_and_spawn_net_killers(
     ):
         async for i in stream:
             print(f'child echoing {i}')
-            await stream.send(i)
+            if not broke_ipc:
+                await stream.send(i)
+            else:
+                await trio.sleep(0.01)
 
             if (
                 break_ipc_after
                 and
                 i >= break_ipc_after
             ):
+                broke_ipc = True
                 n.start_soon(
                     iter_ipc_stream,
                     stream,
@@ -242,14 +184,13 @@ async def main(
                         # await stream._ctx.chan.send(None)
                         # await stream._ctx.chan.transport.stream.send_eof()
                         await stream._ctx.chan.transport.stream.aclose()
-
                         ipc_break_sent = True
 
                     # it actually breaks right here in the
-                    # mp_spawn/forkserver backends and thus the zombie
-                    # reaper never even kicks in?
-                    print(f'parent sending {i}')
+                    # mp_spawn/forkserver backends and thus the
+                    # zombie reaper never even kicks in?
                     try:
+                        print(f'parent sending {i}')
                         await stream.send(i)
                     except ContextCancelled as ctxc:
                         print(
@@ -262,6 +203,13 @@ async def main(
                         # TODO: is this needed or no?
                         raise
 
+                    except trio.ClosedResourceError:
+                        # NOTE: don't send if we already broke the
+                        # connection to avoid raising a closed-error
+                        # such that we drop through to the ctl-c
+                        # mashing by user.
+                        await trio.sleep(0.01)
+
                     # timeout: int = 1
                     # with trio.move_on_after(timeout) as cs:
                     async with stuff_hangin_ctlc() as timeout:
diff --git a/tractor/_testing/__init__.py b/tractor/_testing/__init__.py
index 876c87e8..fd79fe20 100644
--- a/tractor/_testing/__init__.py
+++ b/tractor/_testing/__init__.py
@@ -26,6 +26,9 @@ import tractor
 from .pytest import (
     tractor_test as tractor_test
 )
+from .fault_simulation import (
+    break_ipc as break_ipc,
+)
 
 
 def repodir() -> pathlib.Path:
diff --git a/tractor/_testing/fault_simulation.py b/tractor/_testing/fault_simulation.py
new file mode 100644
index 00000000..fbd97bf5
--- /dev/null
+++ b/tractor/_testing/fault_simulation.py
@@ -0,0 +1,92 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+`pytest` utils helpers and plugins for testing `tractor`'s runtime
+and applications.
+
+'''
+
+from tractor import (
+    MsgStream,
+)
+
+async def break_ipc(
+    stream: MsgStream,
+    method: str|None = None,
+    pre_close: bool = False,
+
+    def_method: str = 'socket_close',
+
+) -> None:
+    '''
+    XXX: close the channel right after an error is raised
+    purposely breaking the IPC transport to make sure the parent
+    doesn't get stuck in debug or hang on the connection join.
+    this more or less simulates an infinite msg-receive hang on
+    the other end.
+
+    '''
+    # close channel via IPC prot msging before
+    # any transport breakage
+    if pre_close:
+        await stream.aclose()
+
+    method: str = method or def_method
+    print(
+        '#################################\n'
+        'Simulating CHILD-side IPC BREAK!\n'
+        f'method: {method}\n'
+        f'pre `.aclose()`: {pre_close}\n'
+        '#################################\n'
+    )
+
+    match method:
+        case 'socket_close':
+            await stream._ctx.chan.transport.stream.aclose()
+
+        case 'socket_eof':
+            # NOTE: `trio` does the following underneath this
+            # call in `src/trio/_highlevel_socket.py`:
+            # `Stream.socket.shutdown(tsocket.SHUT_WR)`
+            await stream._ctx.chan.transport.stream.send_eof()
+
+        # TODO: remove since now this will be invalid with our
+        # new typed msg spec?
+        # case 'msg':
+        #     await stream._ctx.chan.send(None)
+
+        # TODO: the actual real-world simulated cases like
+        # transport layer hangs and/or lower layer 2-gens type
+        # scenarios..
+        #
+        # -[ ] already have some issues for this general testing
+        # area:
+        #  - https://github.com/goodboy/tractor/issues/97
+        #  - https://github.com/goodboy/tractor/issues/124
+        #   - PR from @guille:
+        #     https://github.com/goodboy/tractor/pull/149
+        # case 'hang':
+        # TODO: framework research:
+        #
+        # - https://github.com/GuoTengda1993/pynetem
+        # - https://github.com/shopify/toxiproxy
+        # - https://manpages.ubuntu.com/manpages/trusty/man1/wirefilter.1.html
+
+        case _:
+            raise RuntimeError(
+                f'IPC break method unsupported: {method}'
+            )
-- 
2.34.1


From 5b551dd9fabf235d7adae44376b00d9de65f4cc6 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 5 Apr 2024 10:53:07 -0400
Subject: [PATCH 208/378] Use `._testing.break_ipc()` in final advanced fault
 test child ctx

---
 tests/test_advanced_faults.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/tests/test_advanced_faults.py b/tests/test_advanced_faults.py
index 5f73ac6c..45c0aa36 100644
--- a/tests/test_advanced_faults.py
+++ b/tests/test_advanced_faults.py
@@ -13,6 +13,7 @@ import trio
 import tractor
 from tractor._testing import (
     examples_dir,
+    break_ipc,
 )
 
 
@@ -93,7 +94,8 @@ def test_ipc_channel_break_during_stream(
         expect_final_exc = trio.ClosedResourceError
 
     mod: ModuleType = import_path(
-        examples_dir() / 'advanced_faults' / 'ipc_failure_during_stream.py',
+        examples_dir() / 'advanced_faults'
+        / 'ipc_failure_during_stream.py',
         root=examples_dir(),
     )
 
@@ -224,9 +226,15 @@ async def break_ipc_after_started(
 ) -> None:
     await ctx.started()
     async with ctx.open_stream() as stream:
-        await stream.aclose()
-        await trio.sleep(0.2)
-        await ctx.chan.send(None)
+
+        # TODO: make a test which verifies the error
+        # for this, i.e. raises a `MsgTypeError`
+        # await ctx.chan.send(None)
+
+        await break_ipc(
+            stream=stream,
+            pre_close=True,
+        )
         print('child broke IPC and terminating')
 
 
-- 
2.34.1


From 10c98946bd31a15a73da33b84ec33d249a902eb4 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 5 Apr 2024 11:36:09 -0400
Subject: [PATCH 209/378] Extend codec test to for msg-spec parameterizing

Set a diff `Msg.pld` spec per test and then send multiple types to
a child actor making sure the child can only send certain types over
a stream and fails with validation or decode errors ow. The test is also
param-ed both with and without hooks demonstrating how a custom type,
`NamespacePath`, needs them for effective use. The subactor IPC context
child is passed a `expect_ipc_send: dict` which relays the values along
with their expected `.send()`-ability.

Deats on technical refinements:
------ - ------
- added a `iter_maybe_sends()` send-value-as-msg-auditor and predicate
  generator (literally) so as to be able to pre-determine if given the
  current codec and `send_values` which values are expected to be IPC
  transmittable.
- as per ^, the diff value-msgs are first round-tripped inside
  a `Started` msg using the configured codec in the parent/root actor
  before bothering with using IPC primitives + a subactor; this is how
  the `expect_ipc_send` table is generated initially.
- for serializing the specs (`Union[Type]`s as required by `msgspec`),
  added a pair of codec hooks: `enc/dec_type_union()` (that ideally we
  move into a `.msg` submod eventually) which code the type-values as
  a `list[str]` of names.
  - the `dec_` hook had to be modified to NOT raise an error when an
    invalid/unhandled value arrives, this is because we do NOT want the
    RPC msg handling loop to raise on the `async for msg in chan:` and
    instead prefer to ignore and warn (for now, but eventually respond
    with error msg - see notes in hook body) these msgs when sent during
    a streaming phase; `Context.started()` will however error on a bad
    input for the current msg-spec since it is part of the "cheap"
    dialog (again see notes in `._context`) wherein the `Started` msg
    is always roundtripped prior to `Channel.send()` to guarantee
    the child adheres to its own spec.
- tossed in lotsa `print()`s for console groking of the run progress.

Further notes on typed-msging breaking cancellation:
------ - ------
- turns out since the runtime's cancellation implementation, being done
  with `Actor.cancel()` methods and friends will actually break when
  a stringent spec is applied (eg. a single type-spec) since the return
  values from said methods are generally `bool`s..
- this means we do indeed need special handling of "runtime RPC method
  invocations" since ideally a user's msg-spec choices do not break core
  functionality on them XD
=> The obvi solution is to add a/some special sub-`Msg` types for such
  cases, possibly just a `RuntimeReturn(Return)` type that will always
  include a `.pld: bool` for these cancel methods such that their
  results are always handled without msg type errors.

More to come on a (hopefully) elegant solution to that last bit!
---
 tests/test_caps_based_msging.py | 648 +++++++++++++++++++++++---------
 1 file changed, 462 insertions(+), 186 deletions(-)

diff --git a/tests/test_caps_based_msging.py b/tests/test_caps_based_msging.py
index b42d9e35..acc1f307 100644
--- a/tests/test_caps_based_msging.py
+++ b/tests/test_caps_based_msging.py
@@ -5,6 +5,7 @@ Low-level functional audits for our
 B~)
 
 '''
+import typing
 from typing import (
     Any,
     Type,
@@ -23,7 +24,9 @@ from msgspec import (
     ValidationError,
 )
 import pytest
+
 import tractor
+from tractor import _state
 from tractor.msg import (
     _codec,
     _ctxvar_MsgCodec,
@@ -34,12 +37,9 @@ from tractor.msg import (
     apply_codec,
     current_codec,
 )
-from tractor.msg import (
-    types,
-)
-from tractor import _state
 from tractor.msg.types import (
-    # PayloadT,
+    _payload_msgs,
+    log,
     Msg,
     Started,
     mk_msg_spec,
@@ -62,17 +62,14 @@ def test_msg_spec_xor_pld_spec():
         )
 
 
-def ex_func(*args):
-    print(f'ex_func({args})')
-
-
 def mk_custom_codec(
     pld_spec: Union[Type]|Any,
+    add_hooks: bool,
 
 ) -> MsgCodec:
     '''
     Create custom `msgpack` enc/dec-hooks and set a `Decoder`
-    which only loads `NamespacePath` types.
+    which only loads `pld_spec` (like `NamespacePath`) types.
 
     '''
     uid: tuple[str, str] = tractor.current_actor().uid
@@ -83,61 +80,75 @@ def mk_custom_codec(
     # https://jcristharif.com/msgspec/extending.html#mapping-to-from-native-types
 
     def enc_nsp(obj: Any) -> Any:
+        print(f'{uid} ENC HOOK')
         match obj:
             case NamespacePath():
                 print(
                     f'{uid}: `NamespacePath`-Only ENCODE?\n'
-                    f'type: {type(obj)}\n'
-                    f'obj: {obj}\n'
+                    f'obj-> `{obj}`: {type(obj)}\n'
                 )
-
+                # if type(obj) != NamespacePath:
+                #     breakpoint()
                 return str(obj)
 
-        logmsg: str = (
-            f'{uid}: Encoding `{obj}: <{type(obj)}>` not supported'
-            f'type: {type(obj)}\n'
-            f'obj: {obj}\n'
+        print(
+            f'{uid}\n'
+            'CUSTOM ENCODE\n'
+            f'obj-arg-> `{obj}`: {type(obj)}\n'
+        )
+        logmsg: str = (
+            f'{uid}\n'
+            'FAILED ENCODE\n'
+            f'obj-> `{obj}: {type(obj)}`\n'
         )
-        print(logmsg)
         raise NotImplementedError(logmsg)
 
     def dec_nsp(
-        type: Type,
+        obj_type: Type,
         obj: Any,
 
     ) -> Any:
         print(
-            f'{uid}: CUSTOM DECODE\n'
-            f'input type: {type}\n'
-            f'obj: {obj}\n'
-            f'type(obj): `{type(obj).__class__}`\n'
+            f'{uid}\n'
+            'CUSTOM DECODE\n'
+            f'type-arg-> {obj_type}\n'
+            f'obj-arg-> `{obj}`: {type(obj)}\n'
         )
         nsp = None
 
-        # This never seems to hit?
-        if isinstance(obj, Msg):
-            print(f'Msg type: {obj}')
-
         if (
-            type is NamespacePath
+            obj_type is NamespacePath
             and isinstance(obj, str)
             and ':' in obj
         ):
             nsp = NamespacePath(obj)
+            # TODO: we could built a generic handler using
+            # JUST matching the obj_type part?
+            # nsp = obj_type(obj)
 
         if nsp:
             print(f'Returning NSP instance: {nsp}')
             return nsp
 
         logmsg: str = (
-            f'{uid}: Decoding `{obj}: <{type(obj)}>` not supported'
-            f'input type: {type(obj)}\n'
-            f'obj: {obj}\n'
-            f'type(obj): `{type(obj).__class__}`\n'
+            f'{uid}\n'
+            'FAILED DECODE\n'
+            f'type-> {obj_type}\n'
+            f'obj-arg-> `{obj}`: {type(obj)}\n'
         )
-        print(logmsg)
-        raise NotImplementedError(logmsg)
-
+        # TODO: figure out the ignore subsys for this!
+        # -[ ] option whether to defense-relay backc the msg
+        #   inside an `Invalid`/`Ignore`
+        # -[ ] how to make this handling pluggable such that a
+        #   `Channel`/`MsgTransport` can intercept and process
+        #   back msgs either via exception handling or some other
+        #   signal?
+        log.warning(logmsg)
+        # NOTE: this delivers the invalid
+        # value up to `msgspec`'s decoding
+        # machinery for error raising.
+        return obj
+        # raise NotImplementedError(logmsg)
 
     nsp_codec: MsgCodec = mk_codec(
         ipc_pld_spec=pld_spec,
@@ -151,97 +162,32 @@ def mk_custom_codec(
         # `Any`-decoded-pld the enc has no knowledge (by default)
         # how to enc `NamespacePath` (nsp), so we add a custom
         # hook to do that ALWAYS.
-        enc_hook=enc_nsp,
+        enc_hook=enc_nsp if add_hooks else None,
 
         # XXX NOTE: pretty sure this is mutex with the `type=` to
         # `Decoder`? so it won't work in tandem with the
         # `ipc_pld_spec` passed above?
-        dec_hook=dec_nsp,
+        dec_hook=dec_nsp if add_hooks else None,
     )
     return nsp_codec
 
 
-@tractor.context
-async def send_back_nsp(
-    ctx: Context,
-    expect_debug: bool,
-    use_any_spec: bool,
-
-) -> None:
-    '''
-    Setup up a custom codec to load instances of `NamespacePath`
-    and ensure we can round trip a func ref with our parent.
-
-    '''
-    # debug mode sanity check
-    assert expect_debug == _state.debug_mode()
-
-    # task: trio.Task = trio.lowlevel.current_task()
-
-    # TreeVar
-    # curr_codec = _ctxvar_MsgCodec.get_in(task)
-
-    # ContextVar
-    # task_ctx: Context = task.context
-    # assert _ctxvar_MsgCodec not in task_ctx
-
-    curr_codec = _ctxvar_MsgCodec.get()
-    assert curr_codec is _codec._def_tractor_codec
-
-    if use_any_spec:
-        pld_spec = Any
-    else:
-        # NOTE: don't need the |None here since
-        # the parent side will never send `None` like
-        # we do here in the implicit return at the end of this
-        # `@context` body.
-        pld_spec = NamespacePath  # |None
-
-    nsp_codec: MsgCodec = mk_custom_codec(
-        pld_spec=pld_spec,
-    )
-    with apply_codec(nsp_codec) as codec:
-        chk_codec_applied(
-            custom_codec=nsp_codec,
-            enter_value=codec,
-        )
-
-        # ensure roundtripping works locally
-        nsp = NamespacePath.from_ref(ex_func)
-        wire_bytes: bytes = nsp_codec.encode(
-            Started(
-                cid=ctx.cid,
-                pld=nsp
-            )
-        )
-        msg: Started = nsp_codec.decode(wire_bytes)
-        pld = msg.pld
-        assert pld == nsp
-
-        await ctx.started(nsp)
-        async with ctx.open_stream() as ipc:
-            async for msg in ipc:
-
-                if use_any_spec:
-                    assert msg == f'{__name__}:ex_func'
-
-                    # TODO: as per below
-                    # assert isinstance(msg, NamespacePath)
-                    assert isinstance(msg, str)
-                else:
-                    assert isinstance(msg, NamespacePath)
-
-                await ipc.send(msg)
-
-
 def chk_codec_applied(
-    custom_codec: MsgCodec,
-    enter_value: MsgCodec,
+    expect_codec: MsgCodec,
+    enter_value: MsgCodec|None = None,
+
 ) -> MsgCodec:
+    '''
+    buncha sanity checks ensuring that the IPC channel's
+    context-vars are set to the expected codec and that are
+    ctx-var wrapper APIs match the same.
 
-    # task: trio.Task = trio.lowlevel.current_task()
-
+    '''
+    # TODO: play with tricyle again, bc this is supposed to work
+    # the way we want?
+    #
     # TreeVar
+    # task: trio.Task = trio.lowlevel.current_task()
     # curr_codec = _ctxvar_MsgCodec.get_in(task)
 
     # ContextVar
@@ -249,46 +195,358 @@ def chk_codec_applied(
     # assert _ctxvar_MsgCodec in task_ctx
     # curr_codec: MsgCodec = task.context[_ctxvar_MsgCodec]
 
+    # NOTE: currently we use this!
     # RunVar
-    curr_codec: MsgCodec = _ctxvar_MsgCodec.get()
+    curr_codec: MsgCodec = current_codec()
     last_read_codec = _ctxvar_MsgCodec.get()
-    assert curr_codec is last_read_codec
+    # assert curr_codec is last_read_codec
 
     assert (
+        (same_codec := expect_codec) is
         # returned from `mk_codec()`
-        custom_codec is
 
         # yielded value from `apply_codec()`
-        enter_value is
 
         # read from current task's `contextvars.Context`
         curr_codec is
-
-        # public API for all of the above
-        current_codec()
+        last_read_codec
 
         # the default `msgspec` settings
         is not _codec._def_msgspec_codec
         is not _codec._def_tractor_codec
     )
 
+    if enter_value:
+        enter_value is same_codec
+
+
+def iter_maybe_sends(
+    send_items: dict[Union[Type], Any] | list[tuple],
+    ipc_pld_spec: Union[Type] | Any,
+    add_codec_hooks: bool,
+
+    codec: MsgCodec|None = None,
+
+) -> tuple[Any, bool]:
+
+    if isinstance(send_items, dict):
+        send_items = send_items.items()
+
+    for (
+        send_type_spec,
+        send_value,
+    ) in send_items:
+
+        expect_roundtrip: bool = False
+
+        # values-to-typespec santiy
+        send_type = type(send_value)
+        assert send_type == send_type_spec or (
+            (subtypes := getattr(send_type_spec, '__args__', None))
+            and send_type in subtypes
+        )
+
+        spec_subtypes: set[Union[Type]] = (
+             getattr(
+                 ipc_pld_spec,
+                 '__args__',
+                 {ipc_pld_spec,},
+             )
+        )
+        send_in_spec: bool = (
+            send_type == ipc_pld_spec
+            or (
+                ipc_pld_spec != Any
+                and  # presume `Union` of types
+                send_type in spec_subtypes
+            )
+            or (
+                ipc_pld_spec == Any
+                and
+                send_type != NamespacePath
+            )
+        )
+        expect_roundtrip = (
+            send_in_spec
+            # any spec should support all other
+            # builtin py values that we send
+            # except our custom nsp type which
+            # we should be able to send as long
+            # as we provide the custom codec hooks.
+            or (
+                ipc_pld_spec == Any
+                and
+                send_type == NamespacePath
+                and
+                add_codec_hooks
+            )
+        )
+
+        if codec is not None:
+            # XXX FIRST XXX ensure roundtripping works
+            # before touching any IPC primitives/APIs.
+            wire_bytes: bytes = codec.encode(
+                Started(
+                    cid='blahblah',
+                    pld=send_value,
+                )
+            )
+            # NOTE: demonstrates the decoder loading
+            # to via our native SCIPP msg-spec
+            # (structurred-conc-inter-proc-protocol)
+            # implemented as per,
+            try:
+                msg: Started = codec.decode(wire_bytes)
+                if not expect_roundtrip:
+                    pytest.fail(
+                        f'NOT-EXPECTED able to roundtrip value given spec:\n'
+                        f'ipc_pld_spec -> {ipc_pld_spec}\n'
+                        f'value -> {send_value}: {send_type}\n'
+                    )
+
+                pld = msg.pld
+                assert pld == send_value
+
+            except ValidationError:
+                if expect_roundtrip:
+                    pytest.fail(
+                        f'EXPECTED to roundtrip value given spec:\n'
+                        f'ipc_pld_spec -> {ipc_pld_spec}\n'
+                        f'value -> {send_value}: {send_type}\n'
+                    )
+
+        yield (
+            str(send_type),
+            send_value,
+            expect_roundtrip,
+        )
+
+
+def dec_type_union(
+    type_names: list[str],
+) -> Type:
+    '''
+    Look up types by name, compile into a list and then create and
+    return a `typing.Union` from the full set.
+
+    '''
+    import importlib
+    types: list[Type] = []
+    for type_name in type_names:
+        for ns in [
+            typing,
+            importlib.import_module(__name__),
+        ]:
+            if type_ref := getattr(
+                ns,
+                type_name,
+                False,
+            ):
+                types.append(type_ref)
+
+    # special case handling only..
+    # ipc_pld_spec: Union[Type] = eval(
+    #     pld_spec_str,
+    #     {},  # globals
+    #     {'typing': typing},  # locals
+    # )
+
+    return Union[*types]
+
+
+def enc_type_union(
+    union_or_type: Union[Type]|Type,
+) -> list[str]:
+    '''
+    Encode a type-union or single type to a list of type-name-strings
+    ready for IPC interchange.
+
+    '''
+    type_strs: list[str] = []
+    for typ in getattr(
+        union_or_type,
+        '__args__',
+        {union_or_type,},
+    ):
+        type_strs.append(typ.__qualname__)
+
+    return type_strs
+
+
+@tractor.context
+async def send_back_nsp(
+    ctx: Context,
+    expect_debug: bool,
+    pld_spec_type_strs: list[str],
+    add_hooks: bool,
+    started_msg_bytes: bytes,
+    expect_ipc_send: dict[str, tuple[Any, bool]],
+
+) -> None:
+    '''
+    Setup up a custom codec to load instances of `NamespacePath`
+    and ensure we can round trip a func ref with our parent.
+
+    '''
+    # debug mode sanity check (prolly superfluous but, meh)
+    assert expect_debug == _state.debug_mode()
+
+    # init state in sub-actor should be default
+    chk_codec_applied(
+        expect_codec=_codec._def_tractor_codec,
+    )
+
+    # load pld spec from input str
+    ipc_pld_spec = dec_type_union(
+        pld_spec_type_strs,
+    )
+    pld_spec_str = str(ipc_pld_spec)
+
+    # same as on parent side config.
+    nsp_codec: MsgCodec = mk_custom_codec(
+        pld_spec=ipc_pld_spec,
+        add_hooks=add_hooks,
+    )
+    with apply_codec(nsp_codec) as codec:
+        chk_codec_applied(
+            expect_codec=nsp_codec,
+            enter_value=codec,
+        )
+
+        print(
+            'CHILD attempting `Started`-bytes DECODE..\n'
+        )
+        try:
+            msg: Started = nsp_codec.decode(started_msg_bytes)
+            expected_pld_spec_str: str = msg.pld
+            assert pld_spec_str == expected_pld_spec_str
+
+        # TODO: maybe we should add our own wrapper error so as to
+        # be interchange-lib agnostic?
+        # -[ ] the error type is wtv is raised from the hook so we
+        #   could also require a type-class of errors for
+        #   indicating whether the hook-failure can be handled by
+        #   a nasty-dialog-unprot sub-sys?
+        except ValidationError:
+
+            # NOTE: only in the `Any` spec case do we expect this to
+            # work since otherwise no spec covers a plain-ol'
+            # `.pld: str`
+            if pld_spec_str == 'Any':
+                raise
+            else:
+                print(
+                    'CHILD (correctly) unable to DECODE `Started`-bytes\n'
+                    f'{started_msg_bytes}\n'
+                )
+
+        iter_send_val_items = iter(expect_ipc_send.values())
+        sent: list[Any] = []
+        for send_value, expect_send in iter_send_val_items:
+            try:
+                print(
+                    f'CHILD attempting to `.started({send_value})`\n'
+                    f'=> expect_send: {expect_send}\n'
+                    f'SINCE, ipc_pld_spec: {ipc_pld_spec}\n'
+                    f'AND, codec: {codec}\n'
+                )
+                await ctx.started(send_value)
+                sent.append(send_value)
+                if not expect_send:
+
+                    # XXX NOTE XXX THIS WON'T WORK WITHOUT SPECIAL
+                    # `str` handling! or special debug mode IPC
+                    # msgs!
+                    # await tractor.pause()
+
+                    raise RuntimeError(
+                    # pytest.fail(
+                        f'NOT-EXPECTED able to roundtrip value given spec:\n'
+                        f'ipc_pld_spec -> {ipc_pld_spec}\n'
+                        f'value -> {send_value}: {type(send_value)}\n'
+                    )
+
+                break  # move on to streaming block..
+
+            except NotImplementedError:
+                print('FAILED ENCODE!')
+
+            except tractor.MsgTypeError:
+                # await tractor.pause()
+                if expect_send:
+                    pytest.fail(
+                        f'EXPECTED to `.started()` value given spec:\n'
+                        f'ipc_pld_spec -> {ipc_pld_spec}\n'
+                        f'value -> {send_value}: {type(send_value)}\n'
+                    )
+
+        async with ctx.open_stream() as ipc:
+            for send_value, expect_send in iter_send_val_items:
+                send_type: Type = type(send_value)
+                print(
+                    'CHILD report on send value\n'
+                    f'ipc_pld_spec: {ipc_pld_spec}\n'
+                    f'expect_send: {expect_send}\n'
+                    f'val: {send_value}\n'
+                )
+                try:
+                    await ipc.send(send_value)
+                    sent.append(send_value)
+                    if not expect_send:
+                        pytest.fail(
+                            f'NOT-EXPECTED able to roundtrip value given spec:\n'
+                            f'ipc_pld_spec -> {ipc_pld_spec}\n'
+                            f'value -> {send_value}: {send_type}\n'
+                        )
+                except ValidationError:
+                    if expect_send:
+                        pytest.fail(
+                            f'EXPECTED to roundtrip value given spec:\n'
+                            f'ipc_pld_spec -> {ipc_pld_spec}\n'
+                            f'value -> {send_value}: {send_type}\n'
+                        )
+                    continue
+
+        assert (
+            len(sent)
+            ==
+            len([val
+                 for val, expect in
+                 expect_ipc_send.values()
+                 if expect is True])
+        )
+
+
+def ex_func(*args):
+    print(f'ex_func({args})')
+
 
 @pytest.mark.parametrize(
     'ipc_pld_spec',
     [
-        # _codec._def_msgspec_codec,
         Any,
-        # _codec._def_tractor_codec,
-        NamespacePath|None,
+        NamespacePath,
+        NamespacePath|None,  # the "maybe" spec Bo
     ],
     ids=[
         'any_type',
         'nsp_type',
+        'maybe_nsp_type',
     ]
 )
+@pytest.mark.parametrize(
+    'add_codec_hooks',
+    [
+        True,
+        False,
+    ],
+    ids=['use_codec_hooks', 'no_codec_hooks'],
+)
 def test_codec_hooks_mod(
     debug_mode: bool,
     ipc_pld_spec: Union[Type]|Any,
+    # send_value: None|str|NamespacePath,
+    add_codec_hooks: bool,
 ):
     '''
     Audit the `.msg.MsgCodec` override apis details given our impl
@@ -297,17 +555,17 @@ def test_codec_hooks_mod(
 
     '''
     async def main():
+        nsp = NamespacePath.from_ref(ex_func)
+        send_items: dict[Union, Any] = {
+            Union[None]: None,
+            Union[NamespacePath]: nsp,
+            Union[str]: str(nsp),
+        }
 
-        # task: trio.Task = trio.lowlevel.current_task()
-
-        # ContextVar
-        # task_ctx: Context = task.context
-        # assert _ctxvar_MsgCodec not in task_ctx
-
-        # TreeVar
-        # def_codec: MsgCodec = _ctxvar_MsgCodec.get_in(task)
-        def_codec = _ctxvar_MsgCodec.get()
-        assert def_codec is _codec._def_tractor_codec
+        # init default state for actor
+        chk_codec_applied(
+            expect_codec=_codec._def_tractor_codec,
+        )
 
         async with tractor.open_nursery(
             debug_mode=debug_mode,
@@ -323,79 +581,97 @@ def test_codec_hooks_mod(
             #   `NamespacePath`
             nsp_codec: MsgCodec = mk_custom_codec(
                 pld_spec=ipc_pld_spec,
+                add_hooks=add_codec_hooks,
             )
             with apply_codec(nsp_codec) as codec:
                 chk_codec_applied(
-                    custom_codec=nsp_codec,
+                    expect_codec=nsp_codec,
                     enter_value=codec,
                 )
 
+                expect_ipc_send: dict[str, tuple[Any, bool]] = {}
+
+                report: str = (
+                    'Parent report on send values with\n'
+                    f'ipc_pld_spec: {ipc_pld_spec}\n'
+                    '       ------ - ------\n'
+                )
+                for val_type_str, val, expect_send in iter_maybe_sends(
+                    send_items,
+                    ipc_pld_spec,
+                    add_codec_hooks=add_codec_hooks,
+                ):
+                    report += (
+                        f'send_value: {val}: {type(val)} '
+                        f'=> expect_send: {expect_send}\n'
+                    )
+                    expect_ipc_send[val_type_str] = (val, expect_send)
+
+                print(
+                    report +
+                    '       ------ - ------\n'
+                )
+                assert len(expect_ipc_send) == len(send_items)
+                # now try over real IPC with a the subactor
+                # expect_ipc_rountrip: bool = True
+                expected_started = Started(
+                    cid='cid',
+                    pld=str(ipc_pld_spec),
+                )
+                # build list of values we expect to receive from
+                # the subactor.
+                expect_to_send: list[Any] = [
+                    val
+                    for val, expect_send in expect_ipc_send.values()
+                    if expect_send
+                ]
+
+                pld_spec_type_strs: list[str] = enc_type_union(ipc_pld_spec)
+
+                # TODO: send the original nsp here and
+                # test with `limit_msg_spec()` above?
+                # await tractor.pause()
+                print('PARENT opening IPC ctx!\n')
                 async with (
+
                     p.open_context(
                         send_back_nsp,
-                        # TODO: send the original nsp here and
-                        # test with `limit_msg_spec()` above?
                         expect_debug=debug_mode,
-                        use_any_spec=(ipc_pld_spec==Any),
-
+                        pld_spec_type_strs=pld_spec_type_strs,
+                        add_hooks=add_codec_hooks,
+                        started_msg_bytes=nsp_codec.encode(expected_started),
+                        expect_ipc_send=expect_ipc_send,
                     ) as (ctx, first),
+
                     ctx.open_stream() as ipc,
                 ):
-                    if ipc_pld_spec is NamespacePath:
-                        assert isinstance(first, NamespacePath)
-
+                    # ensure codec is still applied across
+                    # `tractor.Context` + its embedded nursery.
+                    chk_codec_applied(
+                        expect_codec=nsp_codec,
+                        enter_value=codec,
+                    )
                     print(
                         'root: ENTERING CONTEXT BLOCK\n'
                         f'type(first): {type(first)}\n'
                         f'first: {first}\n'
                     )
-                    # ensure codec is still applied across
-                    # `tractor.Context` + its embedded nursery.
-                    chk_codec_applied(
-                        custom_codec=nsp_codec,
-                        enter_value=codec,
-                    )
+                    expect_to_send.remove(first)
 
-                    first_nsp = NamespacePath(first)
+                    # TODO: explicit values we expect depending on
+                    # codec config!
+                    # assert first == first_val
+                    # assert first == f'{__name__}:ex_func'
 
-                    # ensure roundtripping works
-                    wire_bytes: bytes = nsp_codec.encode(
-                        Started(
-                            cid=ctx.cid,
-                            pld=first_nsp
+                    async for next_sent in ipc:
+                        print(
+                            'Child sent next value\n'
+                            f'{next_sent}: {type(next_sent)}\n'
                         )
-                    )
-                    msg: Started = nsp_codec.decode(wire_bytes)
-                    pld = msg.pld
-                    assert  pld == first_nsp
+                        expect_to_send.remove(next_sent)
 
-                    # try a manual decode of the started msg+pld
-
-                    # TODO: actually get the decoder loading
-                    # to native once we spec our SCIPP msgspec
-                    # (structurred-conc-inter-proc-protocol)
-                    # implemented as per,
-                    # https://github.com/goodboy/tractor/issues/36
-                    #
-                    if ipc_pld_spec is NamespacePath:
-                        assert isinstance(first, NamespacePath)
-
-                    # `Any`-payload-spec case
-                    else:
-                        assert isinstance(first, str)
-                        assert first == f'{__name__}:ex_func'
-
-                    await ipc.send(first)
-
-                    with trio.move_on_after(.6):
-                        async for msg in ipc:
-                            print(msg)
-
-                            # TODO: as per above
-                            # assert isinstance(msg, NamespacePath)
-                            assert isinstance(msg, str)
-                            await ipc.send(msg)
-                            await trio.sleep(0.1)
+                    # all sent values should have arrived!
+                    assert not expect_to_send
 
             await p.cancel_actor()
 
@@ -467,7 +743,7 @@ def chk_pld_type(
 
     roundtrip: bool|None = None
     pld_spec_msg_names: list[str] = [
-        td.__name__ for td in types._payload_spec_msgs
+        td.__name__ for td in _payload_msgs
     ]
     for typedef in msg_types:
 
-- 
2.34.1


From 7f1c2b8ecf225ba1241db971cba4a03b5e65c343 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 5 Apr 2024 13:59:43 -0400
Subject: [PATCH 210/378] Add buncha notes on `Start` field for "params"

Such that the current `kwargs: dict` field can eventually be strictly
msg-typed (eventually directly from a `@context` def) using modern typed
python's hippest syntactical approach B)

Also proto a new `CancelAck(Return)` subtype msg for supporting msg-spec
agnostic `Actor.cancel_xx()` method calls in the runtime such that
a user can't break cancellation (and thus SC) by dynamically setting
a codec that doesn't allow `bool` results (as an eg. in this case).
Note that the msg isn't used yet in `._rpc` but that's a comin!
---
 tractor/msg/types.py | 124 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 113 insertions(+), 11 deletions(-)

diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index b246cb61..3e7a2d7a 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -45,6 +45,10 @@ from msgspec import (
 from tractor.msg import (
     pretty_struct,
 )
+from tractor.log import get_logger
+
+
+log = get_logger('tractor.msgspec')
 
 # type variable for the boxed payload field `.pld`
 PayloadT = TypeVar('PayloadT')
@@ -185,7 +189,47 @@ class SpawnSpec(
 #      | Union[DebugLock, DebugLocked, DebugRelease]
 #   )
 
+# class Params(
+#     Struct,
+#     Generic[PayloadT],
+# ):
+#     spec: PayloadT|ParamSpec
+#     inputs: InputsT|dict[str, Any]
 
+    # TODO: for eg. we could stringently check the target
+    # task-func's type sig and enforce it?
+    # as an example for an IPTC,
+    # @tractor.context
+    # async def send_back_nsp(
+    #     ctx: Context,
+    #     expect_debug: bool,
+    #     pld_spec_str: str,
+    #     add_hooks: bool,
+    #     started_msg_dict: dict,
+    # ) -> <WhatHere!>:
+
+    # TODO: figure out which of the `typing` feats we want to
+    # support:
+    # - plain ol `ParamSpec`:
+    #   https://docs.python.org/3/library/typing.html#typing.ParamSpec
+    # - new in 3.12 type parameter lists Bo
+    # |_ https://docs.python.org/3/reference/compound_stmts.html#type-params
+    # |_ historical pep 695: https://peps.python.org/pep-0695/
+    # |_ full lang spec: https://typing.readthedocs.io/en/latest/spec/
+    # |_ on annotation scopes:
+    #    https://docs.python.org/3/reference/executionmodel.html#annotation-scopes
+    # spec: ParamSpec[
+    #     expect_debug: bool,
+    #     pld_spec_str: str,
+    #     add_hooks: bool,
+    #     started_msg_dict: dict,
+    # ]
+
+
+# TODO: possibly sub-type for runtime method requests?
+# -[ ] `Runtime(Start)` with a `.ns: str = 'self' or
+#     we can just enforce any such method as having a strict
+#     ns for calling funcs, namely the `Actor` instance?
 class Start(
     Struct,
     tag=True,
@@ -212,9 +256,45 @@ class Start(
     ns: str
     func: str
 
-    kwargs: dict
+    # TODO: make this a sub-struct which can be further
+    # type-limited, maybe `Inputs`?
+    # => SEE ABOVE <=
+    kwargs: dict[str, Any]
     uid: tuple[str, str]  # (calling) actor-id
 
+    # TODO: enforcing a msg-spec in terms `Msg.pld`
+    # parameterizable msgs to be used in the appls IPC dialog.
+    #
+    # -[ ] both as part of the `.open_context()` call AND as part of the
+    #     immediate ack-reponse (see similar below)
+    #     we should do spec matching and fail if anything is awry?
+    #
+    # -[ ] eventually spec should be generated/parsed from the
+    #     type-annots as # desired in GH issue:
+    #     https://github.com/goodboy/tractor/issues/365
+    #
+    # -[ ] semantics of the mismatch case
+    #   - when caller-callee specs we should raise
+    #    a `MsgTypeError` or `MsgSpecError` or similar?
+    #
+    # -[ ] wrapper types for both spec types such that we can easily
+    #     IPC transport them?
+    #     - `TypeSpec: Union[Type]`
+    #      * also a `.__contains__()` for doing `None in
+    #      TypeSpec[None|int]` since rn you need to do it on
+    #      `.__args__` for unions..
+    #     - `MsgSpec: Union[Type[Msg]]
+    #
+    # -[ ] auto-genning this from new (in 3.12) type parameter lists Bo
+    # |_ https://docs.python.org/3/reference/compound_stmts.html#type-params
+    # |_ historical pep 695: https://peps.python.org/pep-0695/
+    # |_ full lang spec: https://typing.readthedocs.io/en/latest/spec/
+    # |_ on annotation scopes:
+    #    https://docs.python.org/3/reference/executionmodel.html#annotation-scopes
+    # |_ 3.13 will have subscriptable funcs Bo
+    #    https://peps.python.org/pep-0718/
+    pld_spec: str = str(Any)
+
 
 class StartAck(
     Struct,
@@ -235,14 +315,10 @@ class StartAck(
         'context',  # TODO: the only one eventually?
     ]
 
-    # TODO: as part of the reponse we should report our allowed
-    # msg spec which should be generated from the type-annots as
-    # desired in # https://github.com/goodboy/tractor/issues/365
-    # When this does not match what the starter/caller side
-    # expects we of course raise a `TypeError` just like if
-    # a function had been called using an invalid signature.
-    #
-    # msgspec: MsgSpec
+    # import typing
+    # eval(str(Any), {}, {'typing': typing})
+    # started_spec: str = str(Any)
+    # return_spec
 
 
 class Started(
@@ -290,6 +366,7 @@ class Stop(
     # pld: UnsetType = UNSET
 
 
+# TODO: is `Result` or `Out[come]` a better name?
 class Return(
     Msg,
     Generic[PayloadT],
@@ -302,6 +379,27 @@ class Return(
     pld: PayloadT
 
 
+class CancelAck(
+    Return,
+):
+    '''
+    Deliver the `bool` return-value from a cancellation `Actor`
+    method scheduled via and prior RPC request.
+
+    - `Actor.cancel()`
+       `|_.cancel_soon()`
+       `|_.cancel_rpc_tasks()`
+       `|_._cancel_task()`
+       `|_.cancel_server()`
+
+    RPCs to these methods must **always** be able to deliver a result
+    despite the currently configured IPC msg spec such that graceful
+    cancellation is always functional in the runtime.
+
+    '''
+    pld: bool
+
+
 class Error(
     Struct,
     tag=True,
@@ -530,9 +628,13 @@ def mk_msg_spec(
 
     pld_spec: Union[Type] = specs[spec_build_method]
     runtime_spec: Union[Type] = Union[*ipc_msg_types]
-
+    ipc_spec = pld_spec | runtime_spec
+    log.runtime(
+        'Generating new IPC msg-spec\n'
+        f'{ipc_spec}\n'
+    )
     return (
-        pld_spec | runtime_spec,
+        ipc_spec,
         msgtypes_table[spec_build_method] + ipc_msg_types,
     )
 
-- 
2.34.1


From 5c1401bf813097dc210f75b0b530318dd70ba2f9 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 5 Apr 2024 14:04:32 -0400
Subject: [PATCH 211/378] Factor boxed-err formatting into new
 `pformat_boxed_tb()` helper for use elsewhere

---
 tractor/_exceptions.py | 78 +++++++++++++++++++++++++++++++-----------
 1 file changed, 58 insertions(+), 20 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 9c1dc36d..28c61628 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -104,6 +104,57 @@ def get_err_type(type_name: str) -> BaseException|None:
             return type_ref
 
 
+def pformat_boxed_tb(
+    tb_str: str,
+    fields_str: str|None = None,
+    field_prefix: str = ' |_',
+    indent: str = ' '*2
+) -> str:
+    if (
+        fields_str
+        and
+        field_prefix
+    ):
+        fields: str = textwrap.indent(
+            fields_str,
+            # prefix=' '*2,
+            # prefix=' |_',
+            prefix=field_prefix,
+        )
+    else:
+        fields = fields_str or ''
+
+    # body_indent: str = len(field_prefix) * ' '
+    body: str = (
+
+        # orig
+        # f'  |\n'
+        # f'   ------ - ------\n\n'
+        # f'{tb_str}\n'
+        # f'   ------ - ------\n'
+        # f' _|\n'
+
+        f'|\n'
+        f' ------ - ------\n\n'
+        f'{tb_str}\n'
+        f' ------ - ------\n'
+        f'_|\n'
+    )
+    if len(indent):
+        body: str = textwrap.indent(
+            body,
+            # prefix=body_indent,
+            prefix=indent,
+        )
+
+    return (
+        fields
+        +
+        body
+    )
+    # return body
+
+
 # TODO: rename to just `RemoteError`?
 class RemoteActorError(Exception):
     '''
@@ -117,7 +168,7 @@ class RemoteActorError(Exception):
     '''
     reprol_fields: list[str] = [
         'src_uid',
-        'relay_path',
+        # 'relay_path',
     ]
 
     def __init__(
@@ -249,7 +300,7 @@ class RemoteActorError(Exception):
     @property
     def tb_str(
         self,
-        indent: str = ' '*3,
+        indent: str = ' ',
     ) -> str:
         if remote_tb := self.msgdata.get('tb_str'):
             return textwrap.indent(
@@ -309,25 +360,12 @@ class RemoteActorError(Exception):
         fields: str = self._mk_fields_str(
             _body_fields,
         )
-        fields: str = textwrap.indent(
-            fields,
-            # prefix=' '*2,
-            prefix=' |_',
+        body: str = pformat_boxed_tb(
+            tb_str=self.tb_str,
+            fields_str=fields,
+            field_prefix=' |_',
+            indent=' ',  # no indent?
         )
-        indent: str = ''*1
-        body: str = (
-            f'{fields}'
-            f'  |\n'
-            f'   ------ - ------\n\n'
-            f'{self.tb_str}\n'
-            f'   ------ - ------\n'
-            f' _|\n'
-        )
-        if indent:
-            body: str = textwrap.indent(
-                body,
-                prefix=indent,
-            )
         return (
             f'<{type(self).__name__}(\n'
             f'{body}'
-- 
2.34.1


From b1fd8b2ec36d12cedc79d39e8da27207b964340f Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 5 Apr 2024 16:00:32 -0400
Subject: [PATCH 212/378] Make `Context.started()` a type checked IPC send

As detailed in the surrounding notes, it's pretty advantageous to always
have the child context task ensure the first msg it relays back is
msg-type checked against the current spec and thus `MsgCodec`. Implement
the check via a simple codec-roundtrip of the `Started` msg such that
the `.pld` payload is always validated before transit. This ensures the
child will fail early and notify the parent before any streaming takes
place (i.e. the "nasty" dialog protocol phase).

The main motivation here is to avoid inter-actor task syncing bugs that
are hard(er) to recover from and/or such as if an invalid typed msg is
sent to the parent, who then ignores it (depending on config), and then
the child thinks the parent is in some presumed state while the parent
is still thinking a first msg has yet to arrive. Doing the stringent
check on the sender side (i.e. the child is sending the "first"
application msg via `.started()`) avoids/sidesteps dealing with such
syncing/coordinated-state problems by keeping the entire IPC dialog in
a "cheap" or "control" style transaction up until a stream is opened.

Iow, the parent task's `.open_context()` block entry can't occur until
the child side is definitely (as much as is possible with IPC msg type
checking) in a correct state spec wise. During any streaming phase in
the dialog the msg-type-checking is NOT done for performance (the
"nasty" protocol phase) and instead any type errors are relayed back
from the receiving side. I'm still unsure whether to take the same
approach on the `Return` msg, since at that point erroring early doesn't
benefit the parent task if/when a msg-type error occurs? Definitely more
to ponder and tinker out here..

Impl notes:
- a gotcha with the roundtrip-codec-ed msg is that it often won't match
  the input `value` bc in the `msgpack` case many native python
  sequence/collection types will map to a common array type due to the
  surjection that `msgpack`'s type-sys imposes.
  - so we can't assert that `started == rt_started` but it may be useful
    to at least report the diff of the type-reduced payload so that the
    caller can at least be notified how the input `value` might be
    better type-casted prior to call, for ex. pre-casting to `list`s.
- added a `._strict_started: bool` that could provide the stringent
  checking if desired in the future.
- on any validation error raise our `MsgTypeError` from it.
- ALSO change over the lingering `.send_yield()` deprecated meth body
  to use a `Yield()`.
---
 tractor/_context.py | 79 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 67 insertions(+), 12 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index 38b4431d..6e55c3c9 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -32,6 +32,7 @@ from dataclasses import (
 )
 from functools import partial
 import inspect
+import msgspec
 from pprint import pformat
 from typing import (
     Any,
@@ -60,6 +61,9 @@ from .msg import (
     Started,
     Stop,
     Yield,
+    current_codec,
+    MsgCodec,
+    pretty_struct,
 )
 from ._ipc import Channel
 from ._streaming import MsgStream
@@ -505,6 +509,8 @@ class Context:
     _in_overrun: bool = False
     _allow_overruns: bool = False
 
+    # TODO: figure out how we can enforce this without losing our minds..
+    _strict_started: bool = False
 
     def __str__(self) -> str:
         ds: str = '='
@@ -727,7 +733,13 @@ class Context:
             DeprecationWarning,
             stacklevel=2,
         )
-        await self.chan.send({'yield': data, 'cid': self.cid})
+        # await self.chan.send({'yield': data, 'cid': self.cid})
+        await self.chan.send(
+            Yield(
+                cid=self.cid,
+                pld=data,
+            )
+        )
 
     async def send_stop(self) -> None:
         # await pause()
@@ -1640,18 +1652,61 @@ class Context:
                 f'called `.started()` twice on context with {self.chan.uid}'
             )
 
-        # await self.chan.send(
-        #     {
-        #         'started': value,
-        #          'cid': self.cid,
-        #     }
-        # )
-        await self.chan.send(
-            Started(
-                cid=self.cid,
-                pld=value,
-            )
+        started = Started(
+            cid=self.cid,
+            pld=value,
         )
+        # XXX MEGA NOTE XXX: ONLY on the first msg sent with
+        # `Context.started()` do we STRINGENTLY roundtrip-check
+        # the first payload such that the child side can't send an
+        # incorrect value according to the currently applied
+        # msg-spec!
+        #
+        # HOWEVER, once a stream is opened via
+        # `Context.open_stream()` then this check is NEVER done on
+        # `MsgStream.send()` and instead both the parent and child
+        # sides are expected to relay back msg-type errors when
+        # decode failures exhibit on `MsgStream.receive()` calls thus
+        # enabling a so-called (by the holy 0mq lords)
+        # "cheap-or-nasty pattern" un-protocol design Bo
+        #
+        # https://zguide.zeromq.org/docs/chapter7/#The-Cheap-or-Nasty-Pattern
+        #
+        codec: MsgCodec = current_codec()
+        msg_bytes: bytes = codec.encode(started)
+        try:
+            # be a "cheap" dialog (see above!)
+            rt_started = codec.decode(msg_bytes)
+            if rt_started != started:
+
+                # TODO: break these methods out from the struct subtype?
+                diff = pretty_struct.Struct.__sub__(rt_started, started)
+
+                complaint: str = (
+                    'Started value does not match after codec rountrip?\n\n'
+                    f'{diff}'
+                )
+                # TODO: rn this will pretty much always fail with
+                # any other sequence type embeded in the
+                # payload...
+                if self._strict_started:
+                    raise ValueError(complaint)
+                else:
+                    log.warning(complaint)
+
+            await self.chan.send(rt_started)
+
+        # raise any msg type error NO MATTER WHAT!
+        except msgspec.ValidationError as verr:
+            from tractor._ipc import _raise_msg_type_err
+            _raise_msg_type_err(
+                msg=msg_bytes,
+                codec=codec,
+                validation_err=verr,
+                verb_header='Trying to send payload'
+                # > 'invalid `Started IPC msgs\n'
+            )
+
         self._started_called = True
 
     async def _drain_overflows(
-- 
2.34.1


From 97bfbdbc1c1979a2cd875d53494b1cc51a89a49a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 5 Apr 2024 16:32:15 -0400
Subject: [PATCH 213/378] Expose `MsgTypeError` from pkg

---
 tractor/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tractor/__init__.py b/tractor/__init__.py
index c7d21c9d..0f2bdd65 100644
--- a/tractor/__init__.py
+++ b/tractor/__init__.py
@@ -44,9 +44,10 @@ from ._state import (
     is_root_process as is_root_process,
 )
 from ._exceptions import (
-    RemoteActorError as RemoteActorError,
-    ModuleNotExposed as ModuleNotExposed,
     ContextCancelled as ContextCancelled,
+    ModuleNotExposed as ModuleNotExposed,
+    MsgTypeError as MsgTypeError,
+    RemoteActorError as RemoteActorError,
 )
 from .devx import (
     breakpoint as breakpoint,
-- 
2.34.1


From 4cfe4979ff555bcff2d0257603aa44de38c1de96 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 5 Apr 2024 16:34:07 -0400
Subject: [PATCH 214/378] Factor `MsgpackTCPStream` msg-type checks

Add both the `.send()` and `.recv()` handling blocks to a common
`_raise_msg_type_err()` which includes detailed error msg formatting:

- the `.recv()` side case does introspection of the `Msg` fields and
  attempting to report the exact (field type related) issue
- `.send()` side does some boxed-error style tb formatting like
  `RemoteActorError`.
- add a `strict_types: bool` to `.send()` to allow for just
  warning on bad inputs versus raising, but always raise from any
  `Encoder` type error.
---
 tractor/_ipc.py | 174 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 123 insertions(+), 51 deletions(-)

diff --git a/tractor/_ipc.py b/tractor/_ipc.py
index 6168c77c..9af28e5a 100644
--- a/tractor/_ipc.py
+++ b/tractor/_ipc.py
@@ -54,7 +54,8 @@ from tractor.msg import (
     _ctxvar_MsgCodec,
     _codec,
     MsgCodec,
-    types,
+    types as msgtypes,
+    pretty_struct,
 )
 
 log = get_logger(__name__)
@@ -72,6 +73,7 @@ def get_stream_addrs(stream: trio.SocketStream) -> tuple:
     )
 
 
+# TODO: this should be our `Union[*msgtypes.__spec__]` now right?
 MsgType = TypeVar("MsgType")
 
 # TODO: consider using a generic def and indexing with our eventual
@@ -116,6 +118,73 @@ class MsgTransport(Protocol[MsgType]):
         ...
 
 
+def _raise_msg_type_err(
+    msg: Any|bytes,
+    codec: MsgCodec,
+    validation_err: msgspec.ValidationError|None = None,
+    verb_header: str = '',
+
+) -> None:
+
+    # if side == 'send':
+    if validation_err is None:  # send-side
+
+        import traceback
+        from tractor._exceptions import pformat_boxed_tb
+
+        fmt_spec: str = '\n'.join(
+            map(str, codec.msg_spec.__args__)
+        )
+        fmt_stack: str = (
+            '\n'.join(traceback.format_stack(limit=3))
+        )
+        tb_fmt: str = pformat_boxed_tb(
+            tb_str=fmt_stack,
+            # fields_str=header,
+            field_prefix='  ',
+            indent='',
+        )
+        raise MsgTypeError(
+            f'invalid msg -> {msg}: {type(msg)}\n\n'
+            f'{tb_fmt}\n'
+            f'Valid IPC msgs are:\n\n'
+            # f'  ------ - ------\n'
+            f'{fmt_spec}\n'
+        )
+
+    else:
+        # decode the msg-bytes using the std msgpack
+        # interchange-prot (i.e. without any
+        # `msgspec.Struct` handling) so that we can
+        # determine what `.msg.types.Msg` is the culprit
+        # by reporting the received value.
+        msg_dict: dict = msgspec.msgpack.decode(msg)
+        msg_type_name: str = msg_dict['msg_type']
+        msg_type = getattr(msgtypes, msg_type_name)
+        errmsg: str = (
+            f'invalid `{msg_type_name}` IPC msg\n\n'
+        )
+        if verb_header:
+            errmsg = f'{verb_header} ' + errmsg
+
+        # XXX see if we can determine the exact invalid field
+        # such that we can comprehensively report the
+        # specific field's type problem
+        msgspec_msg: str = validation_err.args[0].rstrip('`')
+        msg, _, maybe_field = msgspec_msg.rpartition('$.')
+        if field_val := msg_dict.get(maybe_field):
+            field_type: Union[Type] = msg_type.__signature__.parameters[
+                maybe_field
+            ].annotation
+            errmsg += (
+                f'{msg.rstrip("`")}\n\n'
+                f'{msg_type}\n'
+                f' |_.{maybe_field}: {field_type} = {field_val!r}\n'
+            )
+
+        raise MsgTypeError(errmsg) from validation_err
+
+
 # TODO: not sure why we have to inherit here, but it seems to be an
 # issue with ``get_msg_transport()`` returning a ``Type[Protocol]``;
 # probably should make a `mypy` issue?
@@ -175,9 +244,10 @@ class MsgpackTCPStream(MsgTransport):
             or
             _codec._ctxvar_MsgCodec.get()
         )
-        log.critical(
-            '!?!: USING STD `tractor` CODEC !?!?\n'
-            f'{self._codec}\n'
+        # TODO: mask out before release?
+        log.runtime(
+            f'New {self} created with codec\n'
+            f'codec: {self._codec}\n'
         )
 
     async def _iter_packets(self) -> AsyncGenerator[dict, None]:
@@ -221,16 +291,18 @@ class MsgpackTCPStream(MsgTransport):
                 # NOTE: lookup the `trio.Task.context`'s var for
                 # the current `MsgCodec`.
                 codec: MsgCodec = _ctxvar_MsgCodec.get()
+
+                # TODO: mask out before release?
                 if self._codec.pld_spec != codec.pld_spec:
                     # assert (
                     #     task := trio.lowlevel.current_task()
                     # ) is not self._task
                     # self._task = task
                     self._codec = codec
-                    log.critical(
-                        '.recv() USING NEW CODEC !?!?\n'
-                        f'{self._codec}\n\n'
-                        f'msg_bytes -> {msg_bytes}\n'
+                    log.runtime(
+                        'Using new codec in {self}.recv()\n'
+                        f'codec: {self._codec}\n\n'
+                        f'msg_bytes: {msg_bytes}\n'
                     )
                 yield codec.decode(msg_bytes)
 
@@ -252,36 +324,13 @@ class MsgpackTCPStream(MsgTransport):
             # and always raise such that spec violations
             # are never allowed to be caught silently!
             except msgspec.ValidationError as verr:
-
-                # decode the msg-bytes using the std msgpack
-                # interchange-prot (i.e. without any
-                # `msgspec.Struct` handling) so that we can
-                # determine what `.msg.types.Msg` is the culprit
-                # by reporting the received value.
-                msg_dict: dict = msgspec.msgpack.decode(msg_bytes)
-                msg_type_name: str = msg_dict['msg_type']
-                msg_type = getattr(types, msg_type_name)
-                errmsg: str = (
-                    f'Received invalid IPC `{msg_type_name}` msg\n\n'
+                # re-raise as type error
+                _raise_msg_type_err(
+                    msg=msg_bytes,
+                    codec=codec,
+                    validation_err=verr,
                 )
 
-                # XXX see if we can determine the exact invalid field
-                # such that we can comprehensively report the
-                # specific field's type problem
-                msgspec_msg: str = verr.args[0].rstrip('`')
-                msg, _, maybe_field = msgspec_msg.rpartition('$.')
-                if field_val := msg_dict.get(maybe_field):
-                    field_type: Union[Type] = msg_type.__signature__.parameters[
-                        maybe_field
-                    ].annotation
-                    errmsg += (
-                        f'{msg.rstrip("`")}\n\n'
-                        f'{msg_type}\n'
-                        f' |_.{maybe_field}: {field_type} = {field_val}\n'
-                    )
-
-                raise MsgTypeError(errmsg) from verr
-
             except (
                 msgspec.DecodeError,
                 UnicodeDecodeError,
@@ -307,12 +356,16 @@ class MsgpackTCPStream(MsgTransport):
 
     async def send(
         self,
-        msg: Any,
+        msg: msgtypes.Msg,
 
+        strict_types: bool = True,
         # hide_tb: bool = False,
     ) -> None:
         '''
-        Send a msgpack coded blob-as-msg over TCP.
+        Send a msgpack encoded py-object-blob-as-msg over TCP.
+
+        If `strict_types == True` then a `MsgTypeError` will be raised on any
+        invalid msg type
 
         '''
         # __tracebackhide__: bool = hide_tb
@@ -321,25 +374,40 @@ class MsgpackTCPStream(MsgTransport):
             # NOTE: lookup the `trio.Task.context`'s var for
             # the current `MsgCodec`.
             codec: MsgCodec = _ctxvar_MsgCodec.get()
-            # if self._codec != codec:
+
+            # TODO: mask out before release?
             if self._codec.pld_spec != codec.pld_spec:
                 self._codec = codec
-                log.critical(
-                    '.send() using NEW CODEC !?!?\n'
-                    f'{self._codec}\n\n'
-                    f'OBJ -> {msg}\n'
+                log.runtime(
+                    'Using new codec in {self}.send()\n'
+                    f'codec: {self._codec}\n\n'
+                    f'msg: {msg}\n'
                 )
-            if type(msg) not in types.__spec__:
-                log.warning(
-                    'Sending non-`Msg`-spec msg?\n\n'
-                    f'{msg}\n'
-                )
-            bytes_data: bytes = codec.encode(msg)
+
+            if type(msg) not in msgtypes.__msg_types__:
+                if strict_types:
+                    _raise_msg_type_err(
+                        msg,
+                        codec=codec,
+                    )
+                else:
+                    log.warning(
+                        'Sending non-`Msg`-spec msg?\n\n'
+                        f'{msg}\n'
+                    )
+
+            try:
+                bytes_data: bytes = codec.encode(msg)
+            except TypeError as typerr:
+                raise MsgTypeError(
+                    'A msg field violates the current spec\n'
+                    f'{codec.pld_spec}\n\n'
+                    f'{pretty_struct.Struct.pformat(msg)}'
+                ) from typerr
 
             # supposedly the fastest says,
             # https://stackoverflow.com/a/54027962
             size: bytes = struct.pack("<I", len(bytes_data))
-
             return await self.stream.send_all(size + bytes_data)
 
     @property
@@ -567,7 +635,6 @@ class Channel:
             f'{pformat(payload)}\n'
         )  # type: ignore
         assert self._transport
-
         await self._transport.send(
             payload,
             # hide_tb=hide_tb,
@@ -577,6 +644,11 @@ class Channel:
         assert self._transport
         return await self._transport.recv()
 
+        # TODO: auto-reconnect features like 0mq/nanomsg?
+        # -[ ] implement it manually with nods to SC prot
+        #      possibly on multiple transport backends?
+        #  -> seems like that might be re-inventing scalability
+        #     prots tho no?
         # try:
         #     return await self._transport.recv()
         # except trio.BrokenResourceError:
-- 
2.34.1


From b9a61ded0ae28132ed69a4c497fcbeb621e5f48a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 5 Apr 2024 19:07:12 -0400
Subject: [PATCH 215/378] Drop `None`-sentinel cancels RPC loop mechanism

Pretty sure we haven't *needed it* for a while, it was always generally
hazardous in terms of IPC msg types, AND it's definitely incompatible
with a dynamically applied typed msg spec: you can't just expect
a `None` to be willy nilly handled all the time XD

For now I'm masking out all the code and leaving very detailed
surrounding notes but am not removing it quite yet in case for strange
reason it is needed by some edge case (though I haven't found according
to the test suite).

Backstory:
------ - ------
Originally (i'm pretty sure anyway) it was added as a super naive
"remote cancellation" mechanism (back before there were specific `Actor`
methods for such things) that was mostly (only?) used before IPC
`Channel` closures to "more gracefully cancel" the connection's parented
RPC tasks. Since we now have explicit runtime-RPC endpoints for
conducting remote cancellation of both tasks and full actors, it should
really be removed anyway, because:
- a `None`-msg setinel is inconsistent with other RPC endpoint handling
  input patterns which (even prior to typed msging) had specific
  msg-value triggers.
- the IPC endpoint's (block) implementation should use
  `Actor.cancel_rpc_tasks(parent_chan=chan)` instead of a manual loop
  through a `Actor._rpc_tasks.copy()`..

Deats:
- mask the `Channel.send(None)` calls from both the `Actor._stream_handler()` tail
  as well as from the `._portal.open_portal()` was connected block.
- mask the msg loop endpoint block and toss in lotsa notes.

Unrelated tweaks:
- drop `Actor._debug_mode`; unused.
- make `Actor.cancel_server()` return a `bool`.
- use `.msg.pretty_struct.Struct.pformat()` to show any msg that is
  ignored (bc invalid) in `._push_result()`.
---
 tractor/_portal.py  |  6 ++--
 tractor/_rpc.py     | 71 +++++++++++++++++++++++-------------
 tractor/_runtime.py | 88 ++++++++++++++++++++++++++-------------------
 3 files changed, 100 insertions(+), 65 deletions(-)

diff --git a/tractor/_portal.py b/tractor/_portal.py
index cc9052ba..957eae59 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -502,7 +502,7 @@ async def open_portal(
     '''
     actor = current_actor()
     assert actor
-    was_connected = False
+    was_connected: bool = False
 
     async with maybe_open_nursery(nursery, shield=shield) as nursery:
 
@@ -533,9 +533,7 @@ async def open_portal(
             await portal.aclose()
 
             if was_connected:
-                # gracefully signal remote channel-msg loop
-                await channel.send(None)
-                # await channel.aclose()
+                await channel.aclose()
 
             # cancel background msg loop task
             if msg_loop_cs:
diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index 0549b0cb..5559702b 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -55,7 +55,6 @@ from ._exceptions import (
     TransportClosed,
 )
 from .devx import (
-    pause,
     maybe_wait_for_debugger,
     _debug,
 )
@@ -429,8 +428,6 @@ async def _invoke(
         # XXX for .pause_from_sync()` usage we need to make sure
         # `greenback` is boostrapped in the subactor!
         await _debug.maybe_init_greenback()
-    # else:
-    #     await pause()
 
     # TODO: possibly a specially formatted traceback
     # (not sure what typing is for this..)?
@@ -855,30 +852,54 @@ async def process_messages(
 
                 match msg:
 
+                # NOTE: this *was a dedicated
+                # "graceful-terminate-loop" mechanism using
+                # a `None`-msg-sentinel which would cancel all RPC
+                # tasks parented by this loop's IPC channel; that
+                # is all rpc-scheduled-tasks started over the
+                # connection were explicitly per-task cancelled
+                # normally prior to the `Channel`'s underlying
+                # transport being later closed.
+                #
+                # * all `.send(None)`s were # removed as part of
+                #   typed-msging  requirements
+                #
+                # TODO: if this mechanism is still desired going
+                # forward it should be implemented as part of the
+                # normal runtime-cancel-RPC endpoints with either,
+                # - a special `msg.types.Msg` to trigger the loop endpoint
+                #   (like `None` was used prior) or,
+                # - it should just be accomplished using A 
+                #   `Start(ns='self', func='cancel_rpc_tasks())`
+                #   request instead?
+                #
                 # if msg is None:
-                # dedicated loop terminate sentinel
-                    case None:
+                    # case None:
+                    #     tasks: dict[
+                    #         tuple[Channel, str],
+                    #         tuple[Context, Callable, trio.Event]
+                    #     ] = actor._rpc_tasks.copy()
+                    #     log.cancel(
+                    #         f'Peer IPC channel terminated via `None` setinel msg?\n'
+                    #         f'=> Cancelling all {len(tasks)} local RPC tasks..\n'
+                    #         f'peer: {chan.uid}\n'
+                    #         f'|_{chan}\n'
+                    #     )
+                    #     # TODO: why aren't we just calling
+                    #     # `.cancel_rpc_tasks()` with the parent
+                    #     # chan as input instead?
+                    #     for (channel, cid) in tasks:
+                    #         if channel is chan:
+                    #             await actor._cancel_task(
+                    #                 cid,
+                    #                 channel,
+                    #                 requesting_uid=channel.uid,
 
-                        tasks: dict[
-                            tuple[Channel, str],
-                            tuple[Context, Callable, trio.Event]
-                        ] = actor._rpc_tasks.copy()
-                        log.cancel(
-                            f'Peer IPC channel terminated via `None` setinel msg?\n'
-                            f'=> Cancelling all {len(tasks)} local RPC tasks..\n'
-                            f'peer: {chan.uid}\n'
-                            f'|_{chan}\n'
-                        )
-                        for (channel, cid) in tasks:
-                            if channel is chan:
-                                await actor._cancel_task(
-                                    cid,
-                                    channel,
-                                    requesting_uid=channel.uid,
+                    #                 ipc_msg=msg,
+                    #             )
 
-                                    ipc_msg=msg,
-                                )
-                        break
+                    #     # immediately break out of this loop!
+                    #     break
 
                 # cid = msg.get('cid')
                 # if cid:
@@ -916,7 +937,7 @@ async def process_messages(
                         cid=cid,
                         ns=ns,
                         func=funcname,
-                        kwargs=kwargs,
+                        kwargs=kwargs,  # type-spec this? see `msg.types`
                         uid=actorid,
                     ):
                         # try:
diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 3bafada1..e08d074a 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -65,7 +65,11 @@ from trio import (
     TaskStatus,
 )
 
-from .msg import NamespacePath
+from tractor.msg import (
+    pretty_struct,
+    NamespacePath,
+    types as msgtypes,
+)
 from ._ipc import Channel
 from ._context import (
     mk_context,
@@ -91,10 +95,6 @@ from ._rpc import (
     process_messages,
     try_ship_error_to_remote,
 )
-from tractor.msg import (
-    types as msgtypes,
-    pretty_struct,
-)
 # from tractor.msg.types import (
 #     Aid,
 #     SpawnSpec,
@@ -164,18 +164,15 @@ class Actor:
     # Information about `__main__` from parent
     _parent_main_data: dict[str, str]
     _parent_chan_cs: CancelScope|None = None
-    _spawn_spec: SpawnSpec|None = None
+    _spawn_spec: msgtypes.SpawnSpec|None = None
 
     # syncs for setup/teardown sequences
     _server_down: trio.Event|None = None
 
-    # user toggled crash handling (including monkey-patched in
-    # `trio.open_nursery()` via `.trionics._supervisor` B)
-    _debug_mode: bool = False
-
     # if started on ``asycio`` running ``trio`` in guest mode
     _infected_aio: bool = False
 
+    # TODO: nursery tracking like `trio` does?
     # _ans: dict[
     #     tuple[str, str],
     #     list[ActorNursery],
@@ -716,35 +713,50 @@ class Actor:
                         # TODO: figure out why this breaks tests..
                         db_cs.cancel()
 
-            # XXX: is this necessary (GC should do it)?
+            # XXX TODO XXX: DO WE NEED THIS?
+            # -[ ] is it necessary any more (GC should do it) now
+            #    that we have strict(er) graceful cancellation
+            #    semantics?
             # XXX WARNING XXX
             # Be AWARE OF THE INDENT LEVEL HERE
             # -> ONLY ENTER THIS BLOCK WHEN ._peers IS
             # EMPTY!!!!
-            if (
-                not self._peers
-                and chan.connected()
-            ):
-                    # if the channel is still connected it may mean the far
-                    # end has not closed and we may have gotten here due to
-                    # an error and so we should at least try to terminate
-                    # the channel from this end gracefully.
-                    log.runtime(
-                        'Terminating channel with `None` setinel msg\n'
-                        f'|_{chan}\n'
-                    )
-                    try:
-                        # send msg loop terminate sentinel which
-                        # triggers cancellation of all remotely
-                        # started tasks.
-                        await chan.send(None)
+            #
+            # if the channel is still connected it may mean the far
+            # end has not closed and we may have gotten here due to
+            # an error and so we should at least try to terminate
+            # the channel from this end gracefully.
+            #if (
+            #    not self._peers
+            #    and chan.connected()
+            #):
+            #        log.runtime(
+            #            'Terminating channel with `None` setinel msg\n'
+            #            f'|_{chan}\n'
+            #        )
+            #        try:
+            #            # ORIGINALLY we sent a msg loop terminate
+            #            # sentinel (`None`) which triggers
+            #            # cancellation of all remotely started
+            #            # tasks.
+            #            #
+            #            # HOWEVER, after we added typed msging,
+            #            # you can't just willy nilly send `None`
+            #            # wherever since it might be invalid given
+            #            # the currently configured msg-spec.
+            #            #
+            #            # SO, this was all removed and I'm pretty
+            #            # confident we don't need it replaced with
+            #            # a manual RPC to
+            #            # a `Actor.cancel_rpc_tasks()` right?
+            #            await chan.send(None)
 
-                        # XXX: do we want this? no right?
-                        # causes "[104] connection reset by peer" on other end
-                        # await chan.aclose()
+            #            # XXX: do we want this? NO RIGHT?
+            #            # causes "[104] connection reset by peer" on other end
+            #            # await chan.aclose()
 
-                    except trio.BrokenResourceError:
-                        log.runtime(f"Channel {chan.uid} was already closed")
+            #        except trio.BrokenResourceError:
+            #            log.runtime(f"Channel {chan.uid} was already closed")
 
     # TODO: rename to `._deliver_payload()` since this handles
     # more then just `result` msgs now obvi XD
@@ -774,9 +786,10 @@ class Actor:
             log.warning(
                 'Ignoring invalid IPC ctx msg!\n\n'
                 f'<= sender: {uid}\n'
-                f'=> cid: {cid}\n\n'
+                # XXX don't need right since it's always in msg?
+                # f'=> cid: {cid}\n\n'
 
-                f'{msg}\n'
+                f'{pretty_struct.Struct.pformat(msg)}\n'
             )
             return
 
@@ -1437,7 +1450,7 @@ class Actor:
             )
         await self._ongoing_rpc_tasks.wait()
 
-    def cancel_server(self) -> None:
+    def cancel_server(self) -> bool:
         '''
         Cancel the internal IPC transport server nursery thereby
         preventing any new inbound IPC connections establishing.
@@ -1446,6 +1459,9 @@ class Actor:
         if self._server_n:
             log.runtime("Shutting down channel server")
             self._server_n.cancel_scope.cancel()
+            return True
+
+        return False
 
     @property
     def accept_addrs(self) -> list[tuple[str, int]]:
-- 
2.34.1


From aca6503fcd98077938fb9f518cfc6de4e669b33a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 7 Apr 2024 10:40:01 -0400
Subject: [PATCH 216/378] Flatten out RPC loop with `match:`/`case:`

Mainly expanding out the runtime endpoints for cancellation to separate
cases and flattening them with the main RPC-request-invoke block, moving
the non-cancel runtime case (where we call `getattr(actor, funcname)`)
inside the main `Start` case (for now) which branches on `ns=="self"`.

Also, add a new IPC msg `class CancelAck(Return):` which is always
included in the default msg-spec such that runtime cancellation (and
eventually all) endpoints return that msg (instead of a `Return`) and
thus sidestep any currently applied `MsgCodec` such that the results
(`bool`s for most cancel methods) are never violating the current type
limit(s) on `Msg.pld`. To support this expose a new variable
`return_msg: Return|CancelAck` param from
`_invoke()`/`_invoke_non_context)()` and set it to `CancelAck` in the
appropriate endpoint case-blocks of the msg loop.

Clean out all the lingering legacy `chan.send(<dict-msg>)` commented
codez from the invoker funcs, with more cleaning likely to come B)
---
 tractor/_rpc.py      | 308 ++++++++++++++++---------------------------
 tractor/msg/types.py |   4 +
 2 files changed, 119 insertions(+), 193 deletions(-)

diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index 5559702b..9da8690b 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -61,13 +61,15 @@ from .devx import (
 from . import _state
 from .log import get_logger
 from tractor.msg.types import (
+    CancelAck,
+    Error,
+    Msg,
+    Return,
     Start,
     StartAck,
     Started,
     Stop,
     Yield,
-    Return,
-    Error,
 )
 
 
@@ -89,6 +91,7 @@ async def _invoke_non_context(
 
     treat_as_gen: bool,
     is_rpc: bool,
+    return_msg: Return|CancelAck = Return,
 
     task_status: TaskStatus[
         Context | BaseException
@@ -97,7 +100,6 @@ async def _invoke_non_context(
 
     # TODO: can we unify this with the `context=True` impl below?
     if inspect.isasyncgen(coro):
-        # await chan.send({
         await chan.send(
             StartAck(
                 cid=cid,
@@ -123,11 +125,6 @@ async def _invoke_non_context(
                     # to_send = await chan.recv_nowait()
                     # if to_send is not None:
                     #     to_yield = await coro.asend(to_send)
-                    # await chan.send({
-                    #     # Yield()
-                    #     'cid': cid,
-                    #     'yield': item,
-                    # })
                     await chan.send(
                         Yield(
                             cid=cid,
@@ -142,11 +139,6 @@ async def _invoke_non_context(
         await chan.send(
             Stop(cid=cid)
         )
-        # await chan.send({
-        #     # Stop(
-        #     'cid': cid,
-        #     'stop': True,
-        # })
 
     # one way @stream func that gets treated like an async gen
     # TODO: can we unify this with the `context=True` impl below?
@@ -157,11 +149,6 @@ async def _invoke_non_context(
                 functype='asyncgen',
             )
         )
-        # await chan.send({
-        #     # StartAck()
-        #     'cid': cid,
-        #     'functype': 'asyncgen',
-        # })
         # XXX: the async-func may spawn further tasks which push
         # back values like an async-generator would but must
         # manualy construct the response dict-packet-responses as
@@ -177,11 +164,6 @@ async def _invoke_non_context(
             await chan.send(
                 Stop(cid=cid)
             )
-            # await chan.send({
-            #     # Stop(
-            #     'cid': cid,
-            #     'stop': True,
-            # })
     else:
         # regular async function/method
         # XXX: possibly just a scheduled `Actor._cancel_task()`
@@ -199,11 +181,6 @@ async def _invoke_non_context(
                     functype='asyncfunc',
                 )
             )
-            # await chan.send({
-            #     # StartAck()
-            #     'cid': cid,
-            #     'functype': 'asyncfunc',
-            # })
         except (
             trio.ClosedResourceError,
             trio.BrokenResourceError,
@@ -237,13 +214,8 @@ async def _invoke_non_context(
                 and chan.connected()
             ):
                 try:
-                    # await chan.send({
-                    #     # Return()
-                    #     'cid': cid,
-                    #     'return': result,
-                    # })
                     await chan.send(
-                        Return(
+                        return_msg(
                             cid=cid,
                             pld=result,
                         )
@@ -408,6 +380,7 @@ async def _invoke(
 
     is_rpc: bool = True,
     hide_tb: bool = True,
+    return_msg: Return|CancelAck = Return,
 
     task_status: TaskStatus[
         Context | BaseException
@@ -517,6 +490,7 @@ async def _invoke(
                 kwargs,
                 treat_as_gen,
                 is_rpc,
+                return_msg,
                 task_status,
             )
             # below is only for `@context` funcs
@@ -547,11 +521,6 @@ async def _invoke(
                 functype='context',
             )
         )
-        # await chan.send({
-        #     # StartAck()
-        #     'cid': cid,
-        #     'functype': 'context',
-        # })
 
         # TODO: should we also use an `.open_context()` equiv
         # for this callee side by factoring the impl from
@@ -576,16 +545,11 @@ async def _invoke(
 
                 # deliver final result to caller side.
                 await chan.send(
-                    Return(
+                    return_msg(
                         cid=cid,
                         pld=res,
                     )
                 )
-                # await chan.send({
-                #     # Return()
-                #     'cid': cid,
-                #     'return': res,
-                # })
 
             # NOTE: this happens IFF `ctx._scope.cancel()` is
             # called by any of,
@@ -674,7 +638,6 @@ async def _invoke(
                     ctxc = ContextCancelled(
                         msg,
                         boxed_type=trio.Cancelled,
-                        # boxed_type_str='Cancelled',
                         canceller=canceller,
                     )
                     # assign local error so that the `.outcome`
@@ -775,12 +738,12 @@ async def try_ship_error_to_remote(
             trio.BrokenResourceError,
             BrokenPipeError,
         ):
-            # err_msg: dict = msg['error']['tb_str']
             log.critical(
                 'IPC transport failure -> '
                 f'failed to ship error to {remote_descr}!\n\n'
                 f'X=> {channel.uid}\n\n'
-                # f'{err_msg}\n'
+
+                # TODO: use `.msg.preetty_struct` for this!
                 f'{msg}\n'
             )
 
@@ -822,6 +785,8 @@ async def process_messages(
 
 
     '''
+    assert actor._service_n  # state sanity
+
     # TODO: once `trio` get's an "obvious way" for req/resp we
     # should use it?
     # https://github.com/python-trio/trio/issues/467
@@ -831,7 +796,7 @@ async def process_messages(
         f'|_{chan}\n'
     )
     nursery_cancelled_before_task: bool = False
-    msg: dict | None = None
+    msg: Msg|None = None
     try:
         # NOTE: this internal scope allows for keeping this
         # message loop running despite the current task having
@@ -840,6 +805,7 @@ async def process_messages(
         # using ``scope = Nursery.start()``
         with CancelScope(shield=shield) as loop_cs:
             task_status.started(loop_cs)
+
             async for msg in chan:
                 log.transport(   # type: ignore
                     f'<= IPC msg from peer: {chan.uid}\n\n'
@@ -894,21 +860,18 @@ async def process_messages(
                     #                 cid,
                     #                 channel,
                     #                 requesting_uid=channel.uid,
-
                     #                 ipc_msg=msg,
                     #             )
-
                     #     # immediately break out of this loop!
                     #     break
 
-                # cid = msg.get('cid')
-                # if cid:
                     case (
                         StartAck(cid=cid)
                         | Started(cid=cid)
                         | Yield(cid=cid)
                         | Stop(cid=cid)
                         | Return(cid=cid)
+                        | CancelAck(cid=cid)
                         | Error(cid=cid)
                     ):
                         # deliver response to local caller/waiter
@@ -918,17 +881,85 @@ async def process_messages(
                             cid,
                             msg,
                         )
+                        # TODO: can remove right?
+                        # continue
+
+                    # runtime-internal cancellation endpoints
+                    case Start(
+                        ns='self',
+                        func='cancel',
+                        cid=cid,
+                        kwargs=kwargs,
+                    ):
+                        kwargs |= {'req_chan': chan}
+
+                        # XXX NOTE XXX don't start entire actor
+                        # runtime cancellation if this actor is
+                        # currently in debug mode!
+                        pdb_complete: trio.Event|None = _debug.Lock.local_pdb_complete
+                        if pdb_complete:
+                            await pdb_complete.wait()
+
+                        # Either of  `Actor.cancel()`/`.cancel_soon()`
+                        # was called, so terminate this IPC msg
+                        # loop, exit back out into `async_main()`,
+                        # and immediately start the core runtime
+                        # machinery shutdown!
+                        with CancelScope(shield=True):
+                            await _invoke(
+                                actor,
+                                cid,
+                                chan,
+                                actor.cancel,
+                                kwargs,
+                                is_rpc=False,
+                                return_msg=CancelAck,
+                            )
 
                         log.runtime(
-                            'Waiting on next IPC msg from\n'
-                            f'peer: {chan.uid}:\n'
+                            'Cancelling IPC transport msg-loop with peer:\n'
                             f'|_{chan}\n'
-
-                            # f'last msg: {msg}\n'
                         )
-                        continue
+                        loop_cs.cancel()
+                        break
 
-                    # process a 'cmd' request-msg upack
+                    case Start(
+                        ns='self',
+                        func='_cancel_task',
+                        cid=cid,
+                        kwargs=kwargs,
+                    ):
+                        target_cid: str = kwargs['cid']
+                        kwargs |= {
+                            'requesting_uid': chan.uid,
+                            'ipc_msg': msg,
+
+                            # XXX NOTE! ONLY the rpc-task-owning
+                            # parent IPC channel should be able to
+                            # cancel it!
+                            'parent_chan': chan,
+                        }
+                        try:
+                            await _invoke(
+                                actor,
+                                cid,
+                                chan,
+                                actor._cancel_task,
+                                kwargs,
+                                is_rpc=False,
+                                return_msg=CancelAck,
+                            )
+                        except BaseException:
+                            log.exception(
+                                'Failed to cancel task?\n'
+                                f'<= canceller: {chan.uid}\n'
+                                f'  |_{chan}\n\n'
+                                f'=> {actor}\n'
+                                f'  |_cid: {target_cid}\n'
+                            )
+
+                    # the "MAIN" RPC endpoint to schedule-a-`trio.Task`
+                    #
                     # TODO: impl with native `msgspec.Struct` support !!
                     # -[ ] implement with ``match:`` syntax?
                     # -[ ] discard un-authed msgs as per,
@@ -940,139 +971,29 @@ async def process_messages(
                         kwargs=kwargs,  # type-spec this? see `msg.types`
                         uid=actorid,
                     ):
-                        # try:
-                        #     (
-                        #         ns,
-                        #         funcname,
-                        #         kwargs,
-                        #         actorid,
-                        #         cid,
-                        #     ) = msg['cmd']
-
-                        # # TODO: put in `case Error():` right?
-                        # except KeyError:
-                        #     # This is the non-rpc error case, that is, an
-                        #     # error **not** raised inside a call to ``_invoke()``
-                        #     # (i.e. no cid was provided in the msg - see above).
-                        #     # Push this error to all local channel consumers
-                        #     # (normally portals) by marking the channel as errored
-                        #     assert chan.uid
-                        #     exc = unpack_error(msg, chan=chan)
-                        #     chan._exc = exc
-                        #     raise exc
-
                         log.runtime(
                             'Handling RPC `Start` request from\n'
                             f'peer: {actorid}\n'
                             '\n'
                             f'=> {ns}.{funcname}({kwargs})\n'
                         )
-                        # case Start(
-                        #     ns='self',
-                        #     funcname='cancel',
-                        # ):
+
+                        # runtime-internal endpoint: `Actor.<funcname>`
+                        # only registry methods exist now yah,
+                        # like ``.register_actor()`` etc. ?
                         if ns == 'self':
-                            if funcname == 'cancel':
-                                func: Callable = actor.cancel
-                                kwargs |= {
-                                    'req_chan': chan,
-                                }
+                            func: Callable = getattr(actor, funcname)
 
-                                # don't start entire actor runtime cancellation
-                                # if this actor is currently in debug mode!
-                                pdb_complete: trio.Event|None = _debug.Lock.local_pdb_complete
-                                if pdb_complete:
-                                    await pdb_complete.wait()
-
-                                # Either of  `Actor.cancel()`/`.cancel_soon()`
-                                # was called, so terminate this IPC msg
-                                # loop, exit back out into `async_main()`,
-                                # and immediately start the core runtime
-                                # machinery shutdown!
-                                with CancelScope(shield=True):
-                                    await _invoke(
-                                        actor,
-                                        cid,
-                                        chan,
-                                        func,
-                                        kwargs,
-                                        is_rpc=False,
-                                    )
-
-                                log.runtime(
-                                    'Cancelling IPC transport msg-loop with peer:\n'
-                                    f'|_{chan}\n'
-                                )
-                                loop_cs.cancel()
-                                break
-
-                        # case Start(
-                        #     ns='self',
-                        #     funcname='_cancel_task',
-                        # ):
-                            if funcname == '_cancel_task':
-                                func: Callable = actor._cancel_task
-
-                                # we immediately start the runtime machinery
-                                # shutdown
-                                # with CancelScope(shield=True):
-                                target_cid: str = kwargs['cid']
-                                kwargs |= {
-                                    # NOTE: ONLY the rpc-task-owning
-                                    # parent IPC channel should be able to
-                                    # cancel it!
-                                    'parent_chan': chan,
-                                    'requesting_uid': chan.uid,
-                                    'ipc_msg': msg,
-                                }
-                                # TODO: remove? already have emit in meth.
-                                # log.runtime(
-                                #     f'Rx RPC task cancel request\n'
-                                #     f'<= canceller: {chan.uid}\n'
-                                #     f'  |_{chan}\n\n'
-                                #     f'=> {actor}\n'
-                                #     f'  |_cid: {target_cid}\n'
-                                # )
-                                try:
-                                    await _invoke(
-                                        actor,
-                                        cid,
-                                        chan,
-                                        func,
-                                        kwargs,
-                                        is_rpc=False,
-                                    )
-                                except BaseException:
-                                    log.exception(
-                                        'Failed to cancel task?\n'
-                                        f'<= canceller: {chan.uid}\n'
-                                        f'  |_{chan}\n\n'
-                                        f'=> {actor}\n'
-                                        f'  |_cid: {target_cid}\n'
-                                    )
-                                continue
-
-                            # case Start(
-                            #     ns='self',
-                            #     funcname='register_actor',
-                            # ):
-                            else:
-                                # normally registry methods, eg.
-                                # ``.register_actor()`` etc.
-                                func: Callable = getattr(actor, funcname)
-
-                        # case Start(
-                        #     ns=str(),
-                        #     funcname=funcname,
-                        # ):
+                        # application RPC endpoint
                         else:
-                            # complain to client about restricted modules
                             try:
-                                func = actor._get_rpc_func(ns, funcname)
+                                func: Callable = actor._get_rpc_func(ns, funcname)
                             except (
                                 ModuleNotExposed,
                                 AttributeError,
                             ) as err:
+                                # always complain to requester
+                                # client about un-enabled modules
                                 err_msg: dict[str, dict] = pack_error(
                                     err,
                                     cid=cid,
@@ -1082,6 +1003,7 @@ async def process_messages(
 
                         # schedule a task for the requested RPC function
                         # in the actor's main "service nursery".
+                        #
                         # TODO: possibly a service-tn per IPC channel for
                         # supervision isolation? would avoid having to
                         # manage RPC tasks individually in `._rpc_tasks`
@@ -1090,7 +1012,7 @@ async def process_messages(
                             f'Spawning task for RPC request\n'
                             f'<= caller: {chan.uid}\n'
                             f'  |_{chan}\n\n'
-                            # TODO: maddr style repr?
+                            # ^-TODO-^ maddr style repr?
                             # f'  |_@ /ipv4/{chan.raddr}/tcp/{chan.rport}/'
                             # f'cid="{cid[-16:]} .."\n\n'
 
@@ -1098,7 +1020,6 @@ async def process_messages(
                             f'  |_cid: {cid}\n'
                             f'   |>> {func}()\n'
                         )
-                        assert actor._service_n  # wait why? do it at top?
                         try:
                             ctx: Context = await actor._service_n.start(
                                 partial(
@@ -1128,13 +1049,12 @@ async def process_messages(
                             log.warning(
                                 'Task for RPC failed?'
                                 f'|_ {func}()\n\n'
-
                                 f'{err}'
                             )
                             continue
 
                         else:
-                            # mark that we have ongoing rpc tasks
+                            # mark our global state with ongoing rpc tasks
                             actor._ongoing_rpc_tasks = trio.Event()
 
                             # store cancel scope such that the rpc task can be
@@ -1145,23 +1065,24 @@ async def process_messages(
                                 trio.Event(),
                             )
 
-                    case Error()|_:
-                        # This is the non-rpc error case, that is, an
-                        # error **not** raised inside a call to ``_invoke()``
-                        # (i.e. no cid was provided in the msg - see above).
-                        # Push this error to all local channel consumers
-                        # (normally portals) by marking the channel as errored
+                    case Error() | _:
+                        # NOTE: this is the non-rpc error case,
+                        # that is, an error **not** raised inside
+                        # a call to ``_invoke()`` (i.e. no cid was
+                        # provided in the msg - see above). Push
+                        # this error to all local channel
+                        # consumers (normally portals) by marking
+                        # the channel as errored
                         log.exception(
                             f'Unhandled IPC msg:\n\n'
                             f'{msg}\n'
                         )
-                        assert chan.uid
-                        exc = unpack_error(
+                        # assert chan.uid
+                        chan._exc: Exception = unpack_error(
                             msg,
                             chan=chan,
                         )
-                        chan._exc = exc
-                        raise exc
+                        raise chan._exc
 
                 log.runtime(
                     'Waiting on next IPC msg from\n'
@@ -1172,7 +1093,8 @@ async def process_messages(
             # end of async for, channel disconnect vis
             # ``trio.EndOfChannel``
             log.runtime(
-                f"{chan} for {chan.uid} disconnected, cancelling tasks"
+                f'channel for {chan.uid} disconnected, cancelling RPC tasks\n'
+                f'|_{chan}\n'
             )
             await actor.cancel_rpc_tasks(
                 req_uid=actor.uid,
diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index 3e7a2d7a..7355a610 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -454,6 +454,10 @@ _runtime_msgs: list[Msg] = [
     # emission from `MsgStream.aclose()`
     Stop,
 
+    # `Return` sub-type that we always accept from
+    # runtime-internal cancel endpoints
+    CancelAck,
+
     # box remote errors, normally subtypes
     # of `RemoteActorError`.
     Error,
-- 
2.34.1


From aea5abdd70c215ead48613405bb8c83f499db341 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 7 Apr 2024 16:29:21 -0400
Subject: [PATCH 217/378] Use `object()` when checking for error field value

Since the field value could be `None` or some other type with
truthy-ness evaluating to `False`..
---
 tractor/_ipc.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tractor/_ipc.py b/tractor/_ipc.py
index 9af28e5a..694eaf9e 100644
--- a/tractor/_ipc.py
+++ b/tractor/_ipc.py
@@ -172,7 +172,8 @@ def _raise_msg_type_err(
         # specific field's type problem
         msgspec_msg: str = validation_err.args[0].rstrip('`')
         msg, _, maybe_field = msgspec_msg.rpartition('$.')
-        if field_val := msg_dict.get(maybe_field):
+        obj = object()
+        if (field_val := msg_dict.get(maybe_field, obj)) is not obj:
             field_type: Union[Type] = msg_type.__signature__.parameters[
                 maybe_field
             ].annotation
-- 
2.34.1


From 38111e8d538831742360878dee9c345e0ac8d1d7 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 7 Apr 2024 16:35:00 -0400
Subject: [PATCH 218/378] Detail out EoC-by-self log msg

---
 tractor/_streaming.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tractor/_streaming.py b/tractor/_streaming.py
index 941cfe8d..dc30ac6e 100644
--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@@ -46,7 +46,6 @@ from .trionics import (
 from tractor.msg import (
     Stop,
     Yield,
-    Error,
 )
 
 if TYPE_CHECKING:
@@ -391,11 +390,11 @@ class MsgStream(trio.abc.Channel):
 
         if not self._eoc:
             log.cancel(
-                'Stream closed before it received an EoC?\n'
+                'Stream closed by self before it received an EoC?\n'
                 'Setting eoc manually..\n..'
             )
             self._eoc: bool = trio.EndOfChannel(
-                f'Context stream closed by {self._ctx.side}\n'
+                f'Context stream closed by self({self._ctx.side})\n'
                 f'|_{self}\n'
             )
         # ?XXX WAIT, why do we not close the local mem chan `._rx_chan` XXX?
-- 
2.34.1


From 8e83455a78e1ed2e3cc63179f49041d4faa0bf4c Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 7 Apr 2024 18:54:03 -0400
Subject: [PATCH 219/378] Finally drop masked `chan.send(None)` related code
 blocks

---
 tractor/_rpc.py     | 89 ++++++++++++---------------------------------
 tractor/_runtime.py | 48 +-----------------------
 2 files changed, 25 insertions(+), 112 deletions(-)

diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index 9da8690b..75e59519 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -810,61 +810,15 @@ async def process_messages(
                 log.transport(   # type: ignore
                     f'<= IPC msg from peer: {chan.uid}\n\n'
 
-                    # TODO: conditionally avoid fmting depending
-                    # on log level (for perf)?
-                    # => specifically `pformat()` sub-call..?
+                    # TODO: avoid fmting depending on loglevel for perf?
+                    # -[ ] specifically `pformat()` sub-call..?
+                    # -[ ] use `.msg.pretty_struct` here now instead!
                     f'{pformat(msg)}\n'
                 )
 
                 match msg:
-
-                # NOTE: this *was a dedicated
-                # "graceful-terminate-loop" mechanism using
-                # a `None`-msg-sentinel which would cancel all RPC
-                # tasks parented by this loop's IPC channel; that
-                # is all rpc-scheduled-tasks started over the
-                # connection were explicitly per-task cancelled
-                # normally prior to the `Channel`'s underlying
-                # transport being later closed.
-                #
-                # * all `.send(None)`s were # removed as part of
-                #   typed-msging  requirements
-                #
-                # TODO: if this mechanism is still desired going
-                # forward it should be implemented as part of the
-                # normal runtime-cancel-RPC endpoints with either,
-                # - a special `msg.types.Msg` to trigger the loop endpoint
-                #   (like `None` was used prior) or,
-                # - it should just be accomplished using A 
-                #   `Start(ns='self', func='cancel_rpc_tasks())`
-                #   request instead?
-                #
-                # if msg is None:
-                    # case None:
-                    #     tasks: dict[
-                    #         tuple[Channel, str],
-                    #         tuple[Context, Callable, trio.Event]
-                    #     ] = actor._rpc_tasks.copy()
-                    #     log.cancel(
-                    #         f'Peer IPC channel terminated via `None` setinel msg?\n'
-                    #         f'=> Cancelling all {len(tasks)} local RPC tasks..\n'
-                    #         f'peer: {chan.uid}\n'
-                    #         f'|_{chan}\n'
-                    #     )
-                    #     # TODO: why aren't we just calling
-                    #     # `.cancel_rpc_tasks()` with the parent
-                    #     # chan as input instead?
-                    #     for (channel, cid) in tasks:
-                    #         if channel is chan:
-                    #             await actor._cancel_task(
-                    #                 cid,
-                    #                 channel,
-                    #                 requesting_uid=channel.uid,
-                    #                 ipc_msg=msg,
-                    #             )
-                    #     # immediately break out of this loop!
-                    #     break
-
+                    # msg for an ongoing IPC ctx session, deliver msg to
+                    # local task.
                     case (
                         StartAck(cid=cid)
                         | Started(cid=cid)
@@ -872,7 +826,7 @@ async def process_messages(
                         | Stop(cid=cid)
                         | Return(cid=cid)
                         | CancelAck(cid=cid)
-                        | Error(cid=cid)
+                        | Error(cid=cid)  # RPC-task ctx specific
                     ):
                         # deliver response to local caller/waiter
                         # via its per-remote-context memory channel.
@@ -881,10 +835,8 @@ async def process_messages(
                             cid,
                             msg,
                         )
-                        # TODO: can remove right?
-                        # continue
 
-                    # runtime-internal cancellation endpoints
+                    # `Actor`(-internal) runtime cancel requests
                     case Start(
                         ns='self',
                         func='cancel',
@@ -959,11 +911,9 @@ async def process_messages(
                             )
 
                     # the "MAIN" RPC endpoint to schedule-a-`trio.Task`
-                    #
-                    # TODO: impl with native `msgspec.Struct` support !!
-                    # -[ ] implement with ``match:`` syntax?
-                    # -[ ] discard un-authed msgs as per,
-                    # <TODO put issue for typed msging structs>
+                    #                ------ - ------
+                    # -[x] discard un-authed msgs as per,
+                    #    <TODO put issue for typed msging structs>
                     case Start(
                         cid=cid,
                         ns=ns,
@@ -987,7 +937,10 @@ async def process_messages(
                         # application RPC endpoint
                         else:
                             try:
-                                func: Callable = actor._get_rpc_func(ns, funcname)
+                                func: Callable = actor._get_rpc_func(
+                                    ns,
+                                    funcname,
+                                )
                             except (
                                 ModuleNotExposed,
                                 AttributeError,
@@ -1065,6 +1018,8 @@ async def process_messages(
                                 trio.Event(),
                             )
 
+                    # XXX remote (runtime scoped) error or uknown
+                    # msg (type).
                     case Error() | _:
                         # NOTE: this is the non-rpc error case,
                         # that is, an error **not** raised inside
@@ -1090,8 +1045,9 @@ async def process_messages(
                     f'|_{chan}\n'
                 )
 
-            # end of async for, channel disconnect vis
-            # ``trio.EndOfChannel``
+            # END-OF `async for`:
+            # IPC disconnected via `trio.EndOfChannel`, likely
+            # due to a (graceful) `Channel.aclose()`.
             log.runtime(
                 f'channel for {chan.uid} disconnected, cancelling RPC tasks\n'
                 f'|_{chan}\n'
@@ -1111,9 +1067,10 @@ async def process_messages(
         # connection-reset) is ok since we don't have a teardown
         # handshake for them (yet) and instead we simply bail out of
         # the message loop and expect the teardown sequence to clean
-        # up.
-        # TODO: don't show this msg if it's an emphemeral
-        # discovery ep call?
+        # up..
+        # TODO: add a teardown handshake? and,
+        # -[ ] don't show this msg if it's an ephemeral discovery ep call?
+        # -[ ] figure out how this will break with other transports?
         log.runtime(
             f'channel closed abruptly with\n'
             f'peer: {chan.uid}\n' 
diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index e08d074a..854db3a3 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -393,8 +393,9 @@ class Actor:
 
             raise mne
 
+    # TODO: maybe change to mod-func and rename for implied
+    # multi-transport semantics?
     async def _stream_handler(
-
         self,
         stream: trio.SocketStream,
 
@@ -713,51 +714,6 @@ class Actor:
                         # TODO: figure out why this breaks tests..
                         db_cs.cancel()
 
-            # XXX TODO XXX: DO WE NEED THIS?
-            # -[ ] is it necessary any more (GC should do it) now
-            #    that we have strict(er) graceful cancellation
-            #    semantics?
-            # XXX WARNING XXX
-            # Be AWARE OF THE INDENT LEVEL HERE
-            # -> ONLY ENTER THIS BLOCK WHEN ._peers IS
-            # EMPTY!!!!
-            #
-            # if the channel is still connected it may mean the far
-            # end has not closed and we may have gotten here due to
-            # an error and so we should at least try to terminate
-            # the channel from this end gracefully.
-            #if (
-            #    not self._peers
-            #    and chan.connected()
-            #):
-            #        log.runtime(
-            #            'Terminating channel with `None` setinel msg\n'
-            #            f'|_{chan}\n'
-            #        )
-            #        try:
-            #            # ORIGINALLY we sent a msg loop terminate
-            #            # sentinel (`None`) which triggers
-            #            # cancellation of all remotely started
-            #            # tasks.
-            #            #
-            #            # HOWEVER, after we added typed msging,
-            #            # you can't just willy nilly send `None`
-            #            # wherever since it might be invalid given
-            #            # the currently configured msg-spec.
-            #            #
-            #            # SO, this was all removed and I'm pretty
-            #            # confident we don't need it replaced with
-            #            # a manual RPC to
-            #            # a `Actor.cancel_rpc_tasks()` right?
-            #            await chan.send(None)
-
-            #            # XXX: do we want this? NO RIGHT?
-            #            # causes "[104] connection reset by peer" on other end
-            #            # await chan.aclose()
-
-            #        except trio.BrokenResourceError:
-            #            log.runtime(f"Channel {chan.uid} was already closed")
-
     # TODO: rename to `._deliver_payload()` since this handles
     # more then just `result` msgs now obvi XD
     async def _push_result(
-- 
2.34.1


From 2f451ab9a37f527a0bfca6f90b684365b98d7ad8 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 8 Apr 2024 10:13:14 -0400
Subject: [PATCH 220/378] Caps-msging test tweaks to get correct failures

These are likely temporary changes but still needed to actually see the
desired/correct failures (of which 5 of 6 tests are supposed to fail rn)
mostly to do with `Start` and `Return` msgs which are invalid under each
test's applied msg-spec.

Tweak set here:
- bit more `print()`s in root and sub for grokin test flow.
- never use `pytes.fail()` in subactor.. should know this by now XD
- comment out some bits that can't ever pass rn and make the underlying
  expected failues harder to grok:
  - the sub's child-side-of-ctx task doing sends should only fail
    for certain msg types like `Started` + `Return`, `Yield`s are
    processed receiver/parent side.
  - don't expect `sent` list to match predicate set for the same reason
    as last bullet.

The outstanding msg-type-semantic validation questions are:
- how to handle `.open_context()` with an input `kwargs` set that
  doesn't adhere to the currently applied msg-spec?
  - should the initial `@acm` entry fail before sending to the child
    side?
- where should received `MsgTypeError`s be raised, at the `MsgStream`
  `.receive()` or lower in the stack?
  - i'm thinking we should mk `MsgTypeError` derive from
    `RemoteActorError` and then have it be delivered as an error to the
    `Context`/`MsgStream` for per-ctx-task handling; would lead to more
    flexible/modular policy overrides in user code outside any defaults
    we provide.
---
 tests/test_caps_based_msging.py | 85 ++++++++++++++++++++++-----------
 1 file changed, 56 insertions(+), 29 deletions(-)

diff --git a/tests/test_caps_based_msging.py b/tests/test_caps_based_msging.py
index acc1f307..f7cab2a5 100644
--- a/tests/test_caps_based_msging.py
+++ b/tests/test_caps_based_msging.py
@@ -374,7 +374,7 @@ def enc_type_union(
 
 
 @tractor.context
-async def send_back_nsp(
+async def send_back_values(
     ctx: Context,
     expect_debug: bool,
     pld_spec_type_strs: list[str],
@@ -388,6 +388,8 @@ async def send_back_nsp(
     and ensure we can round trip a func ref with our parent.
 
     '''
+    uid: tuple = tractor.current_actor().uid
+
     # debug mode sanity check (prolly superfluous but, meh)
     assert expect_debug == _state.debug_mode()
 
@@ -414,7 +416,7 @@ async def send_back_nsp(
         )
 
         print(
-            'CHILD attempting `Started`-bytes DECODE..\n'
+            f'{uid}: attempting `Started`-bytes DECODE..\n'
         )
         try:
             msg: Started = nsp_codec.decode(started_msg_bytes)
@@ -436,7 +438,7 @@ async def send_back_nsp(
                 raise
             else:
                 print(
-                    'CHILD (correctly) unable to DECODE `Started`-bytes\n'
+                    f'{uid}: (correctly) unable to DECODE `Started`-bytes\n'
                     f'{started_msg_bytes}\n'
                 )
 
@@ -445,7 +447,7 @@ async def send_back_nsp(
         for send_value, expect_send in iter_send_val_items:
             try:
                 print(
-                    f'CHILD attempting to `.started({send_value})`\n'
+                    f'{uid}: attempting to `.started({send_value})`\n'
                     f'=> expect_send: {expect_send}\n'
                     f'SINCE, ipc_pld_spec: {ipc_pld_spec}\n'
                     f'AND, codec: {codec}\n'
@@ -460,7 +462,6 @@ async def send_back_nsp(
                     # await tractor.pause()
 
                     raise RuntimeError(
-                    # pytest.fail(
                         f'NOT-EXPECTED able to roundtrip value given spec:\n'
                         f'ipc_pld_spec -> {ipc_pld_spec}\n'
                         f'value -> {send_value}: {type(send_value)}\n'
@@ -468,53 +469,76 @@ async def send_back_nsp(
 
                 break  # move on to streaming block..
 
-            except NotImplementedError:
-                print('FAILED ENCODE!')
-
             except tractor.MsgTypeError:
                 # await tractor.pause()
                 if expect_send:
-                    pytest.fail(
+                    raise RuntimeError(
                         f'EXPECTED to `.started()` value given spec:\n'
                         f'ipc_pld_spec -> {ipc_pld_spec}\n'
                         f'value -> {send_value}: {type(send_value)}\n'
                     )
 
         async with ctx.open_stream() as ipc:
+            print(
+                f'{uid}: Entering streaming block to send remaining values..'
+            )
+
             for send_value, expect_send in iter_send_val_items:
                 send_type: Type = type(send_value)
                 print(
-                    'CHILD report on send value\n'
+                    '------ - ------\n'
+                    f'{uid}: SENDING NEXT VALUE\n'
                     f'ipc_pld_spec: {ipc_pld_spec}\n'
                     f'expect_send: {expect_send}\n'
                     f'val: {send_value}\n'
+                    '------ - ------\n'
                 )
                 try:
                     await ipc.send(send_value)
+                    print(f'***\n{uid}-CHILD sent {send_value!r}\n***\n')
                     sent.append(send_value)
-                    if not expect_send:
-                        pytest.fail(
-                            f'NOT-EXPECTED able to roundtrip value given spec:\n'
-                            f'ipc_pld_spec -> {ipc_pld_spec}\n'
-                            f'value -> {send_value}: {send_type}\n'
-                        )
+
+                    # NOTE: should only raise above on
+                    # `.started()` or a `Return`
+                    # if not expect_send:
+                    #     raise RuntimeError(
+                    #         f'NOT-EXPECTED able to roundtrip value given spec:\n'
+                    #         f'ipc_pld_spec -> {ipc_pld_spec}\n'
+                    #         f'value -> {send_value}: {send_type}\n'
+                    #     )
+
                 except ValidationError:
+                    print(f'{uid} FAILED TO SEND {send_value}!')
+
+                    # await tractor.pause()
                     if expect_send:
-                        pytest.fail(
+                        raise RuntimeError(
                             f'EXPECTED to roundtrip value given spec:\n'
                             f'ipc_pld_spec -> {ipc_pld_spec}\n'
                             f'value -> {send_value}: {send_type}\n'
                         )
-                    continue
+                    # continue
 
-        assert (
-            len(sent)
-            ==
-            len([val
-                 for val, expect in
-                 expect_ipc_send.values()
-                 if expect is True])
-        )
+            else:
+                print(
+                    f'{uid}: finished sending all values\n'
+                    'Should be exiting stream block!\n'
+                )
+
+        print(f'{uid}: exited streaming block!')
+
+        # TODO: this won't be true bc in streaming phase we DO NOT
+        # msgspec check outbound msgs!
+        # -[ ] once we implement the receiver side `InvalidMsg`
+        #   then we can expect it here?
+        # assert (
+        #     len(sent)
+        #     ==
+        #     len([val
+        #          for val, expect in
+        #          expect_ipc_send.values()
+        #          if expect is True])
+        # )
 
 
 def ex_func(*args):
@@ -635,7 +659,7 @@ def test_codec_hooks_mod(
                 async with (
 
                     p.open_context(
-                        send_back_nsp,
+                        send_back_values,
                         expect_debug=debug_mode,
                         pld_spec_type_strs=pld_spec_type_strs,
                         add_hooks=add_codec_hooks,
@@ -665,10 +689,13 @@ def test_codec_hooks_mod(
 
                     async for next_sent in ipc:
                         print(
-                            'Child sent next value\n'
+                            'Parent: child sent next value\n'
                             f'{next_sent}: {type(next_sent)}\n'
                         )
-                        expect_to_send.remove(next_sent)
+                        if expect_to_send:
+                            expect_to_send.remove(next_sent)
+                        else:
+                            print('PARENT should terminate stream loop + block!')
 
                     # all sent values should have arrived!
                     assert not expect_to_send
-- 
2.34.1


From b341146bd1bd3df6c9fad2b97c11760c32083db1 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 8 Apr 2024 10:25:57 -0400
Subject: [PATCH 221/378] Rename `Actor._push_result()` ->
 `._deliver_ctx_payload()`

Better describes the internal RPC impl/latest-architecture with the msgs
delivered being those which either define a `.pld: PayloadT` that gets
passed up to user code, or the error-msg subset that similarly is raised
in a ctx-linked task.
---
 tractor/_context.py   | 10 +++++-----
 tractor/_rpc.py       |  2 +-
 tractor/_runtime.py   | 15 ++++++++++-----
 tractor/_streaming.py |  2 +-
 4 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index 6e55c3c9..6a634166 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -1207,7 +1207,7 @@ class Context:
                 # XXX: (MEGA IMPORTANT) if this is a root opened process we
                 # wait for any immediate child in debug before popping the
                 # context from the runtime msg loop otherwise inside
-                # ``Actor._push_result()`` the msg will be discarded and in
+                # ``Actor._deliver_ctx_payload()`` the msg will be discarded and in
                 # the case where that msg is global debugger unlock (via
                 # a "stop" msg for a stream), this can result in a deadlock
                 # where the root is waiting on the lock to clear but the
@@ -1698,11 +1698,11 @@ class Context:
 
         # raise any msg type error NO MATTER WHAT!
         except msgspec.ValidationError as verr:
-            from tractor._ipc import _raise_msg_type_err
-            _raise_msg_type_err(
+            from tractor._ipc import _mk_msg_type_err
+            raise _mk_msg_type_err(
                 msg=msg_bytes,
                 codec=codec,
-                validation_err=verr,
+                src_validation_error=verr,
                 verb_header='Trying to send payload'
                 # > 'invalid `Started IPC msgs\n'
             )
@@ -2415,7 +2415,7 @@ async def open_context_from_portal(
         # XXX: (MEGA IMPORTANT) if this is a root opened process we
         # wait for any immediate child in debug before popping the
         # context from the runtime msg loop otherwise inside
-        # ``Actor._push_result()`` the msg will be discarded and in
+        # ``Actor._deliver_ctx_payload()`` the msg will be discarded and in
         # the case where that msg is global debugger unlock (via
         # a "stop" msg for a stream), this can result in a deadlock
         # where the root is waiting on the lock to clear but the
diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index 75e59519..d935909f 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -830,7 +830,7 @@ async def process_messages(
                     ):
                         # deliver response to local caller/waiter
                         # via its per-remote-context memory channel.
-                        await actor._push_result(
+                        await actor._deliver_ctx_payload(
                             chan,
                             cid,
                             msg,
diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 854db3a3..4be5ea1f 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -69,6 +69,7 @@ from tractor.msg import (
     pretty_struct,
     NamespacePath,
     types as msgtypes,
+    Msg,
 )
 from ._ipc import Channel
 from ._context import (
@@ -77,9 +78,10 @@ from ._context import (
 )
 from .log import get_logger
 from ._exceptions import (
-    unpack_error,
-    ModuleNotExposed,
     ContextCancelled,
+    ModuleNotExposed,
+    MsgTypeError,
+    unpack_error,
     TransportClosed,
 )
 from .devx import (
@@ -557,7 +559,7 @@ class Actor:
                         cid: str|None = msg.cid
                         if cid:
                             # deliver response to local caller/waiter
-                            await self._push_result(
+                            await self._deliver_ctx_payload(
                                 chan,
                                 cid,
                                 msg,
@@ -716,11 +718,11 @@ class Actor:
 
     # TODO: rename to `._deliver_payload()` since this handles
     # more then just `result` msgs now obvi XD
-    async def _push_result(
+    async def _deliver_ctx_payload(
         self,
         chan: Channel,
         cid: str,
-        msg: dict[str, Any],
+        msg: Msg|MsgTypeError,
 
     ) -> None|bool:
         '''
@@ -749,6 +751,9 @@ class Actor:
             )
             return
 
+        # if isinstance(msg, MsgTypeError):
+        #     return await ctx._deliver_bad_msg()
+
         return await ctx._deliver_msg(msg)
 
     def get_context(
diff --git a/tractor/_streaming.py b/tractor/_streaming.py
index dc30ac6e..fcf8dafc 100644
--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@@ -183,7 +183,7 @@ class MsgStream(trio.abc.Channel):
         # - via a received `{'stop': ...}` msg from remote side.
         #   |_ NOTE: previously this was triggered by calling
         #   ``._rx_chan.aclose()`` on the send side of the channel inside
-        #   `Actor._push_result()`, but now the 'stop' message handling
+        #   `Actor._deliver_ctx_payload()`, but now the 'stop' message handling
         #   has been put just above inside `_raise_from_no_key_in_msg()`.
         except (
             trio.EndOfChannel,
-- 
2.34.1


From cf48fdecfeb76cdf67a3f779d98c086e75659dd6 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 9 Apr 2024 08:44:06 -0400
Subject: [PATCH 222/378] Unify `MsgTypeError` as a `RemoteActorError` subtype

Since in the receive-side error case the source of the exception is the
sender side (normally causing a local `TypeError` at decode time), might
as well bundle the error in remote-capture-style using boxing semantics
around the causing local type error raised from the
`msgspec.msgpack.Decoder.decode()` and with a traceback packed from
`msgspec`-specific knowledge of any field-type spec matching failure.

Deats on new `MsgTypeError` interface:
- includes a `.msg_dict` to get access to any `Decoder.type`-applied
  load of the original (underlying and offending) IPC msg into
  a `dict` form using a vanilla decoder which is normally packed into
  the instance as a `._msg_dict`.
- a public getter to the "supposed offending msg" via `.payload_msg`
  which attempts to take the above `.msg_dict` and load it manually into
  the corresponding `.msg.types.MsgType` struct.
- a constructor `.from_decode()` to make it simple to build out error
  instances from a failed decode scope where the aforementioned
  `msgdict: dict` from the vanilla decode can be provided directly.
- ALSO, we now pack into `MsgTypeError` directly just like ctxc in
  `unpack_error()`

This also completes the while-standing todo for `RemoteActorError` to
contain a ref to the underlying `Error` msg as `._ipc_msg` with public
`@property` access that `defstruct()`-creates a pretty struct version
via `.ipc_msg`.

Internal tweaks for this include:
- `._ipc_msg` is the internal literal `Error`-msg instance if provided
  with `.ipc_msg` the dynamic wrapper as mentioned above.
- `.__init__()` now can still take variable `**extra_msgdata` (similar
  to the `dict`-msgdata as before) to maintain support for subtypes
  which are constructed manually (not only by `pack_error()`) and insert
  their own attrs which get placed in a `._extra_msgdata: dict` if no
  `ipc_msg: Error` is provided as input.
- the `.msgdata` is now a merge of any `._extra_msgdata` and
  a `dict`-casted form of any `._ipc_msg`.
- adjust all previous `.msgdata` field lookups to try equivalent field
  reads on `._ipc_msg: Error`.
- drop default single ws indent from `.tb_str` and do a failover lookup
  to `.msgdata` when `._ipc_msg is None` for the manually constructed
  subtype-instance case.
- add a new class attr `.extra_body_fields: list[str]` to allow subtypes
  to declare attrs they want shown in the `.__repr__()` output, eg.
  `ContextCancelled.canceller`, `StreamOverrun.sender` and
  `MsgTypeError.payload_msg`.
- ^-rework defaults pertaining to-^ with rename from
  `_msgdata_keys` -> `_ipcmsg_keys` with latter now just loading directly
  from the `Error` fields def and `_body_fields: list[str]` just taking
  that value and removing the not-so-useful-in-REPL or already shown
  (i.e. `.tb_str: str`) field names.
- add a new mod level `.pack_from_raise()` helper for auto-boxing RAE
  subtypes constructed manually into `Error`s which is normally how
  `StreamOverrun` and `MsgTypeError` get created in the runtime.
- in support of the above expose a `src_uid: tuple` override to
  `pack_error()` such that the runtime can provide any remote actor id
  when packing a locally-created yet remotely-caused RAE subtype.
- adjust all typing to expect `Error`s over `dict`-msgs.

Adjust some tests to match these changes:
- context and inter-peer-cancel tests to make their `.msgdata` related
  checks against the new `.ipc_msg` as well and `.tb_str` directly.
- toss in an extra sleep to `sleep_a_bit_then_cancel_peer()` to keep the
  'canceller' ctx child task cancelled by it's parent in the 'root' for
  the rte-raised-during-ctxc-handling case (apparently now it's
  returning too fast, cool?).
---
 tests/test_context_stream_semantics.py |   7 +-
 tests/test_inter_peer_cancellation.py  |  10 +
 tractor/_exceptions.py                 | 418 +++++++++++++++++++------
 3 files changed, 333 insertions(+), 102 deletions(-)

diff --git a/tests/test_context_stream_semantics.py b/tests/test_context_stream_semantics.py
index 06a7f8c9..5df133d8 100644
--- a/tests/test_context_stream_semantics.py
+++ b/tests/test_context_stream_semantics.py
@@ -796,10 +796,12 @@ async def test_callee_cancels_before_started(
 
         # raises a special cancel signal
         except tractor.ContextCancelled as ce:
+            _ce = ce  # for debug on crash
             ce.boxed_type == trio.Cancelled
 
             # the traceback should be informative
-            assert 'itself' in ce.msgdata['tb_str']
+            assert 'itself' in ce.tb_str
+            assert ce.tb_str == ce.msgdata['tb_str']
 
         # teardown the actor
         await portal.cancel_actor()
@@ -1157,7 +1159,8 @@ def test_maybe_allow_overruns_stream(
 
         elif slow_side == 'parent':
             assert err.boxed_type == tractor.RemoteActorError
-            assert 'StreamOverrun' in err.msgdata['tb_str']
+            assert 'StreamOverrun' in err.tb_str
+            assert err.tb_str == err.msgdata['tb_str']
 
     else:
         # if this hits the logic blocks from above are not
diff --git a/tests/test_inter_peer_cancellation.py b/tests/test_inter_peer_cancellation.py
index 470287fb..aa05e3c8 100644
--- a/tests/test_inter_peer_cancellation.py
+++ b/tests/test_inter_peer_cancellation.py
@@ -185,6 +185,10 @@ async def sleep_a_bit_then_cancel_peer(
         await trio.sleep(cancel_after)
         await peer.cancel_actor()
 
+        # such that we're cancelled by our rent ctx-task
+        await trio.sleep(3)
+        print('CANCELLER RETURNING!')
+
 
 @tractor.context
 async def stream_ints(
@@ -245,6 +249,12 @@ async def stream_from_peer(
         assert peer_ctx._remote_error is ctxerr
         assert peer_ctx._remote_error.msgdata == ctxerr.msgdata
 
+        # XXX YES, bc exact same msg instances
+        assert peer_ctx._remote_error._ipc_msg is ctxerr._ipc_msg
+
+        # XXX NO, bc new one always created for property accesss
+        assert peer_ctx._remote_error.ipc_msg != ctxerr.ipc_msg
+
         # the peer ctx is the canceller even though it's canceller
         # is the "canceller" XD
         assert peer_name in peer_ctx.canceller
diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 28c61628..a31aa11e 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -31,7 +31,10 @@ import textwrap
 import traceback
 
 import trio
-from msgspec import structs
+from msgspec import (
+    structs,
+    defstruct,
+)
 
 from tractor._state import current_actor
 from tractor.log import get_logger
@@ -40,6 +43,8 @@ from tractor.msg import (
     Msg,
     Stop,
     Yield,
+    pretty_struct,
+    types as msgtypes,
 )
 
 if TYPE_CHECKING:
@@ -64,21 +69,38 @@ class InternalError(RuntimeError):
 
     '''
 
-_body_fields: list[str] = [
-    'boxed_type',
-    'src_type',
-    # TODO: format this better if we're going to include it.
-    # 'relay_path',
-    'src_uid',
 
-    # only in sub-types
-    'canceller',
-    'sender',
+# NOTE: more or less should be close to these:
+# 'boxed_type',
+# 'src_type',
+# 'src_uid',
+# 'canceller',
+# 'sender',
+# TODO: format this better if we're going to include it.
+# 'relay_path',
+#
+_ipcmsg_keys: list[str] = [
+    fi.name
+    for fi, k, v
+    in pretty_struct.iter_fields(Error)
+
 ]
 
-_msgdata_keys: list[str] = [
-    'boxed_type_str',
-] + _body_fields
+_body_fields: list[str] = list(
+    set(_ipcmsg_keys)
+
+    # NOTE: don't show fields that either don't provide
+    # any extra useful info or that are already shown
+    # as part of `.__repr__()` output.
+    - {
+        'src_type_str',
+        'boxed_type_str',
+        'tb_str',
+        'relay_path',
+        '_msg_dict',
+        'cid',
+    }
+)
 
 
 def get_err_type(type_name: str) -> BaseException|None:
@@ -137,7 +159,7 @@ def pformat_boxed_tb(
         f'|\n'
         f' ------ - ------\n\n'
         f'{tb_str}\n'
-        f' ------ - ------\n'
+        f'  ------ - ------\n'
         f'_|\n'
     )
     if len(indent):
@@ -152,10 +174,40 @@ def pformat_boxed_tb(
         +
         body
     )
-    # return body
 
 
-# TODO: rename to just `RemoteError`?
+def pack_from_raise(
+    local_err: (
+        ContextCancelled
+        |StreamOverrun
+        |MsgTypeError
+    ),
+    cid: str,
+
+    **rae_fields,
+
+) -> Error:
+    '''
+    Raise the provided `RemoteActorError` subtype exception
+    instance locally to get a traceback and pack it into an IPC
+    `Error`-msg using `pack_error()` to extract the tb info.
+
+    '''
+    try:
+        raise local_err
+    except type(local_err) as local_err:
+        err_msg: dict[str, dict] = pack_error(
+            local_err,
+            cid=cid,
+            **rae_fields,
+        )
+        return err_msg
+
+
+# TODO: better compat with IPC msg structs?
+# -[ ] rename to just `RemoteError` like in `mp.manager`?
+# -[ ] make a `Struct`-subtype by using the .__post_init__()`?
+#  https://jcristharif.com/msgspec/structs.html#post-init-processing
 class RemoteActorError(Exception):
     '''
     A box(ing) type which bundles a remote actor `BaseException` for
@@ -170,12 +222,28 @@ class RemoteActorError(Exception):
         'src_uid',
         # 'relay_path',
     ]
+    extra_body_fields: list[str] = [
+        'cid',
+        'boxed_type',
+    ]
 
     def __init__(
         self,
         message: str,
+        ipc_msg: Error|None = None,
         boxed_type: Type[BaseException]|None = None,
-        **msgdata
+
+        # NOTE: only provided by subtypes (ctxc and overruns)
+        # wishing to both manually instantiate and add field
+        # values defined on `Error` without having to construct an
+        # `Error()` before the exception is processed by
+        # `pack_error()`.
+        #
+        # TODO: a better way to support this without the extra
+        # private `._extra_msgdata`?
+        # -[ ] ctxc constructed inside `._rpc._invoke()` L:638
+        # -[ ] overrun @ `._context.Context._deliver_msg()` L:1958
+        **extra_msgdata,
 
     ) -> None:
         super().__init__(message)
@@ -188,14 +256,24 @@ class RemoteActorError(Exception):
         # - .remote_type
         # also pertains to our long long oustanding issue XD
         # https://github.com/goodboy/tractor/issues/5
-        #
-        # TODO: always set ._boxed_type` as `None` by default
-        # and instead render if from `.boxed_type_str`?
         self._boxed_type: BaseException = boxed_type
         self._src_type: BaseException|None = None
+        self._ipc_msg: Error|None = ipc_msg
 
-        # TODO: make this a `.errmsg: Error` throughout?
-        self.msgdata: dict[str, Any] = msgdata
+        if (
+            extra_msgdata
+            and ipc_msg
+        ):
+            # XXX mutate the orig msg directly from
+            # manually provided input params.
+            for k, v in extra_msgdata.items():
+                setattr(
+                    self._ipc_msg,
+                    k,
+                    v,
+                )
+        else:
+            self._extra_msgdata = extra_msgdata
 
         # TODO: mask out eventually or place in `pack_error()`
         # pre-`return` lines?
@@ -214,14 +292,56 @@ class RemoteActorError(Exception):
         # either by customizing `ContextCancelled.__init__()` or
         # through a special factor func?
         elif boxed_type:
-            if not self.msgdata.get('boxed_type_str'):
-                self.msgdata['boxed_type_str'] = str(
-                    type(boxed_type).__name__
-                )
+            boxed_type_str: str = type(boxed_type).__name__
+            if (
+                ipc_msg
+                and not self._ipc_msg.boxed_type_str
+            ):
+                self._ipc_msg.boxed_type_str = boxed_type_str
+                assert self.boxed_type_str == self._ipc_msg.boxed_type_str
+
+            else:
+                self._extra_msgdata['boxed_type_str'] = boxed_type_str
 
-            assert self.boxed_type_str == self.msgdata['boxed_type_str']
             assert self.boxed_type is boxed_type
 
+    @property
+    def ipc_msg(self) -> pretty_struct.Struct:
+        '''
+        Re-render the underlying `._ipc_msg: Msg` as
+        a `pretty_struct.Struct` for introspection such that the
+        returned value is a read-only copy of the original.
+
+        '''
+        if self._ipc_msg is None:
+            return None
+
+        msg_type: Msg = type(self._ipc_msg)
+        fields: dict[str, Any] = {
+            k: v for _, k, v in
+            pretty_struct.iter_fields(self._ipc_msg)
+        }
+        return defstruct(
+            msg_type.__name__,
+            fields=fields.keys(),
+            bases=(msg_type, pretty_struct.Struct),
+        )(**fields)
+
+    @property
+    def msgdata(self) -> dict[str, Any]:
+        '''
+        The (remote) error data provided by a merge of the
+        `._ipc_msg: Error` msg and any input `._extra_msgdata: dict`
+        (provided by subtypes via `.__init__()`).
+
+        '''
+        msgdata: dict = (
+            structs.asdict(self._ipc_msg)
+            if self._ipc_msg
+            else {}
+        )
+        return self._extra_msgdata | msgdata
+
     @property
     def src_type_str(self) -> str:
         '''
@@ -231,7 +351,7 @@ class RemoteActorError(Exception):
         at the first relay/hop's receiving actor.
 
         '''
-        return self.msgdata['src_type_str']
+        return self._ipc_msg.src_type_str
 
     @property
     def src_type(self) -> str:
@@ -241,7 +361,7 @@ class RemoteActorError(Exception):
         '''
         if self._src_type is None:
             self._src_type = get_err_type(
-                self.msgdata['src_type_str']
+                self._ipc_msg.src_type_str
             )
 
         return self._src_type
@@ -252,7 +372,7 @@ class RemoteActorError(Exception):
         String-name of the (last hop's) boxed error type.
 
         '''
-        return self.msgdata['boxed_type_str']
+        return self._ipc_msg.boxed_type_str
 
     @property
     def boxed_type(self) -> str:
@@ -262,7 +382,7 @@ class RemoteActorError(Exception):
         '''
         if self._boxed_type is None:
             self._boxed_type = get_err_type(
-                self.msgdata['boxed_type_str']
+                self._ipc_msg.boxed_type_str
             )
 
         return self._boxed_type
@@ -275,40 +395,44 @@ class RemoteActorError(Exception):
         actor's hop.
 
         NOTE: a `list` field with the same name is expected to be
-        passed/updated in `.msgdata`.
+        passed/updated in `.ipc_msg`.
 
         '''
-        return self.msgdata['relay_path']
+        return self._ipc_msg.relay_path
 
     @property
     def relay_uid(self) -> tuple[str, str]|None:
         return tuple(
-            self.msgdata['relay_path'][-1]
+            self._ipc_msg.relay_path[-1]
         )
 
     @property
     def src_uid(self) -> tuple[str, str]|None:
         if src_uid := (
-            self.msgdata.get('src_uid')
+            self._ipc_msg.src_uid
         ):
             return tuple(src_uid)
         # TODO: use path lookup instead?
         # return tuple(
-        #     self.msgdata['relay_path'][0]
+        #     self._ipc_msg.relay_path[0]
         # )
 
     @property
     def tb_str(
         self,
-        indent: str = ' ',
+        indent: str = '',
     ) -> str:
-        if remote_tb := self.msgdata.get('tb_str'):
-            return textwrap.indent(
-                remote_tb,
-                prefix=indent,
-            )
+        remote_tb: str = ''
 
-        return ''
+        if self._ipc_msg:
+            remote_tb: str = self._ipc_msg.tb_str
+        else:
+            remote_tb = self.msgdata.get('tb_str')
+
+        return textwrap.indent(
+            remote_tb or '',
+            prefix=indent,
+        )
 
     def _mk_fields_str(
         self,
@@ -320,14 +444,17 @@ class RemoteActorError(Exception):
             val: Any|None = (
                 getattr(self, key, None)
                 or
-                self.msgdata.get(key)
+                getattr(
+                    self._ipc_msg,
+                    key,
+                    None,
+                )
             )
             # TODO: for `.relay_path` on multiline?
             # if not isinstance(val, str):
             #     val_str = pformat(val)
             # else:
             val_str: str = repr(val)
-
             if val:
                 _repr += f'{key}={val_str}{end_char}'
 
@@ -358,7 +485,9 @@ class RemoteActorError(Exception):
 
         '''
         fields: str = self._mk_fields_str(
-            _body_fields,
+            _body_fields
+            +
+            self.extra_body_fields,
         )
         body: str = pformat_boxed_tb(
             tb_str=self.tb_str,
@@ -415,15 +544,6 @@ class RemoteActorError(Exception):
     #     raise NotImplementedError
 
 
-class InternalActorError(RemoteActorError):
-    '''
-    (Remote) internal `tractor` error indicating failure of some
-    primitive, machinery state or lowlevel task that should never
-    occur.
-
-    '''
-
-
 class ContextCancelled(RemoteActorError):
     '''
     Inter-actor task context was cancelled by either a call to
@@ -433,6 +553,10 @@ class ContextCancelled(RemoteActorError):
     reprol_fields: list[str] = [
         'canceller',
     ]
+    extra_body_fields: list[str] = [
+        'cid',
+        'canceller',
+    ]
     @property
     def canceller(self) -> tuple[str, str]|None:
         '''
@@ -454,7 +578,7 @@ class ContextCancelled(RemoteActorError):
           |_`._cancel_task()`
 
         '''
-        value = self.msgdata.get('canceller')
+        value: tuple[str, str]|None = self._ipc_msg.canceller
         if value:
             return tuple(value)
 
@@ -468,6 +592,132 @@ class ContextCancelled(RemoteActorError):
     # src_actor_uid = canceller
 
 
+class MsgTypeError(
+    RemoteActorError,
+):
+    '''
+    Equivalent of a runtime `TypeError` for IPC dialogs.
+
+    Raise when any IPC wire-message is decoded to have invalid
+    field values (due to type) or for other `MsgCodec` related
+    violations such as having no extension-type for a field with
+    a custom type but no `enc/dec_hook()` support.
+
+    Can be raised on the send or recv side of an IPC `Channel`
+    depending on the particular msg.
+
+    Msgs which cause this to be raised on the `.send()` side (aka
+    in the "ctl" dialog phase) include:
+    - `Start`
+    - `Started`
+    - `Return`
+
+    Those which cause it on on the `.recv()` side (aka the "nasty
+    streaming" dialog phase) are:
+    - `Yield`
+    - TODO: any embedded `.pld` type defined by user code?
+
+    Normally the source of an error is re-raised from some `.msg._codec`
+    decode which itself raises in a backend interchange
+    lib (eg. a `msgspec.ValidationError`).
+
+    '''
+    reprol_fields: list[str] = [
+        'ipc_msg',
+    ]
+    extra_body_fields: list[str] = [
+        'cid',
+        'payload_msg',
+    ]
+
+    @property
+    def msg_dict(self) -> dict[str, Any]:
+        '''
+        If the underlying IPC `Msg` was received from a remote
+        actor but was unable to be decoded to a native
+        `Yield`|`Started`|`Return` struct, the interchange backend
+        native format decoder can be used to stash a `dict`
+        version for introspection by the invalidating RPC task.
+
+        '''
+        return self.msgdata.get('_msg_dict')
+
+    @property
+    def payload_msg(self) -> Msg|None:
+        '''
+        Attempt to construct what would have been the original
+        `Msg`-with-payload subtype (i.e. an instance from the set
+        of msgs in `.msg.types._payload_msgs`) which failed
+        validation.
+
+        '''
+        msg_dict: dict = self.msg_dict.copy()
+        name: str = msg_dict.pop('msg_type')
+        msg_type: Msg = getattr(
+            msgtypes,
+            name,
+            Msg,
+        )
+        return msg_type(**msg_dict)
+
+    @property
+    def cid(self) -> str:
+        # pre-packed using `.from_decode()` constructor
+        return self.msgdata.get('cid')
+
+    @classmethod
+    def from_decode(
+        cls,
+        message: str,
+        msgdict: dict,
+
+    ) -> MsgTypeError:
+        return cls(
+            message=message,
+
+            # NOTE: original "vanilla decode" of the msg-bytes
+            # is placed inside a value readable from
+            # `.msgdata['_msg_dict']`
+            _msg_dict=msgdict,
+
+            # expand and pack all RAE compat fields
+            # into the `._extra_msgdata` aux `dict`.
+            **{
+                k: v
+                for k, v in msgdict.items()
+                if k in _ipcmsg_keys
+            },
+        )
+
+
+class StreamOverrun(
+    RemoteActorError,
+    trio.TooSlowError,
+):
+    reprol_fields: list[str] = [
+        'sender',
+    ]
+    '''
+    This stream was overrun by its sender and can be optionally
+    handled by app code using `MsgStream.send()/.receive()`.
+
+    '''
+    @property
+    def sender(self) -> tuple[str, str] | None:
+        value = self._ipc_msg.sender
+        if value:
+            return tuple(value)
+
+
+# class InternalActorError(RemoteActorError):
+#     '''
+#     Boxed (Remote) internal `tractor` error indicating failure of some
+#     primitive, machinery state or lowlevel task that should never
+#     occur.
+
+#     '''
+
+
 class TransportClosed(trio.ClosedResourceError):
     "Underlying channel transport was closed prior to use"
 
@@ -484,23 +734,6 @@ class NoRuntime(RuntimeError):
     "The root actor has not been initialized yet"
 
 
-class StreamOverrun(
-    RemoteActorError,
-    trio.TooSlowError,
-):
-    reprol_fields: list[str] = [
-        'sender',
-    ]
-    '''
-    This stream was overrun by sender
-
-    '''
-    @property
-    def sender(self) -> tuple[str, str] | None:
-        value = self.msgdata.get('sender')
-        if value:
-            return tuple(value)
-
 
 class AsyncioCancelled(Exception):
     '''
@@ -518,23 +751,12 @@ class MessagingError(Exception):
     '''
 
 
-class MsgTypeError(MessagingError):
-    '''
-    Equivalent of a `TypeError` for an IPC wire-message
-    due to an invalid field value (type).
-
-    Normally this is re-raised from some `.msg._codec`
-    decode error raised by a backend interchange lib
-    like `msgspec` or `pycapnproto`.
-
-    '''
-
-
 def pack_error(
     exc: BaseException|RemoteActorError,
 
     tb: str|None = None,
     cid: str|None = None,
+    src_uid: tuple[str, str]|None = None,
 
 ) -> Error:
     '''
@@ -560,7 +782,8 @@ def pack_error(
     ):
         error_msg.update(exc.msgdata)
 
-    # an onion/inception we need to pack
+    # an onion/inception we need to pack as a nested and relayed
+    # remotely boxed error.
     if (
         type(exc) is RemoteActorError
         and (boxed := exc.boxed_type)
@@ -584,7 +807,7 @@ def pack_error(
         error_msg['boxed_type_str'] = 'RemoteActorError'
 
     else:
-        error_msg['src_uid'] = our_uid
+        error_msg['src_uid'] = src_uid or our_uid
         error_msg['src_type_str'] =  type(exc).__name__
         error_msg['boxed_type_str'] = type(exc).__name__
 
@@ -596,7 +819,7 @@ def pack_error(
 
     # XXX NOTE: always ensure the traceback-str is from the
     # locally raised error (**not** the prior relay's boxed
-    # content's `.msgdata`).
+    # content's in `._ipc_msg.tb_str`).
     error_msg['tb_str'] = tb_str
 
     if cid is not None:
@@ -606,7 +829,7 @@ def pack_error(
 
 
 def unpack_error(
-    msg: dict[str, Any]|Error,
+    msg: Error,
 
     chan: Channel|None = None,
     box_type: RemoteActorError = RemoteActorError,
@@ -624,16 +847,10 @@ def unpack_error(
     '''
     __tracebackhide__: bool = hide_tb
 
-    error_dict: dict[str, dict]|None
     if not isinstance(msg, Error):
-    # if (
-    #     error_dict := msg.get('error')
-    # ) is None:
-        # no error field, nothing to unpack.
         return None
 
-    # retrieve the remote error's msg encoded details
-    # tb_str: str = error_dict.get('tb_str', '')
+    # retrieve the remote error's encoded details from fields
     tb_str: str = msg.tb_str
     message: str = (
         f'{chan.uid}\n'
@@ -651,6 +868,10 @@ def unpack_error(
         box_type = ContextCancelled
         assert boxed_type is box_type
 
+    elif boxed_type_str == 'MsgTypeError':
+        box_type = MsgTypeError
+        assert boxed_type is box_type
+
     # TODO: already included by `_this_mod` in else loop right?
     #
     # we have an inception/onion-error so ensure
@@ -661,12 +882,9 @@ def unpack_error(
         # assert len(error_dict['relay_path']) >= 1
         assert len(msg.relay_path) >= 1
 
-    # TODO: mk RAE just take the `Error` instance directly?
-    error_dict: dict = structs.asdict(msg)
-
     exc = box_type(
         message,
-        **error_dict,
+        ipc_msg=msg,
     )
 
     return exc
-- 
2.34.1


From 15549f7c2654292be259f4268c7e2abb4915ede4 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 9 Apr 2024 10:09:05 -0400
Subject: [PATCH 223/378] Expose `MsgType` and extend `MsgCodec` API a bit

Make a new `MsgType: TypeAlias` for the union of all msg types such that
it can be used in annots throughout the code base; just make
`.msg.__msg_spec__` delegate to it.

Add some new codec methods:
- `pld_spec_str`: for the `str`-casted value of the payload spec,
  generally useful in logging content.
- `msg_spec_items()`: to render a `dict` of msg types to their
  `str()`-casted values with support for singling out a specific
  `MsgType`, type by input `msg` instance.
- `pformat_msg_spec()`: for rendering the (partial) `.msg_spec` as
  a formatted `str` useful in logging.

Oh right, add a `Error._msg_dict: dict` in support of the previous
commit (for `MsgTypeError` packing as RAEs) such that our error msg type
can house a non-type-spec decoded wire-bytes for error
reporting/analysis purposes.
---
 tractor/msg/__init__.py | 10 ++++-----
 tractor/msg/_codec.py   | 46 +++++++++++++++++++++++++++++++++++++----
 tractor/msg/types.py    | 25 +++++++++++++++++++++-
 3 files changed, 71 insertions(+), 10 deletions(-)

diff --git a/tractor/msg/__init__.py b/tractor/msg/__init__.py
index fe965e0b..443b781b 100644
--- a/tractor/msg/__init__.py
+++ b/tractor/msg/__init__.py
@@ -19,7 +19,6 @@ Built-in messaging patterns, types, APIs and helpers.
 
 '''
 from typing import (
-    Union,
     TypeAlias,
 )
 from .ptr import (
@@ -56,8 +55,9 @@ from .types import (
 
     # full msg class set from above as list
     __msg_types__ as __msg_types__,
+
+    # type-alias for union of all msgs
+    MsgType as MsgType,
 )
-# TODO: use new type declaration syntax for msg-type-spec
-# https://docs.python.org/3/library/typing.html#type-aliases
-# https://docs.python.org/3/reference/simple_stmts.html#type
-__msg_spec__: TypeAlias = Union[*__msg_types__]
+
+__msg_spec__: TypeAlias = MsgType
diff --git a/tractor/msg/_codec.py b/tractor/msg/_codec.py
index 56f24d62..de3316c8 100644
--- a/tractor/msg/_codec.py
+++ b/tractor/msg/_codec.py
@@ -57,7 +57,7 @@ from trio.lowlevel import (
 from tractor.msg.pretty_struct import Struct
 from tractor.msg.types import (
     mk_msg_spec,
-    Msg,
+    MsgType,
 )
 
 
@@ -87,12 +87,50 @@ class MsgCodec(Struct):
 
     pld_spec: Union[Type[Struct]]|None
 
+    @property
+    def pld_spec_str(self) -> str:
+        spec: Union[Type]|Type = self.pld_spec
+
+        # TODO: could also use match: instead?
+        if getattr(spec, '__args__', False):
+            # `typing.Union` case
+            return str(spec)
+        else:
+            return spec.__name__
+
     # struct type unions
     # https://jcristharif.com/msgspec/structs.html#tagged-unions
     @property
     def msg_spec(self) -> Union[Type[Struct]]:
         return self._dec.type
 
+    def msg_spec_items(
+        self,
+        msg: MsgType|None = None,
+
+    ) -> dict[str, MsgType]|str:
+
+        msgt_table: dict[str, MsgType] = {
+            msgt: str(msgt)
+            for msgt in self.msg_spec.__args__
+        }
+        if msg:
+            msgt: MsgType = type(msg)
+            str_repr: str = msgt_table[msgt]
+            return {msgt: str_repr}
+
+        return msgt_table
+
+    # TODO: some way to make `pretty_struct.Struct` use this
+    # wrapped field over the `.msg_spec` one?
+    def pformat_msg_spec(
+        self,
+        msg: MsgType|None = None,
+    ) -> str:
+        return '\n'.join(
+            self.msg_spec_items(msg=msg).values()
+        )
+
     lib: ModuleType = msgspec
 
     # TODO: a sub-decoder system as well?
@@ -108,7 +146,7 @@ class MsgCodec(Struct):
     # OR
     # ) = {
     #     # pre-seed decoders for std-py-type-set for use when
-    #     # `Msg.pld == None|Any`.
+    #     # `MsgType.pld == None|Any`.
     #     None: msgpack.Decoder(Any),
     #     Any: msgpack.Decoder(Any),
     # }
@@ -303,7 +341,7 @@ def mk_codec(
     # by `tag_field: str` value key?
     # payload_msg_specs: dict[
     #     str,  # tag_field value as sub-decoder key
-    #     Union[Type[Struct]]  # `Msg.pld` type spec
+    #     Union[Type[Struct]]  # `MsgType.pld` type spec
     # ]|None = None,
 
     libname: str = 'msgspec',
@@ -336,7 +374,7 @@ def mk_codec(
         raise RuntimeError(
             f'If a payload spec is provided,\n'
             "the builtin SC-shuttle-protocol's msg set\n"
-            f'(i.e. `{Msg}`) MUST be used!\n\n'
+            f'(i.e. a `{MsgType}`) MUST be used!\n\n'
             f'However both values were passed as => mk_codec(\n'
             f'   ipc_msg_spec={ipc_msg_spec}`\n'
             f'   ipc_pld_spec={ipc_pld_spec}`\n)\n'
diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index 7355a610..14db09cd 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -31,6 +31,7 @@ from typing import (
     Literal,
     Type,
     TypeVar,
+    TypeAlias,
     Union,
 )
 
@@ -400,16 +401,29 @@ class CancelAck(
     pld: bool
 
 
+# TODO: unify this with `._exceptions.RemoteActorError`
+# such that we can have a msg which is both raisable and
+# IPC-wire ready?
+# B~o
 class Error(
     Struct,
     tag=True,
     tag_field='msg_type',
+
+    # TODO may omit defaults?
+    # https://jcristharif.com/msgspec/structs.html#omitting-default-values
+    # omit_defaults=True,
 ):
     '''
     A pkt that wraps `RemoteActorError`s for relay and raising.
 
     Fields are 1-to-1 meta-data as needed originally by
-    `RemoteActorError.msgdata: dict`.
+    `RemoteActorError.msgdata: dict` but now are defined here.
+
+    Note: this msg shuttles `ContextCancelled` and `StreamOverrun`
+    as well is used to rewrap any `MsgTypeError` for relay-reponse
+    to bad `Yield.pld` senders during an IPC ctx's streaming dialog
+    phase.
 
     '''
     src_uid: tuple[str, str]
@@ -428,6 +442,10 @@ class Error(
     # `StreamOverrun`
     sender: tuple[str, str]|None = None
 
+    # for the `MsgTypeError` case where the receiver side
+    # decodes the underlying original `Msg`-subtype
+    _msg_dict: dict|None = None
+
 
 # TODO: should be make a msg version of `ContextCancelled?`
 # and/or with a scope field or a full `ActorCancelled`?
@@ -486,6 +504,11 @@ __msg_types__: list[Msg] = (
     _payload_msgs
 )
 
+# TODO: use new type declaration syntax for msg-type-spec
+# https://docs.python.org/3/library/typing.html#type-aliases
+# https://docs.python.org/3/reference/simple_stmts.html#type
+MsgType: TypeAlias = Union[*__msg_types__]
+
 
 def mk_msg_spec(
     payload_type_union: Union[Type] = Any,
-- 
2.34.1


From a35c1d40ab37a9cc45a23b35ff3a843b9449cda8 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 9 Apr 2024 10:36:25 -0400
Subject: [PATCH 224/378] Refine `MsgTypeError` handling to
 relay-up-on-`.recv()`

Such that `Channel.recv()` + `MsgpackTCPStream.recv()` originating
msg-type-errors are not raised at the IPC transport layer but instead
relayed up the runtime stack for eventual handling by user-app code via
the `Context`/`MsgStream` layer APIs.

This design choice leads to a substantial amount of flexibility and
modularity, and avoids `MsgTypeError` handling policies from being
coupled to a particular backend IPC transport layer:
- receive-side msg-type errors, as can be raised and handled in the
  `.open_stream()` "nasty" phase of a ctx, whilst being packed at the
  `MsgCodec`/transport layer (keeping the underlying src decode error
  coupled to the specific transport + interchange lib) and then relayed
  upward to app code for custom handling like a normal Error` msg.
- the policy options for handling such cases could be implemented as
  `@acm` wrappers around `.open_context()`/`.open_stream()` blocks (and
  their respective delivered primitives) OR just plain old async
  generators around `MsgStream.receive()` such that both built-in policy
  handling and custom user-app solutions can be swapped without touching
  any `tractor` internals or providing specialized "registry APIs".
  -> eg. the ignore and relay-invalid-msg-to-sender approach can be more
   easily implemented as embedded `try: except MsgTypeError:` blocks
   around `MsgStream.receive()` possibly applied as either of an
   injected wrapper type around a stream or an async gen that `async
   for`s from the stream.
- any performance based AOT-lang extensions used to implement a policy
  for handling recv-side errors space can avoid knowledge of the lower
  level IPC `Channel` (and-downward) primitives.
- `Context` consuming code can choose to let all msg-type-errs
  bubble and handle them manually (like any other remote `Error`
  shuttled exception).
- we can keep (as before) send-side msg type checks can be raised
  locally and cause offending senders to error and adjust before the
  streaming phase of an IPC ctx.

Impl (related) deats:
- obvi make `MsgpackTCPStream.recv()` yield up any `MsgTypeError`
  constructed by `_mk_msg_type_err()` such that the exception will
  eventually be relayed up to `._rpc.process_messages()` and from
  their delivered to the corresponding ctx-task.
- in support of ^, make `Channel.recv()` detect said mtes and use the
  new `pack_from_raise()` to inject the far end `Actor.uid` for the
  `Error.src_uid`.
- keep raising the send side equivalent (when strict enabled) errors
  inline immediately with no upward `Error` packing or relay.
- improve `_mk_msg_type_err()` cases handling with far more detailed
  `MsgTypeError` "message" contents pertaining to `msgspec` specific
  failure-fixing-tips and type-spec mismatch info:
  * use `.from_decode()` constructor in recv-side case to inject the
    non-spec decoded `msg_dict: dict` and use the new
    `MsgCodec.pld_spec_str: str` when clarifying the type discrepancy
    with the offending field.
  * on send-side, if we detect that an unsupported field type was
    described in the original `src_type_error`, AND there is no
    `msgpack.Encoder.enc_hook()` set, that the real issue is likely
    that the user needs to extend the codec to support the
    non-std/custom type with a hook and link to `msgspec` docs.
  * if one of a `src_type/validation_error` is provided, set that
    error as the `.__cause__` in the new mte.
---
 tractor/_ipc.py | 163 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 115 insertions(+), 48 deletions(-)

diff --git a/tractor/_ipc.py b/tractor/_ipc.py
index 694eaf9e..7713811c 100644
--- a/tractor/_ipc.py
+++ b/tractor/_ipc.py
@@ -38,7 +38,6 @@ from typing import (
     Protocol,
     Type,
     TypeVar,
-    Union,
 )
 
 import msgspec
@@ -47,8 +46,9 @@ import trio
 
 from tractor.log import get_logger
 from tractor._exceptions import (
-    TransportClosed,
     MsgTypeError,
+    pack_from_raise,
+    TransportClosed,
 )
 from tractor.msg import (
     _ctxvar_MsgCodec,
@@ -118,40 +118,75 @@ class MsgTransport(Protocol[MsgType]):
         ...
 
 
-def _raise_msg_type_err(
+def _mk_msg_type_err(
     msg: Any|bytes,
     codec: MsgCodec,
-    validation_err: msgspec.ValidationError|None = None,
+
+    message: str|None = None,
     verb_header: str = '',
 
-) -> None:
+    src_validation_error: msgspec.ValidationError|None = None,
+    src_type_error: TypeError|None = None,
 
-    # if side == 'send':
-    if validation_err is None:  # send-side
+) -> MsgTypeError:
 
-        import traceback
-        from tractor._exceptions import pformat_boxed_tb
+    # `Channel.send()` case
+    if src_validation_error is None:  # send-side
 
-        fmt_spec: str = '\n'.join(
-            map(str, codec.msg_spec.__args__)
-        )
-        fmt_stack: str = (
-            '\n'.join(traceback.format_stack(limit=3))
-        )
-        tb_fmt: str = pformat_boxed_tb(
-            tb_str=fmt_stack,
-            # fields_str=header,
-            field_prefix='  ',
-            indent='',
-        )
-        raise MsgTypeError(
-            f'invalid msg -> {msg}: {type(msg)}\n\n'
-            f'{tb_fmt}\n'
-            f'Valid IPC msgs are:\n\n'
-            # f'  ------ - ------\n'
-            f'{fmt_spec}\n'
-        )
+        # no src error from `msgspec.msgpack.Decoder.decode()` so
+        # prolly a manual type-check on our part.
+        if message is None:
+            import traceback
+            from tractor._exceptions import pformat_boxed_tb
 
+            fmt_spec: str = '\n'.join(
+                map(str, codec.msg_spec.__args__)
+            )
+            fmt_stack: str = (
+                '\n'.join(traceback.format_stack(limit=3))
+            )
+            tb_fmt: str = pformat_boxed_tb(
+                tb_str=fmt_stack,
+                # fields_str=header,
+                field_prefix='  ',
+                indent='',
+            )
+            message: str = (
+                f'invalid msg -> {msg}: {type(msg)}\n\n'
+                f'{tb_fmt}\n'
+                f'Valid IPC msgs are:\n\n'
+                # f'  ------ - ------\n'
+                f'{fmt_spec}\n',
+            )
+        elif src_type_error:
+            src_message: str = str(src_type_error)
+            patt: str = 'type '
+            type_idx: int = src_message.find('type ')
+            invalid_type: str = src_message[type_idx + len(patt):].split()[0]
+
+            enc_hook: Callable|None = codec.enc.enc_hook
+            if enc_hook is None:
+                message += (
+                    '\n\n'
+
+                    f"The current IPC-msg codec can't encode type `{invalid_type}` !\n"
+                    f'Maybe a `msgpack.Encoder.enc_hook()` extension is needed?\n\n'
+
+                    f'Check the `msgspec` docs for ad-hoc type extending:\n'
+                    '|_ https://jcristharif.com/msgspec/extending.html\n'
+                    '|_ https://jcristharif.com/msgspec/extending.html#defining-a-custom-extension-messagepack-only\n'
+                )
+
+
+        msgtyperr = MsgTypeError(
+            message=message,
+            ipc_msg=msg,
+        )
+        # ya, might be `None`
+        msgtyperr.__cause__ = src_type_error
+        return msgtyperr
+
+    # `Channel.recv()` case
     else:
         # decode the msg-bytes using the std msgpack
         # interchange-prot (i.e. without any
@@ -161,29 +196,31 @@ def _raise_msg_type_err(
         msg_dict: dict = msgspec.msgpack.decode(msg)
         msg_type_name: str = msg_dict['msg_type']
         msg_type = getattr(msgtypes, msg_type_name)
-        errmsg: str = (
+        message: str = (
             f'invalid `{msg_type_name}` IPC msg\n\n'
         )
         if verb_header:
-            errmsg = f'{verb_header} ' + errmsg
+            message = f'{verb_header} ' + message
 
         # XXX see if we can determine the exact invalid field
         # such that we can comprehensively report the
         # specific field's type problem
-        msgspec_msg: str = validation_err.args[0].rstrip('`')
+        msgspec_msg: str = src_validation_error.args[0].rstrip('`')
         msg, _, maybe_field = msgspec_msg.rpartition('$.')
         obj = object()
         if (field_val := msg_dict.get(maybe_field, obj)) is not obj:
-            field_type: Union[Type] = msg_type.__signature__.parameters[
-                maybe_field
-            ].annotation
-            errmsg += (
+            message += (
                 f'{msg.rstrip("`")}\n\n'
                 f'{msg_type}\n'
-                f' |_.{maybe_field}: {field_type} = {field_val!r}\n'
+                f' |_.{maybe_field}: {codec.pld_spec_str} = {field_val!r}\n'
             )
 
-        raise MsgTypeError(errmsg) from validation_err
+        msgtyperr = MsgTypeError.from_decode(
+            message=message,
+            msgdict=msg_dict,
+        )
+        msgtyperr.__cause__ = src_validation_error
+        return msgtyperr
 
 
 # TODO: not sure why we have to inherit here, but it seems to be an
@@ -325,12 +362,15 @@ class MsgpackTCPStream(MsgTransport):
             # and always raise such that spec violations
             # are never allowed to be caught silently!
             except msgspec.ValidationError as verr:
-                # re-raise as type error
-                _raise_msg_type_err(
+                msgtyperr: MsgTypeError = _mk_msg_type_err(
                     msg=msg_bytes,
                     codec=codec,
-                    validation_err=verr,
+                    src_validation_error=verr,
                 )
+                # XXX deliver up to `Channel.recv()` where
+                # a re-raise and `Error`-pack can inject the far
+                # end actor `.uid`.
+                yield msgtyperr
 
             except (
                 msgspec.DecodeError,
@@ -387,7 +427,7 @@ class MsgpackTCPStream(MsgTransport):
 
             if type(msg) not in msgtypes.__msg_types__:
                 if strict_types:
-                    _raise_msg_type_err(
+                    raise _mk_msg_type_err(
                         msg,
                         codec=codec,
                     )
@@ -400,11 +440,16 @@ class MsgpackTCPStream(MsgTransport):
             try:
                 bytes_data: bytes = codec.encode(msg)
             except TypeError as typerr:
-                raise MsgTypeError(
-                    'A msg field violates the current spec\n'
-                    f'{codec.pld_spec}\n\n'
-                    f'{pretty_struct.Struct.pformat(msg)}'
-                ) from typerr
+                msgtyperr: MsgTypeError = _mk_msg_type_err(
+                    msg,
+                    codec=codec,
+                    message=(
+                        f'IPC-msg-spec violation in\n\n'
+                        f'{pretty_struct.Struct.pformat(msg)}'
+                    ),
+                    src_type_error=typerr,
+                )
+                raise msgtyperr from typerr
 
             # supposedly the fastest says,
             # https://stackoverflow.com/a/54027962
@@ -719,13 +764,35 @@ class Channel:
         assert self._transport
         while True:
             try:
-                async for item in self._transport:
-                    yield item
+                async for msg in self._transport:
+                    match msg:
+                        # NOTE: if transport/interchange delivers
+                        # a type error, we pack it with the far
+                        # end peer `Actor.uid` and relay the
+                        # `Error`-msg upward to the `._rpc` stack
+                        # for normal RAE handling.
+                        case MsgTypeError():
+                            yield pack_from_raise(
+                                local_err=msg,
+                                cid=msg.cid,
+
+                                # XXX we pack it here bc lower
+                                # layers have no notion of an
+                                # actor-id ;)
+                                src_uid=self.uid,
+                            )
+                        case _:
+                            yield msg
+
+                    # TODO: if we were gonna do this it should be
+                    # done up at the `MsgStream` layer!
+                    #
                     # sent = yield item
                     # if sent is not None:
                     #     # optimization, passing None through all the
                     #     # time is pointless
                     #     await self._transport.send(sent)
+
             except trio.BrokenResourceError:
 
                 # if not self._autorecon:
-- 
2.34.1


From 8839bb06a342f95d21d72e4c150638c00bd2b4c5 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 9 Apr 2024 13:46:34 -0400
Subject: [PATCH 225/378] Start tidying up `._context`, use `pack_from_raise()`

Mostly removing commented (and replaced) code blocks lingering from the
ctxc semantics work and new typed-msg-spec `MsgType`s handling AND use
the new `._exceptions.pack_from_raise()` helper to construct
`StreamOverrun` msgs.

Deaterz:
- clean out the drain loop now that it's implemented to handle our
  struct msg types including the `dict`-msg bits left in as
  fallback-reminders, any notes/todos better summarized at the top of
  their blocks, remove any `_final_result_is_set()` related duplicate/legacy
  tidbits.
- use a `case Error()` block in drain loop with fallthrough to `_:`
  always resulting in an rte raise.
- move "XXX" notes into the doc-string for `._deliver_msg()` as
  a "rules" section.
- use `match:` syntax for logging the `result_or_err: MsgType` outcome
  from the final `.result()` call inside `open_context_from_portal()`.
- generally speaking use `MsgType` type annotations throughout!
---
 tractor/_context.py | 226 ++++++++++++++++++--------------------------
 tractor/_portal.py  |   4 +-
 tractor/_runtime.py |   1 -
 3 files changed, 95 insertions(+), 136 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index 6a634166..69f28aca 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -49,20 +49,21 @@ from ._exceptions import (
     InternalError,
     RemoteActorError,
     StreamOverrun,
-    pack_error,
+    pack_from_raise,
     unpack_error,
     _raise_from_no_key_in_msg,
 )
 from .log import get_logger
 from .msg import (
+    Error,
+    MsgType,
+    MsgCodec,
     NamespacePath,
-    Msg,
     Return,
     Started,
     Stop,
     Yield,
     current_codec,
-    MsgCodec,
     pretty_struct,
 )
 from ._ipc import Channel
@@ -107,8 +108,7 @@ async def _drain_to_final_msg(
     # wait for a final context result by collecting (but
     # basically ignoring) any bi-dir-stream msgs still in transit
     # from the far end.
-    # pre_result_drained: list[dict] = []
-    pre_result_drained: list[Msg] = []
+    pre_result_drained: list[MsgType] = []
     while not (
         ctx.maybe_error
         and not ctx._final_result_is_set()
@@ -168,7 +168,7 @@ async def _drain_to_final_msg(
 
             # pray to the `trio` gawds that we're corrent with this
             # msg: dict = await ctx._recv_chan.receive()
-            msg: Msg = await ctx._recv_chan.receive()
+            msg: MsgType = await ctx._recv_chan.receive()
             # always capture unexpected/non-result msgs
             pre_result_drained.append(msg)
 
@@ -191,13 +191,12 @@ async def _drain_to_final_msg(
             raise
 
         match msg:
+
+            # final result arrived!
             case Return(
-                cid=cid,
+                # cid=cid,
                 pld=res,
             ):
-        # try:
-            # ctx._result: Any = msg['return']
-            # ctx._result: Any = msg.pld
                 ctx._result: Any = res
                 log.runtime(
                     'Context delivered final draining msg:\n'
@@ -210,13 +209,9 @@ async def _drain_to_final_msg(
                 # TODO: ^ we don't need it right?
                 break
 
-        # except KeyError:
-        # except AttributeError:
+            # far end task is still streaming to us so discard
+            # and report depending on local ctx state.
             case Yield():
-            # if 'yield' in msg:
-
-                # far end task is still streaming to us so discard
-                # and report per local context state.
                 if (
                     (ctx._stream.closed
                      and (reason := 'stream was already closed')
@@ -257,45 +252,34 @@ async def _drain_to_final_msg(
                     )
                     continue
 
+            # stream terminated, but no result yet..
+            #
             # TODO: work out edge cases here where
             # a stream is open but the task also calls
             # this?
             # -[ ] should be a runtime error if a stream is open right?
             # Stop()
             case Stop():
-            # elif 'stop' in msg:
                 log.cancel(
                     'Remote stream terminated due to "stop" msg:\n\n'
                     f'{pformat(msg)}\n'
                 )
                 continue
 
-            # It's an internal error if any other msg type without
-            # a`'cid'` field arrives here!
-            case _:
-            # if not msg.get('cid'):
-                if not msg.cid:
-                    raise InternalError(
-                        'Unexpected cid-missing msg?\n\n'
-                        f'{msg}\n'
-                    )
+            # remote error msg, likely already handled inside
+            # `Context._deliver_msg()`
+            case Error():
 
-                # XXX fallthrough to handle expected error XXX
-                # TODO: replace this with `ctx.maybe_raise()`
+                # TODO: can we replace this with `ctx.maybe_raise()`?
+                # -[ ]  would this be handier for this case maybe?
+                #     async with maybe_raise_on_exit() as raises:
+                #         if raises:
+                #             log.error('some msg about raising..')
                 #
-                # TODO: would this be handier for this case maybe?
-                # async with maybe_raise_on_exit() as raises:
-                #     if raises:
-                #         log.error('some msg about raising..')
-
                 re: Exception|None = ctx._remote_error
                 if re:
-                    log.critical(
-                        'Remote ctx terminated due to "error" msg:\n'
-                        f'{re}'
-                    )
                     assert msg is ctx._cancel_msg
-                    # NOTE: this solved a super dupe edge case XD
+                    # NOTE: this solved a super duper edge case XD
                     # this was THE super duper edge case of:
                     # - local task opens a remote task,
                     # - requests remote cancellation of far end
@@ -312,9 +296,10 @@ async def _drain_to_final_msg(
                     #   does not re-raise any ctxc it receives
                     #   IFF **it** was the cancellation
                     #   requester..
-                    # will raise if necessary, ow break from
-                    # loop presuming any error terminates the
-                    # context!
+                    #
+                    # XXX will raise if necessary but ow break
+                    # from loop presuming any supressed error
+                    # (ctxc) should terminate the context!
                     ctx._maybe_raise_remote_err(
                         re,
                         # NOTE: obvi we don't care if we
@@ -338,6 +323,7 @@ async def _drain_to_final_msg(
                     log.critical('SHOULD NEVER GET HERE!?')
                     assert msg is ctx._cancel_msg
                     assert error.msgdata == ctx._remote_error.msgdata
+                    assert error.ipc_msg == ctx._remote_error.ipc_msg
                     from .devx._debug import pause
                     await pause()
                     ctx._maybe_cancel_and_set_remote_error(error)
@@ -346,6 +332,20 @@ async def _drain_to_final_msg(
                 else:
                     # bubble the original src key error
                     raise
+
+            # XXX should pretty much never get here unless someone
+            # overrides the default `MsgType` spec.
+            case _:
+                # It's definitely an internal error if any other
+                # msg type without a`'cid'` field arrives here!
+                if not msg.cid:
+                    raise InternalError(
+                        'Unexpected cid-missing msg?\n\n'
+                        f'{msg}\n'
+                    )
+
+                raise RuntimeError('Unknown msg type: {msg}')
+
     else:
         log.cancel(
             'Skipping `MsgStream` drain since final outcome is set\n\n'
@@ -1342,8 +1342,11 @@ class Context:
             # `._cancel_called == True`.
             not raise_overrun_from_self
             and isinstance(remote_error, RemoteActorError)
-            and remote_error.msgdata['boxed_type_str'] == 'StreamOverrun'
-            and tuple(remote_error.msgdata['sender']) == our_uid
+
+            and remote_error.boxed_type_str == 'StreamOverrun'
+
+            # and tuple(remote_error.msgdata['sender']) == our_uid
+            and tuple(remote_error.sender) == our_uid
         ):
             # NOTE: we set the local scope error to any "self
             # cancellation" error-response thus "absorbing"
@@ -1412,16 +1415,11 @@ class Context:
 
         assert self._recv_chan
         raise_overrun: bool = not self._allow_overruns
-        # res_placeholder: int = id(self)
         if (
-            # self._result == res_placeholder
-            # and not self._remote_error
             self.maybe_error is None
-            # not self._remote_error
-            # and not self._local_error
-            and not self._recv_chan._closed  # type: ignore
+            and
+            not self._recv_chan._closed  # type: ignore
         ):
-
             # wait for a final context result/error by "draining"
             # (by more or less ignoring) any bi-dir-stream "yield"
             # msgs still in transit from the far end.
@@ -1432,7 +1430,6 @@ class Context:
             for msg in drained_msgs:
 
                 # TODO: mask this by default..
-                # if 'return' in msg:
                 if isinstance(msg, Return):
                     # from .devx import pause
                     # await pause()
@@ -1448,6 +1445,9 @@ class Context:
             )
 
         self.maybe_raise(
+            # NOTE: obvi we don't care if we
+            # overran the far end if we're already
+            # waiting on a final result (msg).
             raise_overrun_from_self=(
                 raise_overrun
                 and
@@ -1458,34 +1458,12 @@ class Context:
                 (not self._cancel_called)
             )
         )
-        # if (
-        #     (re := self._remote_error)
-        #     # and self._result == res_placeholder
-        # ):
-        #     self._maybe_raise_remote_err(
-        #         re,
-        #         # NOTE: obvi we don't care if we
-        #         # overran the far end if we're already
-        #         # waiting on a final result (msg).
-        #         # raise_overrun_from_self=False,
-        #         raise_overrun_from_self=(
-        #             raise_overrun
-        #             and
-        #             # only when we ARE NOT the canceller
-        #             # should we raise overruns, bc ow we're
-        #             # raising something we know might happen
-        #             # during cancellation ;)
-        #             (not self._cancel_called)
-        #         ),
-        #     )
-            # if maybe_err:
-            #     self._result = maybe_err
-
         return self.outcome
 
-    # TODO: switch this with above which should be named
-    # `.wait_for_outcome()` and instead do
-    # a `.outcome.Outcome.unwrap()` ?
+    # TODO: switch this with above!
+    # -[ ] should be named `.wait_for_outcome()` and instead do
+    #     a `.outcome.Outcome.unwrap()` ?
+    #
     # @property
     # def result(self) -> Any|None:
     #     if self._final_result_is_set():
@@ -1544,7 +1522,6 @@ class Context:
         return None
 
     def _final_result_is_set(self) -> bool:
-        # return not (self._result == id(self))
         return self._result is not Unresolved
 
     # def get_result_nowait(self) -> Any|None:
@@ -1761,8 +1738,7 @@ class Context:
 
     async def _deliver_msg(
         self,
-        # msg: dict,
-        msg: Msg,
+        msg: MsgType,
 
     ) -> bool:
         '''
@@ -1776,6 +1752,20 @@ class Context:
         `._scope_nursery: trio.Nursery`) which ensures that such
         messages are queued up and eventually sent if possible.
 
+         XXX RULES XXX
+        ------ - ------
+        - NEVER raise remote errors from this method; a runtime task caller.
+          An error "delivered" to a ctx should always be raised by
+          the corresponding local task operating on the
+          `Portal`/`Context` APIs.
+
+        - NEVER `return` early before delivering the msg!
+          bc if the error is a ctxc and there is a task waiting on
+          `.result()` we need the msg to be
+          `send_chan.send_nowait()`-ed over the `._recv_chan` so
+          that the error is relayed to that waiter task and thus
+          raised in user code!
+
         '''
         cid: str = self.cid
         chan: Channel = self.chan
@@ -1806,28 +1796,14 @@ class Context:
             )
             self._cancel_msg: dict = msg
 
-            # NOTE: this will not raise an error, merely set
+            # XXX NOTE: this will not raise an error, merely set
             # `._remote_error` and maybe cancel any task currently
             # entered in `Portal.open_context()` presuming the
             # error is "cancel causing" (i.e. a `ContextCancelled`
             # or `RemoteActorError`).
             self._maybe_cancel_and_set_remote_error(re)
 
-            # XXX NEVER do this XXX..!!
-            # bc if the error is a ctxc and there is a task
-            # waiting on `.result()` we need the msg to be sent
-            # over the `send_chan`/`._recv_chan` so that the error
-            # is relayed to that waiter task..
-            # return True
-            #
-            # XXX ALSO NO!! XXX
-            # => NEVER raise remote errors from the calling
-            # runtime task, they should always be raised by
-            # consumer side tasks operating on the
-            # `Portal`/`Context` APIs.
-            # if self._remote_error:
-            #     self._maybe_raise_remote_err(error)
-
+        # XXX only case where returning early is fine!
         if self._in_overrun:
             log.warning(
                 f'Queueing OVERRUN msg on caller task:\n'
@@ -1946,31 +1922,27 @@ class Context:
                         # anything different.
                         return False
             else:
-                # txt += f'\n{msg}\n'
                 # raise local overrun and immediately pack as IPC
                 # msg for far end.
-                try:
-                    raise StreamOverrun(
+                err_msg: Error = pack_from_raise(
+                    local_err=StreamOverrun(
                         txt,
                         sender=from_uid,
-                    )
-                except StreamOverrun as err:
-                    err_msg: dict[str, dict] = pack_error(
-                        err,
-                        cid=cid,
-                    )
-                    try:
-                        # relay condition to sender side remote task
-                        await chan.send(err_msg)
-                        return True
+                    ),
+                    cid=cid,
+                )
+                try:
+                    # relay condition to sender side remote task
+                    await chan.send(err_msg)
+                    return True
 
-                    except trio.BrokenResourceError:
-                        # XXX: local consumer has closed their side
-                        # so cancel the far end streaming task
-                        log.warning(
-                            'Channel for ctx is already closed?\n'
-                            f'|_{chan}\n'
-                        )
+                # XXX: local consumer has closed their side of
+                # the IPC so cancel the far end streaming task
+                except trio.BrokenResourceError:
+                    log.warning(
+                        'Channel for ctx is already closed?\n'
+                        f'|_{chan}\n'
+                    )
 
             # ow, indicate unable to deliver by default
             return False
@@ -2379,28 +2351,17 @@ async def open_context_from_portal(
             # an exception type boxed in a `RemoteActorError`
             # is returned (meaning it was obvi not raised)
             # that we want to log-report on.
-            msgdata: str|None = getattr(
-                result_or_err,
-                'msgdata',
-                None
-            )
-            match (msgdata, result_or_err):
-                case (
-                    {'tb_str': tbstr},
-                    ContextCancelled(),
-                ):
-                    log.cancel(tbstr)
+            match result_or_err:
+                case ContextCancelled() as ctxc:
+                    log.cancel(ctxc.tb_str)
 
-                case (
-                    {'tb_str': tbstr},
-                    RemoteActorError(),
-                ):
+                case RemoteActorError() as rae:
                     log.exception(
                         'Context remotely errored!\n'
                         f'<= peer: {uid}\n'
                         f'  |_ {nsf}()\n\n'
 
-                        f'{tbstr}'
+                        f'{rae.tb_str}'
                     )
                 case (None, _):
                     log.runtime(
@@ -2410,7 +2371,6 @@ async def open_context_from_portal(
 
                         f'`{result_or_err}`\n'
                     )
-
     finally:
         # XXX: (MEGA IMPORTANT) if this is a root opened process we
         # wait for any immediate child in debug before popping the
diff --git a/tractor/_portal.py b/tractor/_portal.py
index 957eae59..e4db93a6 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -46,6 +46,7 @@ from ._state import (
 from ._ipc import Channel
 from .log import get_logger
 from .msg import (
+    Error,
     NamespacePath,
     Return,
 )
@@ -69,8 +70,7 @@ log = get_logger(__name__)
 # `._raise_from_no_key_in_msg()` (after tweak to
 # accept a `chan: Channel` arg) in key block!
 def _unwrap_msg(
-    # msg: dict[str, Any],
-    msg: Return,
+    msg: Return|Error,
     channel: Channel,
 
     hide_tb: bool = True,
diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 4be5ea1f..4d90c591 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -49,7 +49,6 @@ from pprint import pformat
 import signal
 import sys
 from typing import (
-    Any,
     Callable,
     TYPE_CHECKING,
 )
-- 
2.34.1


From af013912acbb5aabadae26d168ae5c45599c9f71 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 9 Apr 2024 13:58:10 -0400
Subject: [PATCH 226/378] Relay `MsgTypeError`s upward in RPC loop via
 `._deliver_ctx_payload()`

---
 tractor/_rpc.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index d935909f..a95dbfec 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -47,12 +47,13 @@ from ._context import (
     Context,
 )
 from ._exceptions import (
-    ModuleNotExposed,
-    is_multi_cancelled,
     ContextCancelled,
+    ModuleNotExposed,
+    MsgTypeError,
+    TransportClosed,
+    is_multi_cancelled,
     pack_error,
     unpack_error,
-    TransportClosed,
 )
 from .devx import (
     maybe_wait_for_debugger,
@@ -636,7 +637,7 @@ async def _invoke(
                     # (callee) task, so relay this cancel signal to the
                     # other side.
                     ctxc = ContextCancelled(
-                        msg,
+                        message=msg,
                         boxed_type=trio.Cancelled,
                         canceller=canceller,
                     )
@@ -826,7 +827,12 @@ async def process_messages(
                         | Stop(cid=cid)
                         | Return(cid=cid)
                         | CancelAck(cid=cid)
-                        | Error(cid=cid)  # RPC-task ctx specific
+
+                        # `.cid` means RPC-ctx-task specific
+                        | Error(cid=cid)
+
+                        # recv-side `MsgType` decode violation
+                        | MsgTypeError(cid=cid)
                     ):
                         # deliver response to local caller/waiter
                         # via its per-remote-context memory channel.
-- 
2.34.1


From 0dcaf5f3b2c878c1c1417dba4efc6488a4ea5152 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 9 Apr 2024 13:58:51 -0400
Subject: [PATCH 227/378] TO-CHERRY: Error on `breakpoint()` without
 `debug_mode=True`?

Not sure if this is a good tactic (yet) but it at least covers us from
getting user's confused by `breakpoint()` usage causing REPL clobbering.
Always set an explicit rte raising breakpoint hook such that the user
realizes they can't use `.pause_from_sync()` without enabling debug
mode.

** CHERRY-PICK into `pause_from_sync_w_greenback` branch! **
---
 tractor/_root.py | 43 +++++++++++++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 12 deletions(-)

diff --git a/tractor/_root.py b/tractor/_root.py
index 91d7c83f..bc55fd9e 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -22,9 +22,10 @@ from contextlib import asynccontextmanager
 from functools import partial
 import importlib
 import logging
+import os
 import signal
 import sys
-import os
+from typing import Callable
 import warnings
 
 
@@ -99,19 +100,35 @@ async def open_root_actor(
     # Override the global debugger hook to make it play nice with
     # ``trio``, see much discussion in:
     # https://github.com/python-trio/trio/issues/1155#issuecomment-742964018
+    builtin_bp_handler: Callable = sys.breakpointhook
+    orig_bp_path: str|None = os.environ.get(
+        'PYTHONBREAKPOINT',
+        None,
+    )
     if (
+        debug_mode
+        and
         await _debug.maybe_init_greenback(
             raise_not_found=False,
         )
     ):
-        builtin_bp_handler = sys.breakpointhook
-        orig_bp_path: str|None = os.environ.get(
-            'PYTHONBREAKPOINT',
-            None,
-        )
         os.environ['PYTHONBREAKPOINT'] = (
             'tractor.devx._debug.pause_from_sync'
         )
+    else:
+        # TODO: disable `breakpoint()` by default (without
+        # `greenback`) since it will break any multi-actor
+        # usage by a clobbered TTY's stdstreams!
+        def block_bps(*args, **kwargs):
+            raise RuntimeError(
+                '`tractor` blocks built-in `breakpoint()` calls by default!\n'
+                'If you need to us it please install `greenback` and set '
+                '`debug_mode=True` when opening the runtime '
+                '(either via `.open_nursery()` or `open_root_actor()`)\n'
+            )
+
+        sys.breakpointhook = block_bps
+        # os.environ['PYTHONBREAKPOINT'] = None
 
     # attempt to retreive ``trio``'s sigint handler and stash it
     # on our debugger lock state.
@@ -368,12 +385,14 @@ async def open_root_actor(
         _state._last_actor_terminated = actor
 
         # restore built-in `breakpoint()` hook state
-        sys.breakpointhook = builtin_bp_handler
-        if orig_bp_path is not None:
-            os.environ['PYTHONBREAKPOINT'] = orig_bp_path
-        else:
-            # clear env back to having no entry
-            os.environ.pop('PYTHONBREAKPOINT')
+        if debug_mode:
+            if builtin_bp_handler is not None:
+                sys.breakpointhook = builtin_bp_handler
+            if orig_bp_path is not None:
+                os.environ['PYTHONBREAKPOINT'] = orig_bp_path
+            else:
+                # clear env back to having no entry
+                os.environ.pop('PYTHONBREAKPOINT')
 
         logger.runtime("Root actor terminated")
 
-- 
2.34.1


From 7aaa2a61ec7288ba73e4b6d7d11405ed6971b453 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 11 Apr 2024 20:23:55 -0400
Subject: [PATCH 228/378] Add msg-from-dict constructor helper

Handy for re-constructing a struct-`MsgType` from a `dict` decoded from
wire-bytes wherein the msg failed to decode normally due to a field type
error but you'd still like to show the "potential" msg in struct form,
say inside a `MsgTypeError`'s meta data.

Supporting deats:
- add a `.msg.types.from_dict_msg()` to implement it (the helper).
- also a `.msg.types._msg_table: dict[str, MsgType]` for supporting this
  func ^ as well as providing just a general `MsgType`-by-`str`-name
  lookup.

Unrelated:
- Drop commented idea for still supporting `dict`-msg set via
  `enc/dec_hook()`s that would translate to/from `MsgType`s, but that
  would require a duplicate impl in the runtime.. so eff that XD
---
 tractor/msg/pretty_struct.py |   1 +
 tractor/msg/types.py         | 117 ++++++++++++-----------------------
 2 files changed, 40 insertions(+), 78 deletions(-)

diff --git a/tractor/msg/pretty_struct.py b/tractor/msg/pretty_struct.py
index 412b6ed6..a67bbd26 100644
--- a/tractor/msg/pretty_struct.py
+++ b/tractor/msg/pretty_struct.py
@@ -140,6 +140,7 @@ class Struct(
 
         return sin_props
 
+    # TODO: make thisi a mod-func!
     def pformat(
         self,
         field_indent: int = 2,
diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index 14db09cd..9787504b 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -447,6 +447,29 @@ class Error(
     _msg_dict: dict|None = None
 
 
+def from_dict_msg(
+    dict_msg: dict,
+
+    msgT: MsgType|None = None,
+    tag_field: str = 'msg_type'
+
+) -> MsgType:
+    '''
+    Helper to build a specific `MsgType` struct from
+    a "vanilla" decoded `dict`-ified equivalent of the
+    msg: i.e. if the `msgpack.Decoder.type == Any`.
+
+    '''
+    msg_type_tag_field: str = (
+        msgT.__struct_config__.tag_field
+        if msgT is not None
+        else tag_field
+    )
+    # XXX ensure tag field is removed
+    msgT_name: str = dict_msg.pop(msg_type_tag_field)
+    msgT: MsgType = _msg_table[msgT_name]
+    return msgT(**dict_msg)
+
 # TODO: should be make a msg version of `ContextCancelled?`
 # and/or with a scope field or a full `ActorCancelled`?
 # class Cancelled(Msg):
@@ -498,12 +521,18 @@ _payload_msgs: list[Msg] = [
 
 # built-in SC shuttle protocol msg type set in
 # approx order of the IPC txn-state spaces.
-__msg_types__: list[Msg] = (
+__msg_types__: list[MsgType] = (
     _runtime_msgs
     +
     _payload_msgs
 )
 
+
+_msg_table: dict[str, MsgType] = {
+    msgT.__name__: msgT
+    for msgT in __msg_types__
+}
+
 # TODO: use new type declaration syntax for msg-type-spec
 # https://docs.python.org/3/library/typing.html#type-aliases
 # https://docs.python.org/3/reference/simple_stmts.html#type
@@ -660,6 +689,11 @@ def mk_msg_spec(
         'Generating new IPC msg-spec\n'
         f'{ipc_spec}\n'
     )
+    assert (
+        ipc_spec
+        and
+        ipc_spec is not Any
+    )
     return (
         ipc_spec,
         msgtypes_table[spec_build_method] + ipc_msg_types,
@@ -669,9 +703,9 @@ def mk_msg_spec(
 # TODO: make something similar to this inside `._codec` such that
 # user can just pass a type table of some sort?
 # -[ ] we would need to decode all msgs to `pretty_struct.Struct`
-#   and then call `.to_dict()` on them?
+#     and then call `.to_dict()` on them?
 # -[ ] we're going to need to re-impl all the stuff changed in the
-# runtime port such that it can handle dicts or `Msg`s?
+#    runtime port such that it can handle dicts or `Msg`s?
 #
 # def mk_dict_msg_codec_hooks() -> tuple[Callable, Callable]:
 #     '''
@@ -679,88 +713,15 @@ def mk_msg_spec(
 #     manual convertion from our above native `Msg` set
 #     to `dict` equivalent (wire msgs) in order to keep legacy compat
 #     with the original runtime implementation.
-
+#
 #     Note: this is is/was primarly used while moving the core
 #     runtime over to using native `Msg`-struct types wherein we
 #     start with the send side emitting without loading
 #     a typed-decoder and then later flipping the switch over to
 #     load to the native struct types once all runtime usage has
 #     been adjusted appropriately.
-
+#
 #     '''
-#     def enc_to_dict(msg: Any) -> Any:
-#         '''
-#         Encode `Msg`-structs to `dict` msgs instead
-#         of using `msgspec.msgpack.Decoder.type`-ed
-#         features.
-
-#         '''
-#         match msg:
-#             case Start():
-#                 dctmsg: dict = pretty_struct.Struct.to_dict(
-#                     msg
-#                 )['pld']
-
-#             case Error():
-#                 dctmsg: dict = pretty_struct.Struct.to_dict(
-#                     msg
-#                 )['pld']
-#                 return {'error': dctmsg}
-
-
-#     def dec_from_dict(
-#         type: Type,
-#         obj: Any,
-#     ) -> Any:
-#         '''
-#         Decode to `Msg`-structs from `dict` msgs instead
-#         of using `msgspec.msgpack.Decoder.type`-ed
-#         features.
-
-#         '''
-#         cid: str = obj.get('cid')
-#         match obj:
-#             case {'cmd': pld}:
-#                 return Start(
-#                     cid=cid,
-#                     pld=pld,
-#                 )
-#             case {'functype': pld}:
-#                 return StartAck(
-#                     cid=cid,
-#                     functype=pld,
-#                     # pld=IpcCtxSpec(
-#                     #     functype=pld,
-#                     # ),
-#                 )
-#             case {'started': pld}:
-#                 return Started(
-#                     cid=cid,
-#                     pld=pld,
-#                 )
-#             case {'yield': pld}:
-#                 return Yield(
-#                     cid=obj['cid'],
-#                     pld=pld,
-#                 )
-#             case {'stop': pld}:
-#                 return Stop(
-#                     cid=cid,
-#                 )
-#             case {'return': pld}:
-#                 return Return(
-#                     cid=cid,
-#                     pld=pld,
-#                 )
-
-#             case {'error': pld}:
-#                 return Error(
-#                     cid=cid,
-#                     pld=ErrorData(
-#                         **pld
-#                     ),
-#                 )
-
 #     return (
 #         # enc_to_dict,
 #         dec_from_dict,
-- 
2.34.1


From dbc445ff9dedd214398925d4e7c863f1d1332cfd Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 11 Apr 2024 20:42:54 -0400
Subject: [PATCH 229/378] Expose `tractor.msg.PayloadT` from subpkg

---
 tractor/msg/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tractor/msg/__init__.py b/tractor/msg/__init__.py
index 443b781b..8f13f5f8 100644
--- a/tractor/msg/__init__.py
+++ b/tractor/msg/__init__.py
@@ -53,6 +53,9 @@ from .types import (
 
     Error as Error,
 
+    # type-var for `.pld` field
+    PayloadT as PayloadT,
+
     # full msg class set from above as list
     __msg_types__ as __msg_types__,
 
-- 
2.34.1


From 322e015d3264b7fc1f70ba4ba71204ef8d90b52d Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 11 Apr 2024 21:04:48 -0400
Subject: [PATCH 230/378] Add custom `MsgCodec.__repr__()`

Sure makes console grokability a lot better by showing only the
customizeable fields.

Further, clean up `mk_codec()` a bunch by removing the `ipc_msg_spec`
param since we don't plan to support another msg-set (for now) which
allows cleaning out a buncha logic that was mostly just a source of
bugs..

Also,
- add temporary `log.info()` around codec application.
- throw in some sanity `assert`s to `limit_msg_spec()`.
- add but mask out the `extend_msg_spec()` idea since it seems `msgspec`
  won't allow `Decoder.type` extensions when using a custom `dec_hook()`
  for some extension type.. (not sure what approach to take here yet).
---
 tractor/msg/_codec.py | 137 +++++++++++++++++++++++++-----------------
 1 file changed, 83 insertions(+), 54 deletions(-)

diff --git a/tractor/msg/_codec.py b/tractor/msg/_codec.py
index de3316c8..e117457f 100644
--- a/tractor/msg/_codec.py
+++ b/tractor/msg/_codec.py
@@ -37,6 +37,7 @@ from contextlib import (
 #     ContextVar,
 #     Token,
 # )
+import textwrap
 from typing import (
     Any,
     Callable,
@@ -59,7 +60,9 @@ from tractor.msg.types import (
     mk_msg_spec,
     MsgType,
 )
+from tractor.log import get_logger
 
+log = get_logger(__name__)
 
 # TODO: overall IPC msg-spec features (i.e. in this mod)!
 #
@@ -87,6 +90,27 @@ class MsgCodec(Struct):
 
     pld_spec: Union[Type[Struct]]|None
 
+    def __repr__(self) -> str:
+        speclines: str = textwrap.indent(
+            self.pformat_msg_spec(),
+            prefix=' '*3,
+        )
+        body: str = textwrap.indent(
+            f'|_lib = {self.lib.__name__!r}\n'
+            f'|_enc_hook: {self.enc.enc_hook}\n'
+            f'|_dec_hook: {self.dec.dec_hook}\n'
+            f'|_pld_spec: {self.pld_spec_str}\n'
+            # f'|\n'
+            f'|__msg_spec__:\n'
+            f'{speclines}\n',
+            prefix=' '*2,
+        )
+        return (
+            f'<{type(self).__name__}(\n'
+            f'{body}'
+            ')>'
+        )
+
     @property
     def pld_spec_str(self) -> str:
         spec: Union[Type]|Type = self.pld_spec
@@ -163,8 +187,8 @@ class MsgCodec(Struct):
 
     ) -> bytes:
         '''
-        Encode input python objects to `msgpack` bytes for transfer
-        on a tranport protocol connection.
+        Encode input python objects to `msgpack` bytes for
+        transfer on a tranport protocol connection.
 
         '''
         return self._enc.encode(py_obj)
@@ -325,15 +349,9 @@ class MsgCodec(Struct):
 
 
 def mk_codec(
-    ipc_msg_spec: Union[Type[Struct]]|Any|None = None,
-    #
-    # ^TODO^: in the long run, do we want to allow using a diff IPC `Msg`-set?
-    # it would break the runtime, but maybe say if you wanted
-    # to add some kinda field-specific or wholesale `.pld` ecryption?
-
     # struct type unions set for `Decoder`
     # https://jcristharif.com/msgspec/structs.html#tagged-unions
-    ipc_pld_spec: Union[Type[Struct]]|Any|None = None,
+    ipc_pld_spec: Union[Type[Struct]]|Any = Any,
 
     # TODO: offering a per-msg(-field) type-spec such that
     # the fields can be dynamically NOT decoded and left as `Raw`
@@ -352,7 +370,6 @@ def mk_codec(
     dec_hook: Callable|None = None,
     enc_hook: Callable|None = None,
     # ------ - ------
-    **kwargs,
     #
     # Encoder:
     # write_buffer_size=write_buffer_size,
@@ -367,44 +384,19 @@ def mk_codec(
     `msgspec` ;).
 
     '''
-    if (
-        ipc_msg_spec is not None
-        and ipc_pld_spec
-    ):
-        raise RuntimeError(
-            f'If a payload spec is provided,\n'
-            "the builtin SC-shuttle-protocol's msg set\n"
-            f'(i.e. a `{MsgType}`) MUST be used!\n\n'
-            f'However both values were passed as => mk_codec(\n'
-            f'   ipc_msg_spec={ipc_msg_spec}`\n'
-            f'   ipc_pld_spec={ipc_pld_spec}`\n)\n'
-        )
-
-    elif (
-        ipc_pld_spec
-        and
-
-        # XXX required for now (or maybe forever?) until
-        # we can dream up a way to allow parameterizing and/or
-        # custom overrides to the `Msg`-spec protocol itself?
-        ipc_msg_spec is None
-    ):
-        # (manually) generate a msg-payload-spec for all relevant
-        # god-boxing-msg subtypes, parameterizing the `Msg.pld: PayloadT`
-        # for the decoder such that all sub-type msgs in our SCIPP
-        # will automatically decode to a type-"limited" payload (`Struct`)
-        # object (set).
-        (
-            ipc_msg_spec,
-            msg_types,
-        ) = mk_msg_spec(
-            payload_type_union=ipc_pld_spec,
-        )
-        assert len(ipc_msg_spec.__args__) == len(msg_types)
-        assert ipc_msg_spec
-
-    else:
-        ipc_msg_spec = ipc_msg_spec or Any
+    # (manually) generate a msg-payload-spec for all relevant
+    # god-boxing-msg subtypes, parameterizing the `Msg.pld: PayloadT`
+    # for the decoder such that all sub-type msgs in our SCIPP
+    # will automatically decode to a type-"limited" payload (`Struct`)
+    # object (set).
+    (
+        ipc_msg_spec,
+        msg_types,
+    ) = mk_msg_spec(
+        payload_type_union=ipc_pld_spec,
+    )
+    assert len(ipc_msg_spec.__args__) == len(msg_types)
+    assert ipc_msg_spec
 
     enc = msgpack.Encoder(
        enc_hook=enc_hook,
@@ -418,8 +410,6 @@ def mk_codec(
         _enc=enc,
         _dec=dec,
         pld_spec=ipc_pld_spec,
-        # payload_msg_specs=payload_msg_specs,
-        # **kwargs,
     )
 
     # sanity on expected backend support
@@ -500,8 +490,16 @@ def apply_codec(
     - https://github.com/oremanj/tricycle/blob/master/tricycle/_tests/test_tree_var.py
 
     '''
+    __tracebackhide__: bool = True
     orig: MsgCodec = _ctxvar_MsgCodec.get()
     assert orig is not codec
+    if codec.pld_spec is None:
+        breakpoint()
+
+    log.info(
+        'Applying new msg-spec codec\n\n'
+        f'{codec}\n'
+    )
     token: RunVarToken = _ctxvar_MsgCodec.set(codec)
 
     # TODO: for TreeVar approach, see docs for @cm `.being()` API:
@@ -518,7 +516,10 @@ def apply_codec(
         _ctxvar_MsgCodec.reset(token)
 
     assert _ctxvar_MsgCodec.get() is orig
-
+    log.info(
+        'Reverted to last msg-spec codec\n\n'
+        f'{orig}\n'
+    )
 
 def current_codec() -> MsgCodec:
     '''
@@ -532,14 +533,15 @@ def current_codec() -> MsgCodec:
 
 @cm
 def limit_msg_spec(
-    payload_types: Union[Type[Struct]],
+    payload_spec: Union[Type[Struct]],
 
     # TODO: don't need this approach right?
     # -> related to the `MsgCodec._payload_decs` stuff above..
     # tagged_structs: list[Struct]|None = None,
 
     **codec_kwargs,
-):
+
+) -> MsgCodec:
     '''
     Apply a `MsgCodec` that will natively decode the SC-msg set's
     `Msg.pld: Union[Type[Struct]]` payload fields using
@@ -547,10 +549,37 @@ def limit_msg_spec(
     for all IPC contexts in use by the current `trio.Task`.
 
     '''
+    __tracebackhide__: bool = True
+    curr_codec = current_codec()
     msgspec_codec: MsgCodec = mk_codec(
-        payload_types=payload_types,
+        ipc_pld_spec=payload_spec,
         **codec_kwargs,
     )
     with apply_codec(msgspec_codec) as applied_codec:
         assert applied_codec is msgspec_codec
         yield msgspec_codec
+
+    assert curr_codec is current_codec()
+
+
+# XXX: msgspec won't allow this with non-struct custom types
+# like `NamespacePath`!@!
+# @cm
+# def extend_msg_spec(
+#     payload_spec: Union[Type[Struct]],
+
+# ) -> MsgCodec:
+#     '''
+#     Extend the current `MsgCodec.pld_spec` (type set) by extending
+#     the payload spec to **include** the types specified by
+#     `payload_spec`.
+
+#     '''
+#     codec: MsgCodec = current_codec()
+#     pld_spec: Union[Type] = codec.pld_spec
+#     extended_spec: Union[Type] = pld_spec|payload_spec
+
+#     with limit_msg_spec(payload_types=extended_spec) as ext_codec:
+#         # import pdbp; pdbp.set_trace()
+#         assert ext_codec.pld_spec == extended_spec
+#         yield ext_codec
-- 
2.34.1


From eec240a70a6fea5412f970a22c6b91fc3472410c Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 11 Apr 2024 21:24:02 -0400
Subject: [PATCH 231/378] Tweak some `pformat_boxed_tb()` indent inputs

- add some `tb_str: str` indent-prefix args for diff indent levels for the
body vs. the surrounding "ascii box".
- ^-use it-^ from `RemoteActorError.__repr()__` obvi.
- use new `msg.types.from_dict_msg()` in impl of
  `MsgTypeError.payload_msg`, handy for showing what the message "would
  have looked like in `Struct` form" had it not failed it's type
  constraints.
---
 tractor/_exceptions.py | 73 ++++++++++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 28 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index a31aa11e..31b7b36e 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -40,7 +40,7 @@ from tractor._state import current_actor
 from tractor.log import get_logger
 from tractor.msg import (
     Error,
-    Msg,
+    MsgType,
     Stop,
     Yield,
     pretty_struct,
@@ -130,7 +130,10 @@ def pformat_boxed_tb(
     tb_str: str,
     fields_str: str|None = None,
     field_prefix: str = ' |_',
-    indent: str = ' '*2
+
+    tb_box_indent: int|None = None,
+    tb_body_indent: int = 1,
+
 ) -> str:
     if (
         fields_str
@@ -139,15 +142,19 @@ def pformat_boxed_tb(
     ):
         fields: str = textwrap.indent(
             fields_str,
-            # prefix=' '*2,
-            # prefix=' |_',
             prefix=field_prefix,
         )
     else:
         fields = fields_str or ''
 
-    # body_indent: str = len(field_prefix) * ' '
-    body: str = (
+    tb_body = tb_str
+    if tb_body_indent:
+        tb_body: str = textwrap.indent(
+            tb_str,
+            prefix=tb_body_indent * ' ',
+        )
+
+    tb_box: str = (
 
         # orig
         # f'  |\n'
@@ -158,21 +165,29 @@ def pformat_boxed_tb(
 
         f'|\n'
         f' ------ - ------\n\n'
-        f'{tb_str}\n'
+        # f'{tb_str}\n'
+        f'{tb_body}'
         f'  ------ - ------\n'
         f'_|\n'
     )
-    if len(indent):
-        body: str = textwrap.indent(
-            body,
-            # prefix=body_indent,
-            prefix=indent,
+    tb_box_indent: str = (
+        tb_box_indent
+        or
+        1
+
+        # (len(field_prefix))
+        # ? ^-TODO-^ ? if you wanted another indent level
+    )
+    if tb_box_indent > 0:
+        tb_box: str = textwrap.indent(
+            tb_box,
+            prefix=tb_box_indent * ' ',
         )
 
     return (
         fields
         +
-        body
+        tb_box
     )
 
 
@@ -316,7 +331,7 @@ class RemoteActorError(Exception):
         if self._ipc_msg is None:
             return None
 
-        msg_type: Msg = type(self._ipc_msg)
+        msg_type: MsgType = type(self._ipc_msg)
         fields: dict[str, Any] = {
             k: v for _, k, v in
             pretty_struct.iter_fields(self._ipc_msg)
@@ -493,7 +508,10 @@ class RemoteActorError(Exception):
             tb_str=self.tb_str,
             fields_str=fields,
             field_prefix=' |_',
-            indent=' ',  # no indent?
+            # ^- is so that it's placed like so,
+            # just after <Type(
+            #             |___ ..
+            tb_body_indent=1,
         )
         return (
             f'<{type(self).__name__}(\n'
@@ -623,7 +641,7 @@ class MsgTypeError(
 
     '''
     reprol_fields: list[str] = [
-        'ipc_msg',
+        'payload_msg',
     ]
     extra_body_fields: list[str] = [
         'cid',
@@ -633,7 +651,7 @@ class MsgTypeError(
     @property
     def msg_dict(self) -> dict[str, Any]:
         '''
-        If the underlying IPC `Msg` was received from a remote
+        If the underlying IPC `MsgType` was received from a remote
         actor but was unable to be decoded to a native
         `Yield`|`Started`|`Return` struct, the interchange backend
         native format decoder can be used to stash a `dict`
@@ -643,22 +661,21 @@ class MsgTypeError(
         return self.msgdata.get('_msg_dict')
 
     @property
-    def payload_msg(self) -> Msg|None:
+    def payload_msg(
+        self,
+    ) -> MsgType|None:
         '''
         Attempt to construct what would have been the original
-        `Msg`-with-payload subtype (i.e. an instance from the set
+        `MsgType`-with-payload subtype (i.e. an instance from the set
         of msgs in `.msg.types._payload_msgs`) which failed
         validation.
 
         '''
-        msg_dict: dict = self.msg_dict.copy()
-        name: str = msg_dict.pop('msg_type')
-        msg_type: Msg = getattr(
-            msgtypes,
-            name,
-            Msg,
-        )
-        return msg_type(**msg_dict)
+        if msg_dict := self.msg_dict.copy():
+            return msgtypes.from_dict_msg(
+                dict_msg=msg_dict,
+            )
+        return None
 
     @property
     def cid(self) -> str:
@@ -908,7 +925,7 @@ def is_multi_cancelled(exc: BaseException) -> bool:
 
 def _raise_from_no_key_in_msg(
     ctx: Context,
-    msg: Msg,
+    msg: MsgType,
     src_err: KeyError,
     log: StackLevelAdapter,  # caller specific `log` obj
 
-- 
2.34.1


From faa7194daf345ef35a67e7ffa3d239fc8226d4f1 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 12 Apr 2024 11:47:10 -0400
Subject: [PATCH 232/378] TOSQUASH 322e015d Fix `mk_codec()` input arg

---
 tractor/msg/_codec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tractor/msg/_codec.py b/tractor/msg/_codec.py
index e117457f..82fd2011 100644
--- a/tractor/msg/_codec.py
+++ b/tractor/msg/_codec.py
@@ -420,7 +420,7 @@ def mk_codec(
 
 # instance of the default `msgspec.msgpack` codec settings, i.e.
 # no custom structs, hooks or other special types.
-_def_msgspec_codec: MsgCodec = mk_codec(ipc_msg_spec=Any)
+_def_msgspec_codec: MsgCodec = mk_codec(ipc_pld_spec=Any)
 
 # The built-in IPC `Msg` spec.
 # Our composing "shuttle" protocol which allows `tractor`-app code
-- 
2.34.1


From 3fb3608879a44d46c8d13509e0b5e24930d043b4 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 12 Apr 2024 11:49:50 -0400
Subject: [PATCH 233/378] Extend recv-side `MsgTypeError` default message

Display the new `MsgCodec.pld_spec_str` and format the incorrect field
value to be placed entirely (txt block wise) right of the "type annot"
part of the line:

Iow if you had a bad `dict` value where something else should be it'd
look something like this:

<Started(
 |_pld: NamespacePath = {'cid': '3e0ca00c-7d32-4d2a-a0c2-ac2e12453871',
                         'locked': True,
                         'msg_type': 'LockStatus',
                         'subactor_uid': ['sub', 'af7ccb69-1dab-491f-84f7-2ec42c32d137']}
---
 tractor/_ipc.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/tractor/_ipc.py b/tractor/_ipc.py
index 7713811c..a5b44a4e 100644
--- a/tractor/_ipc.py
+++ b/tractor/_ipc.py
@@ -130,6 +130,8 @@ def _mk_msg_type_err(
 
 ) -> MsgTypeError:
 
+    import textwrap
+
     # `Channel.send()` case
     if src_validation_error is None:  # send-side
 
@@ -209,10 +211,24 @@ def _mk_msg_type_err(
         msg, _, maybe_field = msgspec_msg.rpartition('$.')
         obj = object()
         if (field_val := msg_dict.get(maybe_field, obj)) is not obj:
+            field_name_expr: str = (
+                f' |_{maybe_field}: {codec.pld_spec_str} = '
+            )
+            fmt_val_lines: list[str] = pformat(field_val).splitlines()
+            fmt_val: str = (
+                f'{fmt_val_lines[0]}\n'
+                +
+                textwrap.indent(
+                    '\n'.join(fmt_val_lines[1:]),
+                    prefix=' '*len(field_name_expr),
+                )
+            )
             message += (
                 f'{msg.rstrip("`")}\n\n'
-                f'{msg_type}\n'
-                f' |_.{maybe_field}: {codec.pld_spec_str} = {field_val!r}\n'
+                f'<{msg_type.__qualname__}(\n'
+                # f'{".".join([msg_type.__module__, msg_type.__qualname__])}\n'
+                f'{field_name_expr}{fmt_val}\n'
+                f')>'
             )
 
         msgtyperr = MsgTypeError.from_decode(
@@ -338,7 +354,7 @@ class MsgpackTCPStream(MsgTransport):
                     # self._task = task
                     self._codec = codec
                     log.runtime(
-                        'Using new codec in {self}.recv()\n'
+                        f'Using new codec in {self}.recv()\n'
                         f'codec: {self._codec}\n\n'
                         f'msg_bytes: {msg_bytes}\n'
                     )
@@ -420,7 +436,7 @@ class MsgpackTCPStream(MsgTransport):
             if self._codec.pld_spec != codec.pld_spec:
                 self._codec = codec
                 log.runtime(
-                    'Using new codec in {self}.send()\n'
+                    f'Using new codec in {self}.send()\n'
                     f'codec: {self._codec}\n\n'
                     f'msg: {msg}\n'
                 )
-- 
2.34.1


From df548257adc2a7f7252341a1e031fd1f28e1fdc1 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sat, 13 Apr 2024 15:19:08 -0400
Subject: [PATCH 234/378] IPC ctx refinements around `MsgTypeError` awareness

Add a bit of special handling for msg-type-errors with a dedicated
log-msg detailing which `.side: str` is the sender/causer and avoiding
a `._scope.cancel()` call in such cases since the local task might be
written to handle and tolerate the badly (typed) IPC msg.

As part of ^, change the ctx task-pair "side" semantics from "caller" ->
"callee" to be "parent" -> "child" which better matches the
cross-process SC-linked-task supervision hierarchy, and
`trio.Nursery.parent_task`; in `trio` the task that opens a nursery is
also named the "parent".

Impl deats / fixes around the `.side` semantics:
- ensure that `._portal: Portal` is set ASAP after
  `Actor.start_remote_task()` such that if the `Started` transaction
  fails, the parent-vs.-child sides are still denoted correctly (since
  `._portal` being set is the predicate for that).
- add a helper func `Context.peer_side(side: str) -> str:` which inverts
  from "child" to "parent" and vice versa, useful for logging info.

Other tweaks:
- make `_drain_to_final_msg()` return a tuple of a maybe-`Return` and
  the list of other `pre_result_drained: list[MsgType]` such that we
  don't ever have to warn about the return msg getting captured as
  a pre-"result" msg.
- Add some strictness flags to `.started()` which allow for toggling
  whether to error or warn log about mismatching roundtripped `Started`
  msgs prior to IPC transit.
---
 tractor/_context.py | 179 ++++++++++++++++++++++++++++++++------------
 1 file changed, 132 insertions(+), 47 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index 69f28aca..fc16289b 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -47,6 +47,7 @@ import trio
 from ._exceptions import (
     ContextCancelled,
     InternalError,
+    MsgTypeError,
     RemoteActorError,
     StreamOverrun,
     pack_from_raise,
@@ -59,12 +60,14 @@ from .msg import (
     MsgType,
     MsgCodec,
     NamespacePath,
+    PayloadT,
     Return,
     Started,
     Stop,
     Yield,
     current_codec,
     pretty_struct,
+    types as msgtypes,
 )
 from ._ipc import Channel
 from ._streaming import MsgStream
@@ -88,7 +91,10 @@ async def _drain_to_final_msg(
     hide_tb: bool = True,
     msg_limit: int = 6,
 
-) -> list[dict]:
+) -> tuple[
+    Return|None,
+    list[MsgType]
+]:
     '''
     Drain IPC msgs delivered to the underlying rx-mem-chan
     `Context._recv_chan` from the runtime in search for a final
@@ -109,6 +115,7 @@ async def _drain_to_final_msg(
     # basically ignoring) any bi-dir-stream msgs still in transit
     # from the far end.
     pre_result_drained: list[MsgType] = []
+    return_msg: Return|None = None
     while not (
         ctx.maybe_error
         and not ctx._final_result_is_set()
@@ -169,8 +176,6 @@ async def _drain_to_final_msg(
             # pray to the `trio` gawds that we're corrent with this
             # msg: dict = await ctx._recv_chan.receive()
             msg: MsgType = await ctx._recv_chan.receive()
-            # always capture unexpected/non-result msgs
-            pre_result_drained.append(msg)
 
         # NOTE: we get here if the far end was
         # `ContextCancelled` in 2 cases:
@@ -207,11 +212,13 @@ async def _drain_to_final_msg(
                 # if ctx._recv_chan:
                 #     await ctx._recv_chan.aclose()
                 # TODO: ^ we don't need it right?
+                return_msg = msg
                 break
 
             # far end task is still streaming to us so discard
             # and report depending on local ctx state.
             case Yield():
+                pre_result_drained.append(msg)
                 if (
                     (ctx._stream.closed
                      and (reason := 'stream was already closed')
@@ -236,7 +243,10 @@ async def _drain_to_final_msg(
 
                         f'{pformat(msg)}\n'
                     )
-                    return pre_result_drained
+                    return (
+                        return_msg,
+                        pre_result_drained,
+                    )
 
                 # drain up to the `msg_limit` hoping to get
                 # a final result or error/ctxc.
@@ -260,6 +270,7 @@ async def _drain_to_final_msg(
             # -[ ] should be a runtime error if a stream is open right?
             # Stop()
             case Stop():
+                pre_result_drained.append(msg)
                 log.cancel(
                     'Remote stream terminated due to "stop" msg:\n\n'
                     f'{pformat(msg)}\n'
@@ -269,7 +280,6 @@ async def _drain_to_final_msg(
             # remote error msg, likely already handled inside
             # `Context._deliver_msg()`
             case Error():
-
                 # TODO: can we replace this with `ctx.maybe_raise()`?
                 # -[ ]  would this be handier for this case maybe?
                 #     async with maybe_raise_on_exit() as raises:
@@ -336,6 +346,7 @@ async def _drain_to_final_msg(
             # XXX should pretty much never get here unless someone
             # overrides the default `MsgType` spec.
             case _:
+                pre_result_drained.append(msg)
                 # It's definitely an internal error if any other
                 # msg type without a`'cid'` field arrives here!
                 if not msg.cid:
@@ -352,7 +363,10 @@ async def _drain_to_final_msg(
             f'{ctx.outcome}\n'
         )
 
-    return pre_result_drained
+    return (
+        return_msg,
+        pre_result_drained,
+    )
 
 
 class Unresolved:
@@ -719,21 +733,36 @@ class Context:
         Return string indicating which task this instance is wrapping.
 
         '''
-        return 'caller' if self._portal else 'callee'
+        return 'parent' if self._portal else 'child'
 
+    @staticmethod
+    def peer_side(side: str) -> str:
+        match side:
+            case 'child':
+                return 'parent'
+            case 'parent':
+                return 'child'
+
+    # TODO: remove stat!
+    # -[ ] re-implement the `.experiemental._pubsub` stuff
+    #     with `MsgStream` and that should be last usage?
+    # -[ ] remove from `tests/legacy_one_way_streaming.py`!
     async def send_yield(
         self,
         data: Any,
-
     ) -> None:
+        '''
+        Deprecated method for what now is implemented in `MsgStream`.
 
+        We need to rework / remove some stuff tho, see above.
+
+        '''
         warnings.warn(
             "`Context.send_yield()` is now deprecated. "
             "Use ``MessageStream.send()``. ",
             DeprecationWarning,
             stacklevel=2,
         )
-        # await self.chan.send({'yield': data, 'cid': self.cid})
         await self.chan.send(
             Yield(
                 cid=self.cid,
@@ -742,12 +771,11 @@ class Context:
         )
 
     async def send_stop(self) -> None:
-        # await pause()
-        # await self.chan.send({
-        #     # Stop(
-        #     'stop': True,
-        #     'cid': self.cid
-        # })
+        '''
+        Terminate a `MsgStream` dialog-phase by sending the IPC
+        equiv of a `StopIteration`.
+
+        '''
         await self.chan.send(
             Stop(cid=self.cid)
         )
@@ -843,6 +871,7 @@ class Context:
 
         # self-cancel (ack) or,
         # peer propagated remote cancellation.
+        msgtyperr: bool = False
         if isinstance(error, ContextCancelled):
 
             whom: str = (
@@ -854,6 +883,16 @@ class Context:
                 f'{error}'
             )
 
+        elif isinstance(error, MsgTypeError):
+            msgtyperr = True
+            peer_side: str = self.peer_side(self.side)
+            log.error(
+                f'IPC dialog error due to msg-type caused by {peer_side!r} side\n\n'
+
+                f'{error}\n'
+                f'{pformat(self)}\n'
+            )
+
         else:
             log.error(
                 f'Remote context error:\n\n'
@@ -894,9 +933,9 @@ class Context:
             # if `._cancel_called` then `.cancel_acked and .cancel_called`
             # always should be set.
             and not self._is_self_cancelled()
-
             and not cs.cancel_called
             and not cs.cancelled_caught
+            and not msgtyperr
         ):
             # TODO: it'd sure be handy to inject our own
             # `trio.Cancelled` subtype here ;)
@@ -1001,7 +1040,7 @@ class Context:
         # when the runtime finally receives it during teardown
         # (normally in `.result()` called from
         # `Portal.open_context().__aexit__()`)
-        if side == 'caller':
+        if side == 'parent':
             if not self._portal:
                 raise InternalError(
                     'No portal found!?\n'
@@ -1423,7 +1462,10 @@ class Context:
             # wait for a final context result/error by "draining"
             # (by more or less ignoring) any bi-dir-stream "yield"
             # msgs still in transit from the far end.
-            drained_msgs: list[dict] = await _drain_to_final_msg(
+            (
+                return_msg,
+                drained_msgs,
+            ) = await _drain_to_final_msg(
                 ctx=self,
                 hide_tb=hide_tb,
             )
@@ -1441,7 +1483,10 @@ class Context:
 
             log.cancel(
                 'Ctx drained pre-result msgs:\n'
-                f'{pformat(drained_msgs)}'
+                f'{pformat(drained_msgs)}\n\n'
+
+                f'Final return msg:\n'
+                f'{return_msg}\n'
             )
 
         self.maybe_raise(
@@ -1608,7 +1653,13 @@ class Context:
 
     async def started(
         self,
-        value: Any | None = None
+
+        # TODO: how to type this so that it's the
+        # same as the payload type? Is this enough?
+        value: PayloadT|None = None,
+
+        strict_parity: bool = False,
+        complain_no_parity: bool = True,
 
     ) -> None:
         '''
@@ -1629,7 +1680,7 @@ class Context:
                 f'called `.started()` twice on context with {self.chan.uid}'
             )
 
-        started = Started(
+        started_msg = Started(
             cid=self.cid,
             pld=value,
         )
@@ -1650,28 +1701,54 @@ class Context:
         # https://zguide.zeromq.org/docs/chapter7/#The-Cheap-or-Nasty-Pattern
         #
         codec: MsgCodec = current_codec()
-        msg_bytes: bytes = codec.encode(started)
+        msg_bytes: bytes = codec.encode(started_msg)
         try:
             # be a "cheap" dialog (see above!)
-            rt_started = codec.decode(msg_bytes)
-            if rt_started != started:
+            if (
+                strict_parity
+                or
+                complain_no_parity
+            ):
+                rt_started: Started = codec.decode(msg_bytes)
 
-                # TODO: break these methods out from the struct subtype?
-                diff = pretty_struct.Struct.__sub__(rt_started, started)
+                # XXX something is prolly totes cucked with the
+                # codec state!
+                if isinstance(rt_started, dict):
+                    rt_started = msgtypes.from_dict_msg(
+                        dict_msg=rt_started,
+                    )
+                    raise RuntimeError(
+                        'Failed to roundtrip `Started` msg?\n'
+                        f'{pformat(rt_started)}\n'
+                    )
 
-                complaint: str = (
-                    'Started value does not match after codec rountrip?\n\n'
-                    f'{diff}'
-                )
-                # TODO: rn this will pretty much always fail with
-                # any other sequence type embeded in the
-                # payload...
-                if self._strict_started:
-                    raise ValueError(complaint)
-                else:
-                    log.warning(complaint)
+                if rt_started != started_msg:
+                    # TODO: break these methods out from the struct subtype?
 
-            await self.chan.send(rt_started)
+                    diff = pretty_struct.Struct.__sub__(
+                        rt_started,
+                        started_msg,
+                    )
+                    complaint: str = (
+                        'Started value does not match after codec rountrip?\n\n'
+                        f'{diff}'
+                    )
+
+                    # TODO: rn this will pretty much always fail with
+                    # any other sequence type embeded in the
+                    # payload...
+                    if (
+                        self._strict_started
+                        or
+                        strict_parity
+                    ):
+                        raise ValueError(complaint)
+                    else:
+                        log.warning(complaint)
+
+                # started_msg = rt_started
+
+            await self.chan.send(started_msg)
 
         # raise any msg type error NO MATTER WHAT!
         except msgspec.ValidationError as verr:
@@ -1682,7 +1759,7 @@ class Context:
                 src_validation_error=verr,
                 verb_header='Trying to send payload'
                 # > 'invalid `Started IPC msgs\n'
-            )
+            ) from verr
 
         self._started_called = True
 
@@ -1783,13 +1860,17 @@ class Context:
             else:
                 log_meth = log.runtime
 
-            log_meth(
-                f'Delivering error-msg to caller\n\n'
+            side: str = self.side
 
-                f'<= peer: {from_uid}\n'
+            peer_side: str = self.peer_side(side)
+
+            log_meth(
+                f'Delivering IPC ctx error from {peer_side!r} to {side!r} task\n\n'
+
+                f'<= peer {peer_side!r}: {from_uid}\n'
                 f'  |_ {nsf}()\n\n'
 
-                f'=> cid: {cid}\n'
+                f'=> {side!r} cid: {cid}\n'
                 f'  |_{self._task}\n\n'
 
                 f'{pformat(re)}\n'
@@ -1804,6 +1885,7 @@ class Context:
             self._maybe_cancel_and_set_remote_error(re)
 
         # XXX only case where returning early is fine!
+        structfmt = pretty_struct.Struct.pformat
         if self._in_overrun:
             log.warning(
                 f'Queueing OVERRUN msg on caller task:\n'
@@ -1813,7 +1895,7 @@ class Context:
                 f'=> cid: {cid}\n'
                 f'  |_{self._task}\n\n'
 
-                f'{pformat(msg)}\n'
+                f'{structfmt(msg)}\n'
             )
             self._overflow_q.append(msg)
             return False
@@ -1827,7 +1909,7 @@ class Context:
                 f'=> {self._task}\n'
                 f'  |_cid={self.cid}\n\n'
 
-                f'{pformat(msg)}\n'
+                f'{structfmt(msg)}\n'
             )
 
             # NOTE: if an error is deteced we should always still
@@ -2047,6 +2129,9 @@ async def open_context_from_portal(
         # place..
         allow_overruns=allow_overruns,
     )
+    # ASAP, so that `Context.side: str` can be determined for
+    # logging / tracing / debug!
+    ctx._portal: Portal = portal
 
     assert ctx._remote_func_type == 'context'
     msg: Started = await ctx._recv_chan.receive()
@@ -2065,10 +2150,10 @@ async def open_context_from_portal(
             msg=msg,
             src_err=src_error,
             log=log,
-            expect_key='started',
+            expect_msg=Started,
+            # expect_key='started',
         )
 
-    ctx._portal: Portal = portal
     uid: tuple = portal.channel.uid
     cid: str = ctx.cid
 
-- 
2.34.1


From 2d22713806db49afdf12332fce013d2ba6fab64d Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 14 Apr 2024 16:29:21 -0400
Subject: [PATCH 235/378] Add `from_dict_msg(user_pretty: bool)` flag

Allows for optionally (and dynamically) constructing the "expected"
`MsgType` from a `dict` into a `pretty_struct.Struct`, mostly for
logging usage.
---
 tractor/msg/types.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index 9787504b..f7654f62 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -451,7 +451,8 @@ def from_dict_msg(
     dict_msg: dict,
 
     msgT: MsgType|None = None,
-    tag_field: str = 'msg_type'
+    tag_field: str = 'msg_type',
+    use_pretty: bool = False,
 
 ) -> MsgType:
     '''
@@ -468,6 +469,19 @@ def from_dict_msg(
     # XXX ensure tag field is removed
     msgT_name: str = dict_msg.pop(msg_type_tag_field)
     msgT: MsgType = _msg_table[msgT_name]
+    if use_pretty:
+        msgT = defstruct(
+            name=msgT_name,
+            fields=[
+                (key, fi.type)
+                for fi, key, _
+                in pretty_struct.iter_fields(msgT)
+            ],
+            bases=(
+                pretty_struct.Struct,
+                msgT,
+            ),
+        )
     return msgT(**dict_msg)
 
 # TODO: should be make a msg version of `ContextCancelled?`
-- 
2.34.1


From 2edfed75ebf4e559c668aded9fb1f4de575b5c1a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 14 Apr 2024 16:32:18 -0400
Subject: [PATCH 236/378] Add `MsgTypeError.expected_msg_type`

Which matches with renaming `.payload_msg` -> `.expected_msg` which is
the value we attempt to construct from a vanilla-msgppack
decode-to-`dict` and then construct manually into a `MsgType` using
`.msg.types.from_dict_msg()`. Add a todo to use new `use_pretty` flag
which currently conflicts with `._exceptions.pformat_boxed_type()`
prefix formatting..
---
 tractor/_exceptions.py | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 31b7b36e..259994a1 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -43,9 +43,12 @@ from tractor.msg import (
     MsgType,
     Stop,
     Yield,
-    pretty_struct,
     types as msgtypes,
 )
+from tractor.msg.pretty_struct import (
+    iter_fields,
+    Struct,
+)
 
 if TYPE_CHECKING:
     from ._context import Context
@@ -82,7 +85,7 @@ class InternalError(RuntimeError):
 _ipcmsg_keys: list[str] = [
     fi.name
     for fi, k, v
-    in pretty_struct.iter_fields(Error)
+    in iter_fields(Error)
 
 ]
 
@@ -321,7 +324,7 @@ class RemoteActorError(Exception):
             assert self.boxed_type is boxed_type
 
     @property
-    def ipc_msg(self) -> pretty_struct.Struct:
+    def ipc_msg(self) -> Struct:
         '''
         Re-render the underlying `._ipc_msg: Msg` as
         a `pretty_struct.Struct` for introspection such that the
@@ -334,12 +337,12 @@ class RemoteActorError(Exception):
         msg_type: MsgType = type(self._ipc_msg)
         fields: dict[str, Any] = {
             k: v for _, k, v in
-            pretty_struct.iter_fields(self._ipc_msg)
+            iter_fields(self._ipc_msg)
         }
         return defstruct(
             msg_type.__name__,
             fields=fields.keys(),
-            bases=(msg_type, pretty_struct.Struct),
+            bases=(msg_type, Struct),
         )(**fields)
 
     @property
@@ -641,11 +644,11 @@ class MsgTypeError(
 
     '''
     reprol_fields: list[str] = [
-        'payload_msg',
+        'expected_msg_type',
     ]
     extra_body_fields: list[str] = [
         'cid',
-        'payload_msg',
+        'expected_msg',
     ]
 
     @property
@@ -661,9 +664,7 @@ class MsgTypeError(
         return self.msgdata.get('_msg_dict')
 
     @property
-    def payload_msg(
-        self,
-    ) -> MsgType|None:
+    def expected_msg(self) -> MsgType|None:
         '''
         Attempt to construct what would have been the original
         `MsgType`-with-payload subtype (i.e. an instance from the set
@@ -674,9 +675,17 @@ class MsgTypeError(
         if msg_dict := self.msg_dict.copy():
             return msgtypes.from_dict_msg(
                 dict_msg=msg_dict,
+                # use_pretty=True,
+                # ^-TODO-^ would luv to use this BUT then the
+                # `field_prefix` in `pformat_boxed_tb()` cucks it
+                # all up.. XD
             )
         return None
 
+    @property
+    def expected_msg_type(self) -> Type[MsgType]|None:
+        return type(self.expected_msg)
+
     @property
     def cid(self) -> str:
         # pre-packed using `.from_decode()` constructor
-- 
2.34.1


From f72b97234810ad0d529eb08c1c93cc4323666330 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 14 Apr 2024 17:49:18 -0400
Subject: [PATCH 237/378] Hide `._entry`/`._child` frames, tweak some more type
 annots

---
 tractor/_child.py     |  1 +
 tractor/_entry.py     |  1 +
 tractor/_spawn.py     |  2 +-
 tractor/_supervise.py | 10 +++++-----
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/tractor/_child.py b/tractor/_child.py
index bd1e830e..4226ae90 100644
--- a/tractor/_child.py
+++ b/tractor/_child.py
@@ -36,6 +36,7 @@ def parse_ipaddr(arg):
 
 
 if __name__ == "__main__":
+    __tracebackhide__: bool = True
 
     parser = argparse.ArgumentParser()
     parser.add_argument("--uid", type=parse_uid)
diff --git a/tractor/_entry.py b/tractor/_entry.py
index b2aae2e5..bf719abb 100644
--- a/tractor/_entry.py
+++ b/tractor/_entry.py
@@ -106,6 +106,7 @@ def _trio_main(
     Entry point for a `trio_run_in_process` subactor.
 
     '''
+    __tracebackhide__: bool = True
     _state._current_actor = actor
     trio_main = partial(
         async_main,
diff --git a/tractor/_spawn.py b/tractor/_spawn.py
index 4715bd1a..43814918 100644
--- a/tractor/_spawn.py
+++ b/tractor/_spawn.py
@@ -513,7 +513,7 @@ async def trio_proc(
         # })
 
         # track subactor in current nursery
-        curr_actor = current_actor()
+        curr_actor: Actor = current_actor()
         curr_actor._actoruid2nursery[subactor.uid] = actor_nursery
 
         # resume caller at next checkpoint now that child is up
diff --git a/tractor/_supervise.py b/tractor/_supervise.py
index df3d7def..be81e4e6 100644
--- a/tractor/_supervise.py
+++ b/tractor/_supervise.py
@@ -119,11 +119,11 @@ class ActorNursery:
         name: str,
         *,
         bind_addrs: list[tuple[str, int]] = [_default_bind_addr],
-        rpc_module_paths: list[str] | None = None,
-        enable_modules: list[str] | None = None,
-        loglevel: str | None = None,  # set log level per subactor
-        nursery: trio.Nursery | None = None,
-        debug_mode: bool | None = None,
+        rpc_module_paths: list[str]|None = None,
+        enable_modules: list[str]|None = None,
+        loglevel: str|None = None,  # set log level per subactor
+        nursery: trio.Nursery|None = None,
+        debug_mode: bool|None = None,
         infect_asyncio: bool = False,
     ) -> Portal:
         '''
-- 
2.34.1


From 38a648385999e3aef0d69db10d22468cedf62277 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 14 Apr 2024 18:31:41 -0400
Subject: [PATCH 238/378] Use `_raise_from_no_key_in_msg(allow_msgs)`

Instead of `allow_msg_keys` since we've fully flipped over to
struct-types for msgs in the runtime.

- drop the loop from `MsgStream.receive_nowait()` since
  `Yield/Return.pld` getting will handle both (instead of a loop of
  `dict`-key reads).
---
 tractor/_context.py    |  1 -
 tractor/_exceptions.py |  3 +--
 tractor/_streaming.py  | 48 ++++++++++++++++--------------------------
 3 files changed, 19 insertions(+), 33 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index fc16289b..94956547 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -2151,7 +2151,6 @@ async def open_context_from_portal(
             src_err=src_error,
             log=log,
             expect_msg=Started,
-            # expect_key='started',
         )
 
     uid: tuple = portal.channel.uid
diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 259994a1..65637fb5 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -938,7 +938,6 @@ def _raise_from_no_key_in_msg(
     src_err: KeyError,
     log: StackLevelAdapter,  # caller specific `log` obj
 
-    expect_key: str = 'yield',
     expect_msg: str = Yield,
     stream: MsgStream | None = None,
 
@@ -1053,7 +1052,7 @@ def _raise_from_no_key_in_msg(
     # is activated above.
     _type: str = 'Stream' if stream else 'Context'
     raise MessagingError(
-        f"{_type} was expecting a '{expect_key.upper()}' message"
+        f"{_type} was expecting a {expect_msg} message"
         " BUT received a non-error msg:\n"
         f'{pformat(msg)}'
     ) from src_err
diff --git a/tractor/_streaming.py b/tractor/_streaming.py
index fcf8dafc..ac4d482e 100644
--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@@ -44,6 +44,7 @@ from .trionics import (
     BroadcastReceiver,
 )
 from tractor.msg import (
+    Return,
     Stop,
     Yield,
 )
@@ -82,7 +83,7 @@ class MsgStream(trio.abc.Channel):
         self,
         ctx: Context,  # typing: ignore # noqa
         rx_chan: trio.MemoryReceiveChannel,
-        _broadcaster: BroadcastReceiver | None = None,
+        _broadcaster: BroadcastReceiver|None = None,
 
     ) -> None:
         self._ctx = ctx
@@ -96,36 +97,26 @@ class MsgStream(trio.abc.Channel):
     # delegate directly to underlying mem channel
     def receive_nowait(
         self,
-        allow_msg_keys: list[str] = ['yield'],
+        allow_msgs: list[str] = Yield,
     ):
-        # msg: dict = self._rx_chan.receive_nowait()
         msg: Yield|Stop = self._rx_chan.receive_nowait()
-        for (
-            i,
-            key,
-        ) in enumerate(allow_msg_keys):
-            try:
-                # return msg[key]
-                return msg.pld
-            # except KeyError as kerr:
-            except AttributeError as attrerr:
-                if i < (len(allow_msg_keys) - 1):
-                    continue
-
-                _raise_from_no_key_in_msg(
-                    ctx=self._ctx,
-                    msg=msg,
-                    # src_err=kerr,
-                    src_err=attrerr,
-                    log=log,
-                    expect_key=key,
-                    stream=self,
-                )
+        # TODO: replace msg equiv of this or does the `.pld`
+        # interface read already satisfy it? I think so, yes?
+        try:
+            return msg.pld
+        except AttributeError as attrerr:
+            _raise_from_no_key_in_msg(
+                ctx=self._ctx,
+                msg=msg,
+                src_err=attrerr,
+                log=log,
+                stream=self,
+            )
 
     async def receive(
         self,
 
-        hide_tb: bool = True,
+        hide_tb: bool = False,
     ):
         '''
         Receive a single msg from the IPC transport, the next in
@@ -157,10 +148,9 @@ class MsgStream(trio.abc.Channel):
         try:
             try:
                 msg: Yield = await self._rx_chan.receive()
-                # return msg['yield']
                 return msg.pld
 
-            # except KeyError as kerr:
+            # TODO: implement with match: instead?
             except AttributeError as attrerr:
                 # src_err = kerr
                 src_err = attrerr
@@ -170,10 +160,8 @@ class MsgStream(trio.abc.Channel):
                 _raise_from_no_key_in_msg(
                     ctx=self._ctx,
                     msg=msg,
-                    # src_err=kerr,
                     src_err=attrerr,
                     log=log,
-                    expect_key='yield',
                     stream=self,
                 )
 
@@ -304,7 +292,7 @@ class MsgStream(trio.abc.Channel):
         while not drained:
             try:
                 maybe_final_msg = self.receive_nowait(
-                    allow_msg_keys=['yield', 'return'],
+                    allow_msgs=[Yield, Return],
                 )
                 if maybe_final_msg:
                     log.debug(
-- 
2.34.1


From 921f72f7fef70322231872263d9209476910e896 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 14 Apr 2024 18:36:22 -0400
Subject: [PATCH 239/378] Add `maybe_enable_greenback: bool` flag to
 `open_root_actor()`

---
 tractor/_root.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tractor/_root.py b/tractor/_root.py
index bc55fd9e..9ce470fa 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -79,6 +79,7 @@ async def open_root_actor(
 
     # enables the multi-process debugger support
     debug_mode: bool = False,
+    maybe_enable_greenback: bool = False,  # `.pause_from_sync()/breakpoint()` support
 
     # internal logging
     loglevel: str|None = None,
@@ -107,8 +108,8 @@ async def open_root_actor(
     )
     if (
         debug_mode
-        and
-        await _debug.maybe_init_greenback(
+        and maybe_enable_greenback
+        and await _debug.maybe_init_greenback(
             raise_not_found=False,
         )
     ):
-- 
2.34.1


From eca2c02f8bb7a50611b3844fda97286590f263cc Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 14 Apr 2024 18:53:42 -0400
Subject: [PATCH 240/378] Flip to `.pause()` in subactor bp example

---
 examples/debugging/multi_subactors.py     | 1 +
 examples/debugging/subactor_breakpoint.py | 9 ++++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/examples/debugging/multi_subactors.py b/examples/debugging/multi_subactors.py
index 259d5268..22b13ac8 100644
--- a/examples/debugging/multi_subactors.py
+++ b/examples/debugging/multi_subactors.py
@@ -38,6 +38,7 @@ async def main():
     """
     async with tractor.open_nursery(
         debug_mode=True,
+        # loglevel='runtime',
     ) as n:
 
         # Spawn both actors, don't bother with collecting results
diff --git a/examples/debugging/subactor_breakpoint.py b/examples/debugging/subactor_breakpoint.py
index bcc304d1..4fdff484 100644
--- a/examples/debugging/subactor_breakpoint.py
+++ b/examples/debugging/subactor_breakpoint.py
@@ -3,17 +3,20 @@ import tractor
 
 
 async def breakpoint_forever():
-    """Indefinitely re-enter debugger in child actor.
-    """
+    '''
+    Indefinitely re-enter debugger in child actor.
+
+    '''
     while True:
         await trio.sleep(0.1)
-        await tractor.breakpoint()
+        await tractor.pause()
 
 
 async def main():
 
     async with tractor.open_nursery(
         debug_mode=True,
+        loglevel='cancel',
     ) as n:
 
         portal = await n.run_in_actor(
-- 
2.34.1


From 60aa16adf655ac954c88e222b054f5d843fc317b Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 14 Apr 2024 19:31:50 -0400
Subject: [PATCH 241/378] Pass a `use_greenback: bool` runtime var to subs

Such that the top level `maybe_enable_greenback` from
`open_root_actor()` can toggle the entire actor tree's usage.
Read the rtv in `._rpc` tasks and only enable if set.

Also, rigor up the `._rpc.process_messages()` loop to handle `Error()`
and `case _:` separately such that we now raise an explicit rte for
unknown / invalid msgs. Use "parent" / "child" for side descriptions in
loop comments and put a fat comment before the `StartAck` in `_invoke()`.
---
 tractor/_root.py | 12 +++++++--
 tractor/_rpc.py  | 68 ++++++++++++++++++++++++++++++++++--------------
 2 files changed, 58 insertions(+), 22 deletions(-)

diff --git a/tractor/_root.py b/tractor/_root.py
index 9ce470fa..1964a067 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -116,6 +116,8 @@ async def open_root_actor(
         os.environ['PYTHONBREAKPOINT'] = (
             'tractor.devx._debug.pause_from_sync'
         )
+        _state._runtime_vars['use_greenback'] = True
+
     else:
         # TODO: disable `breakpoint()` by default (without
         # `greenback`) since it will break any multi-actor
@@ -386,14 +388,20 @@ async def open_root_actor(
         _state._last_actor_terminated = actor
 
         # restore built-in `breakpoint()` hook state
-        if debug_mode:
+        if (
+            debug_mode
+            and
+            maybe_enable_greenback
+        ):
             if builtin_bp_handler is not None:
                 sys.breakpointhook = builtin_bp_handler
+
             if orig_bp_path is not None:
                 os.environ['PYTHONBREAKPOINT'] = orig_bp_path
+
             else:
                 # clear env back to having no entry
-                os.environ.pop('PYTHONBREAKPOINT')
+                os.environ.pop('PYTHONBREAKPOINT', None)
 
         logger.runtime("Root actor terminated")
 
diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index a95dbfec..86c3e27d 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -41,7 +41,6 @@ from trio import (
     TaskStatus,
 )
 
-from .msg import NamespacePath
 from ._ipc import Channel
 from ._context import (
     Context,
@@ -61,6 +60,11 @@ from .devx import (
 )
 from . import _state
 from .log import get_logger
+from .msg import (
+    current_codec,
+    MsgCodec,
+    NamespacePath,
+)
 from tractor.msg.types import (
     CancelAck,
     Error,
@@ -98,6 +102,7 @@ async def _invoke_non_context(
         Context | BaseException
     ] = trio.TASK_STATUS_IGNORED,
 ):
+    __tracebackhide__: bool = True
 
     # TODO: can we unify this with the `context=True` impl below?
     if inspect.isasyncgen(coro):
@@ -398,7 +403,11 @@ async def _invoke(
     __tracebackhide__: bool = hide_tb
     treat_as_gen: bool = False
 
-    if _state.debug_mode():
+    if (
+        _state.debug_mode()
+        and
+        _state._runtime_vars['use_greenback']
+    ):
         # XXX for .pause_from_sync()` usage we need to make sure
         # `greenback` is boostrapped in the subactor!
         await _debug.maybe_init_greenback()
@@ -512,10 +521,22 @@ async def _invoke(
         #     wrapper that calls `Context.started()` and then does
         #     the `await coro()`?
 
-        # a "context" endpoint type is the most general and
-        # "least sugary" type of RPC ep with support for
+        # ------ - ------
+        # a "context" endpoint is the most general and
+        # "least sugary" type of RPC with support for
         # bi-dir streaming B)
-        # StartAck
+        #
+        # the concurrency relation is simlar to a task nursery
+        # wherein a "parent" task (the one that enters
+        # `trio.open_nursery()` in some actor "opens" (via
+        # `Portal.open_context()`) an IPC ctx to another peer
+        # (which is maybe a sub-) actor who then schedules (aka
+        # `trio.Nursery.start()`s) a new "child" task to execute
+        # the `@context` annotated func; that is this func we're
+        # running directly below!
+        # ------ - ------
+        #
+        # StartAck: respond immediately with endpoint info
         await chan.send(
             StartAck(
                 cid=cid,
@@ -524,11 +545,11 @@ async def _invoke(
         )
 
         # TODO: should we also use an `.open_context()` equiv
-        # for this callee side by factoring the impl from
+        # for this child side by factoring the impl from
         # `Portal.open_context()` into a common helper?
         #
         # NOTE: there are many different ctx state details
-        # in a callee side instance according to current impl:
+        # in a child side instance according to current impl:
         # - `.cancelled_caught` can never be `True`.
         #  -> the below scope is never exposed to the
         #     `@context` marked RPC function.
@@ -554,7 +575,7 @@ async def _invoke(
 
             # NOTE: this happens IFF `ctx._scope.cancel()` is
             # called by any of,
-            # - *this* callee task manually calling `ctx.cancel()`.
+            # - *this* child task manually calling `ctx.cancel()`.
             # - the runtime calling `ctx._deliver_msg()` which
             #   itself calls `ctx._maybe_cancel_and_set_remote_error()`
             #   which cancels the scope presuming the input error
@@ -631,10 +652,11 @@ async def _invoke(
                         # f'  |_{ctx}'
                     )
 
-                    # task-contex was either cancelled by request using
-                    # ``Portal.cancel_actor()`` or ``Context.cancel()``
-                    # on the far end, or it was cancelled by the local
-                    # (callee) task, so relay this cancel signal to the
+                    # task-contex was either cancelled by request
+                    # using ``Portal.cancel_actor()`` or
+                    # ``Context.cancel()`` on the far end, or it
+                    # was cancelled by the local child (or callee)
+                    # task, so relay this cancel signal to the
                     # other side.
                     ctxc = ContextCancelled(
                         message=msg,
@@ -655,7 +677,7 @@ async def _invoke(
 
         ) as scope_error:
 
-            # always set this (callee) side's exception as the
+            # always set this (child) side's exception as the
             # local error on the context
             ctx._local_error: BaseException = scope_error
 
@@ -1024,9 +1046,8 @@ async def process_messages(
                                 trio.Event(),
                             )
 
-                    # XXX remote (runtime scoped) error or uknown
-                    # msg (type).
-                    case Error() | _:
+                    # runtime-scoped remote error (since no `.cid`)
+                    case Error():
                         # NOTE: this is the non-rpc error case,
                         # that is, an error **not** raised inside
                         # a call to ``_invoke()`` (i.e. no cid was
@@ -1034,10 +1055,6 @@ async def process_messages(
                         # this error to all local channel
                         # consumers (normally portals) by marking
                         # the channel as errored
-                        log.exception(
-                            f'Unhandled IPC msg:\n\n'
-                            f'{msg}\n'
-                        )
                         # assert chan.uid
                         chan._exc: Exception = unpack_error(
                             msg,
@@ -1045,6 +1062,17 @@ async def process_messages(
                         )
                         raise chan._exc
 
+                    # unknown/invalid msg type?
+                    case _:
+                        codec: MsgCodec = current_codec()
+                        message: str = (
+                            f'Unhandled IPC msg for codec?\n\n'
+                            f'|_{codec}\n\n'
+                            f'{msg}\n'
+                        )
+                        log.exception(message)
+                        raise RuntimeError(message)
+
                 log.runtime(
                     'Waiting on next IPC msg from\n'
                     f'peer: {chan.uid}\n'
-- 
2.34.1


From b209990d045c9069b11279916dc529bbe3152b0b Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 14 Apr 2024 19:39:57 -0400
Subject: [PATCH 242/378] Flip a last `MultiError` to a beg, add todo on
 `@stream` func

---
 tests/test_context_stream_semantics.py | 4 ++--
 tests/test_legacy_one_way_streaming.py | 5 ++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/test_context_stream_semantics.py b/tests/test_context_stream_semantics.py
index 5df133d8..cedddf73 100644
--- a/tests/test_context_stream_semantics.py
+++ b/tests/test_context_stream_semantics.py
@@ -246,10 +246,10 @@ def test_simple_context(
             trio.run(main)
         except error_parent:
             pass
-        except trio.MultiError as me:
+        except BaseExceptionGroup as beg:
             # XXX: on windows it seems we may have to expect the group error
             from tractor._exceptions import is_multi_cancelled
-            assert is_multi_cancelled(me)
+            assert is_multi_cancelled(beg)
     else:
         trio.run(main)
 
diff --git a/tests/test_legacy_one_way_streaming.py b/tests/test_legacy_one_way_streaming.py
index 1e7ec987..6092bca7 100644
--- a/tests/test_legacy_one_way_streaming.py
+++ b/tests/test_legacy_one_way_streaming.py
@@ -38,10 +38,13 @@ async def async_gen_stream(sequence):
     assert cs.cancelled_caught
 
 
+# TODO: deprecated either remove entirely
+# or re-impl in terms of `MsgStream` one-sides
+# wrapper, but at least remove `Portal.open_stream_from()`
 @tractor.stream
 async def context_stream(
     ctx: tractor.Context,
-    sequence
+    sequence: list[int],
 ):
     for i in sequence:
         await ctx.send_yield(i)
-- 
2.34.1


From 829dfa75201c12148cc0a137fef3eed4480fc17f Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 14 Apr 2024 19:41:29 -0400
Subject: [PATCH 243/378] Add defaul rtv for `use_greeback: bool = False`

---
 tractor/_state.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tractor/_state.py b/tractor/_state.py
index 9e4e9473..b76e8ac9 100644
--- a/tractor/_state.py
+++ b/tractor/_state.py
@@ -30,11 +30,16 @@ if TYPE_CHECKING:
 
 _current_actor: Actor|None = None  # type: ignore # noqa
 _last_actor_terminated: Actor|None = None
+
+# TODO: mk this a `msgspec.Struct`!
 _runtime_vars: dict[str, Any] = {
     '_debug_mode': False,
     '_is_root': False,
     '_root_mailbox': (None, None),
     '_registry_addrs': [],
+
+    # for `breakpoint()` support
+    'use_greenback': False,
 }
 
 
-- 
2.34.1


From 3869e91b19b0a4bfe0fb99123704515c06227722 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 14 Apr 2024 19:50:09 -0400
Subject: [PATCH 244/378] More msg-spec tests tidying

- Drop `test_msg_spec_xor_pld_spec()` since we no longer support
  `ipc_msg_spec` arg to `mk_codec()`.
- Expect `MsgTypeError`s around `.open_context()` calls when
  `add_codec_hooks == False`.
- toss in some `.pause()` points in the subactor ctx body whilst hacking
  out a `.pld` protocol for debug mode TTY locking.
---
 tests/test_caps_based_msging.py | 65 ++++++++++++++++++++++-----------
 1 file changed, 43 insertions(+), 22 deletions(-)

diff --git a/tests/test_caps_based_msging.py b/tests/test_caps_based_msging.py
index f7cab2a5..9a73ba8d 100644
--- a/tests/test_caps_based_msging.py
+++ b/tests/test_caps_based_msging.py
@@ -14,19 +14,20 @@ from typing import (
 from contextvars import (
     Context,
 )
-# from inspect import Parameter
 
 from msgspec import (
     structs,
     msgpack,
-    # defstruct,
     Struct,
     ValidationError,
 )
 import pytest
 
 import tractor
-from tractor import _state
+from tractor import (
+    _state,
+    MsgTypeError,
+)
 from tractor.msg import (
     _codec,
     _ctxvar_MsgCodec,
@@ -47,21 +48,6 @@ from tractor.msg.types import (
 import trio
 
 
-def test_msg_spec_xor_pld_spec():
-    '''
-    If the `.msg.types.Msg`-set is overridden, we
-    can't also support a `Msg.pld` spec.
-
-    '''
-    # apply custom hooks and set a `Decoder` which only
-    # loads `NamespacePath` types.
-    with pytest.raises(RuntimeError):
-        mk_codec(
-            ipc_msg_spec=Any,
-            ipc_pld_spec=NamespacePath,
-        )
-
-
 def mk_custom_codec(
     pld_spec: Union[Type]|Any,
     add_hooks: bool,
@@ -134,7 +120,9 @@ def mk_custom_codec(
             f'{uid}\n'
             'FAILED DECODE\n'
             f'type-> {obj_type}\n'
-            f'obj-arg-> `{obj}`: {type(obj)}\n'
+            f'obj-arg-> `{obj}`: {type(obj)}\n\n'
+            f'current codec:\n'
+            f'{current_codec()}\n'
         )
         # TODO: figure out the ignore subsys for this!
         # -[ ] option whether to defense-relay backc the msg
@@ -409,7 +397,9 @@ async def send_back_values(
         pld_spec=ipc_pld_spec,
         add_hooks=add_hooks,
     )
-    with apply_codec(nsp_codec) as codec:
+    with (
+        apply_codec(nsp_codec) as codec,
+    ):
         chk_codec_applied(
             expect_codec=nsp_codec,
             enter_value=codec,
@@ -459,7 +449,7 @@ async def send_back_values(
                     # XXX NOTE XXX THIS WON'T WORK WITHOUT SPECIAL
                     # `str` handling! or special debug mode IPC
                     # msgs!
-                    # await tractor.pause()
+                    await tractor.pause()
 
                     raise RuntimeError(
                         f'NOT-EXPECTED able to roundtrip value given spec:\n'
@@ -470,7 +460,8 @@ async def send_back_values(
                 break  # move on to streaming block..
 
             except tractor.MsgTypeError:
-                # await tractor.pause()
+                await tractor.pause()
+
                 if expect_send:
                     raise RuntimeError(
                         f'EXPECTED to `.started()` value given spec:\n'
@@ -652,12 +643,42 @@ def test_codec_hooks_mod(
 
                 pld_spec_type_strs: list[str] = enc_type_union(ipc_pld_spec)
 
+                # XXX should raise an mte (`MsgTypeError`)
+                # when `add_codec_hooks == False` bc the input
+                # `expect_ipc_send` kwarg has a nsp which can't be
+                # serialized!
+                #
+                # TODO:can we ensure this happens from the
+                # `Return`-side (aka the sub) as well?
+                if not add_codec_hooks:
+                    try:
+                        async with p.open_context(
+                            send_back_values,
+                            expect_debug=debug_mode,
+                            pld_spec_type_strs=pld_spec_type_strs,
+                            add_hooks=add_codec_hooks,
+                            started_msg_bytes=nsp_codec.encode(expected_started),
+
+                            # XXX NOTE bc we send a `NamespacePath` in this kwarg
+                            expect_ipc_send=expect_ipc_send,
+
+                        ) as (ctx, first):
+                            pytest.fail('ctx should fail to open without custom enc_hook!?')
+
+                    # this test passes bc we can go no further!
+                    except MsgTypeError:
+                        # teardown nursery
+                        await p.cancel_actor()
+                        return
+
                 # TODO: send the original nsp here and
                 # test with `limit_msg_spec()` above?
                 # await tractor.pause()
                 print('PARENT opening IPC ctx!\n')
                 async with (
 
+                    # XXX should raise an mte (`MsgTypeError`)
+                    # when `add_codec_hooks == False`..
                     p.open_context(
                         send_back_values,
                         expect_debug=debug_mode,
-- 
2.34.1


From d4155396bf6eddb5cdc175d12d4c23431aed14d7 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 14 Apr 2024 19:52:44 -0400
Subject: [PATCH 245/378] Relay `SIGUSR1` to subactors for `stackscope` tracing

Since obvi we don't want to just only see the trace in the root most of
the time ;)

Currently the sig keeps firing twice in the root though, and i'm not
sure why yet..
---
 tractor/devx/_stackscope.py | 61 ++++++++++++++++++++++++++++++++++---
 1 file changed, 57 insertions(+), 4 deletions(-)

diff --git a/tractor/devx/_stackscope.py b/tractor/devx/_stackscope.py
index 706b85d3..38c7af1d 100644
--- a/tractor/devx/_stackscope.py
+++ b/tractor/devx/_stackscope.py
@@ -23,12 +23,31 @@ into each ``trio.Nursery`` except it links the lifetimes of memory space
 disjoint, parallel executing tasks in separate actors.
 
 '''
+from __future__ import annotations
+import multiprocessing as mp
 from signal import (
     signal,
     SIGUSR1,
 )
+import traceback
+from typing import TYPE_CHECKING
 
 import trio
+from tractor import (
+    _state,
+    log as logmod,
+)
+
+log = logmod.get_logger(__name__)
+
+
+if TYPE_CHECKING:
+    from tractor._spawn import ProcessType
+    from tractor import (
+        Actor,
+        ActorNursery,
+    )
+
 
 @trio.lowlevel.disable_ki_protection
 def dump_task_tree() -> None:
@@ -41,9 +60,15 @@ def dump_task_tree() -> None:
             recurse_child_tasks=True
         )
     )
-    log = get_console_log('cancel')
+    log = get_console_log(
+        name=__name__,
+        level='cancel',
+    )
+    actor: Actor = _state.current_actor()
     log.pdb(
-        f'Dumping `stackscope` tree:\n\n'
+        f'Dumping `stackscope` tree for actor\n'
+        f'{actor.name}: {actor}\n'
+        f' |_{mp.current_process()}\n\n'
         f'{tree_str}\n'
     )
     # import logging
@@ -56,8 +81,13 @@ def dump_task_tree() -> None:
     #     ).exception("Error printing task tree")
 
 
-def signal_handler(sig: int, frame: object) -> None:
-    import traceback
+def signal_handler(
+    sig: int,
+    frame: object,
+
+    relay_to_subs: bool = True,
+
+) -> None:
     try:
         trio.lowlevel.current_trio_token(
         ).run_sync_soon(dump_task_tree)
@@ -65,6 +95,26 @@ def signal_handler(sig: int, frame: object) -> None:
         # not in async context -- print a normal traceback
         traceback.print_stack()
 
+    if not relay_to_subs:
+        return
+
+    an: ActorNursery
+    for an in _state.current_actor()._actoruid2nursery.values():
+
+        subproc: ProcessType
+        subactor: Actor
+        for subactor, subproc, _ in an._children.values():
+            log.pdb(
+                f'Relaying `SIGUSR1`[{sig}] to sub-actor\n'
+                f'{subactor}\n'
+                f' |_{subproc}\n'
+            )
+
+            if isinstance(subproc, trio.Process):
+                subproc.send_signal(sig)
+
+            elif isinstance(subproc, mp.Process):
+                subproc._send_signal(sig)
 
 
 def enable_stack_on_sig(
@@ -82,3 +132,6 @@ def enable_stack_on_sig(
     # NOTE: not the above can be triggered from
     # a (xonsh) shell using:
     # kill -SIGUSR1 @$(pgrep -f '<cmd>')
+    #
+    # for example if you were looking to trace a `pytest` run
+    # kill -SIGUSR1 @$(pgrep -f 'pytest')
-- 
2.34.1


From 5dfff3f75a8cdeb0b4705efca8bab562660359dd Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 15 Apr 2024 15:20:00 -0400
Subject: [PATCH 246/378] Tweak a couple more log message fmts

---
 tractor/_context.py   |  2 +-
 tractor/_streaming.py | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index 94956547..027f15ff 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -1902,7 +1902,7 @@ class Context:
 
         try:
             log.runtime(
-                f'Delivering msg from IPC ctx:\n'
+                f'Delivering msg from IPC ctx:\n\n'
                 f'<= {from_uid}\n'
                 f'  |_ {nsf}()\n\n'
 
diff --git a/tractor/_streaming.py b/tractor/_streaming.py
index ac4d482e..16e32cea 100644
--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@@ -377,14 +377,17 @@ class MsgStream(trio.abc.Channel):
         #         await rx_chan.aclose()
 
         if not self._eoc:
-            log.cancel(
-                'Stream closed by self before it received an EoC?\n'
-                'Setting eoc manually..\n..'
-            )
-            self._eoc: bool = trio.EndOfChannel(
-                f'Context stream closed by self({self._ctx.side})\n'
+            message: str = (
+                f'Context stream closed by {self._ctx.side!r}\n'
                 f'|_{self}\n'
             )
+            log.cancel(
+                'Stream self-closed before receiving EoC\n\n'
+                +
+                message
+            )
+            self._eoc = trio.EndOfChannel(message)
+
         # ?XXX WAIT, why do we not close the local mem chan `._rx_chan` XXX?
         # => NO, DEFINITELY NOT! <=
         # if we're a bi-dir ``MsgStream`` BECAUSE this same
-- 
2.34.1


From a73b24cf4ab6dfb84029f6ed546662c7153afd97 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 16 Apr 2024 10:09:45 -0400
Subject: [PATCH 247/378] First draft, sub-msg-spec for debugger `Lock` sys

Since it's totes possible to have a spec applied that won't permit
`str`s, might as well formalize a small msg set for subactors to request
the tree-wide TTY `Lock`.

BTW, I'm prolly not going into every single change here in this first
WIP since there's still a variety of broken stuff mostly to do with
races on the codec apply being done in a `trio.lowleve.RunVar`; it
should be re-done with a `ContextVar` such that each task does NOT
mutate the global setting..

New msg set and usage is simply:
- `LockStatus` which is the reponse msg delivered from `lock_tty_for_child()`
- `LockRelease` a one-off request msg from the subactor to drop the
  `Lock` from a `MsgStream.send()`.
- use these msgs throughout the root and sub sides of the locking
  ctx funcs: `lock_tty_for_child()` & `wait_for_parent_stdin_hijack()`

The codec is now applied in both the root and sub `Lock` request tasks:
- for root inside `lock_tty_for_child()` before the `.started()`.
- for subs, inside `wait_for_parent_stdin_hijack()` since we only want
  to affect the codec *for the locking task*.
  - (hence the need for ctx-var as mentioned above but currently this
    can cause races which will break against other app tasks competing
    for the codec setting).
- add a `apply_debug_codec()` helper for use in both cases.
- add more detailed logging to both the root and sub side of `Lock`
  requesting funcs including requiring that the sub-side task "uid" (a
  `tuple[str, int]` = (trio.Task.name, id(trio.Task)` be provided (more
  on this later).

A main issue discovered while proto-testing all this was the ability of
a sub to "double lock" (leading to self-deadlock) via an error in
`wait_for_parent_stdin_hijack()` which, for ex., can happen in debug
mode via crash handling of a `MsgTypeError` received from the root
during a codec applied msg-spec race! Originally I was attempting to
solve this by making the SIGINT override handler more resilient but this
case is somewhat impossible to detect by an external root task other
then checking for duplicate ownership via the new `subactor_task_uid`.
=> SO NOW, we always stick the current task uid in the
   `Lock._blocked: set` and raise an rte on a double request by the same
   remote task.

Included is a variety of small refinements:
- finally figured out how to mark a variety of `.__exit__()` frames with
  `pdbp.hideframe()` to actually hide them B)
- add cls methods around managing `Lock._locking_task_cs` from root only.
- re-org all the `Lock` attrs into those only used in root vs. subactors
  and proto-prep a new `DebugStatus` actor-singleton to be used in subs.
- add a `Lock.repr()` to contextually print the current conc primitives.
- rename our `Pdb`-subtype to `PdbREPL`.
- rigor out the SIGINT handler a bit, originally to try and hack-solve
  the double-lock issue mentioned above, but now just with better
  logging and logic for most (all?) possible hang cases that should be
  hang-recoverable after enough ctrl-c mashing by the user.. well
  hopefully:
  - using `Lock.repr()` for both root and sub cases.
  - lots more `log.warn()`s and handler reversions on stale lock or cs
    detection.
- factor `._pause()` impl a little better moving the actual repl entry
  to a new `_enter_repl_sync()` (originally for easier wrapping in the
  sub case with `apply_codec()`).
---
 tractor/devx/__init__.py |    1 -
 tractor/devx/_debug.py   | 1049 ++++++++++++++++++++++++++++----------
 2 files changed, 792 insertions(+), 258 deletions(-)

diff --git a/tractor/devx/__init__.py b/tractor/devx/__init__.py
index c4676e3f..7ea2b25c 100644
--- a/tractor/devx/__init__.py
+++ b/tractor/devx/__init__.py
@@ -27,7 +27,6 @@ from ._debug import (
     pause as pause,
     pause_from_sync as pause_from_sync,
     shield_sigint_handler as shield_sigint_handler,
-    MultiActorPdb as MultiActorPdb,
     open_crash_handler as open_crash_handler,
     maybe_open_crash_handler as maybe_open_crash_handler,
     post_mortem as post_mortem,
diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 26155b22..51e74379 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -25,6 +25,7 @@ from contextlib import (
     asynccontextmanager as acm,
     contextmanager as cm,
     nullcontext,
+    _GeneratorContextManager,
 )
 from functools import (
     partial,
@@ -33,6 +34,7 @@ from functools import (
 import os
 import signal
 import sys
+import textwrap
 import threading
 import traceback
 from typing import (
@@ -40,6 +42,7 @@ from typing import (
     Callable,
     AsyncIterator,
     AsyncGenerator,
+    TypeAlias,
     TYPE_CHECKING,
 )
 from types import (
@@ -47,17 +50,23 @@ from types import (
     ModuleType,
 )
 
+from msgspec import Struct
 import pdbp
 import sniffio
 import tractor
 import trio
-from trio.lowlevel import current_task
+from trio.lowlevel import (
+    current_task,
+    Task,
+)
 from trio import (
     TaskStatus,
-    # Task,
 )
 
 from tractor.log import get_logger
+from tractor.msg import (
+    _codec,
+)
 from tractor._state import (
     current_actor,
     is_root_process,
@@ -76,6 +85,36 @@ if TYPE_CHECKING:
 
 log = get_logger(__name__)
 
+# XXX HACKZONE XXX
+#  hide exit stack frames on nurseries and cancel-scopes!
+# |_ so avoid seeing it when the `pdbp` REPL is first engaged from
+#    inside a `trio.open_nursery()` scope (with no line after it
+#    in before the block end??).
+#
+# TODO: FINALLY got this workin originally with
+#  `@pdbp.hideframe` around the `wrapper()` def embedded inside
+#  `_ki_protection_decoratior()`.. which is in the module:
+#  /home/goodboy/.virtualenvs/tractor311/lib/python3.11/site-packages/trio/_core/_ki.py
+#
+# -[ ] make an issue and patch for `trio` core? maybe linked
+#    to the long outstanding `pdb` one below?
+#   |_ it's funny that there's frame hiding throughout `._run.py`
+#      but not where it matters on the below exit funcs..
+#
+# -[ ] provide a patchset for the lonstanding
+#   |_ https://github.com/python-trio/trio/issues/1155
+#
+# -[ ] make a linked issue to ^ and propose allowing all the
+#     `._core._run` code to have their `__tracebackhide__` value
+#     configurable by a `RunVar` to allow getting scheduler frames
+#     if desired through configuration?
+#
+# -[ ] maybe dig into the core `pdb` issue why the extra frame is shown
+#      at all?
+#
+pdbp.hideframe(trio._core._run.NurseryManager.__aexit__)
+pdbp.hideframe(trio._core._run.CancelScope.__exit__)
+pdbp.hideframe(_GeneratorContextManager.__exit__)
 
 __all__ = [
     'breakpoint',
@@ -83,6 +122,28 @@ __all__ = [
 ]
 
 
+class LockStatus(
+    Struct,
+    tag=True,
+    tag_field='msg_type',
+):
+    subactor_uid: tuple[str, str]
+    cid: str
+    locked: bool
+
+
+class LockRelease(
+    Struct,
+    tag=True,
+    tag_field='msg_type',
+):
+    subactor_uid: tuple[str, str]
+    cid: str
+
+
+__msg_spec__: TypeAlias = LockStatus|LockRelease
+
+
 class Lock:
     '''
     Actor global debug lock state.
@@ -90,41 +151,111 @@ class Lock:
     Mostly to avoid a lot of ``global`` declarations for now XD.
 
     '''
-    repl: MultiActorPdb | None = None
+    # XXX local ref to the `Pbp` instance, ONLY set in the
+    # actor-process that currently has activated a REPL
+    # i.e. it will be `None` (unset) in any other actor-process
+    # that does not have this lock acquired in the root proc.
+    repl: PdbREPL|None = None
+
     # placeholder for function to set a ``trio.Event`` on debugger exit
     # pdb_release_hook: Callable | None = None
 
-    _trio_handler: Callable[
-        [int, FrameType | None], Any
-    ] | int | None = None
+    _trio_handler: (
+        Callable[[int, FrameType|None], Any]
+        |int
+        | None
+    ) = None
 
-    # actor-wide variable pointing to current task name using debugger
-    local_task_in_debug: str | None = None
+    remote_task_in_debug: str|None = None
 
-    # NOTE: set by the current task waiting on the root tty lock from
-    # the CALLER side of the `lock_tty_for_child()` context entry-call
-    # and must be cancelled if this actor is cancelled via IPC
-    # request-message otherwise deadlocks with the parent actor may
-    # ensure
+    @staticmethod
+    def get_locking_task_cs() -> trio.CancelScope|None:
+        if is_root_process():
+            return Lock._locking_task_cs
+
+        raise RuntimeError(
+            '`Lock.locking_task_cs` is invalid in subactors!'
+        )
+
+    @staticmethod
+    def set_locking_task_cs(
+        cs: trio.CancelScope,
+    ) -> None:
+        if not is_root_process():
+            raise RuntimeError(
+                '`Lock.locking_task_cs` is invalid in subactors!'
+            )
+
+        Lock._locking_task_cs = cs
+
+    # SUBACTOR ONLY
+    # ------ - -------
+    local_task_in_debug: Task|None = None
     _debugger_request_cs: trio.CancelScope|None = None
+    local_pdb_complete: trio.Event|None = None
 
-    # NOTE: set only in the root actor for the **local** root spawned task
-    # which has acquired the lock (i.e. this is on the callee side of
-    # the `lock_tty_for_child()` context entry).
-    _root_local_task_cs_in_debug: trio.CancelScope|None = None
+    #     ROOT ONLY
+    # ------ - -------
+    # the root-actor-ONLY singletons for, 
+    #
+    # - the uid of the actor who's task is using a REPL
+    # - a literal task-lock,
+    # - a shielded-cancel-scope around the acquiring task*,
+    # - a broadcast event to signal no-actor using a REPL in tree,
+    # - a filter list to block subs-by-uid from locking.
+    #
+    # * in case it needs to be manually cancelled in root due to
+    #   a stale lock condition (eg. IPC failure with the locking
+    #   child
+    global_actor_in_debug: tuple[str, str]|None = None
+    no_remote_has_tty: trio.Event|None = None
+    _locking_task_cs: trio.CancelScope|None = None
 
-    # actor tree-wide actor uid that supposedly has the tty lock
-    global_actor_in_debug: tuple[str, str] = None
-
-    local_pdb_complete: trio.Event | None = None
-    no_remote_has_tty: trio.Event | None = None
-
-    # lock in root actor preventing multi-access to local tty
     _debug_lock: trio.StrictFIFOLock = trio.StrictFIFOLock()
+    _blocked: set[tuple[str, str]] = set()  # `Actor.uid` block list
 
+    # TODO: should go on `PbpREPL`?
     _orig_sigint_handler: Callable | None = None
-    _blocked: set[tuple[str, str]] = set()
 
+    @classmethod
+    def repr(cls) -> str:
+
+        # both root and subs
+        fields: str = (
+            f'repl: {cls.repl}\n'
+            f'local_repl_task: {cls.local_task_in_debug}\n'
+        )
+
+        if is_root_process():
+            lock_stats: trio.LockStatistics = cls._debug_lock.statistics()
+            fields += (
+                f'global_actor_in_debug: {cls.global_actor_in_debug}\n'
+                f'no_remote_has_tty: {cls.no_remote_has_tty}\n'
+                f'remote_task_in_debug: {cls.remote_task_in_debug}\n'
+                f'_locking_task_cs: {cls.get_locking_task_cs()}\n'
+                f'_blocked: {cls._blocked}\n\n'
+
+                f'_debug_lock: {cls._debug_lock}\n'
+                f'lock_stats: {lock_stats}\n'
+            )
+        else:
+            fields += (
+                f'local_task_in_debug: {cls.local_task_in_debug}\n'
+                f'local_pdb_complete: {cls.local_pdb_complete}\n'
+                f'_debugger_request_cs: {cls._debugger_request_cs}\n'
+            )
+
+        body: str = textwrap.indent(
+            fields,
+            prefix=' |_',
+        )
+        return (
+            f'<{cls.__name__}(\n'
+            f'{body}'
+            ')>'
+        )
+
+    # TODO: move to PdbREPL!
     @classmethod
     def shield_sigint(cls):
         '''
@@ -218,19 +349,35 @@ class Lock:
             else:
                 cls._debug_lock.release()
 
-        except RuntimeError:
+        except RuntimeError as rte:
             # uhhh makes no sense but been seeing the non-owner
             # release error even though this is definitely the task
             # that locked?
             owner = cls._debug_lock.statistics().owner
+            # if (
+            #     owner
+            #     and
+            #     cls.remote_task_in_debug is None
+            # ):
+            #     raise RuntimeError(
+            #         'Stale `Lock` detected, no remote task active!?\n'
+            #         f'|_{owner}\n'
+            #         # f'{Lock}'
+            #     ) from rte
+
             if owner:
-                raise
+                raise rte
+
+            # OW suppress, can't member why tho .. XD
+            # something somethin corrupts a cancel-scope
+            # somewhere..
 
         try:
             # sometimes the ``trio`` might already be terminated in
             # which case this call will raise.
             if cls.local_pdb_complete is not None:
                 cls.local_pdb_complete.set()
+
         finally:
             # restore original sigint handler
             cls.unshield_sigint()
@@ -241,10 +388,33 @@ class Lock:
             cls.local_task_in_debug = None
 
 
+# TODO: actually use this instead throughout for subs!
+class DebugStatus:
+    '''
+    Singleton-state for debugging machinery in a subactor.
+
+    Composes conc primitives for syncing with a root actor to
+    acquire the tree-global (TTY) `Lock` such that only ever one
+    actor's task can have the REPL active at a given time.
+
+    '''
+    repl: PdbREPL|None = None
+    lock_status: LockStatus|None = None
+
+    repl_task: Task|None = None
+    # local_task_in_debug: Task|None = None
+
+    req_cs: trio.CancelScope|None = None
+    # _debugger_request_cs: trio.CancelScope|None = None
+
+    repl_release: trio.Event|None = None
+    # local_pdb_complete: trio.Event|None = None
+
 
 class TractorConfig(pdbp.DefaultConfig):
     '''
-    Custom ``pdbp`` goodness :surfer:
+    Custom `pdbp` config which tries to use the best tradeoff
+    between pretty and minimal.
 
     '''
     use_pygments: bool = True
@@ -255,21 +425,41 @@ class TractorConfig(pdbp.DefaultConfig):
     # fixes line spacing issue when resizing terminal B)
     truncate_long_lines: bool = False
 
+    # ------ - ------
+    # our own custom config vars mostly
+    # for syncing with the actor tree's singleton
+    # TTY `Lock`.
 
-class MultiActorPdb(pdbp.Pdb):
+
+class PdbREPL(pdbp.Pdb):
     '''
-    Add teardown hooks to the regular ``pdbp.Pdb``.
+    Add teardown hooks and local state describing any 
+    ongoing TTY `Lock` request dialog.
 
     '''
     # override the pdbp config with our coolio one
+    # NOTE: this is only loaded when no `~/.pdbrc` exists
+    # so we should prolly pass it into the .__init__() instead?
+    # i dunno, see the `DefaultFactory` and `pdb.Pdb` impls.
     DefaultConfig = TractorConfig
 
+    status = DebugStatus
+
     # def preloop(self):
     #     print('IN PRELOOP')
     #     super().preloop()
 
-    # TODO: figure out how to disallow recursive .set_trace() entry
-    # since that'll cause deadlock for us.
+    # TODO: cleaner re-wrapping of all this?
+    # -[ ] figure out how to disallow recursive .set_trace() entry
+    #     since that'll cause deadlock for us.
+    # -[ ] maybe a `@cm` to call `super().<same_meth_name>()`?
+    # -[ ] look at hooking into the `pp` hook specially with our
+    #     own set of pretty-printers?
+    #    * `.pretty_struct.Struct.pformat()`
+    #    * `.pformat(MsgType.pld)`
+    #    * `.pformat(Error.tb_str)`?
+    #    * .. maybe more?
+    #
     def set_continue(self):
         try:
             super().set_continue()
@@ -282,6 +472,17 @@ class MultiActorPdb(pdbp.Pdb):
         finally:
             Lock.release()
 
+    # TODO: special handling where we just want the next LOC and
+    # not to resume to the next pause/crash point?
+    # def set_next(
+    #     self,
+    #     frame: FrameType
+    # ) -> None:
+    #     try:
+    #         super().set_next(frame)
+    #     finally:
+    #         Lock.release()
+
     # XXX NOTE: we only override this because apparently the stdlib pdb
     # bois likes to touch the SIGINT handler as much as i like to touch
     # my d$%&.
@@ -314,7 +515,8 @@ class MultiActorPdb(pdbp.Pdb):
 
 @acm
 async def _acquire_debug_lock_from_root_task(
-    uid: tuple[str, str]
+    subactor_uid: tuple[str, str],
+    remote_task_uid: str,
 
 ) -> AsyncIterator[trio.StrictFIFOLock]:
     '''
@@ -326,16 +528,31 @@ async def _acquire_debug_lock_from_root_task(
     to the ``pdb`` repl.
 
     '''
-    task_name: str = current_task().name
+    # task_name: str = current_task().name
     we_acquired: bool = False
 
     log.runtime(
-        f"Attempting to acquire TTY lock, remote task: {task_name}:{uid}"
+        f'Attempting to acquire TTY lock for,\n'
+        f'subactor_uid: {subactor_uid}\n'
+        f'remote task: {remote_task_uid}\n'
     )
     try:
-        log.runtime(
-            f"entering lock checkpoint, remote task: {task_name}:{uid}"
+        pre_msg: str = (
+            f'Entering lock checkpoint for sub-actor\n'
+            f'subactor_uid: {subactor_uid}\n'
+            f'remote task: {remote_task_uid}\n'
         )
+        stats = Lock._debug_lock.statistics()
+        if owner := stats.owner:
+            # and Lock.no_remote_has_tty is not None
+            pre_msg += (
+                f'\n'
+                f'`Lock` already held by local task\n'
+                f'{owner}\n\n'
+                f'On behalf of remote task: {Lock.remote_task_in_debug!r}\n'
+            )
+        log.runtime(pre_msg)
+
         # NOTE: if the surrounding cancel scope from the
         # `lock_tty_for_child()` caller is cancelled, this line should
         # unblock and NOT leave us in some kind of
@@ -349,9 +566,14 @@ async def _acquire_debug_lock_from_root_task(
             # can try to avoid clobbering any connection from a child
             # that's currently relying on it.
             Lock.no_remote_has_tty = trio.Event()
+            Lock.remote_task_in_debug = remote_task_uid
 
-        Lock.global_actor_in_debug = uid
-        log.runtime(f"TTY lock acquired, remote task: {task_name}:{uid}")
+        Lock.global_actor_in_debug = subactor_uid
+        log.runtime(
+            f'TTY lock acquired for,\n'
+            f'subactor_uid: {subactor_uid}\n'
+            f'remote task: {remote_task_uid}\n'
+        )
 
         # NOTE: critical section: this yield is unshielded!
 
@@ -368,7 +590,8 @@ async def _acquire_debug_lock_from_root_task(
     finally:
         if (
             we_acquired
-            and Lock._debug_lock.locked()
+            and
+            Lock._debug_lock.locked()
         ):
             Lock._debug_lock.release()
 
@@ -380,16 +603,37 @@ async def _acquire_debug_lock_from_root_task(
         stats = Lock._debug_lock.statistics()
         if (
             not stats.owner
+            # and Lock.no_remote_has_tty is not None
         ):
-            log.runtime(f"No more tasks waiting on tty lock! says {uid}")
+            # log.runtime(
+            log.info(
+                f'No more child ctx tasks hold the TTY lock!\n'
+                f'last subactor: {subactor_uid}\n'
+                f'remote task: {remote_task_uid}\n'
+            )
             if Lock.no_remote_has_tty is not None:
+                # set and release
                 Lock.no_remote_has_tty.set()
                 Lock.no_remote_has_tty = None
+                Lock.remote_task_in_debug = None
+            else:
+                log.warning(
+                    'Not signalling `Lock.no_remote_has_tty` since it has value:\n'
+                    f'{Lock.no_remote_has_tty}\n'
+                )
+        else:
+            log.info(
+                f'A child ctx tasks still holds the TTY lock ??\n'
+                f'last subactor: {subactor_uid}\n'
+                f'remote task: {remote_task_uid}\n'
+                f'current local owner task: {stats.owner}\n'
+            )
 
         Lock.global_actor_in_debug = None
-
         log.runtime(
-            f"TTY lock released, remote task: {task_name}:{uid}"
+            'TTY lock released by child\n'
+            f'last subactor: {subactor_uid}\n'
+            f'remote task: {remote_task_uid}\n'
         )
 
 
@@ -397,9 +641,14 @@ async def _acquire_debug_lock_from_root_task(
 async def lock_tty_for_child(
 
     ctx: tractor.Context,
-    subactor_uid: tuple[str, str]
 
-) -> str:
+    # TODO: when we finally get a `Start.params: ParamSpec`
+    # working it'd sure be nice to have `msgspec` auto-decode this
+    # to an actual tuple XD
+    subactor_uid: tuple[str, str],
+    subactor_task_uid: tuple[str, int],
+
+) -> LockStatus|LockRelease:
     '''
     Lock the TTY in the root process of an actor tree in a new
     inter-actor-context-task such that the ``pdbp`` debugger console
@@ -411,53 +660,141 @@ async def lock_tty_for_child(
     highly reliable at releasing the mutex complete!
 
     '''
-    task_name: str = current_task().name
+
+    req_task_uid: tuple = tuple(subactor_task_uid)
+    if req_task_uid in Lock._blocked:
+        raise RuntimeError(
+            f'The same remote task already has an active request for TTY lock ??\n\n'
+            f'task uid: {req_task_uid}\n'
+            f'subactor uid: {subactor_uid}\n'
+        )
+
+    Lock._blocked.add(req_task_uid)
+
+    root_task_name: str = current_task().name
     if tuple(subactor_uid) in Lock._blocked:
         log.warning(
-            f'Actor {subactor_uid} is blocked from acquiring debug lock\n'
-            f"remote task: {task_name}:{subactor_uid}"
+            f'Subactor is blocked from acquiring debug lock..\n'
+            f'subactor_uid: {subactor_uid}\n'
+            f'remote task: {subactor_task_uid}\n'
         )
         ctx._enter_debugger_on_cancel: bool = False
         await ctx.cancel(f'Debug lock blocked for {subactor_uid}')
-        return 'pdb_lock_blocked'
+        return LockStatus(
+            subactor_uid=subactor_uid,
+            cid=ctx.cid,
+            locked=False,
+        )
 
     # TODO: when we get to true remote debugging
     # this will deliver stdin data?
 
     log.debug(
-        "Attempting to acquire TTY lock\n"
-        f"remote task: {task_name}:{subactor_uid}"
+        'Subactor attempting to acquire TTY lock\n'
+        f'root task: {root_task_name}\n'
+        f'subactor_uid: {subactor_uid}\n'
+        f'remote task: {subactor_task_uid}\n'
     )
-
-    log.debug(f"Actor {subactor_uid} is WAITING on stdin hijack lock")
     Lock.shield_sigint()
-
     try:
         with (
+            # NOTE: though a cs is created for every subactor lock
+            # REQUEST in this ctx-child task, only the root-task
+            # holding the `Lock` (on behalf of the ctx parent task
+            # in a subactor) will set
+            # `Lock._locking_task_cs` such that if the
+            # lock holdingn task ever needs to be cancelled (since
+            # it's shielded by default) that global ref can be
+            # used to do so!
             trio.CancelScope(shield=True) as debug_lock_cs,
+
+            _codec.limit_msg_spec(
+                payload_spec=__msg_spec__,
+            ) as codec,
         ):
-            Lock._root_local_task_cs_in_debug = debug_lock_cs
-            async with _acquire_debug_lock_from_root_task(subactor_uid):
+            # sanity?
+            # TODO: don't need the ref right?
+            assert codec is _codec.current_codec()
+
+            async with _acquire_debug_lock_from_root_task(
+                subactor_uid,
+                subactor_task_uid,
+            ):
+                # XXX SUPER IMPORTANT BELOW IS ON THIS LINE XXX
+                # without that the root cs might be,
+                # - set and then removed in the finally block by 
+                #   a task that never acquired the lock, leaving 
+                # - the task that DID acquire the lock STUCK since
+                #   it's original cs was GC-ed bc the first task
+                #   already set the global ref to `None`
+                Lock.set_locking_task_cs(debug_lock_cs)
 
                 # indicate to child that we've locked stdio
-                await ctx.started('Locked')
-                log.debug(
-                    f"Actor {subactor_uid} acquired stdin hijack lock"
+                await ctx.started(
+                    LockStatus(
+                        subactor_uid=subactor_uid,
+                        cid=ctx.cid,
+                        locked=True,
+                    )
                 )
 
+                log.debug( f'Actor {subactor_uid} acquired TTY lock')
+
                 # wait for unlock pdb by child
                 async with ctx.open_stream() as stream:
-                    assert await stream.receive() == 'pdb_unlock'
+                    release_msg: LockRelease = await stream.receive()
 
-        return "pdb_unlock_complete"
+                    # TODO: security around only releasing if
+                    # these match?
+                    log.pdb(
+                        f'TTY lock released requested\n\n'
+                        f'{release_msg}\n'
+                    )
+                    assert release_msg.cid == ctx.cid
+                    assert release_msg.subactor_uid == tuple(subactor_uid)
+
+                log.debug(f'Actor {subactor_uid} released TTY lock')
+
+            return LockStatus(
+                subactor_uid=subactor_uid,
+                cid=ctx.cid,
+                locked=False,
+            )
 
     finally:
-        Lock._root_local_task_cs_in_debug = None
+        debug_lock_cs.cancel()
+        Lock.set_locking_task_cs(None)
         Lock.unshield_sigint()
 
 
+@cm
+def apply_debug_codec() -> _codec.MsgCodec:
+    '''
+    Apply the subactor TTY `Lock`-ing protocol's msgspec temporarily
+    (only in the current task).
+
+    '''
+    with (
+        _codec.limit_msg_spec(
+            payload_spec=__msg_spec__,
+        ) as debug_codec,
+    ):
+        assert debug_codec is _codec.current_codec()
+        log.pdb(
+            'Applied `.devx._debug` msg-spec via codec\n'
+            f'{debug_codec}\n'
+        )
+        yield debug_codec
+
+    log.pdb(
+        'REMOVED `.devx._debug` msg-spec via codec\n'
+        f'{debug_codec}\n'
+    )
+
+
 async def wait_for_parent_stdin_hijack(
     actor_uid: tuple[str, str],
+    task_uid: tuple[str, int],
     task_status: TaskStatus[trio.CancelScope] = trio.TASK_STATUS_IGNORED
 ):
     '''
@@ -476,25 +813,30 @@ async def wait_for_parent_stdin_hijack(
     '''
     from .._discovery import get_root
 
-    with trio.CancelScope(shield=True) as cs:
+    with (
+        trio.CancelScope(shield=True) as cs,
+        apply_debug_codec(),
+    ):
         Lock._debugger_request_cs = cs
-
         try:
+            # TODO: merge into sync async with ?
             async with get_root() as portal:
-
                 # this syncs to child's ``Context.started()`` call.
                 async with portal.open_context(
                     lock_tty_for_child,
                     subactor_uid=actor_uid,
+                    subactor_task_uid=task_uid,
 
-                ) as (ctx, val):
-
-                    log.debug('locked context')
-                    assert val == 'Locked'
+                ) as (ctx, resp):
+                    log.pdb(
+                        'Subactor locked TTY per msg\n'
+                        f'{resp}\n'
+                    )
+                    assert resp.subactor_uid == actor_uid
+                    assert resp.cid
 
                     async with ctx.open_stream() as stream:
-                        try:
-                            # unblock local caller
+                        try:  # to unblock local caller
                             assert Lock.local_pdb_complete
                             task_status.started(cs)
 
@@ -503,14 +845,22 @@ async def wait_for_parent_stdin_hijack(
                             await Lock.local_pdb_complete.wait()
 
                         finally:
-                            # TODO: shielding currently can cause hangs...
-                            # with trio.CancelScope(shield=True):
-                            await stream.send('pdb_unlock')
+                            await stream.send(
+                                LockRelease(
+                                    subactor_uid=actor_uid,
+                                    cid=resp.cid,
+                                )
+                            )
 
                         # sync with callee termination
-                        assert await ctx.result() == "pdb_unlock_complete"
+                        status: LockStatus = await ctx.result()
+                        assert not status.locked
 
-                log.debug('exitting child side locking task context')
+                log.pdb(
+                    'TTY lock was released for subactor with msg\n\n'
+                    f'{status}\n\n'
+                    'Exitting {ctx.side!r} side locking of locking ctx'
+                )
 
         except ContextCancelled:
             log.warning('Root actor cancelled debug lock')
@@ -518,12 +868,17 @@ async def wait_for_parent_stdin_hijack(
 
         finally:
             Lock.local_task_in_debug = None
-            log.debug('Exiting debugger from child')
+            log.debug('Exiting debugger TTY lock request func from child')
 
 
-def mk_mpdb() -> MultiActorPdb:
+    log.cancel('Reverting SIGINT handler!')
+    Lock.unshield_sigint()
+
+
+
+def mk_mpdb() -> PdbREPL:
     '''
-    Deliver a new `MultiActorPdb`: a multi-process safe `pdbp`
+    Deliver a new `PdbREPL`: a multi-process safe `pdbp`
     REPL using the magic of SC!
 
     Our `pdb.Pdb` subtype accomplishes multi-process safe debugging
@@ -538,7 +893,7 @@ def mk_mpdb() -> MultiActorPdb:
       by either explicit requests in the runtime or 
 
     '''
-    pdb = MultiActorPdb()
+    pdb = PdbREPL()
 
     # Always shield out SIGINTs for subactors when REPL is active.
     #
@@ -560,7 +915,6 @@ def mk_mpdb() -> MultiActorPdb:
 def shield_sigint_handler(
     signum: int,
     frame: 'frame',  # type: ignore # noqa
-    # pdb_obj: MultiActorPdb | None = None,
     *args,
 
 ) -> None:
@@ -577,6 +931,7 @@ def shield_sigint_handler(
     uid_in_debug: tuple[str, str]|None = Lock.global_actor_in_debug
 
     actor: Actor = current_actor()
+    case_handled: bool = False
 
     def do_cancel():
         # If we haven't tried to cancel the runtime then do that instead
@@ -586,107 +941,202 @@ def shield_sigint_handler(
             actor.cancel_soon()
 
         # If the runtime is already cancelled it likely means the user
-        # hit ctrl-c again because teardown didn't full take place in
+        # hit ctrl-c again because teardown didn't fully take place in
         # which case we do the "hard" raising of a local KBI.
         else:
             raise KeyboardInterrupt
 
+    # try to see if the supposed (sub)actor in debug still
+    # has an active connection to *this* actor, and if not
+    # it's likely they aren't using the TTY lock / debugger
+    # and we should propagate SIGINT normally.
     any_connected: bool = False
-
     if uid_in_debug is not None:
-        # try to see if the supposed (sub)actor in debug still
-        # has an active connection to *this* actor, and if not
-        # it's likely they aren't using the TTY lock / debugger
-        # and we should propagate SIGINT normally.
-        chans: list[tractor.Channel] = actor._peers.get(tuple(uid_in_debug))
+        chans: list[tractor.Channel] = actor._peers.get(
+            tuple(uid_in_debug)
+        )
         if chans:
             any_connected = any(chan.connected() for chan in chans)
             if not any_connected:
                 log.warning(
                     'A global actor reported to be in debug '
-                    'but no connection exists for this child:\n'
-                    f'{uid_in_debug}\n'
+                    'but no connection exists for this child!?\n'
+                    f'subactor_uid: {uid_in_debug}\n\n'
                     'Allowing SIGINT propagation..'
                 )
                 return do_cancel()
 
     # only set in the actor actually running the REPL
-    pdb_obj: MultiActorPdb|None = Lock.repl
+    repl: PdbREPL|None = Lock.repl
 
+    # TODO: maybe we should flatten out all these cases using
+    # a match/case?
+    #
     # root actor branch that reports whether or not a child
     # has locked debugger.
-    if (
-        is_root_process()
-        and uid_in_debug is not None
+    if is_root_process():
+        lock_cs: trio.CancelScope = Lock.get_locking_task_cs()
 
-        # XXX: only if there is an existing connection to the
-        # (sub-)actor in debug do we ignore SIGINT in this
-        # parent! Otherwise we may hang waiting for an actor
-        # which has already terminated to unlock.
-        and any_connected
-    ):
-        # we are root and some actor is in debug mode
-        # if uid_in_debug is not None:
+        log.warning(
+            f'root {actor.uid} handling SIGINT\n'
+            f'any_connected: {any_connected}\n\n'
 
-        if pdb_obj:
-            name = uid_in_debug[0]
-            if name != 'root':
-                log.pdb(
-                    f"Ignoring SIGINT, child in debug mode: `{uid_in_debug}`"
-                )
+            f'{Lock.repr()}\n'
+        )
+
+        maybe_stale_lock_cs: bool = (
+            lock_cs is not None
+            # and not lock_cs.cancel_called
+            and uid_in_debug is None
+        )
+        if maybe_stale_lock_cs:
+            log.warning(
+                'Stale `Lock._locking_task_cs: CancelScope` DETECTED?\n'
+                f'|_{lock_cs}\n\n'
+            )
+            lock_cs.cancel()
+
+        if uid_in_debug:  # "someone" is (ostensibly) using debug `Lock`
+            name_in_debug: str = uid_in_debug[0]
+            if (
+                not repl  # but it's NOT us, the root actor.
+            ):
+                # sanity: since no repl ref is set, we def shouldn't
+                # be the lock owner!
+                assert name_in_debug != 'root'
+
+                # XXX: only if there is an existing connection to the
+                # (sub-)actor in debug do we ignore SIGINT in this
+                # parent! Otherwise we may hang waiting for an actor
+                # which has already terminated to unlock.
+                if any_connected:  # there are subactors we can contact
+                    # NOTE: don't emit this with `.pdb()` level in
+                    # root without a higher level.
+                    log.debug(
+                        f'Ignoring SIGINT while debug REPL in use by child\n'
+                        f'subactor: {uid_in_debug}\n'
+                    )
+                    # returns here minus tail logic
+                    case_handled = True
+
+                else:
+                    message: str = (
+                        f'Ignoring SIGINT while debug REPL SUPPOSEDLY in use by child\n'
+                        f'subactor: {uid_in_debug}\n\n'
+                        f'BUT, no child actors are contactable!?!?\n\n'
+
+                        # f'Reverting to def `trio` SIGINT handler..\n'
+                    )
+
+                    if maybe_stale_lock_cs:
+                        lock_cs.cancel()
+                        message += (
+                            'Maybe `Lock._locking_task_cs: CancelScope` is stale?\n'
+                            f'|_{lock_cs}\n\n'
+                        )
+
+                    log.warning(message)
+                    Lock.unshield_sigint()
+                    case_handled = True
 
             else:
+                assert name_in_debug == 'root'  # we are the registered locker
+                assert repl  # we have a pdb REPL engaged
                 log.pdb(
-                    "Ignoring SIGINT while in debug mode"
+                    f'Ignoring SIGINT while debug REPL in use\n'
+                    f'root actor: {uid_in_debug}\n'
                 )
-    elif (
-        is_root_process()
-    ):
-        if pdb_obj:
-            log.pdb(
-                "Ignoring SIGINT since debug mode is enabled"
+                # returns here minus tail logic
+                case_handled = True
+
+        # root actor still has this SIGINT handler active without
+        # an actor using the `Lock` (a bug state) ??
+        # => so immediately cancel any stale lock cs and revert
+        # the handler!
+        else:
+            # XXX revert back to ``trio`` handler since this handler shouldn't 
+            # be enabled withtout an actor using a debug REPL!
+            log.warning(
+                'Ignoring SIGINT in root actor but no actor using a `pdb` REPL?\n'
+                'Reverting SIGINT handler to `trio` default!\n'
             )
 
-        if (
-            Lock._root_local_task_cs_in_debug
-            and not Lock._root_local_task_cs_in_debug.cancel_called
-        ):
-            Lock._root_local_task_cs_in_debug.cancel()
+            if maybe_stale_lock_cs:
+                lock_cs.cancel()
 
-            # revert back to ``trio`` handler asap!
             Lock.unshield_sigint()
+            case_handled = True
 
     # child actor that has locked the debugger
     elif not is_root_process():
+        log.warning(
+            f'Subactor {actor.uid} handling SIGINT\n\n'
+            f'{Lock.repr()}\n'
+        )
 
-        chan: Channel = actor._parent_chan
-        if not chan or not chan.connected():
+        rent_chan: Channel = actor._parent_chan
+        if (
+            rent_chan is None
+            or
+            not rent_chan.connected()
+        ):
             log.warning(
-                'A global actor reported to be in debug '
-                'but no connection exists for its parent:\n'
+                'A global sub-actor reported to be in debug '
+                'but it has no connection to its parent ??\n'
                 f'{uid_in_debug}\n'
                 'Allowing SIGINT propagation..'
             )
-            return do_cancel()
+            Lock.unshield_sigint()
+            # do_cancel()
+            case_handled = True
 
-        task: str | None = Lock.local_task_in_debug
+        task: str|None = Lock.local_task_in_debug
         if (
             task
-            and pdb_obj
+            and
+            repl
         ):
+        # if repl:
             log.pdb(
-                f"Ignoring SIGINT while task in debug mode: `{task}`"
+                f'Ignoring SIGINT while local task using debug REPL\n'
+                f'|_{task}\n'
+                f'  |_{repl}\n'
             )
+            case_handled = True
+        else:
+            msg: str = (
+                'SIGINT shield handler still active BUT, \n\n'
+            )
+            if task is None:
+                msg += (
+                    f'- No local task claims to be in debug?\n'
+                    f' |_{task}\n\n'
+                )
+
+            if repl is None:
+                msg += (
+                    f'- No local REPL is currently active?\n'
+                    f' |_{repl}\n\n'
+                )
+
+            log.warning(
+                msg
+                +
+                'Reverting handler to `trio` default!\n'
+            )
+            Lock.unshield_sigint()
+            case_handled = True
+
+            # XXX ensure that the reverted-to-handler actually is
+            # able to rx what should have been **this** KBI ;)
+            do_cancel()
+            # raise KeyboardInterrupt
 
         # TODO: how to handle the case of an intermediary-child actor
         # that **is not** marked in debug mode? See oustanding issue:
         # https://github.com/goodboy/tractor/issues/320
         # elif debug_mode():
 
-    else:  # XXX: shouldn't ever get here?
-        raise RuntimeError("WTFWTFWTF")
-        # raise KeyboardInterrupt("WTFWTFWTF")
-
     # NOTE: currently (at least on ``fancycompleter`` 0.9.2)
     # it looks to be that the last command that was run (eg. ll)
     # will be repeated by default.
@@ -695,31 +1145,37 @@ def shield_sigint_handler(
     # we want to alert the user that more input is expect since
     # nothing has been done dur to ignoring sigint.
     if (
-        pdb_obj  # only when this actor has a REPL engaged
+        repl  # only when this actor has a REPL engaged
     ):
         # XXX: yah, mega hack, but how else do we catch this madness XD
-        if pdb_obj.shname == 'xonsh':
-            pdb_obj.stdout.write(pdb_obj.prompt)
+        if repl.shname == 'xonsh':
+            repl.stdout.write(repl.prompt)
 
-        pdb_obj.stdout.flush()
+        repl.stdout.flush()
 
         # TODO: make this work like sticky mode where if there is output
         # detected as written to the tty we redraw this part underneath
         # and erase the past draw of this same bit above?
-        # pdb_obj.sticky = True
-        # pdb_obj._print_if_sticky()
+        # repl.sticky = True
+        # repl._print_if_sticky()
 
         # also see these links for an approach from ``ptk``:
         # https://github.com/goodboy/tractor/issues/130#issuecomment-663752040
         # https://github.com/prompt-toolkit/python-prompt-toolkit/blob/c2c6af8a0308f9e5d7c0e28cb8a02963fe0ce07a/prompt_toolkit/patch_stdout.py
 
+    if not case_handled:
+        log.critical(
+            f'{actor.uid} UNHANDLED SIGINT !?!?\n'
+            # TODO: pprint for `Lock`?
+        )
+
 
 _pause_msg: str = 'Attaching to pdb REPL in actor'
 
 
 def _set_trace(
     actor: tractor.Actor|None = None,
-    pdb: MultiActorPdb|None = None,
+    pdb: PdbREPL|None = None,
     shield: bool = False,
 
     extra_frames_up_when_async: int = 1,
@@ -767,14 +1223,16 @@ def _set_trace(
         log.pdb(
             f'{msg}\n'
             '|\n'
-            f'|_ {actor.uid}\n'
+            # TODO: make an `Actor.__repr()__`
+            # f'|_ {current_task()} @ {actor.name}\n'
+            f'|_ {current_task()}\n'
         )
         # no f!#$&* idea, but when we're in async land
         # we need 2x frames up?
         for i in range(extra_frames_up_when_async):
             frame: FrameType = frame.f_back
             log.debug(
-                f'Going up frame {i} -> {frame}\n'
+                f'Going up frame_{i}:\n|_{frame}\n'
             )
 
     # engage ze REPL
@@ -787,7 +1245,7 @@ async def _pause(
     debug_func: Callable = _set_trace,
 
     # NOTE: must be passed in the `.pause_from_sync()` case!
-    pdb: MultiActorPdb|None = None,
+    pdb: PdbREPL|None = None,
 
     # TODO: allow caller to pause despite task cancellation,
     # exactly the same as wrapping with:
@@ -799,6 +1257,8 @@ async def _pause(
     #
     shield: bool = False,
     hide_tb: bool = True,
+    extra_frames_up_when_async: int = 4,
+
     task_status: TaskStatus[trio.Event] = trio.TASK_STATUS_IGNORED
 
 ) -> None:
@@ -813,7 +1273,9 @@ async def _pause(
     __tracebackhide__: bool = hide_tb
     actor: Actor = current_actor()
     try:
-        task_name: str = trio.lowlevel.current_task().name
+        # TODO: use the `Task` instance instead for `is` checks
+        # below!
+        task: Task = trio.lowlevel.current_task()
     except RuntimeError as rte:
         if actor.is_infected_aio():
             raise RuntimeError(
@@ -821,48 +1283,142 @@ async def _pause(
                 'for infected `asyncio` mode!'
             ) from rte
 
+    # task_name: str = task.name
+
     if (
         not Lock.local_pdb_complete
-        or Lock.local_pdb_complete.is_set()
+        or
+        Lock.local_pdb_complete.is_set()
     ):
         Lock.local_pdb_complete = trio.Event()
 
     if debug_func is not None:
-        debug_func = partial(
-            debug_func,
-        )
+        debug_func = partial(debug_func)
 
     if pdb is None:
-        pdb: MultiActorPdb = mk_mpdb()
+        pdb: PdbREPL = mk_mpdb()
+
+    def _enter_repl_sync(
+        debug_func: Callable,
+    ) -> None:
+        __tracebackhide__: bool = hide_tb
+        try:
+            # TODO: do we want to support using this **just** for the
+            # locking / common code (prolly to help address #320)?
+            #
+            if debug_func is None:
+                task_status.started(Lock)
+            else:
+                # block here one (at the appropriate frame *up*) where
+                # ``breakpoint()`` was awaited and begin handling stdio.
+                log.debug('Entering sync world of the `pdb` REPL..')
+                try:
+                    # log.critical(
+                    #     f'stack len: {len(pdb.stack)}\n'
+                    # )
+                    debug_func(
+                        actor,
+                        pdb,
+                        extra_frames_up_when_async=extra_frames_up_when_async,
+                        shield=shield,
+                    )
+                except BaseException:
+                    log.exception(
+                        'Failed to invoke internal `debug_func = '
+                        f'{debug_func.func.__name__}`\n'
+                    )
+                    raise
+
+        except bdb.BdbQuit:
+            Lock.release()
+            raise
+
+        except BaseException:
+            log.exception(
+                'Failed to engage debugger via `_pause()` ??\n'
+            )
+            raise
+
+    if is_root_process():
+
+        # we also wait in the root-parent for any child that
+        # may have the tty locked prior
+        # TODO: wait, what about multiple root tasks acquiring it though?
+        if Lock.global_actor_in_debug == actor.uid:
+            # re-entrant root process already has it: noop.
+            log.warning(
+                f'{task.name}@{actor.uid} already has TTY lock\n'
+                f'ignoring..'
+            )
+            await trio.lowlevel.checkpoint()
+            return
+
+        # XXX: since we need to enter pdb synchronously below,
+        # we have to release the lock manually from pdb completion
+        # callbacks. Can't think of a nicer way then this atm.
+        if Lock._debug_lock.locked():
+            log.warning(
+                'attempting to shield-acquire active TTY lock'
+                f' owned by {Lock.global_actor_in_debug}'
+            )
+
+            # must shield here to avoid hitting a ``Cancelled`` and
+            # a child getting stuck bc we clobbered the tty
+            with trio.CancelScope(shield=True):
+                await Lock._debug_lock.acquire()
+        else:
+            # may be cancelled
+            await Lock._debug_lock.acquire()
+
+        Lock.global_actor_in_debug = actor.uid
+        Lock.local_task_in_debug = task
+        Lock.repl = pdb
+
+        # enter REPL from root, no TTY locking IPC ctx necessary
+        _enter_repl_sync(debug_func)
+        return  # next branch is mutex and for subactors
 
     # TODO: need a more robust check for the "root" actor
-    if (
+    elif (
         not is_root_process()
         and actor._parent_chan  # a connected child
     ):
-
         if Lock.local_task_in_debug:
 
             # Recurrence entry case: this task already has the lock and
             # is likely recurrently entering a breakpoint
-            if Lock.local_task_in_debug == task_name:
-                # noop on recurrent entry case but we want to trigger
-                # a checkpoint to allow other actors error-propagate and
-                # potetially avoid infinite re-entries in some subactor.
+            #
+            # NOTE: noop on recurrent entry case but we want to trigger
+            # a checkpoint to allow other actors error-propagate and
+            # potetially avoid infinite re-entries in some
+            # subactor that would otherwise not bubble until the
+            # next checkpoint was hit.
+            if (
+                (repl_task := Lock.local_task_in_debug)
+                and 
+                repl_task is task
+            ):
+                log.warning(
+                    f'{task.name}@{actor.uid} already has TTY lock\n'
+                    f'ignoring..'
+                )
                 await trio.lowlevel.checkpoint()
                 return
 
-            # if **this** actor is already in debug mode block here
-            # waiting for the control to be released - this allows
-            # support for recursive entries to `tractor.breakpoint()`
-            log.warning(f"{actor.uid} already has a debug lock, waiting...")
-
+            # if **this** actor is already in debug REPL we want
+            # to maintain actor-local-task mutex access, so block
+            # here waiting for the control to be released - this
+            # -> allows for recursive entries to `tractor.pause()`
+            log.warning(
+                f'{task.name}@{actor.uid} already has TTY lock\n'
+                f'waiting for release..'
+            )
             await Lock.local_pdb_complete.wait()
             await trio.sleep(0.1)
 
         # mark local actor as "in debug mode" to avoid recurrent
         # entries/requests to the root process
-        Lock.local_task_in_debug = task_name
+        Lock.local_task_in_debug = task
 
         # this **must** be awaited by the caller and is done using the
         # root nursery so that the debugger can continue to run without
@@ -875,91 +1431,54 @@ async def _pause(
         #   actor._service_n.cancel_scope.shield = shield
         # ```
         # but not entirely sure if that's a sane way to implement it?
-        try:
-            with trio.CancelScope(shield=True):
-                await actor._service_n.start(
+
+        # NOTE: MUST it here bc multiple tasks are spawned by any
+        # one sub-actor AND there will be a race between when the
+        # root locking task delivers the `Started(pld=LockStatus)`
+        # and when the REPL is actually entered here. SO ensure
+        # the codec is set before either are run!
+        #
+        with (
+            # _codec.limit_msg_spec(
+            #     payload_spec=__msg_spec__,
+            # ) as debug_codec,
+            trio.CancelScope(shield=shield),
+        ):
+            # async with trio.open_nursery() as tn:
+            #     tn.cancel_scope.shield = True
+            try:
+                # cs: trio.CancelScope = await tn.start(
+                cs: trio.CancelScope = await actor._service_n.start(
                     wait_for_parent_stdin_hijack,
                     actor.uid,
+                    (task.name, id(task)),
                 )
+                # our locker task should be the one in ctx
+                # with the root actor
+                assert Lock._debugger_request_cs is cs
+
+                # XXX used by the SIGINT handler to check if
+                # THIS actor is in REPL interaction
                 Lock.repl = pdb
 
-        except RuntimeError:
-            Lock.release()
+            except RuntimeError:
+                Lock.release()
 
-            if actor._cancel_called:
-                # service nursery won't be usable and we
-                # don't want to lock up the root either way since
-                # we're in (the midst of) cancellation.
-                return
+                if actor._cancel_called:
+                    # service nursery won't be usable and we
+                    # don't want to lock up the root either way since
+                    # we're in (the midst of) cancellation.
+                    return
 
-            raise
-
-    elif is_root_process():
-
-        # we also wait in the root-parent for any child that
-        # may have the tty locked prior
-        # TODO: wait, what about multiple root tasks acquiring it though?
-        if Lock.global_actor_in_debug == actor.uid:
-            # re-entrant root process already has it: noop.
-            return
-
-        # XXX: since we need to enter pdb synchronously below,
-        # we have to release the lock manually from pdb completion
-        # callbacks. Can't think of a nicer way then this atm.
-        if Lock._debug_lock.locked():
-            log.warning(
-                'Root actor attempting to shield-acquire active tty lock'
-                f' owned by {Lock.global_actor_in_debug}')
-
-            # must shield here to avoid hitting a ``Cancelled`` and
-            # a child getting stuck bc we clobbered the tty
-            with trio.CancelScope(shield=True):
-                await Lock._debug_lock.acquire()
-        else:
-            # may be cancelled
-            await Lock._debug_lock.acquire()
-
-        Lock.global_actor_in_debug = actor.uid
-        Lock.local_task_in_debug = task_name
-        Lock.repl = pdb
-
-    try:
-        # TODO: do we want to support using this **just** for the
-        # locking / common code (prolly to help address #320)?
-        #
-        if debug_func is None:
-            task_status.started(Lock)
-
-        else:
-            # block here one (at the appropriate frame *up*) where
-            # ``breakpoint()`` was awaited and begin handling stdio.
-            log.debug('Entering sync world of the `pdb` REPL..')
-            try:
-                # log.critical(
-                #     f'stack len: {len(pdb.stack)}\n'
-                # )
-                debug_func(
-                    actor,
-                    pdb,
-                    extra_frames_up_when_async=2,
-                    shield=shield,
-                )
-            except BaseException:
-                log.exception(
-                    'Failed to invoke internal `debug_func = '
-                    f'{debug_func.func.__name__}`\n'
-                )
                 raise
 
-    except bdb.BdbQuit:
-        Lock.release()
-        raise
+            # enter REPL
+
+            try:
+                _enter_repl_sync(debug_func)
+            finally:
+                Lock.unshield_sigint()
 
-    except BaseException:
-        log.exception(
-            'Failed to engage debugger via `_pause()` ??\n'
-        )
-        raise
 
 # XXX: apparently we can't do this without showing this frame
 # in the backtrace on first entry to the REPL? Seems like an odd
@@ -1017,15 +1536,21 @@ async def pause(
         #     __tracebackhide__: bool = True
         #     super().__exit__(*args, **kwargs)
 
-        trio.CancelScope.__enter__.__tracebackhide__ = True
-        trio.CancelScope.__exit__.__tracebackhide__ = True
+        # trio.CancelScope.__enter__.__tracebackhide__ = True
+        # trio.CancelScope.__exit__.__tracebackhide__ = True
 
         # import types
         # with trio.CancelScope(shield=shield) as cs:
             # cs.__exit__ = types.MethodType(_exit, cs)
             # cs.__exit__.__tracebackhide__ = True
 
-        with trio.CancelScope(shield=shield) as cs:
+        # TODO: LOL, solved this with the `pdb.hideframe` stuff
+        # at top-of-mod.. so I guess we can just only use this
+        # block right?
+        with trio.CancelScope(
+            shield=shield,
+        ) as cs:
+            print(f'debug cs is {cs}\n')
             # setattr(cs.__exit__.__func__, '__tracebackhide__', True)
             # setattr(cs.__enter__.__func__, '__tracebackhide__', True)
 
@@ -1135,7 +1660,7 @@ def pause_from_sync(
 
     # raises on not-found by default
     greenback: ModuleType = maybe_import_greenback()
-    mdb: MultiActorPdb = mk_mpdb()
+    mdb: PdbREPL = mk_mpdb()
 
     # run async task which will lock out the root proc's TTY.
     if not Lock.is_main_trio_thread():
@@ -1157,7 +1682,7 @@ def pause_from_sync(
             )
         )
         # TODO: maybe the `trio.current_task()` id/name if avail?
-        Lock.local_task_in_debug: str = str(threading.current_thread().name)
+        Lock.local_task_in_debug: str = str(threading.current_thread())
 
     else:  # we are presumably the `trio.run()` + main thread
         greenback.await_(
@@ -1167,7 +1692,7 @@ def pause_from_sync(
                 hide_tb=hide_tb,
             )
         )
-        Lock.local_task_in_debug: str = current_task().name
+        Lock.local_task_in_debug: str = current_task()
 
     # TODO: ensure we aggressively make the user aware about
     # entering the global ``breakpoint()`` built-in from sync
@@ -1198,7 +1723,11 @@ async def breakpoint(**kwargs):
         '`tractor.breakpoint()` is deprecated!\n'
         'Please use `tractor.pause()` instead!\n'
     )
-    await pause(**kwargs)
+    __tracebackhide__: bool = True
+    await pause(
+        # extra_frames_up_when_async=6,
+        **kwargs
+    )
 
 
 _crash_msg: str = (
@@ -1208,11 +1737,11 @@ _crash_msg: str = (
 
 def _post_mortem(
     actor: tractor.Actor,
-    pdb: MultiActorPdb,
+    pdb: PdbREPL,
     shield: bool = False,
 
     # only for compat with `._set_trace()`..
-    extra_frames_up_when_async=0,
+    extra_frames_up_when_async=1,
 
 ) -> None:
     '''
@@ -1225,7 +1754,11 @@ def _post_mortem(
     log.pdb(
         f'{_crash_msg}\n'
         '|\n'
-        f'|_ {actor.uid}\n'
+        f'|_ {current_task()}\n'
+
+        # f'|_ @{actor.uid}\n'
+        # TODO: make an `Actor.__repr()__`
+        # f'|_ {current_task()} @ {actor.name}\n'
     )
 
     # TODO: only replacing this to add the
@@ -1278,9 +1811,12 @@ async def _maybe_enter_pm(err):
 @acm
 async def acquire_debug_lock(
     subactor_uid: tuple[str, str],
-) -> AsyncGenerator[None, tuple]:
+) -> AsyncGenerator[
+    trio.CancelScope|None,
+    tuple,
+]:
     '''
-    Grab root's debug lock on entry, release on exit.
+    Request to acquire the TTY `Lock` in the root actor, release on exit.
 
     This helper is for actor's who don't actually need to acquired
     the debugger but want to wait until the lock is free in the
@@ -1297,7 +1833,7 @@ async def acquire_debug_lock(
             wait_for_parent_stdin_hijack,
             subactor_uid,
         )
-        yield None
+        yield cs
         cs.cancel()
 
 
@@ -1328,7 +1864,6 @@ async def maybe_wait_for_debugger(
         # Instead try to wait for pdb to be released before
         # tearing down.
         in_debug: tuple[str, str]|None = Lock.global_actor_in_debug
-        debug_complete: trio.Event|None = Lock.no_remote_has_tty
 
         if in_debug == current_actor().uid:
             log.debug(
@@ -1340,7 +1875,7 @@ async def maybe_wait_for_debugger(
 
         elif in_debug:
             msg += (
-                f'Debug `Lock` in use by subactor: {in_debug}\n'
+                f'Debug `Lock` in use by subactor\n|\n|_{in_debug}\n'
             )
             # TODO: could this make things more deterministic?
             # wait to see if a sub-actor task will be
@@ -1358,17 +1893,17 @@ async def maybe_wait_for_debugger(
 
         for istep in range(poll_steps):
             if (
-                debug_complete
-                and not debug_complete.is_set()
+                Lock.no_remote_has_tty is not None
+                and not Lock.no_remote_has_tty.is_set()
                 and in_debug is not None
             ):
                 log.pdb(
                     msg
                     +
-                    'Root is waiting on tty lock to release..\n'
+                    '\nRoot is waiting on tty lock to release..\n'
                 )
                 with trio.CancelScope(shield=True):
-                    await debug_complete.wait()
+                    await Lock.no_remote_has_tty.wait()
                 log.pdb(
                     f'Child subactor released debug lock\n'
                     f'|_{in_debug}\n'
@@ -1378,8 +1913,8 @@ async def maybe_wait_for_debugger(
             if (
                 in_debug is None
                 and (
-                    debug_complete is None
-                    or debug_complete.is_set()
+                    Lock.no_remote_has_tty is None
+                    or Lock.no_remote_has_tty.is_set()
                 )
             ):
                 log.pdb(
-- 
2.34.1


From d0e76100735c3f817fadf85fa32e462defa76a19 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 17 Apr 2024 23:19:31 -0400
Subject: [PATCH 248/378] The src error to `_raise_from_no_key_in_msg()` is
 always an attr-error now!

---
 tractor/_exceptions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 65637fb5..4ace626f 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -935,7 +935,7 @@ def is_multi_cancelled(exc: BaseException) -> bool:
 def _raise_from_no_key_in_msg(
     ctx: Context,
     msg: MsgType,
-    src_err: KeyError,
+    src_err: AttributeError,
     log: StackLevelAdapter,  # caller specific `log` obj
 
     expect_msg: str = Yield,
@@ -994,7 +994,7 @@ def _raise_from_no_key_in_msg(
             ctx.chan,
             hide_tb=hide_tb,
 
-        ) from None
+        ) from src_err
 
     # `MsgStream` termination msg.
     # TODO: does it make more sense to pack 
-- 
2.34.1


From 77a15ebf19927c5e8f15a7d93f003a6162f59797 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 18 Apr 2024 12:47:28 -0400
Subject: [PATCH 249/378] Use `DebugStatus` around subactor lock requests

Breaks out all the (sub)actor local conc primitives from `Lock` (which
is now only used in and by the root actor) such that there's an explicit
distinction between a task that's "consuming" the `Lock` (remotely) vs.
the root-side service tasks which do the actual acquire on behalf of the
requesters.

`DebugStatus` changeover deats:
------ - ------
- move all the actor-local vars over `DebugStatus` including:
  - move `_trio_handler` and `_orig_sigint_handler`
  - `local_task_in_debug` now `repl_task`
  - `_debugger_request_cs` now `req_cs`
  - `local_pdb_complete` now `repl_release`
- drop all ^ fields from `Lock.repr()` obvi..
- move over the `.[un]shield_sigint()` and
  `.is_main_trio_thread()` methods.
- add some new attrs/meths:
  - `DebugStatus.repl` for the currently running `Pdb` in-actor
    singleton.
  - `.repr()` for pprint of state (like `Lock`).
- Note: that even when a root-actor task is in REPL, the `DebugStatus`
  is still used for certain actor-local state mgmt, such as SIGINT
  handler shielding.
- obvi change all lock-requester code bits to now use a `DebugStatus` in
  their local actor-state instead of `Lock`, i.e. change usage from
  `Lock` in `._runtime` and `._root`.
- use new `Lock.get_locking_task_cs()` API in when checking for
  sub-in-debug from `._runtime.Actor._stream_handler()`.

Unrelated to topic-at-hand tweaks:
------ - ------
- drop the commented bits about hiding `@[a]cm` stack frames from
  `_debug.pause()` and simplify to only one block with the `shield`
  passthrough since we already solved the issue with cancel-scopes using
  `@pdbp.hideframe` B)
  - this includes all the extra logging about the extra frame for the
    user (good thing i put in that wasted effort back then eh..)
- put the `try/except BaseException` with `log.exception()` around the
  whole of `._pause()` to ensure we don't miss in-func errors which can
  cause hangs..
- allow passing in `portal: Portal` to
  `Actor.start_remote_task()` such that `Portal` task spawning methods
  are always denoted correctly in terms of `Context.side`.
- lotsa logging tweaks, decreasing a bit of noise from `.runtime()`s.
---
 tractor/_root.py       |   2 +-
 tractor/_runtime.py    | 101 ++++----
 tractor/devx/_debug.py | 573 +++++++++++++++++++----------------------
 3 files changed, 322 insertions(+), 354 deletions(-)

diff --git a/tractor/_root.py b/tractor/_root.py
index 1964a067..afe91e7f 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -135,7 +135,7 @@ async def open_root_actor(
 
     # attempt to retreive ``trio``'s sigint handler and stash it
     # on our debugger lock state.
-    _debug.Lock._trio_handler = signal.getsignal(signal.SIGINT)
+    _debug.DebugStatus._trio_handler = signal.getsignal(signal.SIGINT)
 
     # mark top most level process as root actor
     _state._runtime_vars['_is_root'] = True
diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 4d90c591..72866d43 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -267,10 +267,13 @@ class Actor:
         self._listeners: list[trio.abc.Listener] = []
         self._parent_chan: Channel|None = None
         self._forkserver_info: tuple|None = None
+
+        # track each child/sub-actor in it's locally
+        # supervising nursery
         self._actoruid2nursery: dict[
-            tuple[str, str],
+            tuple[str, str],  # sub-`Actor.uid`
             ActorNursery|None,
-        ] = {}  # type: ignore  # noqa
+        ] = {}
 
         # when provided, init the registry addresses property from
         # input via the validator.
@@ -659,12 +662,18 @@ class Actor:
 
                     # TODO: NEEEDS TO BE TESTED!
                     # actually, no idea if this ever even enters.. XD
+                    #
+                    # XXX => YES IT DOES, when i was testing ctl-c
+                    # from broken debug TTY locking due to
+                    # msg-spec races on application using RunVar...
                     pdb_user_uid: tuple = pdb_lock.global_actor_in_debug
                     if (
                         pdb_user_uid
                         and local_nursery
                     ):
-                        entry: tuple|None = local_nursery._children.get(pdb_user_uid)
+                        entry: tuple|None = local_nursery._children.get(
+                            tuple(pdb_user_uid)
+                        )
                         if entry:
                             proc: trio.Process
                             _, proc, _ = entry
@@ -674,10 +683,10 @@ class Actor:
                             and poll() is None
                         ):
                             log.cancel(
-                                'Root actor reports no-more-peers, BUT '
+                                'Root actor reports no-more-peers, BUT\n'
                                 'a DISCONNECTED child still has the debug '
-                                'lock!\n'
-                                f'root uid: {self.uid}\n'
+                                'lock!\n\n'
+                                # f'root uid: {self.uid}\n'
                                 f'last disconnected child uid: {uid}\n'
                                 f'locking child uid: {pdb_user_uid}\n'
                             )
@@ -703,9 +712,8 @@ class Actor:
                     # if a now stale local task has the TTY lock still
                     # we cancel it to allow servicing other requests for
                     # the lock.
-                    db_cs: trio.CancelScope|None = pdb_lock._root_local_task_cs_in_debug
                     if (
-                        db_cs
+                        (db_cs := pdb_lock.get_locking_task_cs())
                         and not db_cs.cancel_called
                         and uid == pdb_user_uid
                     ):
@@ -742,7 +750,7 @@ class Actor:
         except KeyError:
             log.warning(
                 'Ignoring invalid IPC ctx msg!\n\n'
-                f'<= sender: {uid}\n'
+                f'<= sender: {uid}\n\n'
                 # XXX don't need right since it's always in msg?
                 # f'=> cid: {cid}\n\n'
 
@@ -796,7 +804,7 @@ class Actor:
                 cid,
                 # side,
             )]
-            log.runtime(
+            log.debug(
                 f'Retreived cached IPC ctx for\n'
                 f'peer: {chan.uid}\n'
                 f'cid:{cid}\n'
@@ -835,10 +843,14 @@ class Actor:
         nsf: NamespacePath,
         kwargs: dict,
 
+        # determines `Context.side: str`
+        portal: Portal|None = None,
+
         # IPC channel config
         msg_buffer_size: int|None = None,
         allow_overruns: bool = False,
         load_nsf: bool = False,
+        ack_timeout: float = 3,
 
     ) -> Context:
         '''
@@ -863,10 +875,12 @@ class Actor:
             msg_buffer_size=msg_buffer_size,
             allow_overruns=allow_overruns,
         )
+        ctx._portal = portal
 
         if (
             'self' in nsf
-            or not load_nsf
+            or
+            not load_nsf
         ):
             ns, _, func = nsf.partition(':')
         else:
@@ -874,42 +888,29 @@ class Actor:
             # -[ ] but, how to do `self:<Actor.meth>`??
             ns, func = nsf.to_tuple()
 
+        msg = msgtypes.Start(
+            ns=ns,
+            func=func,
+            kwargs=kwargs,
+            uid=self.uid,
+            cid=cid,
+        )
         log.runtime(
-            'Sending cmd to\n'
-            f'peer: {chan.uid} => \n'
-            '\n'
-            f'=> {ns}.{func}({kwargs})\n'
+            'Sending RPC start msg\n\n'
+            f'=> peer: {chan.uid}\n'
+            f'  |_ {ns}.{func}({kwargs})\n'
         )
-        await chan.send(
-            msgtypes.Start(
-                ns=ns,
-                func=func,
-                kwargs=kwargs,
-                uid=self.uid,
-                cid=cid,
-            )
-        )
-            # {'cmd': (
-            #     ns,
-            #     func,
-            #     kwargs,
-            #     self.uid,
-            #     cid,
-            # )}
-        # )
+        await chan.send(msg)
 
-        # Wait on first response msg and validate; this should be
-        # immediate.
-        # first_msg: dict = await ctx._recv_chan.receive()
-        # functype: str = first_msg.get('functype')
-
-        first_msg: msgtypes.StartAck = await ctx._recv_chan.receive()
+        # NOTE wait on first `StartAck` response msg and validate;
+        # this should be immediate and does not (yet) wait for the
+        # remote child task to sync via `Context.started()`.
+        with trio.fail_after(ack_timeout):
+            first_msg: msgtypes.StartAck = await ctx._recv_chan.receive()
         try:
             functype: str = first_msg.functype
         except AttributeError:
             raise unpack_error(first_msg, chan)
-            # if 'error' in first_msg:
-            #     raise unpack_error(first_msg, chan)
 
         if functype not in (
             'asyncfunc',
@@ -917,7 +918,7 @@ class Actor:
             'context',
         ):
             raise ValueError(
-                f'{first_msg} is an invalid response packet?'
+                f'Invalid `StartAck.functype: str = {first_msg!r}` ??'
             )
 
         ctx._remote_func_type = functype
@@ -1162,7 +1163,7 @@ class Actor:
 
             # kill any debugger request task to avoid deadlock
             # with the root actor in this tree
-            dbcs = _debug.Lock._debugger_request_cs
+            dbcs = _debug.DebugStatus.req_cs
             if dbcs is not None:
                 msg += (
                     '>> Cancelling active debugger request..\n'
@@ -1237,9 +1238,9 @@ class Actor:
         except KeyError:
             # NOTE: during msging race conditions this will often
             # emit, some examples:
-            # - callee returns a result before cancel-msg/ctxc-raised
-            # - callee self raises ctxc before caller send request,
-            # - callee errors prior to cancel req.
+            # - child returns a result before cancel-msg/ctxc-raised
+            # - child self raises ctxc before parent send request,
+            # - child errors prior to cancel req.
             log.cancel(
                 'Cancel request invalid, RPC task already completed?\n\n'
                 f'<= canceller: {requesting_uid}\n\n'
@@ -1302,15 +1303,15 @@ class Actor:
         flow_info: str = (
             f'<= canceller: {requesting_uid}\n'
             f'=> ipc-parent: {parent_chan}\n'
-            f'  |_{ctx}\n'
+            f'|_{ctx}\n'
         )
         log.runtime(
-            'Waiting on RPC task to cancel\n'
+            'Waiting on RPC task to cancel\n\n'
             f'{flow_info}'
         )
         await is_complete.wait()
         log.runtime(
-            f'Sucessfully cancelled RPC task\n'
+            f'Sucessfully cancelled RPC task\n\n'
             f'{flow_info}'
         )
         return True
@@ -1536,8 +1537,8 @@ async def async_main(
 
     '''
     # attempt to retreive ``trio``'s sigint handler and stash it
-    # on our debugger lock state.
-    _debug.Lock._trio_handler = signal.getsignal(signal.SIGINT)
+    # on our debugger state.
+    _debug.DebugStatus._trio_handler = signal.getsignal(signal.SIGINT)
 
     is_registered: bool = False
     try:
diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 51e74379..e4ab7d83 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -160,12 +160,6 @@ class Lock:
     # placeholder for function to set a ``trio.Event`` on debugger exit
     # pdb_release_hook: Callable | None = None
 
-    _trio_handler: (
-        Callable[[int, FrameType|None], Any]
-        |int
-        | None
-    ) = None
-
     remote_task_in_debug: str|None = None
 
     @staticmethod
@@ -188,12 +182,6 @@ class Lock:
 
         Lock._locking_task_cs = cs
 
-    # SUBACTOR ONLY
-    # ------ - -------
-    local_task_in_debug: Task|None = None
-    _debugger_request_cs: trio.CancelScope|None = None
-    local_pdb_complete: trio.Event|None = None
-
     #     ROOT ONLY
     # ------ - -------
     # the root-actor-ONLY singletons for, 
@@ -214,16 +202,12 @@ class Lock:
     _debug_lock: trio.StrictFIFOLock = trio.StrictFIFOLock()
     _blocked: set[tuple[str, str]] = set()  # `Actor.uid` block list
 
-    # TODO: should go on `PbpREPL`?
-    _orig_sigint_handler: Callable | None = None
-
     @classmethod
     def repr(cls) -> str:
 
         # both root and subs
         fields: str = (
             f'repl: {cls.repl}\n'
-            f'local_repl_task: {cls.local_task_in_debug}\n'
         )
 
         if is_root_process():
@@ -238,12 +222,6 @@ class Lock:
                 f'_debug_lock: {cls._debug_lock}\n'
                 f'lock_stats: {lock_stats}\n'
             )
-        else:
-            fields += (
-                f'local_task_in_debug: {cls.local_task_in_debug}\n'
-                f'local_pdb_complete: {cls.local_pdb_complete}\n'
-                f'_debugger_request_cs: {cls._debugger_request_cs}\n'
-            )
 
         body: str = textwrap.indent(
             fields,
@@ -255,7 +233,101 @@ class Lock:
             ')>'
         )
 
-    # TODO: move to PdbREPL!
+    @classmethod
+    def release(cls):
+        try:
+            if not DebugStatus.is_main_trio_thread():
+                trio.from_thread.run_sync(
+                    cls._debug_lock.release
+                )
+            else:
+                cls._debug_lock.release()
+
+        except RuntimeError as rte:
+            # uhhh makes no sense but been seeing the non-owner
+            # release error even though this is definitely the task
+            # that locked?
+            owner = cls._debug_lock.statistics().owner
+            # if (
+            #     owner
+            #     and
+            #     cls.remote_task_in_debug is None
+            # ):
+            #     raise RuntimeError(
+            #         'Stale `Lock` detected, no remote task active!?\n'
+            #         f'|_{owner}\n'
+            #         # f'{Lock}'
+            #     ) from rte
+
+            if owner:
+                raise rte
+
+            # OW suppress, can't member why tho .. XD
+            # something somethin corrupts a cancel-scope
+            # somewhere..
+
+        try:
+            # sometimes the ``trio`` might already be terminated in
+            # which case this call will raise.
+            if DebugStatus.repl_release is not None:
+                DebugStatus.repl_release.set()
+
+        finally:
+            cls.repl = None
+            cls.global_actor_in_debug = None
+
+            # restore original sigint handler
+            DebugStatus.unshield_sigint()
+            # actor-local state, irrelevant for non-root.
+            DebugStatus.repl_task = None
+
+
+# TODO: actually use this instead throughout for subs!
+class DebugStatus:
+    '''
+    Singleton-state for debugging machinery in a subactor.
+
+    Composes conc primitives for syncing with a root actor to
+    acquire the tree-global (TTY) `Lock` such that only ever one
+    actor's task can have the REPL active at a given time.
+
+    Methods to shield the process' `SIGINT` handler are used
+    whenever a local task is an active REPL.
+
+    '''
+    repl: PdbREPL|None = None
+    repl_task: Task|None = None
+    req_cs: trio.CancelScope|None = None
+    repl_release: trio.Event|None = None
+
+    lock_status: LockStatus|None = None
+
+    _orig_sigint_handler: Callable | None = None
+    _trio_handler: (
+        Callable[[int, FrameType|None], Any]
+        |int
+        | None
+    ) = None
+
+
+    @classmethod
+    def repr(cls) -> str:
+        fields: str = (
+            f'repl: {cls.repl}\n'
+            f'repl_task: {cls.repl_task}\n'
+            f'repl_release: {cls.repl_release}\n'
+            f'req_cs: {cls.req_cs}\n'
+        )
+        body: str = textwrap.indent(
+            fields,
+            prefix=' |_',
+        )
+        return (
+            f'<{cls.__name__}(\n'
+            f'{body}'
+            ')>'
+        )
+
     @classmethod
     def shield_sigint(cls):
         '''
@@ -339,77 +411,6 @@ class Lock:
         #     is not threading.main_thread()
         # )
 
-    @classmethod
-    def release(cls):
-        try:
-            if not cls.is_main_trio_thread():
-                trio.from_thread.run_sync(
-                    cls._debug_lock.release
-                )
-            else:
-                cls._debug_lock.release()
-
-        except RuntimeError as rte:
-            # uhhh makes no sense but been seeing the non-owner
-            # release error even though this is definitely the task
-            # that locked?
-            owner = cls._debug_lock.statistics().owner
-            # if (
-            #     owner
-            #     and
-            #     cls.remote_task_in_debug is None
-            # ):
-            #     raise RuntimeError(
-            #         'Stale `Lock` detected, no remote task active!?\n'
-            #         f'|_{owner}\n'
-            #         # f'{Lock}'
-            #     ) from rte
-
-            if owner:
-                raise rte
-
-            # OW suppress, can't member why tho .. XD
-            # something somethin corrupts a cancel-scope
-            # somewhere..
-
-        try:
-            # sometimes the ``trio`` might already be terminated in
-            # which case this call will raise.
-            if cls.local_pdb_complete is not None:
-                cls.local_pdb_complete.set()
-
-        finally:
-            # restore original sigint handler
-            cls.unshield_sigint()
-            cls.repl = None
-
-            # actor-local state, irrelevant for non-root.
-            cls.global_actor_in_debug = None
-            cls.local_task_in_debug = None
-
-
-# TODO: actually use this instead throughout for subs!
-class DebugStatus:
-    '''
-    Singleton-state for debugging machinery in a subactor.
-
-    Composes conc primitives for syncing with a root actor to
-    acquire the tree-global (TTY) `Lock` such that only ever one
-    actor's task can have the REPL active at a given time.
-
-    '''
-    repl: PdbREPL|None = None
-    lock_status: LockStatus|None = None
-
-    repl_task: Task|None = None
-    # local_task_in_debug: Task|None = None
-
-    req_cs: trio.CancelScope|None = None
-    # _debugger_request_cs: trio.CancelScope|None = None
-
-    repl_release: trio.Event|None = None
-    # local_pdb_complete: trio.Event|None = None
-
 
 class TractorConfig(pdbp.DefaultConfig):
     '''
@@ -445,6 +446,7 @@ class PdbREPL(pdbp.Pdb):
 
     status = DebugStatus
 
+
     # def preloop(self):
     #     print('IN PRELOOP')
     #     super().preloop()
@@ -660,16 +662,19 @@ async def lock_tty_for_child(
     highly reliable at releasing the mutex complete!
 
     '''
-
     req_task_uid: tuple = tuple(subactor_task_uid)
     if req_task_uid in Lock._blocked:
         raise RuntimeError(
+            f'Double lock request!?\n'
             f'The same remote task already has an active request for TTY lock ??\n\n'
             f'task uid: {req_task_uid}\n'
-            f'subactor uid: {subactor_uid}\n'
-        )
+            f'subactor uid: {subactor_uid}\n\n'
 
-    Lock._blocked.add(req_task_uid)
+            'This might be mean that the requesting task '
+            'in `wait_for_parent_stdin_hijack()` may have crashed?\n'
+            'Consider that an internal bug exists given the TTY '
+            '`Lock`ing IPC dialog..\n'
+        )
 
     root_task_name: str = current_task().name
     if tuple(subactor_uid) in Lock._blocked:
@@ -695,8 +700,9 @@ async def lock_tty_for_child(
         f'subactor_uid: {subactor_uid}\n'
         f'remote task: {subactor_task_uid}\n'
     )
-    Lock.shield_sigint()
+    DebugStatus.shield_sigint()
     try:
+        Lock._blocked.add(req_task_uid)
         with (
             # NOTE: though a cs is created for every subactor lock
             # REQUEST in this ctx-child task, only the root-task
@@ -708,6 +714,9 @@ async def lock_tty_for_child(
             # used to do so!
             trio.CancelScope(shield=True) as debug_lock_cs,
 
+            # TODO: make this ONLY limit the pld_spec such that we
+            # can on-error-decode-`.pld: Raw` fields in
+            # `Context._deliver_msg()`?
             _codec.limit_msg_spec(
                 payload_spec=__msg_spec__,
             ) as codec,
@@ -763,8 +772,9 @@ async def lock_tty_for_child(
 
     finally:
         debug_lock_cs.cancel()
+        Lock._blocked.remove(req_task_uid)
         Lock.set_locking_task_cs(None)
-        Lock.unshield_sigint()
+        DebugStatus.unshield_sigint()
 
 
 @cm
@@ -817,7 +827,7 @@ async def wait_for_parent_stdin_hijack(
         trio.CancelScope(shield=True) as cs,
         apply_debug_codec(),
     ):
-        Lock._debugger_request_cs = cs
+        DebugStatus.req_cs = cs
         try:
             # TODO: merge into sync async with ?
             async with get_root() as portal:
@@ -829,7 +839,7 @@ async def wait_for_parent_stdin_hijack(
 
                 ) as (ctx, resp):
                     log.pdb(
-                        'Subactor locked TTY per msg\n'
+                        'Subactor locked TTY with msg\n\n'
                         f'{resp}\n'
                     )
                     assert resp.subactor_uid == actor_uid
@@ -837,12 +847,12 @@ async def wait_for_parent_stdin_hijack(
 
                     async with ctx.open_stream() as stream:
                         try:  # to unblock local caller
-                            assert Lock.local_pdb_complete
+                            assert DebugStatus.repl_release
                             task_status.started(cs)
 
                             # wait for local task to exit and
                             # release the REPL
-                            await Lock.local_pdb_complete.wait()
+                            await DebugStatus.repl_release.wait()
 
                         finally:
                             await stream.send(
@@ -867,12 +877,12 @@ async def wait_for_parent_stdin_hijack(
             raise
 
         finally:
-            Lock.local_task_in_debug = None
+            DebugStatus.repl_task = None
             log.debug('Exiting debugger TTY lock request func from child')
 
 
     log.cancel('Reverting SIGINT handler!')
-    Lock.unshield_sigint()
+    DebugStatus.unshield_sigint()
 
 
@@ -901,7 +911,7 @@ def mk_mpdb() -> PdbREPL:
     # in which case schedule the SIGINT shielding override
     # to in the main thread.
     # https://docs.python.org/3/library/signal.html#signals-and-threads
-    Lock.shield_sigint()
+    DebugStatus.shield_sigint()
 
     # XXX: These are the important flags mentioned in
     # https://github.com/python-trio/trio/issues/1155
@@ -1036,7 +1046,8 @@ def shield_sigint_handler(
                         )
 
                     log.warning(message)
-                    Lock.unshield_sigint()
+                    # Lock.unshield_sigint()
+                    DebugStatus.unshield_sigint()
                     case_handled = True
 
             else:
@@ -1064,7 +1075,7 @@ def shield_sigint_handler(
             if maybe_stale_lock_cs:
                 lock_cs.cancel()
 
-            Lock.unshield_sigint()
+            DebugStatus.unshield_sigint()
             case_handled = True
 
     # child actor that has locked the debugger
@@ -1086,11 +1097,11 @@ def shield_sigint_handler(
                 f'{uid_in_debug}\n'
                 'Allowing SIGINT propagation..'
             )
-            Lock.unshield_sigint()
+            DebugStatus.unshield_sigint()
             # do_cancel()
             case_handled = True
 
-        task: str|None = Lock.local_task_in_debug
+        task: str|None = DebugStatus.repl_task
         if (
             task
             and
@@ -1124,7 +1135,7 @@ def shield_sigint_handler(
                 +
                 'Reverting handler to `trio` default!\n'
             )
-            Lock.unshield_sigint()
+            DebugStatus.unshield_sigint()
             case_handled = True
 
             # XXX ensure that the reverted-to-handler actually is
@@ -1200,32 +1211,15 @@ def _set_trace(
             pdb
             and actor is not None
         )
-        # or shield
     ):
-        msg: str = _pause_msg
-        if shield:
-            # log.warning(
-            msg = (
-                '\n\n'
-                '            ------ - ------\n'
-                'Debugger invoked with `shield=True` so an extra\n'
-                '`trio.CancelScope.__exit__()` frame is shown..\n'
-                '\n'
-                'Try going up one frame to see your pause point!\n'
-                '\n'
-                '          SORRY we need to fix this!\n'
-                '            ------ - ------\n\n'
-            ) + msg
-
-        # pdbp.set_trace()
         # TODO: maybe print the actor supervion tree up to the
         # root here? Bo
+
         log.pdb(
-            f'{msg}\n'
+            f'{_pause_msg}\n'
             '|\n'
             # TODO: make an `Actor.__repr()__`
-            # f'|_ {current_task()} @ {actor.name}\n'
-            f'|_ {current_task()}\n'
+            f'|_ {current_task()} @ {actor.uid}\n'
         )
         # no f!#$&* idea, but when we're in async land
         # we need 2x frames up?
@@ -1286,11 +1280,11 @@ async def _pause(
     # task_name: str = task.name
 
     if (
-        not Lock.local_pdb_complete
+        not DebugStatus.repl_release
         or
-        Lock.local_pdb_complete.is_set()
+        DebugStatus.repl_release.is_set()
     ):
-        Lock.local_pdb_complete = trio.Event()
+        DebugStatus.repl_release = trio.Event()
 
     if debug_func is not None:
         debug_func = partial(debug_func)
@@ -1333,71 +1327,14 @@ async def _pause(
             Lock.release()
             raise
 
-        except BaseException:
-            log.exception(
-                'Failed to engage debugger via `_pause()` ??\n'
-            )
-            raise
+    try:
+        if is_root_process():
 
-    if is_root_process():
-
-        # we also wait in the root-parent for any child that
-        # may have the tty locked prior
-        # TODO: wait, what about multiple root tasks acquiring it though?
-        if Lock.global_actor_in_debug == actor.uid:
-            # re-entrant root process already has it: noop.
-            log.warning(
-                f'{task.name}@{actor.uid} already has TTY lock\n'
-                f'ignoring..'
-            )
-            await trio.lowlevel.checkpoint()
-            return
-
-        # XXX: since we need to enter pdb synchronously below,
-        # we have to release the lock manually from pdb completion
-        # callbacks. Can't think of a nicer way then this atm.
-        if Lock._debug_lock.locked():
-            log.warning(
-                'attempting to shield-acquire active TTY lock'
-                f' owned by {Lock.global_actor_in_debug}'
-            )
-
-            # must shield here to avoid hitting a ``Cancelled`` and
-            # a child getting stuck bc we clobbered the tty
-            with trio.CancelScope(shield=True):
-                await Lock._debug_lock.acquire()
-        else:
-            # may be cancelled
-            await Lock._debug_lock.acquire()
-
-        Lock.global_actor_in_debug = actor.uid
-        Lock.local_task_in_debug = task
-        Lock.repl = pdb
-
-        # enter REPL from root, no TTY locking IPC ctx necessary
-        _enter_repl_sync(debug_func)
-        return  # next branch is mutex and for subactors
-
-    # TODO: need a more robust check for the "root" actor
-    elif (
-        not is_root_process()
-        and actor._parent_chan  # a connected child
-    ):
-        if Lock.local_task_in_debug:
-
-            # Recurrence entry case: this task already has the lock and
-            # is likely recurrently entering a breakpoint
-            #
-            # NOTE: noop on recurrent entry case but we want to trigger
-            # a checkpoint to allow other actors error-propagate and
-            # potetially avoid infinite re-entries in some
-            # subactor that would otherwise not bubble until the
-            # next checkpoint was hit.
-            if (
-                (repl_task := Lock.local_task_in_debug)
-                and 
-                repl_task is task
-            ):
+            # we also wait in the root-parent for any child that
+            # may have the tty locked prior
+            # TODO: wait, what about multiple root tasks acquiring it though?
+            if Lock.global_actor_in_debug == actor.uid:
+                # re-entrant root process already has it: noop.
                 log.warning(
                     f'{task.name}@{actor.uid} already has TTY lock\n'
                     f'ignoring..'
@@ -1405,79 +1342,137 @@ async def _pause(
                 await trio.lowlevel.checkpoint()
                 return
 
-            # if **this** actor is already in debug REPL we want
-            # to maintain actor-local-task mutex access, so block
-            # here waiting for the control to be released - this
-            # -> allows for recursive entries to `tractor.pause()`
-            log.warning(
-                f'{task.name}@{actor.uid} already has TTY lock\n'
-                f'waiting for release..'
-            )
-            await Lock.local_pdb_complete.wait()
-            await trio.sleep(0.1)
-
-        # mark local actor as "in debug mode" to avoid recurrent
-        # entries/requests to the root process
-        Lock.local_task_in_debug = task
-
-        # this **must** be awaited by the caller and is done using the
-        # root nursery so that the debugger can continue to run without
-        # being restricted by the scope of a new task nursery.
-
-        # TODO: if we want to debug a trio.Cancelled triggered exception
-        # we have to figure out how to avoid having the service nursery
-        # cancel on this task start? I *think* this works below:
-        # ```python
-        #   actor._service_n.cancel_scope.shield = shield
-        # ```
-        # but not entirely sure if that's a sane way to implement it?
-
-        # NOTE: MUST it here bc multiple tasks are spawned by any
-        # one sub-actor AND there will be a race between when the
-        # root locking task delivers the `Started(pld=LockStatus)`
-        # and when the REPL is actually entered here. SO ensure
-        # the codec is set before either are run!
-        #
-        with (
-            # _codec.limit_msg_spec(
-            #     payload_spec=__msg_spec__,
-            # ) as debug_codec,
-            trio.CancelScope(shield=shield),
-        ):
-            # async with trio.open_nursery() as tn:
-            #     tn.cancel_scope.shield = True
-            try:
-                # cs: trio.CancelScope = await tn.start(
-                cs: trio.CancelScope = await actor._service_n.start(
-                    wait_for_parent_stdin_hijack,
-                    actor.uid,
-                    (task.name, id(task)),
+            # XXX: since we need to enter pdb synchronously below,
+            # we have to release the lock manually from pdb completion
+            # callbacks. Can't think of a nicer way then this atm.
+            if Lock._debug_lock.locked():
+                log.warning(
+                    'attempting to shield-acquire active TTY lock'
+                    f' owned by {Lock.global_actor_in_debug}'
                 )
-                # our locker task should be the one in ctx
-                # with the root actor
-                assert Lock._debugger_request_cs is cs
 
-                # XXX used by the SIGINT handler to check if
-                # THIS actor is in REPL interaction
-                Lock.repl = pdb
+                # must shield here to avoid hitting a ``Cancelled`` and
+                # a child getting stuck bc we clobbered the tty
+                with trio.CancelScope(shield=True):
+                    await Lock._debug_lock.acquire()
+            else:
+                # may be cancelled
+                await Lock._debug_lock.acquire()
 
-            except RuntimeError:
-                Lock.release()
+            Lock.global_actor_in_debug = actor.uid
+            DebugStatus.repl_task = task
+            DebugStatus.repl = Lock.repl = pdb
 
-                if actor._cancel_called:
-                    # service nursery won't be usable and we
-                    # don't want to lock up the root either way since
-                    # we're in (the midst of) cancellation.
+            # enter REPL from root, no TTY locking IPC ctx necessary
+            _enter_repl_sync(debug_func)
+            return  # next branch is mutex and for subactors
+
+        # TODO: need a more robust check for the "root" actor
+        elif (
+            not is_root_process()
+            and actor._parent_chan  # a connected child
+        ):
+            if DebugStatus.repl_task:
+
+                # Recurrence entry case: this task already has the lock and
+                # is likely recurrently entering a breakpoint
+                #
+                # NOTE: noop on recurrent entry case but we want to trigger
+                # a checkpoint to allow other actors error-propagate and
+                # potetially avoid infinite re-entries in some
+                # subactor that would otherwise not bubble until the
+                # next checkpoint was hit.
+                if (
+                    (repl_task := DebugStatus.repl_task)
+                    and
+                    repl_task is task
+                ):
+                    log.warning(
+                        f'{task.name}@{actor.uid} already has TTY lock\n'
+                        f'ignoring..'
+                    )
+                    await trio.lowlevel.checkpoint()
                     return
 
-                raise
+                # if **this** actor is already in debug REPL we want
+                # to maintain actor-local-task mutex access, so block
+                # here waiting for the control to be released - this
+                # -> allows for recursive entries to `tractor.pause()`
+                log.warning(
+                    f'{task.name}@{actor.uid} already has TTY lock\n'
+                    f'waiting for release..'
+                )
+                await DebugStatus.repl_release.wait()
+                await trio.sleep(0.1)
 
-            # enter REPL
+            # mark local actor as "in debug mode" to avoid recurrent
+            # entries/requests to the root process
+            DebugStatus.repl_task = task
 
-            try:
-                _enter_repl_sync(debug_func)
-            finally:
-                Lock.unshield_sigint()
+            # this **must** be awaited by the caller and is done using the
+            # root nursery so that the debugger can continue to run without
+            # being restricted by the scope of a new task nursery.
+
+            # TODO: if we want to debug a trio.Cancelled triggered exception
+            # we have to figure out how to avoid having the service nursery
+            # cancel on this task start? I *think* this works below:
+            # ```python
+            #   actor._service_n.cancel_scope.shield = shield
+            # ```
+            # but not entirely sure if that's a sane way to implement it?
+
+            # NOTE: MUST it here bc multiple tasks are spawned by any
+            # one sub-actor AND there will be a race between when the
+            # root locking task delivers the `Started(pld=LockStatus)`
+            # and when the REPL is actually entered here. SO ensure
+            # the codec is set before either are run!
+            #
+            with (
+                # _codec.limit_msg_spec(
+                #     payload_spec=__msg_spec__,
+                # ) as debug_codec,
+                trio.CancelScope(shield=shield),
+            ):
+                # async with trio.open_nursery() as tn:
+                #     tn.cancel_scope.shield = True
+                try:
+                    # cs: trio.CancelScope = await tn.start(
+                    cs: trio.CancelScope = await actor._service_n.start(
+                        wait_for_parent_stdin_hijack,
+                        actor.uid,
+                        (task.name, id(task)),
+                    )
+                    # our locker task should be the one in ctx
+                    # with the root actor
+                    assert DebugStatus.req_cs is cs
+
+                    # XXX used by the SIGINT handler to check if
+                    # THIS actor is in REPL interaction
+                    Lock.repl = pdb
+
+                except RuntimeError:
+                    Lock.release()
+
+                    if actor._cancel_called:
+                        # service nursery won't be usable and we
+                        # don't want to lock up the root either way since
+                        # we're in (the midst of) cancellation.
+                        return
+
+                    raise
+
+                # enter REPL
+
+                try:
+                    _enter_repl_sync(debug_func)
+                finally:
+                    DebugStatus.unshield_sigint()
+
+    except BaseException:
+        log.exception(
+            'Failed to engage debugger via `_pause()` ??\n'
+        )
+        raise
 
 
 # XXX: apparently we can't do this without showing this frame
@@ -1527,45 +1522,16 @@ async def pause(
     '''
     __tracebackhide__: bool = True
 
-    if shield:
-        # NOTE XXX: even hard coding this inside the `class CancelScope:`
-        # doesn't seem to work for me!?
-        # ^ XXX ^
+    with trio.CancelScope(
+        shield=shield,
+    ) as cs:
 
-        # def _exit(self, *args, **kwargs):
-        #     __tracebackhide__: bool = True
-        #     super().__exit__(*args, **kwargs)
-
-        # trio.CancelScope.__enter__.__tracebackhide__ = True
-        # trio.CancelScope.__exit__.__tracebackhide__ = True
-
-        # import types
-        # with trio.CancelScope(shield=shield) as cs:
-            # cs.__exit__ = types.MethodType(_exit, cs)
-            # cs.__exit__.__tracebackhide__ = True
-
-        # TODO: LOL, solved this with the `pdb.hideframe` stuff
-        # at top-of-mod.. so I guess we can just only use this
-        # block right?
-        with trio.CancelScope(
-            shield=shield,
-        ) as cs:
-            print(f'debug cs is {cs}\n')
-            # setattr(cs.__exit__.__func__, '__tracebackhide__', True)
-            # setattr(cs.__enter__.__func__, '__tracebackhide__', True)
-
-            # NOTE: so the caller can always cancel even if shielded
-            task_status.started(cs)
-            return await _pause(
-                debug_func=debug_func,
-                shield=True,
-                task_status=task_status,
-                **_pause_kwargs
-            )
-    else:
+        # NOTE: so the caller can always manually cancel even
+        # if shielded!
+        task_status.started(cs)
         return await _pause(
             debug_func=debug_func,
-            shield=False,
+            shield=shield,
             task_status=task_status,
             **_pause_kwargs
         )
@@ -1682,7 +1648,7 @@ def pause_from_sync(
             )
         )
         # TODO: maybe the `trio.current_task()` id/name if avail?
-        Lock.local_task_in_debug: str = str(threading.current_thread())
+        DebugStatus.repl_task: str = str(threading.current_thread())
 
     else:  # we are presumably the `trio.run()` + main thread
         greenback.await_(
@@ -1692,7 +1658,7 @@ def pause_from_sync(
                 hide_tb=hide_tb,
             )
         )
-        Lock.local_task_in_debug: str = current_task()
+        DebugStatus.repl_task: str = current_task()
 
     # TODO: ensure we aggressively make the user aware about
     # entering the global ``breakpoint()`` built-in from sync
@@ -1754,7 +1720,8 @@ def _post_mortem(
     log.pdb(
         f'{_crash_msg}\n'
         '|\n'
-        f'|_ {current_task()}\n'
+        # f'|_ {current_task()}\n'
+        f'|_ {current_task()} @ {actor.uid}\n'
 
         # f'|_ @{actor.uid}\n'
         # TODO: make an `Actor.__repr()__`
-- 
2.34.1


From 7372404d76fafb16fcab4ecfb60718caa187533c Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 18 Apr 2024 15:10:23 -0400
Subject: [PATCH 250/378] `NamespacePath._mk_fqnp()` handle `__mod__` for
 methods

Need to use `__self__.__mod__` in the method case i guess..
---
 tractor/msg/ptr.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/tractor/msg/ptr.py b/tractor/msg/ptr.py
index 4d089c3e..abe5406e 100644
--- a/tractor/msg/ptr.py
+++ b/tractor/msg/ptr.py
@@ -76,9 +76,11 @@ class NamespacePath(str):
         return self._ref
 
     @staticmethod
-    def _mk_fqnp(ref: type | object) -> tuple[str, str]:
+    def _mk_fqnp(
+        ref: type|object,
+    ) -> tuple[str, str]:
         '''
-        Generate a minial ``str`` pair which describes a python
+        Generate a minial `str` pair which describes a python
         object's namespace path and object/type name.
 
         In more precise terms something like:
@@ -87,10 +89,9 @@ class NamespacePath(str):
             of THIS type XD
 
         '''
-        if (
-            isfunction(ref)
-        ):
+        if isfunction(ref):
             name: str = getattr(ref, '__name__')
+            mod_name: str = ref.__module__
 
         elif ismethod(ref):
             # build out the path manually i guess..?
@@ -99,15 +100,19 @@ class NamespacePath(str):
                 type(ref.__self__).__name__,
                 ref.__func__.__name__,
             ])
+            mod_name: str = ref.__self__.__module__
 
         else:  # object or other?
             # isinstance(ref, object)
             # and not isfunction(ref)
             name: str = type(ref).__name__
+            mod_name: str = ref.__module__
 
+        # TODO: return static value direactly?
+        #
         # fully qualified namespace path, tuple.
         fqnp: tuple[str, str] = (
-            ref.__module__,
+            mod_name,
             name,
         )
         return fqnp
@@ -115,7 +120,7 @@ class NamespacePath(str):
     @classmethod
     def from_ref(
         cls,
-        ref: type | object,
+        ref: type|object,
 
     ) -> NamespacePath:
 
-- 
2.34.1


From 5439060cd3841900c5ac5fa25769150025d9da52 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 18 Apr 2024 15:12:32 -0400
Subject: [PATCH 251/378] Start a `devx._code` mod

Starting with a little sub-sys for tracing caller frames by marking them
with a dunder var (`__runtimeframe__` by default) and then scanning for
that frame such that code that is *calling* our APIs can be reported
easily in logging / tracing output.

New APIs:
- `find_caller_info()` which does the scan and delivers a,
- `CallerInfo` which (attempts) to expose both the runtime frame-info
  and frame of the caller func along with `NamespacePath` properties.

Probably going to re-implement the dunder var bit as a decorator later
so we can bind in the literal func-object ref instead of trying to look
it up with `get_class_from_frame()`, since it's kinda hacky/non-general
and def doesn't work for closure funcs..
---
 tractor/devx/_code.py | 177 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)
 create mode 100644 tractor/devx/_code.py

diff --git a/tractor/devx/_code.py b/tractor/devx/_code.py
new file mode 100644
index 00000000..01d64cd1
--- /dev/null
+++ b/tractor/devx/_code.py
@@ -0,0 +1,177 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+Tools for code-object annotation, introspection and mutation
+as it pertains to improving the grok-ability of our runtime!
+
+'''
+from __future__ import annotations
+import inspect
+# import msgspec
+# from pprint import pformat
+from types import (
+    FrameType,
+    FunctionType,
+    MethodType,
+    # CodeType,
+)
+from typing import (
+    # Any,
+    Callable,
+    # TYPE_CHECKING,
+    Type,
+)
+
+from tractor.msg import (
+    pretty_struct,
+    NamespacePath,
+)
+
+
+# TODO: yeah, i don't love this and we should prolly just
+# write a decorator that actually keeps a stupid ref to the func
+# obj..
+def get_class_from_frame(fr: FrameType) -> (
+    FunctionType
+    |MethodType
+):
+    '''
+    Attempt to get the function (or method) reference
+    from a given `FrameType`.
+
+    Verbatim from an SO:
+    https://stackoverflow.com/a/2220759
+
+    '''
+    args, _, _, value_dict = inspect.getargvalues(fr)
+
+    # we check the first parameter for the frame function is
+    # named 'self'
+    if (
+        len(args)
+        and
+        # TODO: other cases for `@classmethod` etc..?)
+        args[0] == 'self'
+    ):
+        # in that case, 'self' will be referenced in value_dict
+        instance: object = value_dict.get('self')
+        if instance:
+          # return its class
+          return getattr(
+              instance,
+              '__class__',
+              None,
+          )
+
+    # return None otherwise
+    return None
+
+
+def func_ref_from_frame(
+    frame: FrameType,
+) -> Callable:
+    func_name: str = frame.f_code.co_name
+    try:
+        return frame.f_globals[func_name]
+    except KeyError:
+        cls: Type|None = get_class_from_frame(frame)
+        if cls:
+            return getattr(
+                cls,
+                func_name,
+            )
+
+
+# TODO: move all this into new `.devx._code`!
+# -[ ] prolly create a `@runtime_api` dec?
+# -[ ] ^- make it capture and/or accept buncha optional
+#     meta-data like a fancier version of `@pdbp.hideframe`.
+#
+class CallerInfo(pretty_struct.Struct):
+    rt_fi: inspect.FrameInfo
+    call_frame: FrameType
+
+    @property
+    def api_func_ref(self) -> Callable|None:
+        return func_ref_from_frame(self.rt_fi.frame)
+
+    @property
+    def api_nsp(self) -> NamespacePath|None:
+        func: FunctionType = self.api_func_ref
+        if func:
+            return NamespacePath.from_ref(func)
+
+        return '<unknown>'
+
+    @property
+    def caller_func_ref(self) -> Callable|None:
+        return func_ref_from_frame(self.call_frame)
+
+    @property
+    def caller_nsp(self) -> NamespacePath|None:
+        func: FunctionType = self.caller_func_ref
+        if func:
+            return NamespacePath.from_ref(func)
+
+        return '<unknown>'
+
+
+def find_caller_info(
+    dunder_var: str = '__runtimeframe__',
+    iframes:int = 1,
+    check_frame_depth: bool = True,
+
+) -> CallerInfo|None:
+    '''
+    Scan up the callstack for a frame with a `dunder_var: str` variable
+    and return the `iframes` frames above it.
+
+    By default we scan for a `__runtimeframe__` scope var which
+    denotes a `tractor` API above which (one frame up) is "user
+    app code" which "called into" the `tractor` method or func.
+
+    TODO: ex with `Portal.open_context()`
+
+    '''
+    # TODO: use this instead?
+    # https://docs.python.org/3/library/inspect.html#inspect.getouterframes
+    frames: list[inspect.FrameInfo] = inspect.stack()
+    for fi in frames:
+        assert (
+            fi.function
+            ==
+            fi.frame.f_code.co_name
+        )
+        this_frame: FrameType = fi.frame
+        dunder_val: int|None = this_frame.f_locals.get(dunder_var)
+        if dunder_val:
+            go_up_iframes: int = (
+                dunder_val  # could be 0 or `True` i guess?
+                or
+                iframes
+            )
+            rt_frame: FrameType = fi.frame
+            call_frame = rt_frame
+            for i in range(go_up_iframes):
+                call_frame = call_frame.f_back
+
+            return CallerInfo(
+                rt_fi=fi,
+                call_frame=call_frame,
+            )
+
+    return None
-- 
2.34.1


From d2f6428e46fae1d6eff2d9f5db02d460e86d1f2c Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 18 Apr 2024 15:17:50 -0400
Subject: [PATCH 252/378] Annotate nursery and portal methods for `CallerInfo`
 scanning

---
 tractor/_portal.py    | 27 +++++++++++++++++++++------
 tractor/_supervise.py |  9 ++++++++-
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/tractor/_portal.py b/tractor/_portal.py
index e4db93a6..052dd8ef 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -161,17 +161,18 @@ class Portal:
         self._expect_result = await self.actor.start_remote_task(
             self.channel,
             nsf=NamespacePath(f'{ns}:{func}'),
-            kwargs=kwargs
+            kwargs=kwargs,
+            portal=self,
         )
 
     async def _return_once(
         self,
         ctx: Context,
 
-    ) -> dict[str, Any]:
+    ) -> Return:
 
         assert ctx._remote_func_type == 'asyncfunc'  # single response
-        msg: dict = await ctx._recv_chan.receive()
+        msg: Return = await ctx._recv_chan.receive()
         return msg
 
     async def result(self) -> Any:
@@ -247,6 +248,8 @@ class Portal:
         purpose.
 
         '''
+        __runtimeframe__: int = 1  # noqa
+
         chan: Channel = self.channel
         if not chan.connected():
             log.runtime(
@@ -324,16 +327,18 @@ class Portal:
           internals!
 
         '''
+        __runtimeframe__: int = 1  # noqa
         nsf = NamespacePath(
             f'{namespace_path}:{function_name}'
         )
-        ctx = await self.actor.start_remote_task(
+        ctx: Context = await self.actor.start_remote_task(
             chan=self.channel,
             nsf=nsf,
             kwargs=kwargs,
+            portal=self,
         )
-        ctx._portal = self
-        msg = await self._return_once(ctx)
+        ctx._portal: Portal = self
+        msg: Return = await self._return_once(ctx)
         return _unwrap_msg(
             msg,
             self.channel,
@@ -384,6 +389,7 @@ class Portal:
             self.channel,
             nsf=nsf,
             kwargs=kwargs,
+            portal=self,
         )
         ctx._portal = self
         return _unwrap_msg(
@@ -398,6 +404,14 @@ class Portal:
         **kwargs,
 
     ) -> AsyncGenerator[MsgStream, None]:
+        '''
+        Legacy one-way streaming API.
+
+        TODO: re-impl on top `Portal.open_context()` + an async gen
+        around `Context.open_stream()`.
+
+        '''
+        __runtimeframe__: int = 1  # noqa
 
         if not inspect.isasyncgenfunction(async_gen_func):
             if not (
@@ -411,6 +425,7 @@ class Portal:
             self.channel,
             nsf=NamespacePath.from_ref(async_gen_func),
             kwargs=kwargs,
+            portal=self,
         )
         ctx._portal = self
 
diff --git a/tractor/_supervise.py b/tractor/_supervise.py
index be81e4e6..dc65cc65 100644
--- a/tractor/_supervise.py
+++ b/tractor/_supervise.py
@@ -131,7 +131,12 @@ class ActorNursery:
         "main task" besides the runtime.
 
         '''
-        loglevel = loglevel or self._actor.loglevel or get_loglevel()
+        __runtimeframe__: int = 1  # noqa
+        loglevel: str = (
+            loglevel
+            or self._actor.loglevel
+            or get_loglevel()
+        )
 
         # configure and pass runtime state
         _rtv = _state._runtime_vars.copy()
@@ -209,6 +214,7 @@ class ActorNursery:
         the actor is terminated.
 
         '''
+        __runtimeframe__: int = 1  # noqa
         mod_path: str = fn.__module__
 
         if name is None:
@@ -257,6 +263,7 @@ class ActorNursery:
         directly without any far end graceful ``trio`` cancellation.
 
         '''
+        __runtimeframe__: int = 1  # noqa
         self.cancelled = True
 
         # TODO: impl a repr for spawn more compact
-- 
2.34.1


From 4aa24f8518ea6f6ac0a4b1ef60ac5ccb75311ecb Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 18 Apr 2024 15:18:29 -0400
Subject: [PATCH 253/378] TOSQUASH 77a15eb use `DebugStatus` in `._rpc`

---
 tractor/_rpc.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index 86c3e27d..576e988b 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -814,7 +814,7 @@ async def process_messages(
     # should use it?
     # https://github.com/python-trio/trio/issues/467
     log.runtime(
-        'Entering IPC msg loop:\n'
+        'Entering RPC msg loop:\n'
         f'peer: {chan.uid}\n'
         f'|_{chan}\n'
     )
@@ -876,7 +876,7 @@ async def process_messages(
                         # XXX NOTE XXX don't start entire actor
                         # runtime cancellation if this actor is
                         # currently in debug mode!
-                        pdb_complete: trio.Event|None = _debug.Lock.local_pdb_complete
+                        pdb_complete: trio.Event|None = _debug.DebugStatus.repl_release
                         if pdb_complete:
                             await pdb_complete.wait()
 
@@ -1073,7 +1073,7 @@ async def process_messages(
                         log.exception(message)
                         raise RuntimeError(message)
 
-                log.runtime(
+                log.transport(
                     'Waiting on next IPC msg from\n'
                     f'peer: {chan.uid}\n'
                     f'|_{chan}\n'
-- 
2.34.1


From e5f0b450cf54175d7be6c3ddf41bcb087ef2d7b1 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 18 Apr 2024 15:40:26 -0400
Subject: [PATCH 254/378] Add some `bytes` annots

---
 tractor/_ipc.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tractor/_ipc.py b/tractor/_ipc.py
index a5b44a4e..f76d4ef5 100644
--- a/tractor/_ipc.py
+++ b/tractor/_ipc.py
@@ -314,8 +314,7 @@ class MsgpackTCPStream(MsgTransport):
 
         while True:
             try:
-                header = await self.recv_stream.receive_exactly(4)
-
+                header: bytes = await self.recv_stream.receive_exactly(4)
             except (
                 ValueError,
                 ConnectionResetError,
@@ -337,8 +336,7 @@ class MsgpackTCPStream(MsgTransport):
             size, = struct.unpack("<I", header)
 
             log.transport(f'received header {size}')  # type: ignore
-
-            msg_bytes = await self.recv_stream.receive_exactly(size)
+            msg_bytes: bytes = await self.recv_stream.receive_exactly(size)
 
             log.transport(f"received {msg_bytes}")  # type: ignore
             try:
-- 
2.34.1


From 3018187228a9e851f7cac85cb98dd1c9f65b4c17 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 18 Apr 2024 15:41:06 -0400
Subject: [PATCH 255/378] Tweak `current_actor()` failure msg

---
 tractor/_state.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tractor/_state.py b/tractor/_state.py
index b76e8ac9..30346a6a 100644
--- a/tractor/_state.py
+++ b/tractor/_state.py
@@ -66,7 +66,7 @@ def current_actor(
         err_on_no_runtime
         and _current_actor is None
     ):
-        msg: str = 'No local actor has been initialized yet'
+        msg: str = 'No local actor has been initialized yet?\n'
         from ._exceptions import NoRuntime
 
         if last := last_actor():
@@ -79,8 +79,8 @@ def current_actor(
         # this process.
         else:
             msg += (
-                'No last actor found?\n'
-                'Did you forget to open one of:\n\n'
+                # 'No last actor found?\n'
+                '\nDid you forget to call one of,\n'
                 '- `tractor.open_root_actor()`\n'
                 '- `tractor.open_nursery()`\n'
             )
-- 
2.34.1


From d51be2a36ae4e73cf269c06f4a0fcbff5bc8f7cb Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 18 Apr 2024 15:53:34 -0400
Subject: [PATCH 256/378] Proto in new `Context` refinements

As per some newly added features and APIs:

- pass `portal: Portal` to `Actor.start_remote_task()` from
  `open_context_from_portal()` marking `Portal.open_context()` as
  always being the "parent" task side.

- add caller tracing via `.devx._code.CallerInfo/.find_caller_info()`
  called in `mk_context()` and (for now) a `__runtimeframe__: int = 2`
  inside `open_context_from_portal()` such that any enter-er of
  `Portal.open_context()` will be reported.

- pass in a new `._caller_info` attr which is used in 2 new meths:
  - `.repr_caller: str` for showing the name of the app-code-func.
  - `.repr_api: str` for showing the API ep, which for now we just
    hardcode to `Portal.open_context()` since ow its gonna show the mod
    func name `open_context_from_portal()`.
  - use those new props ^ in the `._deliver_msg()` flow body log msg
    content for much clearer msg-flow tracing Bo

- add `Context._cancel_on_msgerr: bool` to toggle whether
  a delivered `MsgTypeError` should trigger a `._scope.cancel()` call.
  - also (temporarily) add separate `.cancel()` emissions for both cases
    as i work through hacking out the maybe `MsgType.pld: Raw` support.
---
 tractor/_context.py | 147 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 111 insertions(+), 36 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index 027f15ff..2230598d 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -26,6 +26,7 @@ disjoint, parallel executing tasks in separate actors.
 from __future__ import annotations
 from collections import deque
 from contextlib import asynccontextmanager as acm
+from contextvars import ContextVar
 from dataclasses import (
     dataclass,
     field,
@@ -56,6 +57,7 @@ from ._exceptions import (
 )
 from .log import get_logger
 from .msg import (
+    _codec,
     Error,
     MsgType,
     MsgCodec,
@@ -80,6 +82,9 @@ if TYPE_CHECKING:
     from ._portal import Portal
     from ._runtime import Actor
     from ._ipc import MsgTransport
+    from .devx._code import (
+        CallerInfo,
+    )
 
 
 log = get_logger(__name__)
@@ -499,6 +504,18 @@ class Context:
     _started_called: bool = False
     _stream_opened: bool = False
     _stream: MsgStream|None = None
+    _pld_codec_var: ContextVar[MsgCodec] = ContextVar(
+        'pld_codec',
+        default=_codec._def_msgspec_codec,  # i.e. `Any`-payloads
+    )
+
+    @property
+    def pld_codec(self) -> MsgCodec|None:
+        return self._pld_codec_var.get()
+
+    # caller of `Portal.open_context()` for
+    # logging purposes mostly
+    _caller_info: CallerInfo|None = None
 
     # overrun handling machinery
     # NOTE: none of this provides "backpressure" to the remote
@@ -525,6 +542,7 @@ class Context:
 
     # TODO: figure out how we can enforce this without losing our minds..
     _strict_started: bool = False
+    _cancel_on_msgerr: bool = True
 
     def __str__(self) -> str:
         ds: str = '='
@@ -857,6 +875,7 @@ class Context:
         # TODO: never do this right?
         # if self._remote_error:
         #     return
+        peer_side: str = self.peer_side(self.side)
 
         # XXX: denote and set the remote side's error so that
         # after we cancel whatever task is the opener of this
@@ -864,14 +883,15 @@ class Context:
         # appropriately.
         log.runtime(
             'Setting remote error for ctx\n\n'
-            f'<= remote ctx uid: {self.chan.uid}\n'
-            f'=>{error}'
+            f'<= {peer_side!r}: {self.chan.uid}\n'
+            f'=> {self.side!r}\n\n'
+            f'{error}'
         )
         self._remote_error: BaseException = error
 
         # self-cancel (ack) or,
         # peer propagated remote cancellation.
-        msgtyperr: bool = False
+        msgerr: bool = False
         if isinstance(error, ContextCancelled):
 
             whom: str = (
@@ -884,7 +904,7 @@ class Context:
             )
 
         elif isinstance(error, MsgTypeError):
-            msgtyperr = True
+            msgerr = True
             peer_side: str = self.peer_side(self.side)
             log.error(
                 f'IPC dialog error due to msg-type caused by {peer_side!r} side\n\n'
@@ -935,13 +955,24 @@ class Context:
             and not self._is_self_cancelled()
             and not cs.cancel_called
             and not cs.cancelled_caught
-            and not msgtyperr
+            and (
+                msgerr
+                and
+                # NOTE: allow user to config not cancelling the
+                # local scope on `MsgTypeError`s
+                self._cancel_on_msgerr
+            )
         ):
             # TODO: it'd sure be handy to inject our own
             # `trio.Cancelled` subtype here ;)
             # https://github.com/goodboy/tractor/issues/368
+            log.cancel('Cancelling local `.open_context()` scope!')
             self._scope.cancel()
 
+        else:
+            log.cancel('NOT cancelling local `.open_context()` scope!')
+
+
         # TODO: maybe we should also call `._res_scope.cancel()` if it
         # exists to support cancelling any drain loop hangs?
 
@@ -966,9 +997,7 @@ class Context:
     dmaddr = dst_maddr
 
     @property
-    def repr_rpc(
-        self,
-    ) -> str:
+    def repr_rpc(self) -> str:
         # TODO: how to show the transport interchange fmt?
         # codec: str = self.chan.transport.codec_key
         outcome_str: str = self.repr_outcome(
@@ -980,6 +1009,27 @@ class Context:
             f'{self._nsf}() -> {outcome_str}:'
         )
 
+    @property
+    def repr_caller(self) -> str:
+        ci: CallerInfo|None = self._caller_info
+        if ci:
+            return (
+                f'{ci.caller_nsp}()'
+                # f'|_api: {ci.api_nsp}'
+            )
+
+        return '<UNKNOWN caller-frame>'
+
+    @property
+    def repr_api(self) -> str:
+        # ci: CallerInfo|None = self._caller_info
+        # if ci:
+        #     return (
+        #         f'{ci.api_nsp}()\n'
+        #     )
+
+        return 'Portal.open_context()'
+
     async def cancel(
         self,
         timeout: float = 0.616,
@@ -1184,8 +1234,9 @@ class Context:
             )
 
         # NOTE: in one way streaming this only happens on the
-        # caller side inside `Actor.start_remote_task()` so if you try
-        # to send a stop from the caller to the callee in the
+        # parent-ctx-task side (on the side that calls
+        # `Actor.start_remote_task()`) so if you try to send
+        # a stop from the caller to the callee in the
         # single-direction-stream case you'll get a lookup error
         # currently.
         ctx: Context = actor.get_context(
@@ -1850,6 +1901,19 @@ class Context:
         send_chan: trio.MemorySendChannel = self._send_chan
         nsf: NamespacePath = self._nsf
 
+        side: str = self.side
+        if side == 'child':
+            assert not self._portal
+        peer_side: str = self.peer_side(side)
+
+        flow_body: str = (
+            f'<= peer {peer_side!r}: {from_uid}\n'
+            f'  |_<{nsf}()>\n\n'
+
+            f'=> {side!r}: {self._task}\n'
+            f'  |_<{self.repr_api} @ {self.repr_caller}>\n\n'
+        )
+
         re: Exception|None
         if re := unpack_error(
             msg,
@@ -1860,18 +1924,10 @@ class Context:
             else:
                 log_meth = log.runtime
 
-            side: str = self.side
-
-            peer_side: str = self.peer_side(side)
-
             log_meth(
                 f'Delivering IPC ctx error from {peer_side!r} to {side!r} task\n\n'
 
-                f'<= peer {peer_side!r}: {from_uid}\n'
-                f'  |_ {nsf}()\n\n'
-
-                f'=> {side!r} cid: {cid}\n'
-                f'  |_{self._task}\n\n'
+                f'{flow_body}'
 
                 f'{pformat(re)}\n'
             )
@@ -1884,30 +1940,27 @@ class Context:
             # or `RemoteActorError`).
             self._maybe_cancel_and_set_remote_error(re)
 
-        # XXX only case where returning early is fine!
+        # TODO: expose as mod func instead!
         structfmt = pretty_struct.Struct.pformat
         if self._in_overrun:
             log.warning(
-                f'Queueing OVERRUN msg on caller task:\n'
-                f'<= peer: {from_uid}\n'
-                f'  |_ {nsf}()\n\n'
+                f'Queueing OVERRUN msg on caller task:\n\n'
 
-                f'=> cid: {cid}\n'
-                f'  |_{self._task}\n\n'
+                f'{flow_body}'
 
                 f'{structfmt(msg)}\n'
             )
             self._overflow_q.append(msg)
+
+            # XXX NOTE XXX
+            # overrun is the ONLY case where returning early is fine!
             return False
 
         try:
             log.runtime(
                 f'Delivering msg from IPC ctx:\n\n'
-                f'<= {from_uid}\n'
-                f'  |_ {nsf}()\n\n'
 
-                f'=> {self._task}\n'
-                f'  |_cid={self.cid}\n\n'
+                f'{flow_body}'
 
                 f'{structfmt(msg)}\n'
             )
@@ -1939,6 +1992,7 @@ class Context:
                 f'cid: {self.cid}\n'
                 'Failed to deliver msg:\n'
                 f'send_chan: {send_chan}\n\n'
+
                 f'{pformat(msg)}\n'
             )
             return False
@@ -2092,6 +2146,12 @@ async def open_context_from_portal(
     '''
     __tracebackhide__: bool = hide_tb
 
+    # denote this frame as a "runtime frame" for stack
+    # introspection where we report the caller code in logging
+    # and error message content.
+    # NOTE: 2 bc of the wrapping `@acm`
+    __runtimeframe__: int = 2  # noqa
+
     # conduct target func method structural checks
     if not inspect.iscoroutinefunction(func) and (
         getattr(func, '_tractor_contex_function', False)
@@ -2119,6 +2179,8 @@ async def open_context_from_portal(
         nsf=nsf,
         kwargs=kwargs,
 
+        portal=portal,
+
         # NOTE: it's imporant to expose this since you might
         # get the case where the parent who opened the context does
         # not open a stream until after some slow startup/init
@@ -2129,13 +2191,17 @@ async def open_context_from_portal(
         # place..
         allow_overruns=allow_overruns,
     )
-    # ASAP, so that `Context.side: str` can be determined for
-    # logging / tracing / debug!
-    ctx._portal: Portal = portal
-
     assert ctx._remote_func_type == 'context'
-    msg: Started = await ctx._recv_chan.receive()
+    assert ctx._caller_info
 
+    # XXX NOTE since `._scope` is NOT set BEFORE we retreive the
+    # `Started`-msg any cancellation triggered
+    # in `._maybe_cancel_and_set_remote_error()` will
+    # NOT actually cancel the below line!
+    # -> it's expected that if there is an error in this phase of
+    # the dialog, the `Error` msg should be raised from the `msg`
+    # handling block below.
+    msg: Started = await ctx._recv_chan.receive()
     try:
         # the "first" value here is delivered by the callee's
         # ``Context.started()`` call.
@@ -2145,6 +2211,7 @@ async def open_context_from_portal(
 
     # except KeyError as src_error:
     except AttributeError as src_error:
+        log.exception('Raising from unexpected msg!\n')
         _raise_from_no_key_in_msg(
             ctx=ctx,
             msg=msg,
@@ -2570,7 +2637,6 @@ async def open_context_from_portal(
             None,
         )
 
-
 def mk_context(
     chan: Channel,
     cid: str,
@@ -2592,6 +2658,10 @@ def mk_context(
     recv_chan: trio.MemoryReceiveChannel
     send_chan, recv_chan = trio.open_memory_channel(msg_buffer_size)
 
+    # TODO: only scan caller-info if log level so high!
+    from .devx._code import find_caller_info
+    caller_info: CallerInfo|None = find_caller_info()
+
     ctx = Context(
         chan=chan,
         cid=cid,
@@ -2600,6 +2670,7 @@ def mk_context(
         _recv_chan=recv_chan,
         _nsf=nsf,
         _task=trio.lowlevel.current_task(),
+        _caller_info=caller_info,
         **kwargs,
     )
     # TODO: we can drop the old placeholder yah?
@@ -2610,7 +2681,11 @@ def mk_context(
 
 def context(func: Callable) -> Callable:
     '''
-    Mark an async function as a streaming routine with ``@context``.
+    Mark an (async) function as an SC-supervised, inter-`Actor`,
+    child-`trio.Task`, IPC endpoint otherwise known more
+    colloquially as a (RPC) "context".
+
+    Functions annotated the fundamental IPC endpoint type offered by `tractor`.
 
     '''
     # TODO: apply whatever solution ``mypy`` ends up picking for this:
-- 
2.34.1


From dd6a4d49d818d7ebca2805f6c1cd155509f7b5dc Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 18 Apr 2024 16:24:59 -0400
Subject: [PATCH 257/378] Go back to `ContextVar` for codec mgmt

Turns out we do want per-task inheritance particularly if there's to be
per `Context` dynamic mutation of the spec; we don't want mutation in
some task to affect any parent/global setting.

Turns out since we use a common "feeder task" in the rpc loop, we need to
offer a per `Context` payload decoder sys anyway in order to enable
per-task controls for inter-actor multi-task-ctx scenarios.
---
 tractor/msg/_codec.py | 126 +++++++++++++++++++++++++-----------------
 1 file changed, 75 insertions(+), 51 deletions(-)

diff --git a/tractor/msg/_codec.py b/tractor/msg/_codec.py
index 82fd2011..766a297a 100644
--- a/tractor/msg/_codec.py
+++ b/tractor/msg/_codec.py
@@ -33,25 +33,29 @@ from __future__ import annotations
 from contextlib import (
     contextmanager as cm,
 )
-# from contextvars import (
-#     ContextVar,
-#     Token,
-# )
+from contextvars import (
+    ContextVar,
+    Token,
+)
 import textwrap
 from typing import (
     Any,
     Callable,
     Type,
+    TYPE_CHECKING,
     Union,
 )
 from types import ModuleType
 
 import msgspec
-from msgspec import msgpack
-from trio.lowlevel import (
-    RunVar,
-    RunVarToken,
+from msgspec import (
+    msgpack,
+    # Raw,
 )
+# from trio.lowlevel import (
+#     RunVar,
+#     RunVarToken,
+# )
 # TODO: see notes below from @mikenerone..
 # from tricycle import TreeVar
 
@@ -62,6 +66,9 @@ from tractor.msg.types import (
 )
 from tractor.log import get_logger
 
+if TYPE_CHECKING:
+    from tractor._context import Context
+
 log = get_logger(__name__)
 
 # TODO: overall IPC msg-spec features (i.e. in this mod)!
@@ -157,24 +164,6 @@ class MsgCodec(Struct):
 
     lib: ModuleType = msgspec
 
-    # TODO: a sub-decoder system as well?
-    # payload_msg_specs: Union[Type[Struct]] = Any
-    # see related comments in `.msg.types`
-    # _payload_decs: (
-    #     dict[
-    #         str,
-    #         msgpack.Decoder,
-    #     ]
-    #     |None
-    # ) = None
-    # OR
-    # ) = {
-    #     # pre-seed decoders for std-py-type-set for use when
-    #     # `MsgType.pld == None|Any`.
-    #     None: msgpack.Decoder(Any),
-    #     Any: msgpack.Decoder(Any),
-    # }
-
     # TODO: use `functools.cached_property` for these ?
     # https://docs.python.org/3/library/functools.html#functools.cached_property
     @property
@@ -210,7 +199,25 @@ class MsgCodec(Struct):
         # https://jcristharif.com/msgspec/usage.html#typed-decoding
         return self._dec.decode(msg)
 
-    # TODO: do we still want to try and support the sub-decoder with
+    # TODO: a sub-decoder system as well?
+    # payload_msg_specs: Union[Type[Struct]] = Any
+    # see related comments in `.msg.types`
+    # _payload_decs: (
+    #     dict[
+    #         str,
+    #         msgpack.Decoder,
+    #     ]
+    #     |None
+    # ) = None
+    # OR
+    # ) = {
+    #     # pre-seed decoders for std-py-type-set for use when
+    #     # `MsgType.pld == None|Any`.
+    #     None: msgpack.Decoder(Any),
+    #     Any: msgpack.Decoder(Any),
+    # }
+    #
+    # -[ ] do we still want to try and support the sub-decoder with
     # `.Raw` technique in the case that the `Generic` approach gives
     # future grief?
     #
@@ -429,6 +436,9 @@ _def_msgspec_codec: MsgCodec = mk_codec(ipc_pld_spec=Any)
 #
 _def_tractor_codec: MsgCodec = mk_codec(
     ipc_pld_spec=Any,
+
+    # TODO: use this for debug mode locking prot?
+    # ipc_pld_spec=Raw,
 )
 # TODO: IDEALLY provides for per-`trio.Task` specificity of the
 # IPC msging codec used by the transport layer when doing
@@ -462,11 +472,9 @@ _def_tractor_codec: MsgCodec = mk_codec(
 
 # TODO: STOP USING THIS, since it's basically a global and won't
 # allow sub-IPC-ctxs to limit the msg-spec however desired..
-_ctxvar_MsgCodec: MsgCodec = RunVar(
+# _ctxvar_MsgCodec: MsgCodec = RunVar(
+_ctxvar_MsgCodec: ContextVar[MsgCodec] = ContextVar(
     'msgspec_codec',
-
-    # TODO: move this to our new `Msg`-spec!
-    # default=_def_msgspec_codec,
     default=_def_tractor_codec,
 )
 
@@ -475,23 +483,36 @@ _ctxvar_MsgCodec: MsgCodec = RunVar(
 def apply_codec(
     codec: MsgCodec,
 
+    ctx: Context|None = None,
+
 ) -> MsgCodec:
     '''
-    Dynamically apply a `MsgCodec` to the current task's
-    runtime context such that all IPC msgs are processed
-    with it for that task.
+    Dynamically apply a `MsgCodec` to the current task's runtime
+    context such that all (of a certain class of payload
+    containing i.e. `MsgType.pld: PayloadT`) IPC msgs are
+    processed with it for that task.
+
+    Uses a `contextvars.ContextVar` to ensure the scope of any
+    codec setting matches the current `Context` or
+    `._rpc.process_messages()` feeder task's prior setting without
+    mutating any surrounding scope.
+
+    When a `ctx` is supplied, only mod its `Context.pld_codec`.
 
-    Uses a `tricycle.TreeVar` to ensure the scope of the codec
     matches the `@cm` block and DOES NOT change to the original
     (default) value in new tasks (as it does for `ContextVar`).
 
-    See the docs:
-    - https://tricycle.readthedocs.io/en/latest/reference.html#tree-variables
-    - https://github.com/oremanj/tricycle/blob/master/tricycle/_tests/test_tree_var.py
-
     '''
     __tracebackhide__: bool = True
-    orig: MsgCodec = _ctxvar_MsgCodec.get()
+
+    if ctx is not None:
+        var: ContextVar = ctx._var_pld_codec
+    else:
+        # use IPC channel-connection "global" codec
+        var: ContextVar = _ctxvar_MsgCodec
+
+    orig: MsgCodec = var.get()
+
     assert orig is not codec
     if codec.pld_spec is None:
         breakpoint()
@@ -500,22 +521,25 @@ def apply_codec(
         'Applying new msg-spec codec\n\n'
         f'{codec}\n'
     )
-    token: RunVarToken = _ctxvar_MsgCodec.set(codec)
+    token: Token = var.set(codec)
 
-    # TODO: for TreeVar approach, see docs for @cm `.being()` API:
-    # https://tricycle.readthedocs.io/en/latest/reference.html#tree-variables
-    # try:
-    #     with _ctxvar_MsgCodec.being(codec):
-    #         new = _ctxvar_MsgCodec.get()
-    #         assert new is codec
-    #         yield codec
+    # ?TODO? for TreeVar approach which copies from the
+    # cancel-scope of the prior value, NOT the prior task
+    # See the docs:
+    # - https://tricycle.readthedocs.io/en/latest/reference.html#tree-variables
+    # - https://github.com/oremanj/tricycle/blob/master/tricycle/_tests/test_tree_var.py
+    #   ^- see docs for @cm `.being()` API
+    # with _ctxvar_MsgCodec.being(codec):
+    #     new = _ctxvar_MsgCodec.get()
+    #     assert new is codec
+    #     yield codec
 
     try:
-        yield _ctxvar_MsgCodec.get()
+        yield var.get()
     finally:
-        _ctxvar_MsgCodec.reset(token)
+        var.reset(token)
 
-    assert _ctxvar_MsgCodec.get() is orig
+    assert var.get() is orig
     log.info(
         'Reverted to last msg-spec codec\n\n'
         f'{orig}\n'
-- 
2.34.1


From d18cf32e28421f0347a04292ac84ec4da6508722 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 21 Apr 2024 17:02:39 -0400
Subject: [PATCH 258/378] Mark `.pld` msgs as also taking `msgspec.Raw`

---
 tractor/msg/types.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index f7654f62..59ec2a4e 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -38,6 +38,7 @@ from typing import (
 from msgspec import (
     defstruct,
     # field,
+    Raw,
     Struct,
     # UNSET,
     # UnsetType,
@@ -105,7 +106,7 @@ class Msg(
     # TODO: could also be set to `msgspec.Raw` if the sub-decoders
     # approach is preferred over the generic parameterization 
     # approach as take by `mk_msg_spec()` below.
-    pld: PayloadT
+    pld: PayloadT|Raw
 
 
 class Aid(
@@ -332,7 +333,7 @@ class Started(
     decorated IPC endpoint.
 
     '''
-    pld: PayloadT
+    pld: PayloadT|Raw
 
 
 # TODO: instead of using our existing `Start`
@@ -349,7 +350,7 @@ class Yield(
     Per IPC transmission of a value from `await MsgStream.send(<value>)`.
 
     '''
-    pld: PayloadT
+    pld: PayloadT|Raw
 
 
 class Stop(
@@ -377,7 +378,7 @@ class Return(
     func-as-`trio.Task`.
 
     '''
-    pld: PayloadT
+    pld: PayloadT|Raw
 
 
 class CancelAck(
@@ -710,7 +711,9 @@ def mk_msg_spec(
     )
     return (
         ipc_spec,
-        msgtypes_table[spec_build_method] + ipc_msg_types,
+        msgtypes_table[spec_build_method]
+        +
+        ipc_msg_types,
     )
 
 
-- 
2.34.1


From 7b020c42cc69ce8e0e03fe4427acb1200e9f4c75 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 21 Apr 2024 17:08:27 -0400
Subject: [PATCH 259/378] Drop more `dict`-msg cruft from `._exceptions`

---
 tractor/_exceptions.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 4ace626f..90163241 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -170,7 +170,7 @@ def pformat_boxed_tb(
         f' ------ - ------\n\n'
         # f'{tb_str}\n'
         f'{tb_body}'
-        f'  ------ - ------\n'
+        f' ------ - ------\n'
         f'_|\n'
     )
     tb_box_indent: str = (
@@ -972,8 +972,6 @@ def _raise_from_no_key_in_msg(
     # an internal error should never get here
     try:
         cid: str = msg.cid
-        # cid: str = msg['cid']
-    # except KeyError as src_err:
     except AttributeError as src_err:
         raise MessagingError(
             f'IPC `Context` rx-ed msg without a ctx-id (cid)!?\n'
@@ -985,7 +983,6 @@ def _raise_from_no_key_in_msg(
     # TODO: test that shows stream raising an expected error!!!
 
     # raise the error message in a boxed exception type!
-    # if msg.get('error'):
     if isinstance(msg, Error):
     # match msg:
     #     case Error():
@@ -1001,7 +998,6 @@ def _raise_from_no_key_in_msg(
     # the stream._eoc outside this in the calleer always?
         # case Stop():
     elif (
-        # msg.get('stop')
         isinstance(msg, Stop)
         or (
             stream
-- 
2.34.1


From 0df7d557db65df64b46021f613cc203ee7579e33 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 22 Apr 2024 18:01:09 -0400
Subject: [PATCH 260/378] Move `MsgTypeError` maker func to `._exceptions`

Since it's going to be used from the IPC primitive APIs
(`Context`/`MsgStream`) for similarly handling payload type spec
validation errors and bc it's really not well situation in the IPC
module XD

Summary of (impl) tweaks:
- obvi move `_mk_msg_type_err()` and import and use it in `._ipc`; ends
  up avoiding a lot of ad-hoc imports we had from `._exceptions` anyway!
- mask out "new codec" runtime log emission from `MsgpackTCPStream`.
- allow passing a (coming in next commit) `codec: MsgDec` (message
  decoder) which supports the same required `.pld_spec_str: str` attr.
- for send side logging use existing `MsgCodec..pformat_msg_spec()`.
- rename `_raise_from_no_key_in_msg()` to the now more appropriate
  `_raise_from_unexpected_msg()`, but leaving alias for now.
---
 tractor/_exceptions.py | 136 ++++++++++++++++++++++++++++++++++++++++-
 tractor/_ipc.py        | 130 ++-------------------------------------
 2 files changed, 138 insertions(+), 128 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 90163241..b2ba6e84 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -24,6 +24,7 @@ import importlib
 from pprint import pformat
 from typing import (
     Any,
+    Callable,
     Type,
     TYPE_CHECKING,
 )
@@ -32,8 +33,11 @@ import traceback
 
 import trio
 from msgspec import (
-    structs,
     defstruct,
+    msgpack,
+    Raw,
+    structs,
+    ValidationError,
 )
 
 from tractor._state import current_actor
@@ -44,6 +48,8 @@ from tractor.msg import (
     Stop,
     Yield,
     types as msgtypes,
+    MsgCodec,
+    MsgDec,
 )
 from tractor.msg.pretty_struct import (
     iter_fields,
@@ -932,7 +938,7 @@ def is_multi_cancelled(exc: BaseException) -> bool:
     return False
 
 
-def _raise_from_no_key_in_msg(
+def _raise_from_unexpected_msg(
     ctx: Context,
     msg: MsgType,
     src_err: AttributeError,
@@ -1032,7 +1038,6 @@ def _raise_from_no_key_in_msg(
         # that arrived which is probably the source of this stream
         # closure
         ctx.maybe_raise()
-
         raise eoc from src_err
 
     if (
@@ -1052,3 +1057,128 @@ def _raise_from_no_key_in_msg(
         " BUT received a non-error msg:\n"
         f'{pformat(msg)}'
     ) from src_err
+
+
+_raise_from_no_key_in_msg = _raise_from_unexpected_msg
+
+
+def _mk_msg_type_err(
+    msg: Any|bytes|Raw,
+    codec: MsgCodec|MsgDec,
+
+    message: str|None = None,
+    verb_header: str = '',
+
+    src_validation_error: ValidationError|None = None,
+    src_type_error: TypeError|None = None,
+
+) -> MsgTypeError:
+    '''
+    Compose a `MsgTypeError` from an input runtime context.
+
+    '''
+    # `Channel.send()` case
+    if src_validation_error is None:
+
+        if isinstance(codec, MsgDec):
+            raise RuntimeError(
+                '`codec` must be a `MsgCodec` for send-side errors?'
+            )
+
+        # no src error from `msgspec.msgpack.Decoder.decode()` so
+        # prolly a manual type-check on our part.
+        if message is None:
+            fmt_spec: str = codec.pformat_msg_spec()
+            fmt_stack: str = (
+                '\n'.join(traceback.format_stack(limit=3))
+            )
+            tb_fmt: str = pformat_boxed_tb(
+                tb_str=fmt_stack,
+                # fields_str=header,
+                field_prefix='  ',
+                indent='',
+            )
+            message: str = (
+                f'invalid msg -> {msg}: {type(msg)}\n\n'
+                f'{tb_fmt}\n'
+                f'Valid IPC msgs are:\n\n'
+                # f'  ------ - ------\n'
+                f'{fmt_spec}\n',
+            )
+        elif src_type_error:
+            src_message: str = str(src_type_error)
+            patt: str = 'type '
+            type_idx: int = src_message.find('type ')
+            invalid_type: str = src_message[type_idx + len(patt):].split()[0]
+
+            enc_hook: Callable|None = codec.enc.enc_hook
+            if enc_hook is None:
+                message += (
+                    '\n\n'
+
+                    f"The current IPC-msg codec can't encode type `{invalid_type}` !\n"
+                    f'Maybe a `msgpack.Encoder.enc_hook()` extension is needed?\n\n'
+
+                    f'Check the `msgspec` docs for ad-hoc type extending:\n'
+                    '|_ https://jcristharif.com/msgspec/extending.html\n'
+                    '|_ https://jcristharif.com/msgspec/extending.html#defining-a-custom-extension-messagepack-only\n'
+                )
+
+
+        msgtyperr = MsgTypeError(
+            message=message,
+            ipc_msg=msg,
+        )
+        # ya, might be `None`
+        msgtyperr.__cause__ = src_type_error
+        return msgtyperr
+
+    # `Channel.recv()` case
+    else:
+        # decode the msg-bytes using the std msgpack
+        # interchange-prot (i.e. without any
+        # `msgspec.Struct` handling) so that we can
+        # determine what `.msg.types.Msg` is the culprit
+        # by reporting the received value.
+        msg_dict: dict = msgpack.decode(msg)
+        msg_type_name: str = msg_dict['msg_type']
+        msg_type = getattr(msgtypes, msg_type_name)
+        message: str = (
+            f'invalid `{msg_type_name}` IPC msg\n\n'
+        )
+        if verb_header:
+            message = f'{verb_header} ' + message
+
+        # XXX see if we can determine the exact invalid field
+        # such that we can comprehensively report the
+        # specific field's type problem
+        msgspec_msg: str = src_validation_error.args[0].rstrip('`')
+        msg, _, maybe_field = msgspec_msg.rpartition('$.')
+        obj = object()
+        if (field_val := msg_dict.get(maybe_field, obj)) is not obj:
+            field_name_expr: str = (
+                f' |_{maybe_field}: {codec.pld_spec_str} = '
+            )
+            fmt_val_lines: list[str] = pformat(field_val).splitlines()
+            fmt_val: str = (
+                f'{fmt_val_lines[0]}\n'
+                +
+                textwrap.indent(
+                    '\n'.join(fmt_val_lines[1:]),
+                    prefix=' '*len(field_name_expr),
+                )
+            )
+            message += (
+                f'{msg.rstrip("`")}\n\n'
+                f'<{msg_type.__qualname__}(\n'
+                # f'{".".join([msg_type.__module__, msg_type.__qualname__])}\n'
+                f'{field_name_expr}{fmt_val}\n'
+                f')>'
+            )
+
+        msgtyperr = MsgTypeError.from_decode(
+            message=message,
+            msgdict=msg_dict,
+        )
+        msgtyperr.__cause__ = src_validation_error
+        return msgtyperr
diff --git a/tractor/_ipc.py b/tractor/_ipc.py
index f76d4ef5..70774bed 100644
--- a/tractor/_ipc.py
+++ b/tractor/_ipc.py
@@ -49,6 +49,7 @@ from tractor._exceptions import (
     MsgTypeError,
     pack_from_raise,
     TransportClosed,
+    _mk_msg_type_err,
 )
 from tractor.msg import (
     _ctxvar_MsgCodec,
@@ -118,127 +119,6 @@ class MsgTransport(Protocol[MsgType]):
         ...
 
 
-def _mk_msg_type_err(
-    msg: Any|bytes,
-    codec: MsgCodec,
-
-    message: str|None = None,
-    verb_header: str = '',
-
-    src_validation_error: msgspec.ValidationError|None = None,
-    src_type_error: TypeError|None = None,
-
-) -> MsgTypeError:
-
-    import textwrap
-
-    # `Channel.send()` case
-    if src_validation_error is None:  # send-side
-
-        # no src error from `msgspec.msgpack.Decoder.decode()` so
-        # prolly a manual type-check on our part.
-        if message is None:
-            import traceback
-            from tractor._exceptions import pformat_boxed_tb
-
-            fmt_spec: str = '\n'.join(
-                map(str, codec.msg_spec.__args__)
-            )
-            fmt_stack: str = (
-                '\n'.join(traceback.format_stack(limit=3))
-            )
-            tb_fmt: str = pformat_boxed_tb(
-                tb_str=fmt_stack,
-                # fields_str=header,
-                field_prefix='  ',
-                indent='',
-            )
-            message: str = (
-                f'invalid msg -> {msg}: {type(msg)}\n\n'
-                f'{tb_fmt}\n'
-                f'Valid IPC msgs are:\n\n'
-                # f'  ------ - ------\n'
-                f'{fmt_spec}\n',
-            )
-        elif src_type_error:
-            src_message: str = str(src_type_error)
-            patt: str = 'type '
-            type_idx: int = src_message.find('type ')
-            invalid_type: str = src_message[type_idx + len(patt):].split()[0]
-
-            enc_hook: Callable|None = codec.enc.enc_hook
-            if enc_hook is None:
-                message += (
-                    '\n\n'
-
-                    f"The current IPC-msg codec can't encode type `{invalid_type}` !\n"
-                    f'Maybe a `msgpack.Encoder.enc_hook()` extension is needed?\n\n'
-
-                    f'Check the `msgspec` docs for ad-hoc type extending:\n'
-                    '|_ https://jcristharif.com/msgspec/extending.html\n'
-                    '|_ https://jcristharif.com/msgspec/extending.html#defining-a-custom-extension-messagepack-only\n'
-                )
-
-
-        msgtyperr = MsgTypeError(
-            message=message,
-            ipc_msg=msg,
-        )
-        # ya, might be `None`
-        msgtyperr.__cause__ = src_type_error
-        return msgtyperr
-
-    # `Channel.recv()` case
-    else:
-        # decode the msg-bytes using the std msgpack
-        # interchange-prot (i.e. without any
-        # `msgspec.Struct` handling) so that we can
-        # determine what `.msg.types.Msg` is the culprit
-        # by reporting the received value.
-        msg_dict: dict = msgspec.msgpack.decode(msg)
-        msg_type_name: str = msg_dict['msg_type']
-        msg_type = getattr(msgtypes, msg_type_name)
-        message: str = (
-            f'invalid `{msg_type_name}` IPC msg\n\n'
-        )
-        if verb_header:
-            message = f'{verb_header} ' + message
-
-        # XXX see if we can determine the exact invalid field
-        # such that we can comprehensively report the
-        # specific field's type problem
-        msgspec_msg: str = src_validation_error.args[0].rstrip('`')
-        msg, _, maybe_field = msgspec_msg.rpartition('$.')
-        obj = object()
-        if (field_val := msg_dict.get(maybe_field, obj)) is not obj:
-            field_name_expr: str = (
-                f' |_{maybe_field}: {codec.pld_spec_str} = '
-            )
-            fmt_val_lines: list[str] = pformat(field_val).splitlines()
-            fmt_val: str = (
-                f'{fmt_val_lines[0]}\n'
-                +
-                textwrap.indent(
-                    '\n'.join(fmt_val_lines[1:]),
-                    prefix=' '*len(field_name_expr),
-                )
-            )
-            message += (
-                f'{msg.rstrip("`")}\n\n'
-                f'<{msg_type.__qualname__}(\n'
-                # f'{".".join([msg_type.__module__, msg_type.__qualname__])}\n'
-                f'{field_name_expr}{fmt_val}\n'
-                f')>'
-            )
-
-        msgtyperr = MsgTypeError.from_decode(
-            message=message,
-            msgdict=msg_dict,
-        )
-        msgtyperr.__cause__ = src_validation_error
-        return msgtyperr
-
-
 # TODO: not sure why we have to inherit here, but it seems to be an
 # issue with ``get_msg_transport()`` returning a ``Type[Protocol]``;
 # probably should make a `mypy` issue?
@@ -299,10 +179,10 @@ class MsgpackTCPStream(MsgTransport):
             _codec._ctxvar_MsgCodec.get()
         )
         # TODO: mask out before release?
-        log.runtime(
-            f'New {self} created with codec\n'
-            f'codec: {self._codec}\n'
-        )
+        # log.runtime(
+        #     f'New {self} created with codec\n'
+        #     f'codec: {self._codec}\n'
+        # )
 
     async def _iter_packets(self) -> AsyncGenerator[dict, None]:
         '''
-- 
2.34.1


From a51632ffa67c31fa26cce004fa8d2bfad1e222c1 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 22 Apr 2024 18:24:42 -0400
Subject: [PATCH 261/378] Add a `MsgDec` for receive-only decoding

In prep for a "payload receiver" abstraction that will wrap
`MsgType.pld`-IO delivery from `Context` and `MsgStream`, adds a small
`msgspec.msgpack.Decoder` shim which delegates an API similar to
`MsgCodec` and is offered via a `.msg._codec.mk_dec()` factory.

Detalles:
- move over the TODOs/comments from `.msg.types.Start` to to
  `MsgDec.spec` since it's probably the ideal spot to start thinking
  about it from a consumer code PoV.
- move codec reversion assert and log emit into `finally:` block.
- flip default `.types._tractor_codec = mk_codec_ipc_pld(ipc_pld_spec=Raw)`
  in prep for always doing payload-delayed decodes.
- make `MsgCodec._dec` private with public property getter.
- change `CancelAck` to NOT derive from `Return` so it's mutex in
  `match/case:` handling.
---
 tractor/msg/__init__.py |   2 +
 tractor/msg/_codec.py   | 156 ++++++++++++++++++++++++++++++++++------
 tractor/msg/types.py    |  33 +--------
 3 files changed, 141 insertions(+), 50 deletions(-)

diff --git a/tractor/msg/__init__.py b/tractor/msg/__init__.py
index 8f13f5f8..d968f6cf 100644
--- a/tractor/msg/__init__.py
+++ b/tractor/msg/__init__.py
@@ -34,6 +34,7 @@ from ._codec import (
     apply_codec as apply_codec,
     mk_codec as mk_codec,
     MsgCodec as MsgCodec,
+    MsgDec as MsgDec,
     current_codec as current_codec,
 )
 
@@ -50,6 +51,7 @@ from .types import (
     Yield as Yield,
     Stop as Stop,
     Return as Return,
+    CancelAck as CancelAck,
 
     Error as Error,
 
diff --git a/tractor/msg/_codec.py b/tractor/msg/_codec.py
index 766a297a..104f7d99 100644
--- a/tractor/msg/_codec.py
+++ b/tractor/msg/_codec.py
@@ -50,7 +50,7 @@ from types import ModuleType
 import msgspec
 from msgspec import (
     msgpack,
-    # Raw,
+    Raw,
 )
 # from trio.lowlevel import (
 #     RunVar,
@@ -71,6 +71,108 @@ if TYPE_CHECKING:
 
 log = get_logger(__name__)
 
+
+# TODO: unify with `MsgCodec` by making `._dec` part this?
+class MsgDec(Struct):
+    '''
+    An IPC msg decoder.
+
+    Normally used to decode only a payload: `MsgType.pld:
+    PayloadT` field before delivery to IPC consumer code.
+
+    '''
+    _dec: msgpack.Decoder
+
+    @property
+    def dec(self) -> msgpack.Decoder:
+        return self._dec
+
+    # struct type unions
+    # https://jcristharif.com/msgspec/structs.html#tagged-unions
+    #
+    # ^-TODO-^: make a wrapper type for this such that alt
+    # backends can be represented easily without a `Union` needed,
+    # AND so that we have better support for wire transport.
+    #
+    # -[ ] maybe `FieldSpec` is a good name since msg-spec
+    #   better applies to a `MsgType[FieldSpec]`?
+    #
+    # -[ ] both as part of the `.open_context()` call AND as part of the
+    #     immediate ack-reponse (see similar below)
+    #     we should do spec matching and fail if anything is awry?
+    #
+    # -[ ] eventually spec should be generated/parsed from the
+    #     type-annots as # desired in GH issue:
+    #     https://github.com/goodboy/tractor/issues/365
+    #
+    # -[ ] semantics of the mismatch case
+    #   - when caller-callee specs we should raise
+    #    a `MsgTypeError` or `MsgSpecError` or similar?
+    #
+    # -[ ] wrapper types for both spec types such that we can easily
+    #     IPC transport them?
+    #     - `TypeSpec: Union[Type]`
+    #      * also a `.__contains__()` for doing `None in
+    #      TypeSpec[None|int]` since rn you need to do it on
+    #      `.__args__` for unions..
+    #     - `MsgSpec: Union[Type[Msg]]
+    #
+    # -[ ] auto-genning this from new (in 3.12) type parameter lists Bo
+    # |_ https://docs.python.org/3/reference/compound_stmts.html#type-params
+    # |_ historical pep 695: https://peps.python.org/pep-0695/
+    # |_ full lang spec: https://typing.readthedocs.io/en/latest/spec/
+    # |_ on annotation scopes:
+    #    https://docs.python.org/3/reference/executionmodel.html#annotation-scopes
+    # |_ 3.13 will have subscriptable funcs Bo
+    #    https://peps.python.org/pep-0718/
+    @property
+    def spec(self) -> Union[Type[Struct]]:
+        # NOTE: defined and applied inside `mk_codec()`
+        return self._dec.type
+
+    # no difference, as compared to a `MsgCodec` which defines the
+    # `MsgType.pld: PayloadT` part of its spec separately
+    pld_spec = spec
+
+    # TODO: would get moved into `FieldSpec.__str__()` right?
+    @property
+    def spec_str(self) -> str:
+
+        # TODO: could also use match: instead?
+        spec: Union[Type]|Type = self.spec
+
+        # `typing.Union` case
+        if getattr(spec, '__args__', False):
+            return str(spec)
+
+        # just a single type
+        else:
+            return spec.__name__
+
+    pld_spec_str = spec_str
+
+    def decode(
+        self,
+        raw: Raw|bytes,
+    ) -> Any:
+        return self._dec.decode(raw)
+
+    @property
+    def hook(self) -> Callable|None:
+        return self._dec.dec_hook
+
+
+def mk_dec(
+    spec: Union[Type[Struct]]|Any = Any,
+    dec_hook: Callable|None = None,
+
+) -> MsgDec:
+
+    return msgpack.Decoder(
+        type=spec,  # like `Msg[Any]`
+        dec_hook=dec_hook,
+    )
+
 # TODO: overall IPC msg-spec features (i.e. in this mod)!
 #
 # -[ ] API changes towards being interchange lib agnostic!
@@ -94,8 +196,7 @@ class MsgCodec(Struct):
     '''
     _enc: msgpack.Encoder
     _dec: msgpack.Decoder
-
-    pld_spec: Union[Type[Struct]]|None
+    _pld_spec: Type[Struct]|Raw|Any
 
     def __repr__(self) -> str:
         speclines: str = textwrap.indent(
@@ -118,14 +219,21 @@ class MsgCodec(Struct):
             ')>'
         )
 
+    @property
+    def pld_spec(self) -> Type[Struct]|Raw|Any:
+        return self._pld_spec
+
     @property
     def pld_spec_str(self) -> str:
-        spec: Union[Type]|Type = self.pld_spec
 
         # TODO: could also use match: instead?
+        spec: Union[Type]|Type = self.pld_spec
+
+        # `typing.Union` case
         if getattr(spec, '__args__', False):
-            # `typing.Union` case
             return str(spec)
+
+        # just a single type
         else:
             return spec.__name__
 
@@ -133,6 +241,7 @@ class MsgCodec(Struct):
     # https://jcristharif.com/msgspec/structs.html#tagged-unions
     @property
     def msg_spec(self) -> Union[Type[Struct]]:
+        # NOTE: defined and applied inside `mk_codec()`
         return self._dec.type
 
     def msg_spec_items(
@@ -157,8 +266,9 @@ class MsgCodec(Struct):
     def pformat_msg_spec(
         self,
         msg: MsgType|None = None,
+        join_char: str = '\n',
     ) -> str:
-        return '\n'.join(
+        return join_char.join(
             self.msg_spec_items(msg=msg).values()
         )
 
@@ -405,18 +515,25 @@ def mk_codec(
     assert len(ipc_msg_spec.__args__) == len(msg_types)
     assert ipc_msg_spec
 
+    # TODO: use this shim instead?
+    # bc.. unification, err somethin?
+    # dec: MsgDec = mk_dec(
+    #     spec=ipc_msg_spec,
+    #     dec_hook=dec_hook,
+    # )
+
+    dec = msgpack.Decoder(
+        type=ipc_msg_spec,
+        dec_hook=dec_hook,
+    )
     enc = msgpack.Encoder(
        enc_hook=enc_hook,
     )
-    dec = msgpack.Decoder(
-        type=ipc_msg_spec,  # like `Msg[Any]`
-        dec_hook=dec_hook,
-    )
 
     codec = MsgCodec(
         _enc=enc,
         _dec=dec,
-        pld_spec=ipc_pld_spec,
+        _pld_spec=ipc_pld_spec,
     )
 
     # sanity on expected backend support
@@ -435,10 +552,9 @@ _def_msgspec_codec: MsgCodec = mk_codec(ipc_pld_spec=Any)
 # https://jcristharif.com/msgspec/supported-types.html
 #
 _def_tractor_codec: MsgCodec = mk_codec(
-    ipc_pld_spec=Any,
-
     # TODO: use this for debug mode locking prot?
-    # ipc_pld_spec=Raw,
+    # ipc_pld_spec=Any,
+    ipc_pld_spec=Raw,
 )
 # TODO: IDEALLY provides for per-`trio.Task` specificity of the
 # IPC msging codec used by the transport layer when doing
@@ -538,12 +654,12 @@ def apply_codec(
         yield var.get()
     finally:
         var.reset(token)
+        log.info(
+            'Reverted to last msg-spec codec\n\n'
+            f'{orig}\n'
+        )
+        assert var.get() is orig
 
-    assert var.get() is orig
-    log.info(
-        'Reverted to last msg-spec codec\n\n'
-        f'{orig}\n'
-    )
 
 def current_codec() -> MsgCodec:
     '''
@@ -574,7 +690,7 @@ def limit_msg_spec(
 
     '''
     __tracebackhide__: bool = True
-    curr_codec = current_codec()
+    curr_codec: MsgCodec = current_codec()
     msgspec_codec: MsgCodec = mk_codec(
         ipc_pld_spec=payload_spec,
         **codec_kwargs,
diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index 59ec2a4e..cb124324 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -266,35 +266,7 @@ class Start(
 
     # TODO: enforcing a msg-spec in terms `Msg.pld`
     # parameterizable msgs to be used in the appls IPC dialog.
-    #
-    # -[ ] both as part of the `.open_context()` call AND as part of the
-    #     immediate ack-reponse (see similar below)
-    #     we should do spec matching and fail if anything is awry?
-    #
-    # -[ ] eventually spec should be generated/parsed from the
-    #     type-annots as # desired in GH issue:
-    #     https://github.com/goodboy/tractor/issues/365
-    #
-    # -[ ] semantics of the mismatch case
-    #   - when caller-callee specs we should raise
-    #    a `MsgTypeError` or `MsgSpecError` or similar?
-    #
-    # -[ ] wrapper types for both spec types such that we can easily
-    #     IPC transport them?
-    #     - `TypeSpec: Union[Type]`
-    #      * also a `.__contains__()` for doing `None in
-    #      TypeSpec[None|int]` since rn you need to do it on
-    #      `.__args__` for unions..
-    #     - `MsgSpec: Union[Type[Msg]]
-    #
-    # -[ ] auto-genning this from new (in 3.12) type parameter lists Bo
-    # |_ https://docs.python.org/3/reference/compound_stmts.html#type-params
-    # |_ historical pep 695: https://peps.python.org/pep-0695/
-    # |_ full lang spec: https://typing.readthedocs.io/en/latest/spec/
-    # |_ on annotation scopes:
-    #    https://docs.python.org/3/reference/executionmodel.html#annotation-scopes
-    # |_ 3.13 will have subscriptable funcs Bo
-    #    https://peps.python.org/pep-0718/
+    # => SEE `._codec.MsgDec` for more <=
     pld_spec: str = str(Any)
 
 
@@ -382,7 +354,8 @@ class Return(
 
 
 class CancelAck(
-    Return,
+    Msg,
+    Generic[PayloadT],
 ):
     '''
     Deliver the `bool` return-value from a cancellation `Actor`
-- 
2.34.1


From 5eb91449210420bf33598eabccfb8c86a6673436 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 23 Apr 2024 17:43:45 -0400
Subject: [PATCH 262/378] First draft "payload receiver in a new `.msg._ops`

As per much tinkering, re-designs and preceding rubber-ducking via many
"commit msg novelas", **finally** this adds the (hopefully) final
missing layer for typed msg safety: `tractor.msg._ops.PldRx`

(or `PayloadReceiver`? haven't decided how verbose to go..)

Design justification summary:
      ------ - ------
- need a way to be as-close-as-possible to the `tractor`-application
  such that when `MsgType.pld: PayloadT` validation takes place, it is
  straightforward and obvious how user code can decide to handle any
  resulting `MsgTypeError`.
- there should be a common and optional-yet-modular way to modify
  **how** data delivered via IPC (possibly embedded as user defined,
  type-constrained `.pld: msgspec.Struct`s) can be handled and processed
  during fault conditions and/or IPC "msg attacks".
- support for nested type constraints within a `MsgType.pld` field
  should be simple to define, implement and understand at runtime.
- a layer between the app-level IPC primitive APIs
  (`Context`/`MsgStream`) and application-task code (consumer code of
  those APIs) should be easily customized and prove-to-be-as-such
  through demonstrably rigorous internal (sub-sys) use!
  -> eg. via seemless runtime RPC eps support like `Actor.cancel()`
  -> by correctly implementing our `.devx._debug.Lock` REPL TTY mgmt
    dialog prot, via a dead simple payload-as-ctl-msg-spec.

There are some fairly detailed doc strings included so I won't duplicate
that content, the majority of the work here is actually somewhat of
a factoring of many similar blocks that are doing more or less the same
`msg = await Context._rx_chan.receive()` with boilerplate for
`Error`/`Stop` handling via `_raise_from_no_key_in_msg()`. The new
`PldRx` basically provides a shim layer for this common "receive msg,
decode its payload, yield it up to the consuming app task" by pairing
the RPC feeder mem-chan with a msg-payload decoder and expecting IPC API
internals to use **one** API instead of re-implementing the same pattern
all over the place XD

`PldRx` breakdown
 ------ - ------
- for now only expects a `._msgdec: MsgDec` which allows for
  override-able `MsgType.pld` validation and most obviously used in
  the impl of `.dec_msg()`, the decode message method.
- provides multiple mem-chan receive options including:
 |_ `.recv_pld()` which does the e2e operation of receiving a payload
    item.
 |_ a sync `.recv_pld_nowait()` version.
 |_ a `.recv_msg_w_pld()` which optionally allows retreiving both the
    shuttling `MsgType` as well as it's `.pld` body for use cases where
    info on both is important (eg. draining a `MsgStream`).

Dirty internal changeover/implementation deatz:
             ------ - ------
- obvi move over all the IPC "primitives" that previously had the duplicate recv-n-yield
  logic:
 - `MsgStream.receive[_nowait]()` delegating instead to the equivalent
   `PldRx.recv_pld[_nowait]()`.
 - add `Context._pld_rx: PldRx`, created and passed in by
   `mk_context()`; use it for the `.started()` -> `first: Started`
   retrieval inside `open_context_from_portal()`.
 - all the relevant `Portal` invocation methods: `.result()`,
   `.run_from_ns()`, `.run()`; also allows for dropping `_unwrap_msg()`
   and `.Portal_return_once()` outright Bo
- rename `Context.ctx._recv_chan` -> `._rx_chan`.
- add detailed `Context._scope` info for logging whether or not it's
  cancelled inside `_maybe_cancel_and_set_remote_error()`.
- move `._context._drain_to_final_msg()` -> `._ops.drain_to_final_msg()`
  since it's really not necessarily ctx specific per say, and it does
  kinda fit with "msg operations" more abstractly ;)
---
 tractor/_context.py   | 438 +++++++-------------------------
 tractor/_portal.py    | 127 +++++-----
 tractor/_runtime.py   |   6 +-
 tractor/_streaming.py |  71 ++----
 tractor/msg/_ops.py   | 563 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 738 insertions(+), 467 deletions(-)
 create mode 100644 tractor/msg/_ops.py

diff --git a/tractor/_context.py b/tractor/_context.py
index 2230598d..abcb90e4 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -25,26 +25,31 @@ disjoint, parallel executing tasks in separate actors.
 '''
 from __future__ import annotations
 from collections import deque
-from contextlib import asynccontextmanager as acm
-from contextvars import ContextVar
+from contextlib import (
+    asynccontextmanager as acm,
+)
 from dataclasses import (
     dataclass,
     field,
 )
 from functools import partial
 import inspect
-import msgspec
 from pprint import pformat
 from typing import (
     Any,
     Callable,
     AsyncGenerator,
+    Type,
     TYPE_CHECKING,
+    Union,
 )
 import warnings
-
+# ------ - ------
 import trio
-
+from msgspec import (
+    ValidationError,
+)
+# ------ - ------
 from ._exceptions import (
     ContextCancelled,
     InternalError,
@@ -53,7 +58,6 @@ from ._exceptions import (
     StreamOverrun,
     pack_from_raise,
     unpack_error,
-    _raise_from_no_key_in_msg,
 )
 from .log import get_logger
 from .msg import (
@@ -70,8 +74,12 @@ from .msg import (
     current_codec,
     pretty_struct,
     types as msgtypes,
+    _ops as msgops,
+)
+from ._ipc import (
+    Channel,
+    _mk_msg_type_err,
 )
-from ._ipc import Channel
 from ._streaming import MsgStream
 from ._state import (
     current_actor,
@@ -86,294 +94,9 @@ if TYPE_CHECKING:
         CallerInfo,
     )
 
-
 log = get_logger(__name__)
 
 
-async def _drain_to_final_msg(
-    ctx: Context,
-
-    hide_tb: bool = True,
-    msg_limit: int = 6,
-
-) -> tuple[
-    Return|None,
-    list[MsgType]
-]:
-    '''
-    Drain IPC msgs delivered to the underlying rx-mem-chan
-    `Context._recv_chan` from the runtime in search for a final
-    result or error msg.
-
-    The motivation here is to ideally capture errors during ctxc
-    conditions where a canc-request/or local error is sent but the
-    local task also excepts and enters the
-    `Portal.open_context().__aexit__()` block wherein we prefer to
-    capture and raise any remote error or ctxc-ack as part of the
-    `ctx.result()` cleanup and teardown sequence.
-
-    '''
-    __tracebackhide__: bool = hide_tb
-    raise_overrun: bool = not ctx._allow_overruns
-
-    # wait for a final context result by collecting (but
-    # basically ignoring) any bi-dir-stream msgs still in transit
-    # from the far end.
-    pre_result_drained: list[MsgType] = []
-    return_msg: Return|None = None
-    while not (
-        ctx.maybe_error
-        and not ctx._final_result_is_set()
-    ):
-        try:
-            # TODO: can remove?
-            # await trio.lowlevel.checkpoint()
-
-            # NOTE: this REPL usage actually works here dawg! Bo
-            # from .devx._debug import pause
-            # await pause()
-
-            # TODO: bad idea?
-            # -[ ] wrap final outcome channel wait in a scope so
-            # it can be cancelled out of band if needed?
-            #
-            # with trio.CancelScope() as res_cs:
-            #     ctx._res_scope = res_cs
-            #     msg: dict = await ctx._recv_chan.receive()
-            # if res_cs.cancelled_caught:
-
-            # TODO: ensure there's no more hangs, debugging the
-            # runtime pretty preaase!
-            # from .devx._debug import pause
-            # await pause()
-
-            # TODO: can remove this finally?
-            # we have no more need for the sync draining right
-            # since we're can kinda guarantee the async
-            # `.receive()` below will never block yah?
-            #
-            # if (
-            #     ctx._cancel_called and (
-            #         ctx.cancel_acked
-            #         # or ctx.chan._cancel_called
-            #     )
-            #     # or not ctx._final_result_is_set()
-            #     # ctx.outcome is not 
-            #     # or ctx.chan._closed
-            # ):
-            #     try:
-            #         msg: dict = await ctx._recv_chan.receive_nowait()()
-            #     except trio.WouldBlock:
-            #         log.warning(
-            #             'When draining already `.cancel_called` ctx!\n'
-            #             'No final msg arrived..\n'
-            #         )
-            #         break
-            # else:
-            #     msg: dict = await ctx._recv_chan.receive()
-
-            # TODO: don't need it right jefe?
-            # with trio.move_on_after(1) as cs:
-            # if cs.cancelled_caught:
-            #     from .devx._debug import pause
-            #     await pause()
-
-            # pray to the `trio` gawds that we're corrent with this
-            # msg: dict = await ctx._recv_chan.receive()
-            msg: MsgType = await ctx._recv_chan.receive()
-
-        # NOTE: we get here if the far end was
-        # `ContextCancelled` in 2 cases:
-        # 1. we requested the cancellation and thus
-        #    SHOULD NOT raise that far end error,
-        # 2. WE DID NOT REQUEST that cancel and thus
-        #    SHOULD RAISE HERE!
-        except trio.Cancelled:
-
-            # CASE 2: mask the local cancelled-error(s)
-            # only when we are sure the remote error is
-            # the source cause of this local task's
-            # cancellation.
-            ctx.maybe_raise()
-
-            # CASE 1: we DID request the cancel we simply
-            # continue to bubble up as normal.
-            raise
-
-        match msg:
-
-            # final result arrived!
-            case Return(
-                # cid=cid,
-                pld=res,
-            ):
-                ctx._result: Any = res
-                log.runtime(
-                    'Context delivered final draining msg:\n'
-                    f'{pformat(msg)}'
-                )
-                # XXX: only close the rx mem chan AFTER
-                # a final result is retreived.
-                # if ctx._recv_chan:
-                #     await ctx._recv_chan.aclose()
-                # TODO: ^ we don't need it right?
-                return_msg = msg
-                break
-
-            # far end task is still streaming to us so discard
-            # and report depending on local ctx state.
-            case Yield():
-                pre_result_drained.append(msg)
-                if (
-                    (ctx._stream.closed
-                     and (reason := 'stream was already closed')
-                    )
-                    or (ctx.cancel_acked
-                        and (reason := 'ctx cancelled other side')
-                    )
-                    or (ctx._cancel_called
-                        and (reason := 'ctx called `.cancel()`')
-                    )
-                    or (len(pre_result_drained) > msg_limit
-                        and (reason := f'"yield" limit={msg_limit}')
-                    )
-                ):
-                    log.cancel(
-                        'Cancelling `MsgStream` drain since '
-                        f'{reason}\n\n'
-                        f'<= {ctx.chan.uid}\n'
-                        f'  |_{ctx._nsf}()\n\n'
-                        f'=> {ctx._task}\n'
-                        f'  |_{ctx._stream}\n\n'
-
-                        f'{pformat(msg)}\n'
-                    )
-                    return (
-                        return_msg,
-                        pre_result_drained,
-                    )
-
-                # drain up to the `msg_limit` hoping to get
-                # a final result or error/ctxc.
-                else:
-                    log.warning(
-                        'Ignoring "yield" msg during `ctx.result()` drain..\n'
-                        f'<= {ctx.chan.uid}\n'
-                        f'  |_{ctx._nsf}()\n\n'
-                        f'=> {ctx._task}\n'
-                        f'  |_{ctx._stream}\n\n'
-
-                        f'{pformat(msg)}\n'
-                    )
-                    continue
-
-            # stream terminated, but no result yet..
-            #
-            # TODO: work out edge cases here where
-            # a stream is open but the task also calls
-            # this?
-            # -[ ] should be a runtime error if a stream is open right?
-            # Stop()
-            case Stop():
-                pre_result_drained.append(msg)
-                log.cancel(
-                    'Remote stream terminated due to "stop" msg:\n\n'
-                    f'{pformat(msg)}\n'
-                )
-                continue
-
-            # remote error msg, likely already handled inside
-            # `Context._deliver_msg()`
-            case Error():
-                # TODO: can we replace this with `ctx.maybe_raise()`?
-                # -[ ]  would this be handier for this case maybe?
-                #     async with maybe_raise_on_exit() as raises:
-                #         if raises:
-                #             log.error('some msg about raising..')
-                #
-                re: Exception|None = ctx._remote_error
-                if re:
-                    assert msg is ctx._cancel_msg
-                    # NOTE: this solved a super duper edge case XD
-                    # this was THE super duper edge case of:
-                    # - local task opens a remote task,
-                    # - requests remote cancellation of far end
-                    #   ctx/tasks,
-                    # - needs to wait for the cancel ack msg
-                    #   (ctxc) or some result in the race case
-                    #   where the other side's task returns
-                    #   before the cancel request msg is ever
-                    #   rxed and processed,
-                    # - here this surrounding drain loop (which
-                    #   iterates all ipc msgs until the ack or
-                    #   an early result arrives) was NOT exiting
-                    #   since we are the edge case: local task
-                    #   does not re-raise any ctxc it receives
-                    #   IFF **it** was the cancellation
-                    #   requester..
-                    #
-                    # XXX will raise if necessary but ow break
-                    # from loop presuming any supressed error
-                    # (ctxc) should terminate the context!
-                    ctx._maybe_raise_remote_err(
-                        re,
-                        # NOTE: obvi we don't care if we
-                        # overran the far end if we're already
-                        # waiting on a final result (msg).
-                        # raise_overrun_from_self=False,
-                        raise_overrun_from_self=raise_overrun,
-                    )
-
-                    break  # OOOOOF, yeah obvi we need this..
-
-                # XXX we should never really get here
-                # right! since `._deliver_msg()` should
-                # always have detected an {'error': ..}
-                # msg and already called this right!?!
-                elif error := unpack_error(
-                    msg=msg,
-                    chan=ctx._portal.channel,
-                    hide_tb=False,
-                ):
-                    log.critical('SHOULD NEVER GET HERE!?')
-                    assert msg is ctx._cancel_msg
-                    assert error.msgdata == ctx._remote_error.msgdata
-                    assert error.ipc_msg == ctx._remote_error.ipc_msg
-                    from .devx._debug import pause
-                    await pause()
-                    ctx._maybe_cancel_and_set_remote_error(error)
-                    ctx._maybe_raise_remote_err(error)
-
-                else:
-                    # bubble the original src key error
-                    raise
-
-            # XXX should pretty much never get here unless someone
-            # overrides the default `MsgType` spec.
-            case _:
-                pre_result_drained.append(msg)
-                # It's definitely an internal error if any other
-                # msg type without a`'cid'` field arrives here!
-                if not msg.cid:
-                    raise InternalError(
-                        'Unexpected cid-missing msg?\n\n'
-                        f'{msg}\n'
-                    )
-
-                raise RuntimeError('Unknown msg type: {msg}')
-
-    else:
-        log.cancel(
-            'Skipping `MsgStream` drain since final outcome is set\n\n'
-            f'{ctx.outcome}\n'
-        )
-
-    return (
-        return_msg,
-        pre_result_drained,
-    )
-
-
 class Unresolved:
     '''
     Placeholder value for `Context._result` until
@@ -423,9 +146,12 @@ class Context:
 
     # the "feeder" channels for delivering message values to the
     # local task from the runtime's msg processing loop.
-    _recv_chan: trio.MemoryReceiveChannel
+    _rx_chan: trio.MemoryReceiveChannel
     _send_chan: trio.MemorySendChannel
 
+    # payload receiver
+    _pld_rx: msgops.PldRx
+
     # full "namespace-path" to target RPC function
     _nsf: NamespacePath
 
@@ -447,7 +173,7 @@ class Context:
     _task: trio.lowlevel.Task|None = None
 
     # TODO: cs around result waiting so we can cancel any
-    # permanently blocking `._recv_chan.receive()` call in
+    # permanently blocking `._rx_chan.receive()` call in
     # a drain loop?
     # _res_scope: trio.CancelScope|None = None
 
@@ -504,14 +230,6 @@ class Context:
     _started_called: bool = False
     _stream_opened: bool = False
     _stream: MsgStream|None = None
-    _pld_codec_var: ContextVar[MsgCodec] = ContextVar(
-        'pld_codec',
-        default=_codec._def_msgspec_codec,  # i.e. `Any`-payloads
-    )
-
-    @property
-    def pld_codec(self) -> MsgCodec|None:
-        return self._pld_codec_var.get()
 
     # caller of `Portal.open_context()` for
     # logging purposes mostly
@@ -916,9 +634,8 @@ class Context:
         else:
             log.error(
                 f'Remote context error:\n\n'
-
+                # f'{pformat(self)}\n'
                 f'{error}\n'
-                f'{pformat(self)}\n'
             )
 
         # always record the cancelling actor's uid since its
@@ -955,24 +672,49 @@ class Context:
             and not self._is_self_cancelled()
             and not cs.cancel_called
             and not cs.cancelled_caught
-            and (
-                msgerr
-                and
-                # NOTE: allow user to config not cancelling the
-                # local scope on `MsgTypeError`s
-                self._cancel_on_msgerr
-            )
         ):
-            # TODO: it'd sure be handy to inject our own
-            # `trio.Cancelled` subtype here ;)
-            # https://github.com/goodboy/tractor/issues/368
-            log.cancel('Cancelling local `.open_context()` scope!')
-            self._scope.cancel()
+            if not (
+                msgerr
 
+                # NOTE: we allow user to config not cancelling the
+                # local scope on `MsgTypeError`s
+                and not self._cancel_on_msgerr
+            ):
+                # TODO: it'd sure be handy to inject our own
+                # `trio.Cancelled` subtype here ;)
+                # https://github.com/goodboy/tractor/issues/368
+                message: str = 'Cancelling `Context._scope` !\n\n'
+                self._scope.cancel()
+
+            else:
+                message: str = (
+                    'NOT Cancelling `Context._scope` since,\n'
+                    f'Context._cancel_on_msgerr = {self._cancel_on_msgerr}\n\n'
+                    f'AND we got a msg-type-error!\n'
+                    f'{error}\n'
+                )
         else:
-            log.cancel('NOT cancelling local `.open_context()` scope!')
+            message: str = 'NOT cancelling `Context._scope` !\n\n'
 
+        scope_info: str = 'No `self._scope: CancelScope` was set/used ?'
+        if cs:
+            scope_info: str = (
+                f'self._scope: {cs}\n'
+                f'|_ .cancel_called: {cs.cancel_called}\n'
+                f'|_ .cancelled_caught: {cs.cancelled_caught}\n'
+                f'|_ ._cancel_status: {cs._cancel_status}\n\n'
 
+                f'{self}\n'
+                f'|_ ._is_self_cancelled(): {self._is_self_cancelled()}\n'
+                f'|_ ._cancel_on_msgerr: {self._cancel_on_msgerr}\n\n'
+
+                f'msgerr: {msgerr}\n'
+            )
+        log.cancel(
+            message
+            +
+            f'{scope_info}'
+        )
         # TODO: maybe we should also call `._res_scope.cancel()` if it
         # exists to support cancelling any drain loop hangs?
 
@@ -1256,7 +998,7 @@ class Context:
         # a ``.open_stream()`` block prior or there was some other
         # unanticipated error or cancellation from ``trio``.
 
-        if ctx._recv_chan._closed:
+        if ctx._rx_chan._closed:
             raise trio.ClosedResourceError(
                 'The underlying channel for this stream was already closed!\n'
             )
@@ -1276,7 +1018,7 @@ class Context:
         #   stream WAS NOT just closed normally/gracefully.
         async with MsgStream(
             ctx=self,
-            rx_chan=ctx._recv_chan,
+            rx_chan=ctx._rx_chan,
         ) as stream:
 
             # NOTE: we track all existing streams per portal for
@@ -1427,13 +1169,12 @@ class Context:
             # boxed `StreamOverrun`. This is mostly useful for
             # supressing such faults during
             # cancellation/error/final-result handling inside
-            # `_drain_to_final_msg()` such that we do not
+            # `msg._ops.drain_to_final_msg()` such that we do not
             # raise such errors particularly in the case where
             # `._cancel_called == True`.
             not raise_overrun_from_self
             and isinstance(remote_error, RemoteActorError)
-
-            and remote_error.boxed_type_str == 'StreamOverrun'
+            and remote_error.boxed_type is StreamOverrun
 
             # and tuple(remote_error.msgdata['sender']) == our_uid
             and tuple(remote_error.sender) == our_uid
@@ -1503,12 +1244,12 @@ class Context:
         if self._final_result_is_set():
             return self._result
 
-        assert self._recv_chan
+        assert self._rx_chan
         raise_overrun: bool = not self._allow_overruns
         if (
             self.maybe_error is None
             and
-            not self._recv_chan._closed  # type: ignore
+            not self._rx_chan._closed  # type: ignore
         ):
             # wait for a final context result/error by "draining"
             # (by more or less ignoring) any bi-dir-stream "yield"
@@ -1516,7 +1257,7 @@ class Context:
             (
                 return_msg,
                 drained_msgs,
-            ) = await _drain_to_final_msg(
+            ) = await msgops.drain_to_final_msg(
                 ctx=self,
                 hide_tb=hide_tb,
             )
@@ -1802,8 +1543,7 @@ class Context:
             await self.chan.send(started_msg)
 
         # raise any msg type error NO MATTER WHAT!
-        except msgspec.ValidationError as verr:
-            from tractor._ipc import _mk_msg_type_err
+        except ValidationError as verr:
             raise _mk_msg_type_err(
                 msg=msg_bytes,
                 codec=codec,
@@ -1890,7 +1630,7 @@ class Context:
         - NEVER `return` early before delivering the msg!
           bc if the error is a ctxc and there is a task waiting on
           `.result()` we need the msg to be
-          `send_chan.send_nowait()`-ed over the `._recv_chan` so
+          `send_chan.send_nowait()`-ed over the `._rx_chan` so
           that the error is relayed to that waiter task and thus
           raised in user code!
 
@@ -2201,24 +1941,11 @@ async def open_context_from_portal(
     # -> it's expected that if there is an error in this phase of
     # the dialog, the `Error` msg should be raised from the `msg`
     # handling block below.
-    msg: Started = await ctx._recv_chan.receive()
-    try:
-        # the "first" value here is delivered by the callee's
-        # ``Context.started()`` call.
-        # first: Any = msg['started']
-        first: Any = msg.pld
-        ctx._started_called: bool = True
-
-    # except KeyError as src_error:
-    except AttributeError as src_error:
-        log.exception('Raising from unexpected msg!\n')
-        _raise_from_no_key_in_msg(
-            ctx=ctx,
-            msg=msg,
-            src_err=src_error,
-            log=log,
-            expect_msg=Started,
-        )
+    first: Any = await ctx._pld_rx.recv_pld(
+        ctx=ctx,
+        expect_msg=Started,
+    )
+    ctx._started_called: bool = True
 
     uid: tuple = portal.channel.uid
     cid: str = ctx.cid
@@ -2540,7 +2267,7 @@ async def open_context_from_portal(
         # we tear down the runtime feeder chan last
         # to avoid premature stream clobbers.
         if (
-            (rxchan := ctx._recv_chan)
+            (rxchan := ctx._rx_chan)
 
             # maybe TODO: yes i know the below check is
             # touching `trio` memchan internals..BUT, there are
@@ -2583,7 +2310,7 @@ async def open_context_from_portal(
             # underlying feeder channel is
             # once-and-only-CLOSED!
             with trio.CancelScope(shield=True):
-                await ctx._recv_chan.aclose()
+                await ctx._rx_chan.aclose()
 
         # XXX: we always raise remote errors locally and
         # generally speaking mask runtime-machinery related
@@ -2628,9 +2355,9 @@ async def open_context_from_portal(
         # FINALLY, remove the context from runtime tracking and
         # exit!
         log.runtime(
-            'Removing IPC ctx opened with peer\n'
-            f'{uid}\n'
-            f'|_{ctx}\n'
+            'De-allocating IPC ctx opened with {ctx.side!r} peer \n'
+            f'uid: {uid}\n'
+            f'cid: {ctx.cid}\n'
         )
         portal.actor._contexts.pop(
             (uid, cid),
@@ -2643,6 +2370,7 @@ def mk_context(
     nsf: NamespacePath,
 
     msg_buffer_size: int = 2**6,
+    pld_spec: Union[Type] = Any,
 
     **kwargs,
 
@@ -2662,12 +2390,18 @@ def mk_context(
     from .devx._code import find_caller_info
     caller_info: CallerInfo|None = find_caller_info()
 
+    pld_rx = msgops.PldRx(
+        # _rx_mc=recv_chan,
+        _msgdec=_codec.mk_dec(spec=pld_spec)
+    )
+
     ctx = Context(
         chan=chan,
         cid=cid,
         _actor=current_actor(),
         _send_chan=send_chan,
-        _recv_chan=recv_chan,
+        _rx_chan=recv_chan,
+        _pld_rx=pld_rx,
         _nsf=nsf,
         _task=trio.lowlevel.current_task(),
         _caller_info=caller_info,
diff --git a/tractor/_portal.py b/tractor/_portal.py
index 052dd8ef..97268972 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -31,7 +31,7 @@ from typing import (
     Any,
     Callable,
     AsyncGenerator,
-    # Type,
+    TYPE_CHECKING,
 )
 from functools import partial
 from dataclasses import dataclass
@@ -46,12 +46,12 @@ from ._state import (
 from ._ipc import Channel
 from .log import get_logger
 from .msg import (
-    Error,
+    # Error,
     NamespacePath,
     Return,
 )
 from ._exceptions import (
-    unpack_error,
+    # unpack_error,
     NoResult,
 )
 from ._context import (
@@ -62,42 +62,44 @@ from ._streaming import (
     MsgStream,
 )
 
+if TYPE_CHECKING:
+    from ._runtime import Actor
 
 log = get_logger(__name__)
 
 
-# TODO: rename to `unwrap_result()` and use
-# `._raise_from_no_key_in_msg()` (after tweak to
-# accept a `chan: Channel` arg) in key block!
-def _unwrap_msg(
-    msg: Return|Error,
-    channel: Channel,
+# TODO: remove and/or rework?
+# -[ ] rename to `unwrap_result()` and use
+#     `._raise_from_unexpected_msg()` (after tweak to accept a `chan:
+#     Channel` arg) in key block??
+# -[ ] pretty sure this is entirely covered by
+# `_exceptions._raise_from_unexpected_msg()` so REMOVE!
+# def _unwrap_msg(
+#     msg: Return|Error,
+#     ctx: Context,
 
-    hide_tb: bool = True,
+#     hide_tb: bool = True,
 
-) -> Any:
-    '''
-    Unwrap a final result from a `{return: <Any>}` IPC msg.
+# ) -> Any:
+#     '''
+#     Unwrap a final result from a `{return: <Any>}` IPC msg.
 
-    '''
-    __tracebackhide__: bool = hide_tb
+#     '''
+#     __tracebackhide__: bool = hide_tb
+#     try:
+#         return msg.pld
+#     except AttributeError as err:
 
-    try:
-        return msg.pld
-        # return msg['return']
-    # except KeyError as ke:
-    except AttributeError as err:
+#         # internal error should never get here
+#         # assert msg.get('cid'), (
+#         assert msg.cid, (
+#             "Received internal error at portal?"
+#         )
 
-        # internal error should never get here
-        # assert msg.get('cid'), (
-        assert msg.cid, (
-            "Received internal error at portal?"
-        )
-
-        raise unpack_error(
-            msg,
-            channel
-        ) from err
+#         raise unpack_error(
+#             msg,
+#             ctx.chan,
+#         ) from err
 
 
 class Portal:
@@ -123,17 +125,21 @@ class Portal:
     # connected (peer) actors.
     cancel_timeout: float = 0.5
 
-    def __init__(self, channel: Channel) -> None:
+    def __init__(
+        self,
+        channel: Channel,
+    ) -> None:
+
         self.chan = channel
         # during the portal's lifetime
-        self._result_msg: dict|None = None
+        self._final_result: Any|None = None
 
         # When set to a ``Context`` (when _submit_for_result is called)
         # it is expected that ``result()`` will be awaited at some
         # point.
-        self._expect_result: Context | None = None
+        self._expect_result_ctx: Context|None = None
         self._streams: set[MsgStream] = set()
-        self.actor = current_actor()
+        self.actor: Actor = current_actor()
 
     @property
     def channel(self) -> Channel:
@@ -147,6 +153,7 @@ class Portal:
         )
         return self.chan
 
+    # TODO: factor this out into an `ActorNursery` wrapper
     async def _submit_for_result(
         self,
         ns: str,
@@ -154,27 +161,18 @@ class Portal:
         **kwargs
     ) -> None:
 
-        assert self._expect_result is None, (
-            "A pending main result has already been submitted"
-        )
+        if self._expect_result_ctx is not None:
+            raise RuntimeError(
+                'A pending main result has already been submitted'
+            )
 
-        self._expect_result = await self.actor.start_remote_task(
+        self._expect_result_ctx = await self.actor.start_remote_task(
             self.channel,
             nsf=NamespacePath(f'{ns}:{func}'),
             kwargs=kwargs,
             portal=self,
         )
 
-    async def _return_once(
-        self,
-        ctx: Context,
-
-    ) -> Return:
-
-        assert ctx._remote_func_type == 'asyncfunc'  # single response
-        msg: Return = await ctx._recv_chan.receive()
-        return msg
-
     async def result(self) -> Any:
         '''
         Return the result(s) from the remote actor's "main" task.
@@ -188,7 +186,7 @@ class Portal:
             raise exc
 
         # not expecting a "main" result
-        if self._expect_result is None:
+        if self._expect_result_ctx is None:
             log.warning(
                 f"Portal for {self.channel.uid} not expecting a final"
                 " result?\nresult() should only be called if subactor"
@@ -196,17 +194,15 @@ class Portal:
             return NoResult
 
         # expecting a "main" result
-        assert self._expect_result
+        assert self._expect_result_ctx
 
-        if self._result_msg is None:
-            self._result_msg = await self._return_once(
-                self._expect_result
+        if self._final_result is None:
+            self._final_result: Any = await self._expect_result_ctx._pld_rx.recv_pld(
+                ctx=self._expect_result_ctx,
+                expect_msg=Return,
             )
 
-        return _unwrap_msg(
-            self._result_msg,
-            self.channel,
-        )
+        return self._final_result
 
     async def _cancel_streams(self):
         # terminate all locally running async generator
@@ -337,11 +333,9 @@ class Portal:
             kwargs=kwargs,
             portal=self,
         )
-        ctx._portal: Portal = self
-        msg: Return = await self._return_once(ctx)
-        return _unwrap_msg(
-            msg,
-            self.channel,
+        return await ctx._pld_rx.recv_pld(
+            ctx=ctx,
+            expect_msg=Return,
         )
 
     async def run(
@@ -391,10 +385,9 @@ class Portal:
             kwargs=kwargs,
             portal=self,
         )
-        ctx._portal = self
-        return _unwrap_msg(
-            await self._return_once(ctx),
-            self.channel,
+        return await ctx._pld_rx.recv_pld(
+            ctx=ctx,
+            expect_msg=Return,
         )
 
     @acm
@@ -436,7 +429,7 @@ class Portal:
             # deliver receive only stream
             async with MsgStream(
                 ctx=ctx,
-                rx_chan=ctx._recv_chan,
+                rx_chan=ctx._rx_chan,
             ) as rchan:
                 self._streams.add(rchan)
                 yield rchan
diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 72866d43..3e4066e0 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -817,8 +817,8 @@ class Actor:
                 state.max_buffer_size = msg_buffer_size
 
         except KeyError:
-            log.runtime(
-                f'Creating NEW IPC ctx for\n'
+            log.debug(
+                f'Allocate new IPC ctx for\n'
                 f'peer: {chan.uid}\n'
                 f'cid: {cid}\n'
             )
@@ -906,7 +906,7 @@ class Actor:
         # this should be immediate and does not (yet) wait for the
         # remote child task to sync via `Context.started()`.
         with trio.fail_after(ack_timeout):
-            first_msg: msgtypes.StartAck = await ctx._recv_chan.receive()
+            first_msg: msgtypes.StartAck = await ctx._rx_chan.receive()
         try:
             functype: str = first_msg.functype
         except AttributeError:
diff --git a/tractor/_streaming.py b/tractor/_streaming.py
index 16e32cea..764b7c1e 100644
--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@@ -35,7 +35,7 @@ import warnings
 import trio
 
 from ._exceptions import (
-    _raise_from_no_key_in_msg,
+    # _raise_from_no_key_in_msg,
     ContextCancelled,
 )
 from .log import get_logger
@@ -44,8 +44,9 @@ from .trionics import (
     BroadcastReceiver,
 )
 from tractor.msg import (
-    Return,
-    Stop,
+    # Return,
+    # Stop,
+    MsgType,
     Yield,
 )
 
@@ -94,24 +95,23 @@ class MsgStream(trio.abc.Channel):
         self._eoc: bool|trio.EndOfChannel = False
         self._closed: bool|trio.ClosedResourceError = False
 
+    # TODO: could we make this a direct method bind to `PldRx`?
+    # -> receive_nowait = PldRx.recv_pld
+    # |_ means latter would have to accept `MsgStream`-as-`self`?
+    #  => should be fine as long as,
+    #  -[ ] both define `._rx_chan`
+    #  -[ ] .ctx is bound into `PldRx` using a `@cm`?
+    #
     # delegate directly to underlying mem channel
     def receive_nowait(
         self,
-        allow_msgs: list[str] = Yield,
+        expect_msg: MsgType = Yield,
     ):
-        msg: Yield|Stop = self._rx_chan.receive_nowait()
-        # TODO: replace msg equiv of this or does the `.pld`
-        # interface read already satisfy it? I think so, yes?
-        try:
-            return msg.pld
-        except AttributeError as attrerr:
-            _raise_from_no_key_in_msg(
-                ctx=self._ctx,
-                msg=msg,
-                src_err=attrerr,
-                log=log,
-                stream=self,
-            )
+        ctx: Context = self._ctx
+        return ctx._pld_rx.recv_pld_nowait(
+            ctx=ctx,
+            expect_msg=expect_msg,
+        )
 
     async def receive(
         self,
@@ -146,24 +146,9 @@ class MsgStream(trio.abc.Channel):
 
         src_err: Exception|None = None  # orig tb
         try:
-            try:
-                msg: Yield = await self._rx_chan.receive()
-                return msg.pld
 
-            # TODO: implement with match: instead?
-            except AttributeError as attrerr:
-                # src_err = kerr
-                src_err = attrerr
-
-                # NOTE: may raise any of the below error types
-                # includg EoC when a 'stop' msg is found.
-                _raise_from_no_key_in_msg(
-                    ctx=self._ctx,
-                    msg=msg,
-                    src_err=attrerr,
-                    log=log,
-                    stream=self,
-                )
+            ctx: Context = self._ctx
+            return await ctx._pld_rx.recv_pld(ctx=ctx)
 
         # XXX: the stream terminates on either of:
         # - via `self._rx_chan.receive()` raising  after manual closure
@@ -228,7 +213,7 @@ class MsgStream(trio.abc.Channel):
         # probably want to instead raise the remote error
         # over the end-of-stream connection error since likely
         # the remote error was the source cause?
-        ctx: Context = self._ctx
+        # ctx: Context = self._ctx
         ctx.maybe_raise(
             raise_ctxc_from_self_call=True,
         )
@@ -292,7 +277,8 @@ class MsgStream(trio.abc.Channel):
         while not drained:
             try:
                 maybe_final_msg = self.receive_nowait(
-                    allow_msgs=[Yield, Return],
+                    # allow_msgs=[Yield, Return],
+                    expect_msg=Yield,
                 )
                 if maybe_final_msg:
                     log.debug(
@@ -472,6 +458,9 @@ class MsgStream(trio.abc.Channel):
                 self,
                 # use memory channel size by default
                 self._rx_chan._state.max_buffer_size,  # type: ignore
+
+                # TODO: can remove this kwarg right since
+                # by default behaviour is to do this anyway?
                 receive_afunc=self.receive,
             )
 
@@ -517,19 +506,11 @@ class MsgStream(trio.abc.Channel):
             raise self._closed
 
         try:
-            # await self._ctx.chan.send(
-            #     payload={
-            #         'yield': data,
-            #         'cid': self._ctx.cid,
-            #     },
-            #     # hide_tb=hide_tb,
-            # )
             await self._ctx.chan.send(
                 payload=Yield(
                     cid=self._ctx.cid,
                     pld=data,
                 ),
-                # hide_tb=hide_tb,
             )
         except (
             trio.ClosedResourceError,
@@ -562,7 +543,7 @@ def stream(func: Callable) -> Callable:
     '''
     # TODO: apply whatever solution ``mypy`` ends up picking for this:
     # https://github.com/python/mypy/issues/2087#issuecomment-769266912
-    func._tractor_stream_function = True  # type: ignore
+    func._tractor_stream_function: bool = True  # type: ignore
 
     sig = inspect.signature(func)
     params = sig.parameters
diff --git a/tractor/msg/_ops.py b/tractor/msg/_ops.py
new file mode 100644
index 00000000..e78b79a4
--- /dev/null
+++ b/tractor/msg/_ops.py
@@ -0,0 +1,563 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+Near-application abstractions for `MsgType.pld: PayloadT|Raw`
+delivery, filtering and type checking as well as generic
+operational helpers for processing transaction flows.
+
+'''
+from __future__ import annotations
+from contextlib import (
+    # asynccontextmanager as acm,
+    contextmanager as cm,
+)
+from pprint import pformat
+from typing import (
+    Any,
+    Type,
+    TYPE_CHECKING,
+    # Union,
+)
+# ------ - ------
+from msgspec import (
+    msgpack,
+    Raw,
+    Struct,
+    ValidationError,
+)
+import trio
+# ------ - ------
+from tractor.log import get_logger
+from tractor._exceptions import (
+    MessagingError,
+    InternalError,
+    _raise_from_unexpected_msg,
+    MsgTypeError,
+    _mk_msg_type_err,
+    pack_from_raise,
+)
+from ._codec import (
+    mk_dec,
+    MsgDec,
+)
+from .types import (
+    CancelAck,
+    Error,
+    MsgType,
+    PayloadT,
+    Return,
+    Started,
+    Stop,
+    Yield,
+    # pretty_struct,
+)
+
+
+if TYPE_CHECKING:
+    from tractor._context import Context
+    from tractor._streaming import MsgStream
+
+
+log = get_logger(__name__)
+
+
+class PldRx(Struct):
+    '''
+    A "msg payload receiver".
+
+    The pairing of a "feeder" `trio.abc.ReceiveChannel` and an
+    interchange-specific (eg. msgpack) payload field decoder. The
+    validation/type-filtering rules are runtime mutable and allow
+    type constraining the set of `MsgType.pld: Raw|PayloadT`
+    values at runtime, per IPC task-context.
+
+    This abstraction, being just below "user application code",
+    allows for the equivalent of our `MsgCodec` (used for
+    typer-filtering IPC dialog protocol msgs against a msg-spec)
+    but with granular control around payload delivery (i.e. the
+    data-values user code actually sees and uses (the blobs that
+    are "shuttled" by the wrapping dialog prot) such that invalid
+    `.pld: Raw` can be decoded and handled by IPC-primitive user
+    code (i.e. that operates on `Context` and `Msgstream` APIs)
+    without knowledge of the lower level `Channel`/`MsgTransport`
+    primitives nor the `MsgCodec` in use. Further, lazily decoding
+    payload blobs allows for topical (and maybe intentionally
+    "partial") encryption of msg field subsets.
+
+    '''
+    # TODO: better to bind it here?
+    # _rx_mc: trio.MemoryReceiveChannel
+    _msgdec: MsgDec = mk_dec(spec=Any)
+
+    _ipc: Context|MsgStream|None = None
+
+    @cm
+    def apply_to_ipc(
+        self,
+        ipc_prim: Context|MsgStream,
+
+    ) -> PldRx:
+        '''
+        Apply this payload receiver to an IPC primitive type, one
+        of `Context` or `MsgStream`.
+
+        '''
+        self._ipc = ipc_prim
+        try:
+            yield self
+        finally:
+            self._ipc = None
+
+    @property
+    def dec(self) -> msgpack.Decoder:
+        return self._msgdec.dec
+
+    def recv_pld_nowait(
+        self,
+        # TODO: make this `MsgStream` compat as well, see above^
+        # ipc_prim: Context|MsgStream,
+        ctx: Context,
+
+        ipc_msg: MsgType|None = None,
+        expect_msg: Type[MsgType]|None = None,
+
+        **kwargs,
+
+    ) -> Any|Raw:
+
+        msg: MsgType = (
+            ipc_msg
+            or
+
+            # sync-rx msg from underlying IPC feeder (mem-)chan
+            ctx._rx_chan.receive_nowait()
+        )
+        return self.dec_msg(
+            msg,
+            ctx=ctx,
+            expect_msg=expect_msg,
+        )
+
+    async def recv_pld(
+        self,
+        ctx: Context,
+        ipc_msg: MsgType|None = None,
+        expect_msg: Type[MsgType]|None = None,
+
+        **kwargs
+
+    ) -> Any|Raw:
+        '''
+        Receive a `MsgType`, then decode and return its `.pld` field.
+
+        '''
+        msg: MsgType = (
+            ipc_msg
+            or
+
+            # async-rx msg from underlying IPC feeder (mem-)chan
+            await ctx._rx_chan.receive()
+        )
+        return self.dec_msg(
+            msg,
+            ctx=ctx,
+            expect_msg=expect_msg,
+        )
+
+    def dec_msg(
+        self,
+        msg: MsgType,
+        ctx: Context,
+        expect_msg: Type[MsgType]|None = None,
+
+    ) -> PayloadT|Raw:
+        '''
+        Decode a msg's payload field: `MsgType.pld: PayloadT|Raw` and
+        return the value or raise an appropriate error.
+
+        '''
+        match msg:
+            # payload-data shuttle msg; deliver the `.pld` value
+            # directly to IPC (primitive) client-consumer code.
+            case (
+                Started(pld=pld)  # sync phase
+                |Yield(pld=pld)  # streaming phase
+                |Return(pld=pld)  # termination phase
+            ):
+                try:
+                    pld: PayloadT = self._msgdec.decode(pld)
+                    log.runtime(
+                        'Decode msg payload\n\n'
+                        f'{msg}\n\n'
+                        f'{pld}\n'
+                    )
+                    return pld
+
+                # XXX pld-type failure
+                except ValidationError as src_err:
+                    msgterr: MsgTypeError = _mk_msg_type_err(
+                        msg=msg,
+                        codec=self._dec,
+                        src_validation_error=src_err,
+                    )
+                    msg: Error = pack_from_raise(
+                        local_err=msgterr,
+                        cid=msg.cid,
+                        src_uid=ctx.chan.uid,
+                    )
+
+                # XXX some other decoder specific failure?
+                # except TypeError as src_error:
+                #     from .devx import mk_pdb
+                #     mk_pdb().set_trace()
+                #     raise src_error
+
+            # a runtime-internal RPC endpoint response.
+            # always passthrough since (internal) runtime
+            # responses are generally never exposed to consumer
+            # code.
+            case CancelAck(
+                pld=bool(cancelled)
+            ):
+                return cancelled
+
+            case Error():
+                src_err = MessagingError(
+                    'IPC dialog termination by msg'
+                )
+
+            case _:
+                src_err = InternalError(
+                    'Unknown IPC msg ??\n\n'
+                    f'{msg}\n'
+                )
+
+        # fallthrough and raise from `src_err`
+        _raise_from_unexpected_msg(
+            ctx=ctx,
+            msg=msg,
+            src_err=src_err,
+            log=log,
+            expect_msg=expect_msg,
+            hide_tb=False,
+        )
+
+    async def recv_msg_w_pld(
+        self,
+        ipc: Context|MsgStream,
+
+    ) -> tuple[MsgType, PayloadT]:
+        '''
+        Retrieve the next avail IPC msg, decode it's payload, and return
+        the pair of refs.
+
+        '''
+        msg: MsgType = await ipc._rx_chan.receive()
+
+        # TODO: is there some way we can inject the decoded
+        # payload into an existing output buffer for the original
+        # msg instance?
+        pld: PayloadT = self.dec_msg(
+            msg,
+            ctx=ipc,
+        )
+        return msg, pld
+
+
+async def drain_to_final_msg(
+    ctx: Context,
+
+    hide_tb: bool = True,
+    msg_limit: int = 6,
+
+) -> tuple[
+    Return|None,
+    list[MsgType]
+]:
+    '''
+    Drain IPC msgs delivered to the underlying IPC primitive's
+    rx-mem-chan (eg. `Context._rx_chan`) from the runtime in
+    search for a final result or error.
+
+    The motivation here is to ideally capture errors during ctxc
+    conditions where a canc-request/or local error is sent but the
+    local task also excepts and enters the
+    `Portal.open_context().__aexit__()` block wherein we prefer to
+    capture and raise any remote error or ctxc-ack as part of the
+    `ctx.result()` cleanup and teardown sequence.
+
+    '''
+    __tracebackhide__: bool = hide_tb
+    raise_overrun: bool = not ctx._allow_overruns
+
+    # wait for a final context result by collecting (but
+    # basically ignoring) any bi-dir-stream msgs still in transit
+    # from the far end.
+    pre_result_drained: list[MsgType] = []
+    return_msg: Return|None = None
+    while not (
+        ctx.maybe_error
+        and not ctx._final_result_is_set()
+    ):
+        try:
+            # TODO: can remove?
+            # await trio.lowlevel.checkpoint()
+
+            # NOTE: this REPL usage actually works here dawg! Bo
+            # from .devx._debug import pause
+            # await pause()
+
+            # TODO: bad idea?
+            # -[ ] wrap final outcome channel wait in a scope so
+            # it can be cancelled out of band if needed?
+            #
+            # with trio.CancelScope() as res_cs:
+            #     ctx._res_scope = res_cs
+            #     msg: dict = await ctx._rx_chan.receive()
+            # if res_cs.cancelled_caught:
+
+            # TODO: ensure there's no more hangs, debugging the
+            # runtime pretty preaase!
+            # from .devx._debug import pause
+            # await pause()
+
+            # TODO: can remove this finally?
+            # we have no more need for the sync draining right
+            # since we're can kinda guarantee the async
+            # `.receive()` below will never block yah?
+            #
+            # if (
+            #     ctx._cancel_called and (
+            #         ctx.cancel_acked
+            #         # or ctx.chan._cancel_called
+            #     )
+            #     # or not ctx._final_result_is_set()
+            #     # ctx.outcome is not
+            #     # or ctx.chan._closed
+            # ):
+            #     try:
+            #         msg: dict = await ctx._rx_chan.receive_nowait()()
+            #     except trio.WouldBlock:
+            #         log.warning(
+            #             'When draining already `.cancel_called` ctx!\n'
+            #             'No final msg arrived..\n'
+            #         )
+            #         break
+            # else:
+            #     msg: dict = await ctx._rx_chan.receive()
+
+            # TODO: don't need it right jefe?
+            # with trio.move_on_after(1) as cs:
+            # if cs.cancelled_caught:
+            #     from .devx._debug import pause
+            #     await pause()
+
+            # pray to the `trio` gawds that we're corrent with this
+            # msg: dict = await ctx._rx_chan.receive()
+            msg, pld = await ctx._pld_rx.recv_msg_w_pld(ipc=ctx)
+
+        # NOTE: we get here if the far end was
+        # `ContextCancelled` in 2 cases:
+        # 1. we requested the cancellation and thus
+        #    SHOULD NOT raise that far end error,
+        # 2. WE DID NOT REQUEST that cancel and thus
+        #    SHOULD RAISE HERE!
+        except trio.Cancelled:
+
+            # CASE 2: mask the local cancelled-error(s)
+            # only when we are sure the remote error is
+            # the source cause of this local task's
+            # cancellation.
+            ctx.maybe_raise()
+
+            # CASE 1: we DID request the cancel we simply
+            # continue to bubble up as normal.
+            raise
+
+        match msg:
+
+            # final result arrived!
+            case Return(
+                # cid=cid,
+                # pld=res,
+            ):
+                # ctx._result: Any = res
+                ctx._result: Any = pld
+                log.runtime(
+                    'Context delivered final draining msg:\n'
+                    f'{pformat(msg)}'
+                )
+                # XXX: only close the rx mem chan AFTER
+                # a final result is retreived.
+                # if ctx._rx_chan:
+                #     await ctx._rx_chan.aclose()
+                # TODO: ^ we don't need it right?
+                return_msg = msg
+                break
+
+            # far end task is still streaming to us so discard
+            # and report depending on local ctx state.
+            case Yield():
+                pre_result_drained.append(msg)
+                if (
+                    (ctx._stream.closed
+                     and (reason := 'stream was already closed')
+                    )
+                    or (ctx.cancel_acked
+                        and (reason := 'ctx cancelled other side')
+                    )
+                    or (ctx._cancel_called
+                        and (reason := 'ctx called `.cancel()`')
+                    )
+                    or (len(pre_result_drained) > msg_limit
+                        and (reason := f'"yield" limit={msg_limit}')
+                    )
+                ):
+                    log.cancel(
+                        'Cancelling `MsgStream` drain since '
+                        f'{reason}\n\n'
+                        f'<= {ctx.chan.uid}\n'
+                        f'  |_{ctx._nsf}()\n\n'
+                        f'=> {ctx._task}\n'
+                        f'  |_{ctx._stream}\n\n'
+
+                        f'{pformat(msg)}\n'
+                    )
+                    return (
+                        return_msg,
+                        pre_result_drained,
+                    )
+
+                # drain up to the `msg_limit` hoping to get
+                # a final result or error/ctxc.
+                else:
+                    log.warning(
+                        'Ignoring "yield" msg during `ctx.result()` drain..\n'
+                        f'<= {ctx.chan.uid}\n'
+                        f'  |_{ctx._nsf}()\n\n'
+                        f'=> {ctx._task}\n'
+                        f'  |_{ctx._stream}\n\n'
+
+                        f'{pformat(msg)}\n'
+                    )
+                    continue
+
+            # stream terminated, but no result yet..
+            #
+            # TODO: work out edge cases here where
+            # a stream is open but the task also calls
+            # this?
+            # -[ ] should be a runtime error if a stream is open right?
+            # Stop()
+            case Stop():
+                pre_result_drained.append(msg)
+                log.cancel(
+                    'Remote stream terminated due to "stop" msg:\n\n'
+                    f'{pformat(msg)}\n'
+                )
+                continue
+
+            # remote error msg, likely already handled inside
+            # `Context._deliver_msg()`
+            case Error():
+                # TODO: can we replace this with `ctx.maybe_raise()`?
+                # -[ ]  would this be handier for this case maybe?
+                #     async with maybe_raise_on_exit() as raises:
+                #         if raises:
+                #             log.error('some msg about raising..')
+                #
+                re: Exception|None = ctx._remote_error
+                if re:
+                    assert msg is ctx._cancel_msg
+                    # NOTE: this solved a super duper edge case XD
+                    # this was THE super duper edge case of:
+                    # - local task opens a remote task,
+                    # - requests remote cancellation of far end
+                    #   ctx/tasks,
+                    # - needs to wait for the cancel ack msg
+                    #   (ctxc) or some result in the race case
+                    #   where the other side's task returns
+                    #   before the cancel request msg is ever
+                    #   rxed and processed,
+                    # - here this surrounding drain loop (which
+                    #   iterates all ipc msgs until the ack or
+                    #   an early result arrives) was NOT exiting
+                    #   since we are the edge case: local task
+                    #   does not re-raise any ctxc it receives
+                    #   IFF **it** was the cancellation
+                    #   requester..
+                    #
+                    # XXX will raise if necessary but ow break
+                    # from loop presuming any supressed error
+                    # (ctxc) should terminate the context!
+                    ctx._maybe_raise_remote_err(
+                        re,
+                        # NOTE: obvi we don't care if we
+                        # overran the far end if we're already
+                        # waiting on a final result (msg).
+                        # raise_overrun_from_self=False,
+                        raise_overrun_from_self=raise_overrun,
+                    )
+
+                    break  # OOOOOF, yeah obvi we need this..
+
+                # XXX we should never really get here
+                # right! since `._deliver_msg()` should
+                # always have detected an {'error': ..}
+                # msg and already called this right!?!
+                # elif error := unpack_error(
+                #     msg=msg,
+                #     chan=ctx._portal.channel,
+                #     hide_tb=False,
+                # ):
+                #     log.critical('SHOULD NEVER GET HERE!?')
+                #     assert msg is ctx._cancel_msg
+                #     assert error.msgdata == ctx._remote_error.msgdata
+                #     assert error.ipc_msg == ctx._remote_error.ipc_msg
+                #     from .devx._debug import pause
+                #     await pause()
+                #     ctx._maybe_cancel_and_set_remote_error(error)
+                #     ctx._maybe_raise_remote_err(error)
+
+                else:
+                    # bubble the original src key error
+                    raise
+
+            # XXX should pretty much never get here unless someone
+            # overrides the default `MsgType` spec.
+            case _:
+                pre_result_drained.append(msg)
+                # It's definitely an internal error if any other
+                # msg type without a`'cid'` field arrives here!
+                if not msg.cid:
+                    raise InternalError(
+                        'Unexpected cid-missing msg?\n\n'
+                        f'{msg}\n'
+                    )
+
+                raise RuntimeError('Unknown msg type: {msg}')
+
+    else:
+        log.cancel(
+            'Skipping `MsgStream` drain since final outcome is set\n\n'
+            f'{ctx.outcome}\n'
+        )
+
+    return (
+        return_msg,
+        pre_result_drained,
+    )
-- 
2.34.1


From 18e97a8f9ae88000bfaa29b713c7b282006d363f Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 24 Apr 2024 12:31:05 -0400
Subject: [PATCH 263/378] Use `Context._stream` in
 `_raise_from_unexpected_msg()`

Instead of expecting it to be passed in (as it was prior), when
determining if a `Stop` msg is a valid end-of-channel signal use the
`ctx._stream: MsgStream|None` attr which **must** be set by any stream
opening API; either of:
- `Context.open_stream()`
- `Portal.open_stream_from()`

Adjust the case block logic to match with fallthrough from any EoC to
a closed error if necessary. Change the `_type: str` to match the
failing IPC-prim name in the tail case we raise a `MessagingError`.

Other:
- move `.sender: tuple` uid attr up to `RemoteActorError` since `Error`
  optionally defines it as a field and for boxed `StreamOverrun`s (an
  ignore case we check for in the runtime during cancellation) we want
  it readable from the boxing rae.
- drop still unused `InternalActorError`.
---
 tractor/_exceptions.py | 107 +++++++++++++++++++----------------------
 1 file changed, 49 insertions(+), 58 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index b2ba6e84..8d9274fe 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -532,7 +532,8 @@ class RemoteActorError(Exception):
         self,
     ) -> BaseException:
         '''
-        Unpack the inner-most source error from it's original IPC msg data.
+        Unpack the inner-most source error from it's original IPC
+        msg data.
 
         We attempt to reconstruct (as best as we can) the original
         `Exception` from as it would have been raised in the
@@ -570,6 +571,14 @@ class RemoteActorError(Exception):
     #     #     boxed_type=get_type_ref(..
     #     raise NotImplementedError
 
+    @property
+    def sender(self) -> tuple[str, str]|None:
+        if (
+            (msg := self._ipc_msg)
+            and (value := msg.sender)
+        ):
+            return tuple(value)
+
 
 class ContextCancelled(RemoteActorError):
     '''
@@ -734,20 +743,6 @@ class StreamOverrun(
     handled by app code using `MsgStream.send()/.receive()`.
 
     '''
-    @property
-    def sender(self) -> tuple[str, str] | None:
-        value = self._ipc_msg.sender
-        if value:
-            return tuple(value)
-
-
-# class InternalActorError(RemoteActorError):
-#     '''
-#     Boxed (Remote) internal `tractor` error indicating failure of some
-#     primitive, machinery state or lowlevel task that should never
-#     occur.
-
-#     '''
 
 
 class TransportClosed(trio.ClosedResourceError):
@@ -945,7 +940,6 @@ def _raise_from_unexpected_msg(
     log: StackLevelAdapter,  # caller specific `log` obj
 
     expect_msg: str = Yield,
-    stream: MsgStream | None = None,
 
     # allow "deeper" tbs when debugging B^o
     hide_tb: bool = True,
@@ -987,6 +981,8 @@ def _raise_from_unexpected_msg(
         ) from src_err
 
     # TODO: test that shows stream raising an expected error!!!
+    stream: MsgStream|None
+    _type: str = 'Context'
 
     # raise the error message in a boxed exception type!
     if isinstance(msg, Error):
@@ -1003,55 +999,50 @@ def _raise_from_unexpected_msg(
     # TODO: does it make more sense to pack 
     # the stream._eoc outside this in the calleer always?
         # case Stop():
-    elif (
-        isinstance(msg, Stop)
-        or (
-            stream
-            and stream._eoc
-        )
-    ):
-        log.debug(
-            f'Context[{cid}] stream was stopped by remote side\n'
-            f'cid: {cid}\n'
-        )
+    elif stream := ctx._stream:
+        _type: str = 'MsgStream'
 
-        # TODO: if the a local task is already blocking on
-        # a `Context.result()` and thus a `.receive()` on the
-        # rx-chan, we close the chan and set state ensuring that
-        # an eoc is raised!
+        if (
+            stream._eoc
+            or
+            isinstance(msg, Stop)
+        ):
+            log.debug(
+                f'Context[{cid}] stream was stopped by remote side\n'
+                f'cid: {cid}\n'
+            )
 
-        # XXX: this causes ``ReceiveChannel.__anext__()`` to
-        # raise a ``StopAsyncIteration`` **and** in our catch
-        # block below it will trigger ``.aclose()``.
-        eoc = trio.EndOfChannel(
-            f'Context stream ended due to msg:\n\n'
-            f'{pformat(msg)}\n'
-        )
-        # XXX: important to set so that a new `.receive()`
-        # call (likely by another task using a broadcast receiver)
-        # doesn't accidentally pull the `return` message
-        # value out of the underlying feed mem chan which is
-        # destined for the `Context.result()` call during ctx-exit!
-        stream._eoc: Exception = eoc
+            # TODO: if the a local task is already blocking on
+            # a `Context.result()` and thus a `.receive()` on the
+            # rx-chan, we close the chan and set state ensuring that
+            # an eoc is raised!
 
-        # in case there already is some underlying remote error
-        # that arrived which is probably the source of this stream
-        # closure
-        ctx.maybe_raise()
-        raise eoc from src_err
+            # XXX: this causes ``ReceiveChannel.__anext__()`` to
+            # raise a ``StopAsyncIteration`` **and** in our catch
+            # block below it will trigger ``.aclose()``.
+            eoc = trio.EndOfChannel(
+                f'Context stream ended due to msg:\n\n'
+                f'{pformat(msg)}\n'
+            )
+            # XXX: important to set so that a new `.receive()`
+            # call (likely by another task using a broadcast receiver)
+            # doesn't accidentally pull the `return` message
+            # value out of the underlying feed mem chan which is
+            # destined for the `Context.result()` call during ctx-exit!
+            stream._eoc: Exception = eoc
 
-    if (
-        stream
-        and stream._closed
-    ):
-        # TODO: our own error subtype?
-        raise trio.ClosedResourceError(
-            'This stream was closed'
-        )
+            # in case there already is some underlying remote error
+            # that arrived which is probably the source of this stream
+            # closure
+            ctx.maybe_raise()
+            raise eoc from src_err
+
+        if stream._closed:
+            # TODO: our own error subtype?
+            raise trio.ClosedResourceError('This stream was closed')
 
     # always re-raise the source error if no translation error case
     # is activated above.
-    _type: str = 'Stream' if stream else 'Context'
     raise MessagingError(
         f"{_type} was expecting a {expect_msg} message"
         " BUT received a non-error msg:\n"
-- 
2.34.1


From 6aa52417efecccdae5bb46ee78d88d172d70cd7e Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 24 Apr 2024 12:43:08 -0400
Subject: [PATCH 264/378] Set `Context._stream` in
 `Portal.open_stream_from()`..

---
 tractor/_portal.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tractor/_portal.py b/tractor/_portal.py
index 97268972..f3928657 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -420,7 +420,6 @@ class Portal:
             kwargs=kwargs,
             portal=self,
         )
-        ctx._portal = self
 
         # ensure receive-only stream entrypoint
         assert ctx._remote_func_type == 'asyncgen'
@@ -430,9 +429,10 @@ class Portal:
             async with MsgStream(
                 ctx=ctx,
                 rx_chan=ctx._rx_chan,
-            ) as rchan:
-                self._streams.add(rchan)
-                yield rchan
+            ) as stream:
+                self._streams.add(stream)
+                ctx._stream = stream
+                yield stream
 
         finally:
 
@@ -454,7 +454,7 @@ class Portal:
 
             # XXX: should this always be done?
             # await recv_chan.aclose()
-            self._streams.remove(rchan)
+            self._streams.remove(stream)
 
     # NOTE: impl is found in `._context`` mod to make
     # reading/groking the details simpler code-org-wise. This
-- 
2.34.1


From 6b30c86eca20f01ac0b33787cd3caa04f08dd136 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 24 Apr 2024 13:07:05 -0400
Subject: [PATCH 265/378] Try out `msgspec` encode-buffer optimization

As per the reco:
https://jcristharif.com/msgspec/perf-tips.html#reusing-an-output-buffe

BUT, seems to cause this error in `pikerd`..

`BufferError: Existing exports of data: object cannot be re-sized`

Soo no idea? Maybe there's a tweak needed that we can glean from
tests/examples in the `msgspec` repo?

Disabling for now.
---
 tractor/msg/__init__.py |  5 +++++
 tractor/msg/_codec.py   | 17 ++++++++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/tractor/msg/__init__.py b/tractor/msg/__init__.py
index d968f6cf..13739cdb 100644
--- a/tractor/msg/__init__.py
+++ b/tractor/msg/__init__.py
@@ -37,6 +37,11 @@ from ._codec import (
     MsgDec as MsgDec,
     current_codec as current_codec,
 )
+# currently can't bc circular with `._context`
+# from ._ops import (
+#     PldRx as PldRx,
+#     _drain_to_final_msg as _drain_to_final_msg,
+# )
 
 from .types import (
     Msg as Msg,
diff --git a/tractor/msg/_codec.py b/tractor/msg/_codec.py
index 104f7d99..e3540c3d 100644
--- a/tractor/msg/_codec.py
+++ b/tractor/msg/_codec.py
@@ -280,17 +280,32 @@ class MsgCodec(Struct):
     def enc(self) -> msgpack.Encoder:
         return self._enc
 
+    # TODO: reusing encode buffer for perf?
+    # https://jcristharif.com/msgspec/perf-tips.html#reusing-an-output-buffer
+    _buf: bytearray = bytearray()
+
     def encode(
         self,
         py_obj: Any,
 
+        use_buf: bool = False,
+        # ^-XXX-^ uhh why am i getting this?
+        # |_BufferError: Existing exports of data: object cannot be re-sized
+
     ) -> bytes:
         '''
         Encode input python objects to `msgpack` bytes for
         transfer on a tranport protocol connection.
 
+        When `use_buf == True` use the output buffer optimization:
+        https://jcristharif.com/msgspec/perf-tips.html#reusing-an-output-buffer
+
         '''
-        return self._enc.encode(py_obj)
+        if use_buf:
+            self._enc.encode_into(py_obj, self._buf)
+            return self._buf
+        else:
+            return self._enc.encode(py_obj)
 
     @property
     def dec(self) -> msgpack.Decoder:
-- 
2.34.1


From 188ff0e0e50a6beea81f0a2b318412a754c13a99 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 25 Apr 2024 12:33:10 -0400
Subject: [PATCH 266/378] Another `._rpc` mod passthrough

- tweaking logging to include more `MsgType` dumps on IPC faults.
- removing some commented cruft.
- comment formatting / cleanups / add-ons.
- more type annots.
- fill out some TODO content.
---
 tractor/_rpc.py | 163 ++++++++++++++++++++++++------------------------
 1 file changed, 80 insertions(+), 83 deletions(-)

diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index 576e988b..de975a90 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -181,12 +181,11 @@ async def _invoke_non_context(
         # way: using the linked IPC context machinery.
         failed_resp: bool = False
         try:
-            await chan.send(
-                StartAck(
-                    cid=cid,
-                    functype='asyncfunc',
-                )
+            ack = StartAck(
+                cid=cid,
+                functype='asyncfunc',
             )
+            await chan.send(ack)
         except (
             trio.ClosedResourceError,
             trio.BrokenResourceError,
@@ -194,12 +193,11 @@ async def _invoke_non_context(
         ) as ipc_err:
             failed_resp = True
             if is_rpc:
-                raise
+                raise ipc_err
             else:
-                # TODO: should this be an `.exception()` call?
-                log.warning(
-                    f'Failed to respond to non-rpc request: {func}\n'
-                    f'{ipc_err}'
+                log.exception(
+                    f'Failed to respond to runtime RPC request for\n\n'
+                    f'{ack}\n'
                 )
 
         with cancel_scope as cs:
@@ -220,20 +218,19 @@ async def _invoke_non_context(
                 and chan.connected()
             ):
                 try:
-                    await chan.send(
-                        return_msg(
-                            cid=cid,
-                            pld=result,
-                        )
+                    ret_msg = return_msg(
+                        cid=cid,
+                        pld=result,
                     )
+                    await chan.send(ret_msg)
                 except (
                     BrokenPipeError,
                     trio.BrokenResourceError,
                 ):
                     log.warning(
-                        'Failed to return result:\n'
-                        f'{func}@{actor.uid}\n'
-                        f'remote chan: {chan.uid}'
+                        'Failed to send RPC result?\n'
+                        f'|_{func}@{actor.uid}() -> {ret_msg}\n\n'
+                        f'x=> peer: {chan.uid}\n'
                     )
 
 @acm
@@ -250,7 +247,7 @@ async def _errors_relayed_via_ipc(
     ] = trio.TASK_STATUS_IGNORED,
 
 ) -> None:
-    __tracebackhide__: bool = hide_tb  # TODO: use hide_tb here?
+    __tracebackhide__: bool = hide_tb
     try:
         yield  # run RPC invoke body
 
@@ -262,23 +259,19 @@ async def _errors_relayed_via_ipc(
         KeyboardInterrupt,
     ) as err:
 
-        # always hide this frame from debug REPL if the crash
-        # originated from an rpc task and we DID NOT fail due to
-        # an IPC transport error!
+        # NOTE: always hide this frame from debug REPL call stack
+        # if the crash originated from an RPC task and we DID NOT
+        # fail due to an IPC transport error!
         if (
             is_rpc
-            and chan.connected()
+            and
+            chan.connected()
         ):
             __tracebackhide__: bool = hide_tb
 
+        # TODO: maybe we'll want different "levels" of debugging
+        # eventualy such as ('app', 'supervisory', 'runtime') ?
         if not is_multi_cancelled(err):
-
-            # TODO: maybe we'll want different "levels" of debugging
-            # eventualy such as ('app', 'supervisory', 'runtime') ?
-
-            # if not isinstance(err, trio.ClosedResourceError) and (
-            # if not is_multi_cancelled(err) and (
-
             entered_debug: bool = False
             if (
                 (
@@ -310,19 +303,18 @@ async def _errors_relayed_via_ipc(
                 # strange bug in our transport layer itself? Going
                 # to keep this open ended for now.
                 entered_debug = await _debug._maybe_enter_pm(err)
-
                 if not entered_debug:
                     log.exception(
                         'RPC task crashed\n'
                         f'|_{ctx}'
                     )
 
-        # always (try to) ship RPC errors back to caller
+        # ALWAYS try to ship RPC errors back to parent/caller task
         if is_rpc:
-            #
+
             # TODO: tests for this scenario:
             # - RPC caller closes connection before getting a response
-            # should **not** crash this actor..
+            #   should **not** crash this actor..
             await try_ship_error_to_remote(
                 chan,
                 err,
@@ -331,33 +323,41 @@ async def _errors_relayed_via_ipc(
                 hide_tb=hide_tb,
             )
 
-        # error is probably from above coro running code *not from
-        # the target rpc invocation since a scope was never
-        # allocated around the coroutine await.
+        # if the ctx cs is NOT allocated, the error is likely from
+        # above `coro` invocation machinery NOT from inside the
+        # `coro` itself, i.e. err is NOT a user application error.
         if ctx._scope is None:
             # we don't ever raise directly here to allow the
             # msg-loop-scheduler to continue running for this
             # channel.
             task_status.started(err)
 
-        # always reraise KBIs so they propagate at the sys-process
-        # level.
+        # always reraise KBIs so they propagate at the sys-process level.
         if isinstance(err, KeyboardInterrupt):
             raise
 
-
-    # RPC task bookeeping
+    # RPC task bookeeping.
+    # since RPC tasks are scheduled inside a flat
+    # `Actor._service_n`, we add "handles" to each such that
+    # they can be individually ccancelled.
     finally:
         try:
-            ctx, func, is_complete = actor._rpc_tasks.pop(
+            ctx: Context
+            func: Callable
+            is_complete: trio.Event
+            (
+                ctx,
+                func,
+                is_complete,
+            ) = actor._rpc_tasks.pop(
                 (chan, ctx.cid)
             )
             is_complete.set()
 
         except KeyError:
+            # If we're cancelled before the task returns then the
+            # cancel scope will not have been inserted yet
             if is_rpc:
-                # If we're cancelled before the task returns then the
-                # cancel scope will not have been inserted yet
                 log.warning(
                     'RPC task likely errored or cancelled before start?'
                     f'|_{ctx._task}\n'
@@ -372,7 +372,7 @@ async def _errors_relayed_via_ipc(
 
         finally:
             if not actor._rpc_tasks:
-                log.runtime("All RPC tasks have completed")
+                log.runtime('All RPC tasks have completed')
                 actor._ongoing_rpc_tasks.set()
 
 
@@ -414,19 +414,16 @@ async def _invoke(
 
     # TODO: possibly a specially formatted traceback
     # (not sure what typing is for this..)?
-    # tb = None
+    # tb: TracebackType = None
 
     cancel_scope = CancelScope()
-    # activated cancel scope ref
-    cs: CancelScope|None = None
-
+    cs: CancelScope|None = None  # ref when activated
     ctx = actor.get_context(
         chan=chan,
         cid=cid,
         nsf=NamespacePath.from_ref(func),
 
-        # TODO: if we wanted to get cray and support it?
-        # side='callee',
+        # NOTE: no portal passed bc this is the "child"-side
 
         # We shouldn't ever need to pass this through right?
         # it's up to the soon-to-be called rpc task to
@@ -459,8 +456,8 @@ async def _invoke(
             kwargs['stream'] = ctx
 
 
+    # handle decorated ``@tractor.context`` async function
     elif getattr(func, '_tractor_context_function', False):
-        # handle decorated ``@tractor.context`` async function
         kwargs['ctx'] = ctx
         context = True
 
@@ -474,7 +471,8 @@ async def _invoke(
         task_status=task_status,
     ):
         if not (
-            inspect.isasyncgenfunction(func) or
+            inspect.isasyncgenfunction(func)
+            or
             inspect.iscoroutinefunction(func)
         ):
             raise TypeError(f'{func} must be an async function!')
@@ -486,8 +484,7 @@ async def _invoke(
         except TypeError:
             raise
 
-        # TODO: implement all these cases in terms of the
-        # `Context` one!
+        # TODO: impl all these cases in terms of the `Context` one!
         if not context:
             await _invoke_non_context(
                 actor,
@@ -503,7 +500,7 @@ async def _invoke(
                 return_msg,
                 task_status,
             )
-            # below is only for `@context` funcs
+            # XXX below fallthrough is ONLY for `@context` eps
             return
 
         # our most general case: a remote SC-transitive,
@@ -580,9 +577,6 @@ async def _invoke(
             #   itself calls `ctx._maybe_cancel_and_set_remote_error()`
             #   which cancels the scope presuming the input error
             #   is not a `.cancel_acked` pleaser.
-            # - currently a never-should-happen-fallthrough case
-            #   inside ._context._drain_to_final_msg()`..
-            #   # TODO: remove this ^ right?
             if ctx._scope.cancelled_caught:
                 our_uid: tuple = actor.uid
 
@@ -598,9 +592,7 @@ async def _invoke(
                 if cs.cancel_called:
 
                     canceller: tuple = ctx.canceller
-                    msg: str = (
-                        'actor was cancelled by '
-                    )
+                    msg: str = 'actor was cancelled by '
 
                     # NOTE / TODO: if we end up having
                     # ``Actor._cancel_task()`` call
@@ -623,6 +615,8 @@ async def _invoke(
                     else:
                         msg += 'a remote peer'
 
+                    # TODO: move this "div centering" into
+                    # a helper for use elsewhere!
                     div_chars: str = '------ - ------'
                     div_offset: int = (
                         round(len(msg)/2)+1
@@ -702,11 +696,9 @@ async def _invoke(
             ctx: Context = actor._contexts.pop((
                 chan.uid,
                 cid,
-                # ctx.side,
             ))
 
             merr: Exception|None = ctx.maybe_error
-
             (
                 res_type_str,
                 res_str,
@@ -720,7 +712,7 @@ async def _invoke(
             )
             log.runtime(
                 f'IPC context terminated with a final {res_type_str}\n\n'
-                f'{ctx}\n'
+                f'{ctx}'
             )
 
 
@@ -806,13 +798,19 @@ async def process_messages(
       and `Actor.cancel()` process-wide-runtime-shutdown requests
       (as utilized inside `Portal.cancel_actor()` ).
 
-
     '''
     assert actor._service_n  # state sanity
 
     # TODO: once `trio` get's an "obvious way" for req/resp we
     # should use it?
-    # https://github.com/python-trio/trio/issues/467
+    # -[ ] existing GH https://github.com/python-trio/trio/issues/467
+    # -[ ] for other transports (like QUIC) we can possibly just
+    #  entirely avoid the feeder mem-chans since each msg will be
+    #  delivered with a ctx-id already?
+    #
+    #  |_ for ex, from `aioquic` which exposed "stream ids":
+    #  - https://github.com/aiortc/aioquic/blob/main/src/aioquic/quic/connection.py#L1175
+    #  - https://github.com/aiortc/aioquic/blob/main/src/aioquic/quic/connection.py#L659
     log.runtime(
         'Entering RPC msg loop:\n'
         f'peer: {chan.uid}\n'
@@ -850,7 +848,7 @@ async def process_messages(
                         | Return(cid=cid)
                         | CancelAck(cid=cid)
 
-                        # `.cid` means RPC-ctx-task specific
+                        # `.cid` indicates RPC-ctx-task scoped
                         | Error(cid=cid)
 
                         # recv-side `MsgType` decode violation
@@ -1046,16 +1044,16 @@ async def process_messages(
                                 trio.Event(),
                             )
 
-                    # runtime-scoped remote error (since no `.cid`)
+                    # runtime-scoped remote (internal) error
+                    # (^- bc no `Error.cid` -^)
+                    #
+                    # NOTE: this is the non-rpc error case, that
+                    # is, an error NOT raised inside a call to
+                    # `_invoke()` (i.e. no cid was provided in the
+                    # msg - see above). Raise error inline and
+                    # mark the channel as "globally errored" for
+                    # all downstream consuming primitives.
                     case Error():
-                        # NOTE: this is the non-rpc error case,
-                        # that is, an error **not** raised inside
-                        # a call to ``_invoke()`` (i.e. no cid was
-                        # provided in the msg - see above). Push
-                        # this error to all local channel
-                        # consumers (normally portals) by marking
-                        # the channel as errored
-                        # assert chan.uid
                         chan._exc: Exception = unpack_error(
                             msg,
                             chan=chan,
@@ -1111,7 +1109,7 @@ async def process_messages(
             f'|_{chan.raddr}\n'
         )
 
-        # transport **was** disconnected
+        # transport **WAS** disconnected
         return True
 
     except (
@@ -1150,12 +1148,11 @@ async def process_messages(
     finally:
         # msg debugging for when he machinery is brokey
         log.runtime(
-            'Exiting IPC msg loop with\n'
-            f'peer: {chan.uid}\n'
+            'Exiting IPC msg loop with final msg\n\n'
+            f'<= peer: {chan.uid}\n'
             f'|_{chan}\n\n'
-            'final msg:\n'
-            f'{pformat(msg)}\n'
+            f'{pformat(msg)}\n\n'
         )
 
-    # transport **was not** disconnected
+    # transport **WAS NOT** disconnected
     return False
-- 
2.34.1


From c25c77c57372b3c4e4eb73e334ab93895d81d8ab Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 25 Apr 2024 12:36:14 -0400
Subject: [PATCH 267/378] Flip back `StartAck` timeout to `inf`..

---
 tractor/_runtime.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 3e4066e0..d7cc548a 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -850,7 +850,7 @@ class Actor:
         msg_buffer_size: int|None = None,
         allow_overruns: bool = False,
         load_nsf: bool = False,
-        ack_timeout: float = 3,
+        ack_timeout: float = float('inf'),
 
     ) -> Context:
         '''
-- 
2.34.1


From 4bab998ff901d00d46a005f28a47f0bec7d0d3b4 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 25 Apr 2024 12:38:05 -0400
Subject: [PATCH 268/378] Add `Context.peer_side: str` property, mk static-meth
 private.

---
 tractor/_context.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index abcb90e4..3ed54d7d 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -472,13 +472,17 @@ class Context:
         return 'parent' if self._portal else 'child'
 
     @staticmethod
-    def peer_side(side: str) -> str:
+    def _peer_side(side: str) -> str:
         match side:
             case 'child':
                 return 'parent'
             case 'parent':
                 return 'child'
 
+    @property
+    def peer_side(self) -> str:
+        return self._peer_side(self.side)
+
     # TODO: remove stat!
     # -[ ] re-implement the `.experiemental._pubsub` stuff
     #     with `MsgStream` and that should be last usage?
@@ -512,9 +516,7 @@ class Context:
         equiv of a `StopIteration`.
 
         '''
-        await self.chan.send(
-            Stop(cid=self.cid)
-        )
+        await self.chan.send(Stop(cid=self.cid))
 
     def _maybe_cancel_and_set_remote_error(
         self,
@@ -593,7 +595,6 @@ class Context:
         # TODO: never do this right?
         # if self._remote_error:
         #     return
-        peer_side: str = self.peer_side(self.side)
 
         # XXX: denote and set the remote side's error so that
         # after we cancel whatever task is the opener of this
@@ -601,7 +602,7 @@ class Context:
         # appropriately.
         log.runtime(
             'Setting remote error for ctx\n\n'
-            f'<= {peer_side!r}: {self.chan.uid}\n'
+            f'<= {self.peer_side!r}: {self.chan.uid}\n'
             f'=> {self.side!r}\n\n'
             f'{error}'
         )
@@ -623,9 +624,8 @@ class Context:
 
         elif isinstance(error, MsgTypeError):
             msgerr = True
-            peer_side: str = self.peer_side(self.side)
             log.error(
-                f'IPC dialog error due to msg-type caused by {peer_side!r} side\n\n'
+                f'IPC dialog error due to msg-type caused by {self.peer_side!r} side\n\n'
 
                 f'{error}\n'
                 f'{pformat(self)}\n'
@@ -1067,12 +1067,12 @@ class Context:
             except trio.EndOfChannel as eoc:
                 if (
                     eoc
-                    and stream.closed
+                    and
+                    stream.closed
                 ):
                     # sanity, can remove?
                     assert eoc is stream._eoc
-                    # from .devx import pause
-                    # await pause()
+
                     log.warning(
                         'Stream was terminated by EoC\n\n'
                         # NOTE: won't show the error <Type> but
@@ -1644,10 +1644,9 @@ class Context:
         side: str = self.side
         if side == 'child':
             assert not self._portal
-        peer_side: str = self.peer_side(side)
 
         flow_body: str = (
-            f'<= peer {peer_side!r}: {from_uid}\n'
+            f'<= peer {self.peer_side!r}: {from_uid}\n'
             f'  |_<{nsf}()>\n\n'
 
             f'=> {side!r}: {self._task}\n'
@@ -1665,7 +1664,7 @@ class Context:
                 log_meth = log.runtime
 
             log_meth(
-                f'Delivering IPC ctx error from {peer_side!r} to {side!r} task\n\n'
+                f'Delivering IPC ctx error from {self.peer_side!r} to {side!r} task\n\n'
 
                 f'{flow_body}'
 
@@ -2330,7 +2329,7 @@ async def open_context_from_portal(
                 and ctx.cancel_acked
             ):
                 log.cancel(
-                    'Context cancelled by caller task\n'
+                    'Context cancelled by {ctx.side!r}-side task\n'
                     f'|_{ctx._task}\n\n'
 
                     f'{repr(scope_err)}\n'
@@ -2364,6 +2363,7 @@ async def open_context_from_portal(
             None,
         )
 
+
 def mk_context(
     chan: Channel,
     cid: str,
-- 
2.34.1


From adba454d1db681d5f2b54a3b35db461fa3ecf754 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 25 Apr 2024 16:19:39 -0400
Subject: [PATCH 269/378] Use `Context.[peer_]side` in ctxc messages

---
 tractor/_rpc.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index de975a90..d5899d44 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -592,7 +592,7 @@ async def _invoke(
                 if cs.cancel_called:
 
                     canceller: tuple = ctx.canceller
-                    msg: str = 'actor was cancelled by '
+                    explain: str = f'{ctx.side!r}-side task was cancelled by '
 
                     # NOTE / TODO: if we end up having
                     # ``Actor._cancel_task()`` call
@@ -602,24 +602,28 @@ async def _invoke(
                     if ctx._cancel_called:
                         # TODO: test for this!!!!!
                         canceller: tuple = our_uid
-                        msg += 'itself '
+                        explain += 'itself '
 
                     # if the channel which spawned the ctx is the
                     # one that cancelled it then we report that, vs.
                     # it being some other random actor that for ex.
                     # some actor who calls `Portal.cancel_actor()`
                     # and by side-effect cancels this ctx.
+                    #
+                    # TODO: determine if the ctx peer task was the
+                    # exact task which cancelled, vs. some other
+                    # task in the same actor.
                     elif canceller == ctx.chan.uid:
-                        msg += 'its caller'
+                        explain += f'its {ctx.peer_side!r}-side peer'
 
                     else:
-                        msg += 'a remote peer'
+                        explain += 'a remote peer'
 
                     # TODO: move this "div centering" into
                     # a helper for use elsewhere!
                     div_chars: str = '------ - ------'
                     div_offset: int = (
-                        round(len(msg)/2)+1
+                        round(len(explain)/2)+1
                         +
                         round(len(div_chars)/2)+1
                     )
@@ -630,11 +634,12 @@ async def _invoke(
                         +
                         f'{div_chars}\n'
                     )
-                    msg += (
+                    explain += (
                         div_str +
                         f'<= canceller: {canceller}\n'
-                        f'=> uid: {our_uid}\n'
-                        f'  |_{ctx._task}()'
+                        f'=> cancellee: {our_uid}\n'
+                        # TODO: better repr for ctx tasks..
+                        f'  |_{ctx.side!r} {ctx._task}'
 
                         # TODO: instead just show the
                         # ctx.__str__() here?
@@ -653,7 +658,7 @@ async def _invoke(
                     # task, so relay this cancel signal to the
                     # other side.
                     ctxc = ContextCancelled(
-                        message=msg,
+                        message=explain,
                         boxed_type=trio.Cancelled,
                         canceller=canceller,
                     )
-- 
2.34.1


From 08fcd3fb032ea8193508c54d4d25d9de3d949c29 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 25 Apr 2024 20:00:13 -0400
Subject: [PATCH 270/378] Mk `.msg.pretty_struct.Struct.pformat()` a mod func

More along the lines of `msgspec.struct` and also far more useful
internally for pprinting `MsgTypes`. Of course add method aliases.
---
 tractor/msg/pretty_struct.py | 110 +++++++++++++++++------------------
 1 file changed, 55 insertions(+), 55 deletions(-)

diff --git a/tractor/msg/pretty_struct.py b/tractor/msg/pretty_struct.py
index a67bbd26..f27fb89c 100644
--- a/tractor/msg/pretty_struct.py
+++ b/tractor/msg/pretty_struct.py
@@ -102,6 +102,59 @@ def iter_fields(struct: Struct) -> Iterator[
         )
 
 
+def pformat(
+    struct: Struct,
+    field_indent: int = 2,
+    indent: int = 0,
+
+) -> str:
+    '''
+    Recursion-safe `pprint.pformat()` style formatting of
+    a `msgspec.Struct` for sane reading by a human using a REPL.
+
+    '''
+    # global whitespace indent
+    ws: str = ' '*indent
+
+    # field whitespace indent
+    field_ws: str = ' '*(field_indent + indent)
+
+    # qtn: str = ws + struct.__class__.__qualname__
+    qtn: str = struct.__class__.__qualname__
+
+    obj_str: str = ''  # accumulator
+    fi: structs.FieldInfo
+    k: str
+    v: Any
+    for fi, k, v in iter_fields(struct):
+
+        # TODO: how can we prefer `Literal['option1',  'option2,
+        # ..]` over .__name__ == `Literal` but still get only the
+        # latter for simple types like `str | int | None` etc..?
+        ft: type = fi.type
+        typ_name: str = getattr(ft, '__name__', str(ft))
+
+        # recurse to get sub-struct's `.pformat()` output Bo
+        if isinstance(v, Struct):
+            val_str: str =  v.pformat(
+                indent=field_indent + indent,
+                field_indent=indent + field_indent,
+            )
+
+        else:  # the `pprint` recursion-safe format:
+            # https://docs.python.org/3.11/library/pprint.html#pprint.saferepr
+            val_str: str = saferepr(v)
+
+        # TODO: LOLOL use `textwrap.indent()` instead dawwwwwg!
+        obj_str += (field_ws + f'{k}: {typ_name} = {val_str},\n')
+
+    return (
+        f'{qtn}(\n'
+        f'{obj_str}'
+        f'{ws})'
+    )
+
+
 class Struct(
     _Struct,
 
@@ -140,65 +193,12 @@ class Struct(
 
         return sin_props
 
-    # TODO: make thisi a mod-func!
-    def pformat(
-        self,
-        field_indent: int = 2,
-        indent: int = 0,
-
-    ) -> str:
-        '''
-        Recursion-safe `pprint.pformat()` style formatting of
-        a `msgspec.Struct` for sane reading by a human using a REPL.
-
-        '''
-        # global whitespace indent
-        ws: str = ' '*indent
-
-        # field whitespace indent
-        field_ws: str = ' '*(field_indent + indent)
-
-        # qtn: str = ws + self.__class__.__qualname__
-        qtn: str = self.__class__.__qualname__
-
-        obj_str: str = ''  # accumulator
-        fi: structs.FieldInfo
-        k: str
-        v: Any
-        for fi, k, v in iter_fields(self):
-
-            # TODO: how can we prefer `Literal['option1',  'option2,
-            # ..]` over .__name__ == `Literal` but still get only the
-            # latter for simple types like `str | int | None` etc..?
-            ft: type = fi.type
-            typ_name: str = getattr(ft, '__name__', str(ft))
-
-            # recurse to get sub-struct's `.pformat()` output Bo
-            if isinstance(v, Struct):
-                val_str: str =  v.pformat(
-                    indent=field_indent + indent,
-                    field_indent=indent + field_indent,
-                )
-
-            else:  # the `pprint` recursion-safe format:
-                # https://docs.python.org/3.11/library/pprint.html#pprint.saferepr
-                val_str: str = saferepr(v)
-
-            # TODO: LOLOL use `textwrap.indent()` instead dawwwwwg!
-            obj_str += (field_ws + f'{k}: {typ_name} = {val_str},\n')
-
-        return (
-            f'{qtn}(\n'
-            f'{obj_str}'
-            f'{ws})'
-        )
-
+    pformat = pformat
+    # __str__ = __repr__ = pformat
     # TODO: use a pprint.PrettyPrinter instance around ONLY rendering
     # inside a known tty?
     # def __repr__(self) -> str:
     #     ...
-
-    # __str__ = __repr__ = pformat
     __repr__ = pformat
 
     def copy(
-- 
2.34.1


From c383978402ca4ca53608fd74a24402bf24918115 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 26 Apr 2024 12:45:10 -0400
Subject: [PATCH 271/378] Add more useful `MsgDec.__repr__()`

Basically exact same as that for `MsgCodec` with the `.spec` displayed
via a better (maybe multi-line) `.spec_str: str` generated from a common
new set of helper mod funcs factored out msg-codec meths:
- `mk_msgspec_table()` to gen a `MsgType` name -> msg table.
- `pformat_msgspec()` to `str`-ify said table values nicely.q

Also add a new `MsgCodec.msg_spec_str: str` prop which delegates to the
above for the same.
---
 tractor/msg/_codec.py | 123 ++++++++++++++++++++++++++++--------------
 1 file changed, 82 insertions(+), 41 deletions(-)

diff --git a/tractor/msg/_codec.py b/tractor/msg/_codec.py
index e3540c3d..901c0da1 100644
--- a/tractor/msg/_codec.py
+++ b/tractor/msg/_codec.py
@@ -75,7 +75,7 @@ log = get_logger(__name__)
 # TODO: unify with `MsgCodec` by making `._dec` part this?
 class MsgDec(Struct):
     '''
-    An IPC msg decoder.
+    An IPC msg (payload) decoder.
 
     Normally used to decode only a payload: `MsgType.pld:
     PayloadT` field before delivery to IPC consumer code.
@@ -87,6 +87,31 @@ class MsgDec(Struct):
     def dec(self) -> msgpack.Decoder:
         return self._dec
 
+    def __repr__(self) -> str:
+
+        speclines: str = self.spec_str
+
+        # in multi-typed spec case we stick the list
+        # all on newlines after the |__pld_spec__:,
+        # OW it's prolly single type spec-value
+        # so just leave it on same line.
+        if '\n' in speclines:
+            speclines: str = '\n' + textwrap.indent(
+                speclines,
+                prefix=' '*3,
+            )
+
+        body: str = textwrap.indent(
+            f'|_dec_hook: {self.dec.dec_hook}\n'
+            f'|__pld_spec__: {speclines}\n',
+            prefix=' '*2,
+        )
+        return (
+            f'<{type(self).__name__}(\n'
+            f'{body}'
+            ')>'
+        )
+
     # struct type unions
     # https://jcristharif.com/msgspec/structs.html#tagged-unions
     #
@@ -137,17 +162,7 @@ class MsgDec(Struct):
     # TODO: would get moved into `FieldSpec.__str__()` right?
     @property
     def spec_str(self) -> str:
-
-        # TODO: could also use match: instead?
-        spec: Union[Type]|Type = self.spec
-
-        # `typing.Union` case
-        if getattr(spec, '__args__', False):
-            return str(spec)
-
-        # just a single type
-        else:
-            return spec.__name__
+        return pformat_msgspec(codec=self)
 
     pld_spec_str = spec_str
 
@@ -168,9 +183,57 @@ def mk_dec(
 
 ) -> MsgDec:
 
-    return msgpack.Decoder(
-        type=spec,  # like `Msg[Any]`
-        dec_hook=dec_hook,
+    return MsgDec(
+        _dec=msgpack.Decoder(
+            type=spec,  # like `Msg[Any]`
+            dec_hook=dec_hook,
+        )
+    )
+
+
+def mk_msgspec_table(
+    dec: msgpack.Decoder,
+    msg: MsgType|None = None,
+
+) -> dict[str, MsgType]|str:
+    '''
+    Fill out a `dict` of `MsgType`s keyed by name
+    for a given input `msgspec.msgpack.Decoder`
+    as defined by its `.type: Union[Type]` setting.
+
+    If `msg` is provided, only deliver a `dict` with a single
+    entry for that type.
+
+    '''
+    msgspec: Union[Type]|Type = dec.type
+
+    if not (msgtypes := getattr(msgspec, '__args__', False)):
+        msgtypes = [msgspec]
+
+    msgt_table: dict[str, MsgType] = {
+        msgt: str(msgt)
+        for msgt in msgtypes
+    }
+    if msg:
+        msgt: MsgType = type(msg)
+        str_repr: str = msgt_table[msgt]
+        return {msgt: str_repr}
+
+    return msgt_table
+
+
+def pformat_msgspec(
+    codec: MsgCodec|MsgDec,
+    msg: MsgType|None = None,
+    join_char: str = '\n',
+
+) -> str:
+    dec: msgpack.Decoder = getattr(codec, 'dec', codec)
+    return join_char.join(
+        mk_msgspec_table(
+            dec=dec,
+            msg=msg,
+        ).values()
     )
 
 # TODO: overall IPC msg-spec features (i.e. in this mod)!
@@ -200,7 +263,7 @@ class MsgCodec(Struct):
 
     def __repr__(self) -> str:
         speclines: str = textwrap.indent(
-            self.pformat_msg_spec(),
+            pformat_msgspec(codec=self),
             prefix=' '*3,
         )
         body: str = textwrap.indent(
@@ -244,33 +307,11 @@ class MsgCodec(Struct):
         # NOTE: defined and applied inside `mk_codec()`
         return self._dec.type
 
-    def msg_spec_items(
-        self,
-        msg: MsgType|None = None,
-
-    ) -> dict[str, MsgType]|str:
-
-        msgt_table: dict[str, MsgType] = {
-            msgt: str(msgt)
-            for msgt in self.msg_spec.__args__
-        }
-        if msg:
-            msgt: MsgType = type(msg)
-            str_repr: str = msgt_table[msgt]
-            return {msgt: str_repr}
-
-        return msgt_table
-
     # TODO: some way to make `pretty_struct.Struct` use this
     # wrapped field over the `.msg_spec` one?
-    def pformat_msg_spec(
-        self,
-        msg: MsgType|None = None,
-        join_char: str = '\n',
-    ) -> str:
-        return join_char.join(
-            self.msg_spec_items(msg=msg).values()
-        )
+    @property
+    def msg_spec_str(self) -> str:
+        return pformat_msgspec(self.msg_spec)
 
     lib: ModuleType = msgspec
 
-- 
2.34.1


From a5a0e6854b57875bcd6820ed7c58106f7bab55a1 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 26 Apr 2024 13:03:07 -0400
Subject: [PATCH 272/378] Use new `Msg[Co]Dec` repr meths in `._exceptions`

Particularly when logging around `MsgTypeError`s.

Other:
- make `_raise_from_unexpected_msg()`'s `expect_msg` a non-default value
  arg, must always be passed by caller.
- drop `'canceller'` from `_body_fields` ow it shows up twice for ctxc.
- use `.msg.pretty_struct.pformat()`.
- parameterize `RemoteActorError.reprol()` (repr-one-line method) to
  show `RemoteActorError[<self.boxed_type_str>]( ..` to make obvi
  the boxed remote error type.
- re-impl `.boxed_type_str` as `str`-casting the `.boxed_type` value
  which is guaranteed to render non-`None`.
---
 tractor/_exceptions.py | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 8d9274fe..f2ff8c21 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -54,6 +54,7 @@ from tractor.msg import (
 from tractor.msg.pretty_struct import (
     iter_fields,
     Struct,
+    pformat as struct_format,
 )
 
 if TYPE_CHECKING:
@@ -108,6 +109,10 @@ _body_fields: list[str] = list(
         'relay_path',
         '_msg_dict',
         'cid',
+
+        # since only ctxc should show it but `Error` does
+        # have it as an optional field.
+        'canceller',
     }
 )
 
@@ -382,6 +387,9 @@ class RemoteActorError(Exception):
         '''
         Error type raised by original remote faulting actor.
 
+        When the error has only been relayed a single actor-hop
+        this will be the same as the `.boxed_type`.
+
         '''
         if self._src_type is None:
             self._src_type = get_err_type(
@@ -396,7 +404,8 @@ class RemoteActorError(Exception):
         String-name of the (last hop's) boxed error type.
 
         '''
-        return self._ipc_msg.boxed_type_str
+        bt: Type[BaseException] = self.boxed_type
+        return str(bt.__name__)
 
     @property
     def boxed_type(self) -> str:
@@ -492,7 +501,11 @@ class RemoteActorError(Exception):
         '''
         # TODO: use this matryoshka emjoi XD
         # => 🪆
-        reprol_str: str = f'{type(self).__name__}('
+        reprol_str: str = (
+            f'{type(self).__name__}'  # type name
+            f'[{self.boxed_type_str}]'  # parameterized by boxed type
+            '('  # init-style look
+        )
         _repr: str = self._mk_fields_str(
             self.reprol_fields,
             end_char=' ',
@@ -653,8 +666,8 @@ class MsgTypeError(
     - `Yield`
     - TODO: any embedded `.pld` type defined by user code?
 
-    Normally the source of an error is re-raised from some `.msg._codec`
-    decode which itself raises in a backend interchange
+    Normally the source of an error is re-raised from some
+    `.msg._codec` decode which itself raises in a backend interchange
     lib (eg. a `msgspec.ValidationError`).
 
     '''
@@ -939,7 +952,7 @@ def _raise_from_unexpected_msg(
     src_err: AttributeError,
     log: StackLevelAdapter,  # caller specific `log` obj
 
-    expect_msg: str = Yield,
+    expect_msg: Type[MsgType],
 
     # allow "deeper" tbs when debugging B^o
     hide_tb: bool = True,
@@ -1037,16 +1050,16 @@ def _raise_from_unexpected_msg(
             ctx.maybe_raise()
             raise eoc from src_err
 
+        # TODO: our own transport/IPC-broke error subtype?
         if stream._closed:
-            # TODO: our own error subtype?
             raise trio.ClosedResourceError('This stream was closed')
 
     # always re-raise the source error if no translation error case
     # is activated above.
     raise MessagingError(
-        f"{_type} was expecting a {expect_msg} message"
-        " BUT received a non-error msg:\n"
-        f'{pformat(msg)}'
+        f'{_type} was expecting a {expect_msg.__name__!r} message'
+        ' BUT received a non-error msg:\n\n'
+        f'{struct_format(msg)}'
     ) from src_err
 
 
@@ -1079,13 +1092,11 @@ def _mk_msg_type_err(
         # no src error from `msgspec.msgpack.Decoder.decode()` so
         # prolly a manual type-check on our part.
         if message is None:
-            fmt_spec: str = codec.pformat_msg_spec()
             fmt_stack: str = (
                 '\n'.join(traceback.format_stack(limit=3))
             )
             tb_fmt: str = pformat_boxed_tb(
                 tb_str=fmt_stack,
-                # fields_str=header,
                 field_prefix='  ',
                 indent='',
             )
@@ -1093,8 +1104,7 @@ def _mk_msg_type_err(
                 f'invalid msg -> {msg}: {type(msg)}\n\n'
                 f'{tb_fmt}\n'
                 f'Valid IPC msgs are:\n\n'
-                # f'  ------ - ------\n'
-                f'{fmt_spec}\n',
+                f'{codec.msg_spec_str}\n',
             )
         elif src_type_error:
             src_message: str = str(src_type_error)
-- 
2.34.1


From 61db040702f46902d75187c60d5705e2b46cc8eb Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 26 Apr 2024 13:13:04 -0400
Subject: [PATCH 273/378] More bitty (runtime) logging tweaks

---
 tractor/_portal.py    | 6 +++---
 tractor/_streaming.py | 8 ++------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/tractor/_portal.py b/tractor/_portal.py
index f3928657..e25a6c70 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -254,11 +254,11 @@ class Portal:
             return False
 
         reminfo: str = (
-            f'`Portal.cancel_actor()` => {self.channel.uid}\n'
-            f' |_{chan}\n'
+            f'Portal.cancel_actor() => {self.channel.uid}\n'
+            f'|_{chan}\n'
         )
         log.cancel(
-            f'Sending runtime `.cancel()` request to peer\n\n'
+            f'Requesting runtime cancel for peer\n\n'
             f'{reminfo}'
         )
 
diff --git a/tractor/_streaming.py b/tractor/_streaming.py
index 764b7c1e..dd4cd0e1 100644
--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@@ -364,14 +364,10 @@ class MsgStream(trio.abc.Channel):
 
         if not self._eoc:
             message: str = (
-                f'Context stream closed by {self._ctx.side!r}\n'
+                f'Stream self-closed by {self._ctx.side!r}-side before EoC\n'
                 f'|_{self}\n'
             )
-            log.cancel(
-                'Stream self-closed before receiving EoC\n\n'
-                +
-                message
-            )
+            log.cancel(message)
             self._eoc = trio.EndOfChannel(message)
 
         # ?XXX WAIT, why do we not close the local mem chan `._rx_chan` XXX?
-- 
2.34.1


From d285a3479a06b3f85d21238d54f87028ecdafb78 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 26 Apr 2024 13:18:06 -0400
Subject: [PATCH 274/378] Make `.msg.types.Msg.pld: Raw` only, since `PldRx`..

---
 tractor/msg/types.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index cb124324..63c0a467 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -56,6 +56,7 @@ log = get_logger('tractor.msgspec')
 PayloadT = TypeVar('PayloadT')
 
 
+# TODO: PayloadMsg
 class Msg(
     Struct,
     Generic[PayloadT],
@@ -81,7 +82,7 @@ class Msg(
     tree.
 
     '''
-    cid: str|None  # call/context-id
+    cid: str  # call/context-id
     # ^-TODO-^: more explicit type?
     # -[ ] use UNSET here?
     #  https://jcristharif.com/msgspec/supported-types.html#unset
@@ -106,7 +107,7 @@ class Msg(
     # TODO: could also be set to `msgspec.Raw` if the sub-decoders
     # approach is preferred over the generic parameterization 
     # approach as take by `mk_msg_spec()` below.
-    pld: PayloadT|Raw
+    pld: Raw
 
 
 class Aid(
-- 
2.34.1


From a3429268ead5f5c2b158eef53ff805eaa0d61757 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 26 Apr 2024 15:29:50 -0400
Subject: [PATCH 275/378] First draft payload-spec limit API

Add new task-scope oriented `PldRx.pld_spec` management API similar to
`.msg._codec.limit_msg_spec()`, but obvi built to process and filter
`MsgType.pld` values.

New API related changes include:
- new per-task singleton getter `msg._ops.current_pldrx()` which
  delivers the current (global) payload receiver via a new
  `_ctxvar_PldRx: ContextVar` configured with a default
  `_def_any_pldec: MsgDec[Any]` decoder.
- a `PldRx.limit_plds()` which sets the decoder (`.type` underneath)
  for the specific payload rx instance.
- `.msg._ops.limit_plds()` which obtains the current task-scoped `PldRx`
  and applies the pld spec via a new `PldRx.limit_plds()`.
- rename `PldRx._msgdec` -> `._pldec`.
- add `.pld_dec` as pub attr for -^

Unrelated adjustments:
- use `.msg.pretty_struct.pformat()` where handy.
- always pass `expect_msg: MsgType`.
- add a `case Stop()` to `PldRx.dec_msg()` which will `log.warning()`
  when a stop is received by no stream was open on this receiving side
  since we rarely want that to raise since it's prolly just a runtime
  race or mistake in user code.

Other:
---
 tractor/msg/_ops.py | 162 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 145 insertions(+), 17 deletions(-)

diff --git a/tractor/msg/_ops.py b/tractor/msg/_ops.py
index e78b79a4..5a9ab46a 100644
--- a/tractor/msg/_ops.py
+++ b/tractor/msg/_ops.py
@@ -25,12 +25,12 @@ from contextlib import (
     # asynccontextmanager as acm,
     contextmanager as cm,
 )
-from pprint import pformat
+from contextvars import ContextVar
 from typing import (
     Any,
     Type,
     TYPE_CHECKING,
-    # Union,
+    Union,
 )
 # ------ - ------
 from msgspec import (
@@ -63,7 +63,7 @@ from .types import (
     Started,
     Stop,
     Yield,
-    # pretty_struct,
+    pretty_struct,
 )
 
 
@@ -75,6 +75,9 @@ if TYPE_CHECKING:
 log = get_logger(__name__)
 
 
+_def_any_pldec: MsgDec = mk_dec()
+
+
 class PldRx(Struct):
     '''
     A "msg payload receiver".
@@ -101,10 +104,13 @@ class PldRx(Struct):
     '''
     # TODO: better to bind it here?
     # _rx_mc: trio.MemoryReceiveChannel
-    _msgdec: MsgDec = mk_dec(spec=Any)
-
+    _pldec: MsgDec
     _ipc: Context|MsgStream|None = None
 
+    @property
+    def pld_dec(self) -> MsgDec:
+        return self._pldec
+
     @cm
     def apply_to_ipc(
         self,
@@ -122,9 +128,29 @@ class PldRx(Struct):
         finally:
             self._ipc = None
 
+    @cm
+    def limit_plds(
+        self,
+        spec: Union[Type[Struct]],
+
+    ) -> MsgDec:
+        '''
+        Type-limit the loadable msg payloads via an applied
+        `MsgDec` given an input spec, revert to prior decoder on
+        exit.
+
+        '''
+        orig_dec: MsgDec = self._pldec
+        limit_dec: MsgDec = mk_dec(spec=spec)
+        try:
+            self._pldec = limit_dec
+            yield limit_dec
+        finally:
+            self._pldec = orig_dec
+
     @property
     def dec(self) -> msgpack.Decoder:
-        return self._msgdec.dec
+        return self._pldec.dec
 
     def recv_pld_nowait(
         self,
@@ -182,7 +208,7 @@ class PldRx(Struct):
         self,
         msg: MsgType,
         ctx: Context,
-        expect_msg: Type[MsgType]|None = None,
+        expect_msg: Type[MsgType]|None,
 
     ) -> PayloadT|Raw:
         '''
@@ -199,11 +225,11 @@ class PldRx(Struct):
                 |Return(pld=pld)  # termination phase
             ):
                 try:
-                    pld: PayloadT = self._msgdec.decode(pld)
+                    pld: PayloadT = self._pldec.decode(pld)
                     log.runtime(
-                        'Decode msg payload\n\n'
-                        f'{msg}\n\n'
-                        f'{pld}\n'
+                        'Decoded msg payload\n\n'
+                        f'{msg}\n'
+                        f'|_pld={pld!r}'
                     )
                     return pld
 
@@ -237,9 +263,42 @@ class PldRx(Struct):
 
             case Error():
                 src_err = MessagingError(
-                    'IPC dialog termination by msg'
+                    'IPC ctx dialog terminated without `Return`-ing a result'
                 )
 
+            case Stop(cid=cid):
+                message: str = (
+                    f'{ctx.side!r}-side of ctx received stream-`Stop` from '
+                    f'{ctx.peer_side!r} peer ?\n'
+                    f'|_cid: {cid}\n\n'
+
+                    f'{pretty_struct.pformat(msg)}\n'
+                )
+                if ctx._stream is None:
+                    explain: str = (
+                        f'BUT, no `MsgStream` (was) open(ed) on this '
+                        f'{ctx.side!r}-side of the IPC ctx?\n'
+                        f'Maybe check your code for streaming phase race conditions?\n'
+                    )
+                    log.warning(
+                        message
+                        +
+                        explain
+                    )
+                    # let caller decide what to do when only one
+                    # side opened a stream, don't raise.
+                    return msg
+
+                else:
+                    explain: str = (
+                        'Received a `Stop` when it should NEVER be possible!?!?\n'
+                    )
+                    # TODO: this is constructed inside
+                    # `_raise_from_unexpected_msg()` but maybe we
+                    # should pass it in?
+                    # src_err = trio.EndOfChannel(explain)
+                    src_err = None
+
             case _:
                 src_err = InternalError(
                     'Unknown IPC msg ??\n\n'
@@ -259,6 +318,7 @@ class PldRx(Struct):
     async def recv_msg_w_pld(
         self,
         ipc: Context|MsgStream,
+        expect_msg: MsgType,
 
     ) -> tuple[MsgType, PayloadT]:
         '''
@@ -274,10 +334,75 @@ class PldRx(Struct):
         pld: PayloadT = self.dec_msg(
             msg,
             ctx=ipc,
+            expect_msg=expect_msg,
         )
         return msg, pld
 
 
+# Always maintain a task-context-global `PldRx`
+_def_pld_rx: PldRx = PldRx(
+    _pldec=_def_any_pldec,
+)
+_ctxvar_PldRx: ContextVar[PldRx] = ContextVar(
+    'pld_rx',
+    default=_def_pld_rx,
+)
+
+
+def current_pldrx() -> PldRx:
+    '''
+    Return the current `trio.Task.context`'s msg-payload
+    receiver, the post IPC but pre-app code `MsgType.pld`
+    filter.
+
+    Modification of the current payload spec via `limit_plds()`
+    allows an application to contextually filter typed IPC msg
+    content delivered via wire transport.
+
+    '''
+    return _ctxvar_PldRx.get()
+
+
+@cm
+def limit_plds(
+    spec: Union[Type[Struct]],
+    **kwargs,
+
+) -> MsgDec:
+    '''
+    Apply a `MsgCodec` that will natively decode the SC-msg set's
+    `Msg.pld: Union[Type[Struct]]` payload fields using
+    tagged-unions of `msgspec.Struct`s from the `payload_types`
+    for all IPC contexts in use by the current `trio.Task`.
+
+    '''
+    __tracebackhide__: bool = True
+    try:
+        # sanity on orig settings
+        orig_pldrx: PldRx = current_pldrx()
+        orig_pldec: MsgDec = orig_pldrx.pld_dec
+
+        with orig_pldrx.limit_plds(
+            spec=spec,
+            **kwargs,
+        ) as pldec:
+            log.info(
+                'Applying payload-decoder\n\n'
+                f'{pldec}\n'
+            )
+            yield pldec
+    finally:
+        log.info(
+            'Reverted to previous payload-decoder\n\n'
+            f'{orig_pldec}\n'
+        )
+        assert (
+            (pldrx := current_pldrx()) is orig_pldrx
+            and
+            pldrx.pld_dec is orig_pldec
+        )
+
+
 async def drain_to_final_msg(
     ctx: Context,
 
@@ -368,7 +493,10 @@ async def drain_to_final_msg(
 
             # pray to the `trio` gawds that we're corrent with this
             # msg: dict = await ctx._rx_chan.receive()
-            msg, pld = await ctx._pld_rx.recv_msg_w_pld(ipc=ctx)
+            msg, pld = await ctx._pld_rx.recv_msg_w_pld(
+                ipc=ctx,
+                expect_msg=Return,
+            )
 
         # NOTE: we get here if the far end was
         # `ContextCancelled` in 2 cases:
@@ -399,7 +527,7 @@ async def drain_to_final_msg(
                 ctx._result: Any = pld
                 log.runtime(
                     'Context delivered final draining msg:\n'
-                    f'{pformat(msg)}'
+                    f'{pretty_struct.pformat(msg)}'
                 )
                 # XXX: only close the rx mem chan AFTER
                 # a final result is retreived.
@@ -435,7 +563,7 @@ async def drain_to_final_msg(
                         f'=> {ctx._task}\n'
                         f'  |_{ctx._stream}\n\n'
 
-                        f'{pformat(msg)}\n'
+                        f'{pretty_struct.pformat(msg)}\n'
                     )
                     return (
                         return_msg,
@@ -452,7 +580,7 @@ async def drain_to_final_msg(
                         f'=> {ctx._task}\n'
                         f'  |_{ctx._stream}\n\n'
 
-                        f'{pformat(msg)}\n'
+                        f'{pretty_struct.pformat(msg)}\n'
                     )
                     continue
 
@@ -467,7 +595,7 @@ async def drain_to_final_msg(
                 pre_result_drained.append(msg)
                 log.cancel(
                     'Remote stream terminated due to "stop" msg:\n\n'
-                    f'{pformat(msg)}\n'
+                    f'{pretty_struct.pformat(msg)}\n'
                 )
                 continue
 
-- 
2.34.1


From 979af795883fa92d6e81146129f062def013a9fe Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 30 Apr 2024 11:46:56 -0400
Subject: [PATCH 276/378] First draft, package with `poetry` Bo

---
 pyproject.toml | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 84633806..c1064744 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,68 @@
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
+# ------ - ------
+
+[tool.poetry]
+name = "tractor"
+version = "0.1.0a6dev0"
+description='structured concurrent `trio`-"actors"'
+authors = ["Tyler Goodlet <goodboy_foss@protonmail.com>"]
+license = "AGPlv3"
+readme = "docs/README.rst"
+
+# TODO: do we need this xontrib loader at all given pep420
+# and xonsh's xontrib global-autoload-via-setuptools?
+# https://xon.sh/tutorial_xontrib.html#authoring-xontribs
+packages = [
+  {include = 'tractor' },
+  # {include = 'tractor.experimental' },
+  # {include = 'tractor.trionics' },
+  # {include = 'tractor.msg' },
+  # {include = 'tractor.devx' },
+]
+
+# ------ - ------
+
+[tool.poetry.dependencies]
+python = "^3.11"
+
+# trio runtime related
+# proper range spec:
+# https://packaging.python.org/en/latest/discussions/install-requires-vs-requirements/#id5
+trio='^0.24'
+tricycle = "^0.4.1"
+trio-typing = "^0.10.0"
+
+msgspec='^0.18.5'  # interchange
+wrapt = "^1.16.0"  # decorators
+colorlog = "^6.8.2"  # logging
+
+# .devx tooling
+stackscope = "^0.2.2"
+pdbp = "^1.5.0"
+
+
+# TODO: distributed transport using
+# linux kernel networking
+# 'pyroute2
+
+# ------ - ------
+xontrib-vox = "^0.0.1"
+
+[tool.poetry.group.dev]
+optional = false
+[tool.poetry.group.dev.dependencies]
+pytest = "^8.2.0"
+
+# only for xonsh as sh..
+xontrib-vox = "^0.0.1"
+prompt-toolkit = "^3.0.43"
+xonsh-vox-tabcomplete = "^0.5"
+
+# ------ - ------
+
 [tool.towncrier]
 package = "tractor"
 filename = "NEWS.rst"
@@ -27,6 +92,7 @@ all_bullets = true
   name = "Trivial/Internal Changes"
   showcontent = true
 
+# ------ - ------
 
 [tool.pytest.ini_options]
 minversion = '6.0'
@@ -46,3 +112,26 @@ log_cli = false
 # TODO: maybe some of these layout choices?
 # https://docs.pytest.org/en/8.0.x/explanation/goodpractices.html#choosing-a-test-layout-import-rules
 # pythonpath = "src"
+
+# ------ - ------
+
+[project]
+keywords = [
+  'trio',
+  'async',
+  'concurrency',
+  'structured concurrency',
+  'actor model',
+  'distributed',
+  'multiprocessing'
+]
+classifiers = [
+  "Development Status :: 3 - Alpha",
+  "Operating System :: POSIX :: Linux",
+  "Framework :: Trio",
+  "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
+  "Programming Language :: Python :: Implementation :: CPython",
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.11",
+  "Topic :: System :: Distributed Computing",
+]
-- 
2.34.1


From f139adddcabcc91904e059616515789debbd9452 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 30 Apr 2024 11:47:26 -0400
Subject: [PATCH 277/378] Add a `log.devx()` level

---
 tractor/log.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/tractor/log.py b/tractor/log.py
index 6c040209..1870d4e1 100644
--- a/tractor/log.py
+++ b/tractor/log.py
@@ -53,6 +53,7 @@ LEVELS: dict[str, int] = {
     'RUNTIME': 15,
     'CANCEL': 16,
     'PDB': 500,
+    'DEVX': 500,
 }
 # _custom_levels: set[str] = {
 #     lvlname.lower for lvlname in LEVELS.keys()
@@ -62,6 +63,7 @@ STD_PALETTE = {
     'CRITICAL': 'red',
     'ERROR': 'red',
     'PDB': 'white',
+    'DEVX': 'cyan',
     'WARNING': 'yellow',
     'INFO': 'green',
     'CANCEL': 'yellow',
@@ -86,7 +88,8 @@ class StackLevelAdapter(logging.LoggerAdapter):
 
     ) -> None:
         '''
-        IPC level msg-ing.
+        IPC transport level msg IO; generally anything below
+        `._ipc.Channel` and friends.
 
         '''
         return self.log(5, msg)
@@ -102,7 +105,7 @@ class StackLevelAdapter(logging.LoggerAdapter):
         msg: str,
     ) -> None:
         '''
-        Cancellation logging, mostly for runtime reporting.
+        Cancellation sequencing, mostly for runtime reporting.
 
         '''
         return self.log(
@@ -116,7 +119,17 @@ class StackLevelAdapter(logging.LoggerAdapter):
         msg: str,
     ) -> None:
         '''
-        Debugger logging.
+        `pdb`-REPL (debugger) related statuses.
+
+        '''
+        return self.log(500, msg)
+
+    def devx(
+        self,
+        msg: str,
+    ) -> None:
+        '''
+        "Developer experience" sub-sys statuses.
 
         '''
         return self.log(500, msg)
-- 
2.34.1


From 40c972f0ec9b902f02ad7e4566073663b11de3a6 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 30 Apr 2024 12:15:46 -0400
Subject: [PATCH 278/378] Mk `process_messages()` return last msg; summary
 logging

Not sure it's **that** useful (yet) but in theory would allow avoiding
certain log level usage around transient RPC requests for discovery methods
(like `.register_actor()` and friends); can't hurt to be able to
introspect that last message for other future cases I'd imagine as well.
Adjust the calling code in `._runtime` to match; other spots are using
the `trio.Nursery.start()` schedule style and are fine as is.

Improve a bunch more log messages throughout a few mods mostly by going
to a "summary" single-emission style where possible/appropriate:
- in `._runtime` more "single summary" status style log emissions:
 |_mk `Actor.load_modules()` render a single mod loaded summary.
 |_use a summary `con_status: str` for `Actor._stream_handler()` conn
   setup and an equiv (`con_teardown_status`) for connection teardowns.
 |_similar thing in `Actor.wait_for_actor()`.
- generally more usage of `.msg.pretty_struct` apis throughout `._runtime`.
---
 tractor/_entry.py   |   2 +-
 tractor/_portal.py  |  19 +++--
 tractor/_rpc.py     |  51 +++++------
 tractor/_runtime.py | 203 +++++++++++++++++++++++---------------------
 4 files changed, 147 insertions(+), 128 deletions(-)

diff --git a/tractor/_entry.py b/tractor/_entry.py
index bf719abb..78f83283 100644
--- a/tractor/_entry.py
+++ b/tractor/_entry.py
@@ -146,7 +146,7 @@ def _trio_main(
 
     finally:
         log.info(
-            'Actor terminated\n'
+            'Subactor terminated\n'
             +
             actor_info
         )
diff --git a/tractor/_portal.py b/tractor/_portal.py
index e25a6c70..806dcc7b 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -435,7 +435,6 @@ class Portal:
                 yield stream
 
         finally:
-
             # cancel the far end task on consumer close
             # NOTE: this is a special case since we assume that if using
             # this ``.open_fream_from()`` api, the stream is one a one
@@ -496,7 +495,7 @@ class LocalPortal:
 async def open_portal(
 
     channel: Channel,
-    nursery: trio.Nursery|None = None,
+    tn: trio.Nursery|None = None,
     start_msg_loop: bool = True,
     shield: bool = False,
 
@@ -504,15 +503,19 @@ async def open_portal(
     '''
     Open a ``Portal`` through the provided ``channel``.
 
-    Spawns a background task to handle message processing (normally
-    done by the actor-runtime implicitly).
+    Spawns a background task to handle RPC processing, normally
+    done by the actor-runtime implicitly via a call to
+    `._rpc.process_messages()`. just after connection establishment.
 
     '''
     actor = current_actor()
     assert actor
     was_connected: bool = False
 
-    async with maybe_open_nursery(nursery, shield=shield) as nursery:
+    async with maybe_open_nursery(
+        tn,
+        shield=shield,
+    ) as tn:
 
         if not channel.connected():
             await channel.connect()
@@ -524,7 +527,7 @@ async def open_portal(
         msg_loop_cs: trio.CancelScope|None = None
         if start_msg_loop:
             from ._runtime import process_messages
-            msg_loop_cs = await nursery.start(
+            msg_loop_cs = await tn.start(
                 partial(
                     process_messages,
                     actor,
@@ -544,7 +547,7 @@ async def open_portal(
                 await channel.aclose()
 
             # cancel background msg loop task
-            if msg_loop_cs:
+            if msg_loop_cs is not None:
                 msg_loop_cs.cancel()
 
-            nursery.cancel_scope.cancel()
+            tn.cancel_scope.cancel()
diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index d5899d44..ee3151d3 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -64,11 +64,13 @@ from .msg import (
     current_codec,
     MsgCodec,
     NamespacePath,
+    pretty_struct,
 )
 from tractor.msg.types import (
     CancelAck,
     Error,
     Msg,
+    MsgType,
     Return,
     Start,
     StartAck,
@@ -774,7 +776,10 @@ async def process_messages(
     shield: bool = False,
     task_status: TaskStatus[CancelScope] = trio.TASK_STATUS_IGNORED,
 
-) -> bool:
+) -> (
+    bool,  # chan diconnected
+    MsgType,  # last msg
+):
     '''
     This is the low-level, per-IPC-channel, RPC task scheduler loop.
 
@@ -816,11 +821,6 @@ async def process_messages(
     #  |_ for ex, from `aioquic` which exposed "stream ids":
     #  - https://github.com/aiortc/aioquic/blob/main/src/aioquic/quic/connection.py#L1175
     #  - https://github.com/aiortc/aioquic/blob/main/src/aioquic/quic/connection.py#L659
-    log.runtime(
-        'Entering RPC msg loop:\n'
-        f'peer: {chan.uid}\n'
-        f'|_{chan}\n'
-    )
     nursery_cancelled_before_task: bool = False
     msg: Msg|None = None
     try:
@@ -834,12 +834,15 @@ async def process_messages(
 
             async for msg in chan:
                 log.transport(   # type: ignore
-                    f'<= IPC msg from peer: {chan.uid}\n\n'
+                    f'IPC msg from peer\n'
+                    f'<= {chan.uid}\n\n'
 
                     # TODO: avoid fmting depending on loglevel for perf?
-                    # -[ ] specifically `pformat()` sub-call..?
+                    # -[ ] specifically `pretty_struct.pformat()` sub-call..?
+                    #   - how to only log-level-aware actually call this?
                     # -[ ] use `.msg.pretty_struct` here now instead!
-                    f'{pformat(msg)}\n'
+                    # f'{pretty_struct.pformat(msg)}\n'
+                    f'{msg}\n'
                 )
 
                 match msg:
@@ -953,10 +956,11 @@ async def process_messages(
                         uid=actorid,
                     ):
                         log.runtime(
-                            'Handling RPC `Start` request from\n'
-                            f'peer: {actorid}\n'
-                            '\n'
-                            f'=> {ns}.{funcname}({kwargs})\n'
+                            'Handling RPC `Start` request\n'
+                            f'<= peer: {actorid}\n'
+                            f'  |_{ns}.{funcname}({kwargs})\n\n'
+
+                            f'{pretty_struct.pformat(msg)}\n'
                         )
 
                         # runtime-internal endpoint: `Actor.<funcname>`
@@ -1097,25 +1101,24 @@ async def process_messages(
                 parent_chan=chan,
             )
 
-    except (
-        TransportClosed,
-    ):
+    except TransportClosed:
         # channels "breaking" (for TCP streams by EOF or 104
         # connection-reset) is ok since we don't have a teardown
         # handshake for them (yet) and instead we simply bail out of
         # the message loop and expect the teardown sequence to clean
         # up..
-        # TODO: add a teardown handshake? and,
+        #
+        # TODO: maybe add a teardown handshake? and,
         # -[ ] don't show this msg if it's an ephemeral discovery ep call?
         # -[ ] figure out how this will break with other transports?
         log.runtime(
-            f'channel closed abruptly with\n'
-            f'peer: {chan.uid}\n' 
-            f'|_{chan.raddr}\n'
+            f'IPC channel closed abruptly\n'
+            f'<=x peer: {chan.uid}\n'
+            f'   |_{chan.raddr}\n'
         )
 
         # transport **WAS** disconnected
-        return True
+        return (True, msg)
 
     except (
         Exception,
@@ -1155,9 +1158,9 @@ async def process_messages(
         log.runtime(
             'Exiting IPC msg loop with final msg\n\n'
             f'<= peer: {chan.uid}\n'
-            f'|_{chan}\n\n'
-            f'{pformat(msg)}\n\n'
+            f'  |_{chan}\n\n'
+            f'{pretty_struct.pformat(msg)}'
         )
 
     # transport **WAS NOT** disconnected
-    return False
+    return (False, msg)
diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index d7cc548a..d28f4906 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -49,6 +49,7 @@ from pprint import pformat
 import signal
 import sys
 from typing import (
+    Any,
     Callable,
     TYPE_CHECKING,
 )
@@ -68,7 +69,7 @@ from tractor.msg import (
     pretty_struct,
     NamespacePath,
     types as msgtypes,
-    Msg,
+    MsgType,
 )
 from ._ipc import Channel
 from ._context import (
@@ -96,19 +97,6 @@ from ._rpc import (
     process_messages,
     try_ship_error_to_remote,
 )
-# from tractor.msg.types import (
-#     Aid,
-#     SpawnSpec,
-#     Start,
-#     StartAck,
-#     Started,
-#     Yield,
-#     Stop,
-#     Return,
-#     Error,
-# )
-
-
 
 
 if TYPE_CHECKING:
@@ -315,29 +303,32 @@ class Actor:
             self._reg_addrs = addrs
 
     async def wait_for_peer(
-        self, uid: tuple[str, str]
+        self,
+        uid: tuple[str, str],
+
     ) -> tuple[trio.Event, Channel]:
         '''
-        Wait for a connection back from a spawned actor with a `uid`
-        using a `trio.Event` for sync.
+        Wait for a connection back from a (spawned sub-)actor with
+        a `uid` using a `trio.Event` for sync.
 
         '''
-        log.runtime(f"Waiting for peer {uid} to connect")
+        log.debug(f'Waiting for peer {uid!r} to connect')
         event = self._peer_connected.setdefault(uid, trio.Event())
         await event.wait()
-        log.runtime(f"{uid} successfully connected back to us")
+        log.debug(f'{uid!r} successfully connected back to us')
         return event, self._peers[uid][-1]
 
     def load_modules(
         self,
-        debug_mode: bool = False,
+        # debug_mode: bool = False,
     ) -> None:
         '''
-        Load enabled RPC py-modules locally (after process fork/spawn).
+        Load explicitly enabled python modules from local fs after
+        process spawn.
 
         Since this actor may be spawned on a different machine from
         the original nursery we need to try and load the local module
-        code (presuming it exists).
+        code manually (presuming it exists).
 
         '''
         try:
@@ -350,16 +341,21 @@ class Actor:
                     _mp_fixup_main._fixup_main_from_path(
                         parent_data['init_main_from_path'])
 
+            status: str = 'Attempting to import enabled modules:\n'
             for modpath, filepath in self.enable_modules.items():
                 # XXX append the allowed module to the python path which
                 # should allow for relative (at least downward) imports.
                 sys.path.append(os.path.dirname(filepath))
-                log.runtime(f"Attempting to import {modpath}@{filepath}")
-                mod = importlib.import_module(modpath)
+                status += (
+                    f'|_{modpath!r} -> {filepath!r}\n'
+                )
+                mod: ModuleType = importlib.import_module(modpath)
                 self._mods[modpath] = mod
                 if modpath == '__main__':
                     self._mods['__mp_main__'] = mod
 
+            log.runtime(status)
+
         except ModuleNotFoundError:
             # it is expected the corresponding `ModuleNotExposed` error
             # will be raised later
@@ -413,21 +409,23 @@ class Actor:
         chan = Channel.from_stream(stream)
         their_uid: tuple[str, str]|None = chan.uid
 
-        con_msg: str = ''
+        con_status: str = ''
+
+        # TODO: remove this branch since can never happen?
+        # NOTE: `.uid` is only set after first contact
         if their_uid:
-            # NOTE: `.uid` is only set after first contact
-            con_msg = (
-                'IPC Re-connection from already known peer? '
+            con_status = (
+                'IPC Re-connection from already known peer?\n'
             )
         else:
-            con_msg = (
-                'New IPC connection to us '
+            con_status = (
+                'New inbound IPC connection <=\n'
             )
 
-        con_msg += (
-            f'<= @{chan.raddr}\n'
+        con_status += (
             f'|_{chan}\n'
             # f' |_@{chan.raddr}\n\n'
+            # ^-TODO-^ remove since alfready in chan.__repr__()?
         )
         # send/receive initial handshake response
         try:
@@ -447,13 +445,13 @@ class Actor:
             # a bound listener on the "arbiter" addr.  the reset will be
             # because the handshake was never meant took place.
             log.warning(
-                con_msg
+                con_status
                 +
                 ' -> But failed to handshake? Ignoring..\n'
             )
             return
 
-        con_msg += (
+        con_status += (
             f' -> Handshake with actor `{uid[0]}[{uid[1][-6:]}]` complete\n'
         )
         # IPC connection tracking for both peers and new children:
@@ -466,7 +464,7 @@ class Actor:
             None,
         )
         if event:
-            con_msg += (
+            con_status += (
                 ' -> Waking subactor spawn waiters: '
                 f'{event.statistics().tasks_waiting}\n'
                 f' -> Registered IPC chan for child actor {uid}@{chan.raddr}\n'
@@ -477,7 +475,7 @@ class Actor:
             event.set()
 
         else:
-            con_msg += (
+            con_status += (
                 f' -> Registered IPC chan for peer actor {uid}@{chan.raddr}\n'
             )  # type: ignore
 
@@ -491,13 +489,18 @@ class Actor:
         # TODO: can we just use list-ref directly?
         chans.append(chan)
 
-        log.runtime(con_msg)
+        con_status += ' -> Entering RPC msg loop..\n'
+        log.runtime(con_status)
 
         # Begin channel management - respond to remote requests and
         # process received reponses.
         disconnected: bool = False
+        last_msg: MsgType
         try:
-            disconnected: bool = await process_messages(
+            (
+                disconnected,
+                last_msg,
+            ) = await process_messages(
                 self,
                 chan,
             )
@@ -598,16 +601,24 @@ class Actor:
                     # that the IPC layer may have failed
                     # unexpectedly since it may be the cause of
                     # other downstream errors.
-                    entry = local_nursery._children.get(uid)
+                    entry: tuple|None = local_nursery._children.get(uid)
                     if entry:
                         proc: trio.Process
                         _, proc, _ = entry
 
                         if (
                             (poll := getattr(proc, 'poll', None))
-                            and poll() is None
+                            and
+                            poll() is None  # proc still alive
                         ):
-                            log.cancel(
+                            # TODO: change log level based on
+                            # detecting whether chan was created for
+                            # ephemeral `.register_actor()` request!
+                            # -[ ] also, that should be avoidable by
+                            #   re-using any existing chan from the
+                            #   `._discovery.get_registry()` call as
+                            #   well..
+                            log.runtime(
                                 f'Peer IPC broke but subproc is alive?\n\n'
 
                                 f'<=x {chan.uid}@{chan.raddr}\n'
@@ -616,17 +627,17 @@ class Actor:
 
             # ``Channel`` teardown and closure sequence
             # drop ref to channel so it can be gc-ed and disconnected
-            log.runtime(
-                f'Disconnected IPC channel:\n'
-                f'uid: {chan.uid}\n'
-                f'|_{pformat(chan)}\n'
+            con_teardown_status: str = (
+                f'IPC channel disconnected:\n'
+                f'<=x uid: {chan.uid}\n'
+                f'   |_{pformat(chan)}\n\n'
             )
             chans.remove(chan)
 
             # TODO: do we need to be this pedantic?
             if not chans:
-                log.runtime(
-                    f'No more channels with {chan.uid}'
+                con_teardown_status += (
+                    f'-> No more channels with {chan.uid}'
                 )
                 self._peers.pop(uid, None)
 
@@ -640,15 +651,16 @@ class Actor:
                         f' |_[{i}] {pformat(chan)}\n'
                     )
 
-            log.runtime(
-                f'Remaining IPC {len(self._peers)} peers:\n'
-                + peers_str
+            con_teardown_status += (
+                f'-> Remaining IPC {len(self._peers)} peers: {peers_str}\n'
             )
 
             # No more channels to other actors (at all) registered
             # as connected.
             if not self._peers:
-                log.runtime("Signalling no more peer channel connections")
+                con_teardown_status += (
+                    'Signalling no more peer channel connections'
+                )
                 self._no_more_peers.set()
 
                 # NOTE: block this actor from acquiring the
@@ -723,13 +735,16 @@ class Actor:
                         # TODO: figure out why this breaks tests..
                         db_cs.cancel()
 
+            log.runtime(con_teardown_status)
+        # finally block closure
+
     # TODO: rename to `._deliver_payload()` since this handles
     # more then just `result` msgs now obvi XD
     async def _deliver_ctx_payload(
         self,
         chan: Channel,
         cid: str,
-        msg: Msg|MsgTypeError,
+        msg: MsgType|MsgTypeError,
 
     ) -> None|bool:
         '''
@@ -754,7 +769,7 @@ class Actor:
                 # XXX don't need right since it's always in msg?
                 # f'=> cid: {cid}\n\n'
 
-                f'{pretty_struct.Struct.pformat(msg)}\n'
+                f'{pretty_struct.pformat(msg)}\n'
             )
             return
 
@@ -896,9 +911,11 @@ class Actor:
             cid=cid,
         )
         log.runtime(
-            'Sending RPC start msg\n\n'
+            'Sending RPC `Start`\n\n'
             f'=> peer: {chan.uid}\n'
-            f'  |_ {ns}.{func}({kwargs})\n'
+            f'  |_ {ns}.{func}({kwargs})\n\n'
+
+            f'{pretty_struct.pformat(msg)}'
         )
         await chan.send(msg)
 
@@ -955,31 +972,29 @@ class Actor:
 
             if self._spawn_method == "trio":
 
-                # Receive runtime state from our parent
-                # parent_data: dict[str, Any]
-                # parent_data = await chan.recv()
-
-                # TODO: maybe we should just wrap this directly
-                # in a `Actor.spawn_info: SpawnInfo` struct?
+                # Receive post-spawn runtime state from our parent.
                 spawnspec: msgtypes.SpawnSpec = await chan.recv()
                 self._spawn_spec = spawnspec
 
-                # TODO: eventually all these msgs as
-                # `msgspec.Struct` with a special mode that
-                # pformats them in multi-line mode, BUT only
-                # if "trace"/"util" mode is enabled?
                 log.runtime(
                     'Received runtime spec from parent:\n\n'
-                    f'{pformat(spawnspec)}\n'
+
+                    # TODO: eventually all these msgs as
+                    # `msgspec.Struct` with a special mode that
+                    # pformats them in multi-line mode, BUT only
+                    # if "trace"/"util" mode is enabled?
+                    f'{pretty_struct.pformat(spawnspec)}\n'
                 )
-                # accept_addrs: list[tuple[str, int]] = parent_data.pop('bind_addrs')
                 accept_addrs: list[tuple[str, int]] = spawnspec.bind_addrs
 
-                # rvs = parent_data.pop('_runtime_vars')
-                rvs = spawnspec._runtime_vars
+                # TODO: another `Struct` for rtvs..
+                rvs: dict[str, Any] = spawnspec._runtime_vars
                 if rvs['_debug_mode']:
                     try:
-                        log.info(
+                        # TODO: maybe return some status msgs upward
+                        # to that we can emit them in `con_status`
+                        # instead?
+                        log.devx(
                             'Enabling `stackscope` traces on SIGUSR1'
                         )
                         from .devx import enable_stack_on_sig
@@ -989,7 +1004,6 @@ class Actor:
                             '`stackscope` not installed for use in debug mode!'
                         )
 
-                log.runtime(f'Runtime vars are: {rvs}')
                 rvs['_is_root'] = False
                 _state._runtime_vars.update(rvs)
 
@@ -1006,18 +1020,12 @@ class Actor:
                     for val in spawnspec.reg_addrs
                 ]
 
-                # for attr, value in parent_data.items():
+                # TODO: better then monkey patching..
+                # -[ ] maybe read the actual f#$-in `._spawn_spec` XD
                 for _, attr, value in pretty_struct.iter_fields(
                     spawnspec,
                 ):
                     setattr(self, attr, value)
-                    # if (
-                    #     attr == 'reg_addrs'
-                    #     and value
-                    # ):
-                    #     self.reg_addrs = [tuple(val) for val in value]
-                    # else:
-                    #     setattr(self, attr, value)
 
             return (
                 chan,
@@ -1026,12 +1034,11 @@ class Actor:
 
         except OSError:  # failed to connect
             log.warning(
-                f'Failed to connect to parent!?\n\n'
-                'Closing IPC [TCP] transport server to\n'
-                f'{parent_addr}\n'
+                f'Failed to connect to spawning parent actor!?\n'
+                f'x=> {parent_addr}\n'
                 f'|_{self}\n\n'
             )
-            await self.cancel(chan=None)  # self cancel
+            await self.cancel(req_chan=None)  # self cancel
             raise
 
     async def _serve_forever(
@@ -1109,8 +1116,7 @@ class Actor:
         # chan whose lifetime limits the lifetime of its remotely
         # requested and locally spawned RPC tasks - similar to the
         # supervision semantics of a nursery wherein the actual
-        # implementation does start all such tasks in
-        # a sub-nursery.
+        # implementation does start all such tasks in a sub-nursery.
         req_chan: Channel|None,
 
     ) -> bool:
@@ -1151,7 +1157,7 @@ class Actor:
         # other) repr fields instead of doing this all manual..
         msg: str = (
             f'Runtime cancel request from {requester_type}:\n\n'
-            f'<= .cancel(): {requesting_uid}\n'
+            f'<= .cancel(): {requesting_uid}\n\n'
         )
 
         # TODO: what happens here when we self-cancel tho?
@@ -1166,8 +1172,8 @@ class Actor:
             dbcs = _debug.DebugStatus.req_cs
             if dbcs is not None:
                 msg += (
-                    '>> Cancelling active debugger request..\n'
-                    f'|_{_debug.Lock}\n'
+                    '-> Cancelling active debugger request..\n'
+                    f'|_{_debug.Lock.pformat()}'
                 )
                 dbcs.cancel()
 
@@ -1418,7 +1424,12 @@ class Actor:
 
         '''
         if self._server_n:
-            log.runtime("Shutting down channel server")
+            # TODO: obvi a different server type when we eventually
+            # support some others XD
+            server_prot: str = 'TCP'
+            log.runtime(
+                f'Cancelling {server_prot} server'
+            )
             self._server_n.cancel_scope.cancel()
             return True
 
@@ -1602,6 +1613,7 @@ async def async_main(
                 assert accept_addrs
 
                 try:
+                    # TODO: why is this not with the root nursery?
                     actor._server_n = await service_nursery.start(
                         partial(
                             actor._serve_forever,
@@ -1886,13 +1898,13 @@ class Arbiter(Actor):
         sockaddrs: list[tuple[str, int]] = []
         sockaddr: tuple[str, int]
 
-        for (aname, _), sockaddr in self._registry.items():
-            log.runtime(
-                f'Actor mailbox info:\n'
-                f'aname: {aname}\n'
-                f'sockaddr: {sockaddr}\n'
+        mailbox_info: str = 'Actor registry contact infos:\n'
+        for uid, sockaddr in self._registry.items():
+            mailbox_info += (
+                f'|_uid: {uid}\n'
+                f'|_sockaddr: {sockaddr}\n\n'
             )
-            if name == aname:
+            if name == uid[0]:
                 sockaddrs.append(sockaddr)
 
         if not sockaddrs:
@@ -1904,6 +1916,7 @@ class Arbiter(Actor):
                 if not isinstance(uid, trio.Event):
                     sockaddrs.append(self._registry[uid])
 
+        log.runtime(mailbox_info)
         return sockaddrs
 
     async def register_actor(
-- 
2.34.1


From 88a0e90f82ab005dc1848cbe432d5e77758d45ad Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 30 Apr 2024 12:37:17 -0400
Subject: [PATCH 279/378] Reorg frames pformatters, add `Context.repr_state`

A better spot for the pretty-formatting of frame text (and thus tracebacks)
is in the new `.devx._code` module:
- move from `._exceptions` -> `.devx._code.pformat_boxed_tb()`.
- add new `pformat_caller_frame()` factored out the use case in
  `._exceptions._mk_msg_type_err()` where we dump a stack trace
  for bad `.send()` side IPC msgs.

Add some new pretty-format methods to `Context`:
- explicitly implement `.pformat()` and allow an `extra_fields: dict`
  which can be used to inject additional fields (maybe eventually by
  default) such as is now used inside
  `._maybe_cancel_and_set_remote_error()` when reporting the internal
  `._scope` state in cancel logging.
- add a new `.repr_state -> str` which provides a single string status
  depending on the internal state of the IPC ctx in terms of the shuttle
  protocol's "phase"; use it from `.pformat()` for the `|_state:`.
- set `.started(complain_no_parity=False)` now since we presume decoding
  with `.pld: Raw` now with the new `PldRx` design.
- use new `msgops.current_pldrx()` in `mk_context()`.
---
 tractor/_context.py    | 163 +++++++++++++++++++++++++++++------------
 tractor/_exceptions.py |  83 +++------------------
 tractor/devx/_code.py  | 102 ++++++++++++++++++++++++++
 3 files changed, 227 insertions(+), 121 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index 3ed54d7d..f333c9ee 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -61,7 +61,6 @@ from ._exceptions import (
 )
 from .log import get_logger
 from .msg import (
-    _codec,
     Error,
     MsgType,
     MsgCodec,
@@ -103,7 +102,6 @@ class Unresolved:
     a final return value or raised error is resolved.
 
     '''
-    ...
 
 
 # TODO: make this a .msg.types.Struct!
@@ -116,19 +114,19 @@ class Context:
 
     NB: This class should **never be instatiated directly**, it is allocated
     by the runtime in 2 ways:
-     - by entering ``Portal.open_context()`` which is the primary
-       public API for any "caller" task or,
+     - by entering `Portal.open_context()` which is the primary
+       public API for any "parent" task or,
      - by the RPC machinery's `._rpc._invoke()` as a `ctx` arg
-       to a remotely scheduled "callee" function.
+       to a remotely scheduled "child" function.
 
-    AND is always constructed using the below ``mk_context()``.
+    AND is always constructed using the below `mk_context()`.
 
     Allows maintaining task or protocol specific state between
     2 cancel-scope-linked, communicating and parallel executing
     `trio.Task`s. Contexts are allocated on each side of any task
     RPC-linked msg dialog, i.e. for every request to a remote
     actor from a `Portal`. On the "callee" side a context is
-    always allocated inside ``._rpc._invoke()``.
+    always allocated inside `._rpc._invoke()`.
 
     TODO: more detailed writeup on cancellation, error and
     streaming semantics..
@@ -262,7 +260,13 @@ class Context:
     _strict_started: bool = False
     _cancel_on_msgerr: bool = True
 
-    def __str__(self) -> str:
+    def pformat(
+        self,
+        extra_fields: dict[str, Any]|None = None,
+        # ^-TODO-^ some built-in extra state fields
+        # we'll want in some devx specific cases?
+
+    ) -> str:
         ds: str = '='
         # ds: str = ': '
 
@@ -279,11 +283,7 @@ class Context:
         outcome_str: str = self.repr_outcome(
             show_error_fields=True
         )
-        outcome_typ_str: str = self.repr_outcome(
-            type_only=True
-        )
-
-        return (
+        fmtstr: str = (
             f'<Context(\n'
             # f'\n'
             # f'   ---\n'
@@ -304,12 +304,12 @@ class Context:
             # f'   -----\n'
             #
             # TODO: better state `str`ids?
-            # -[ ] maybe map err-types to strs like 'cancelled',
+            # -[x] maybe map err-types to strs like 'cancelled',
             #     'errored', 'streaming', 'started', .. etc.
             # -[ ] as well as a final result wrapper like
             #     `outcome.Value`?
             #
-            f' |_state: {outcome_typ_str}\n'
+            f' |_state: {self.repr_state!r}\n'
 
             f'   outcome{ds}{outcome_str}\n'
             f'   result{ds}{self._result}\n'
@@ -324,6 +324,16 @@ class Context:
             # -[ ] remove this ^ right?
 
             # f'  _remote_error={self._remote_error}
+        )
+        if extra_fields:
+            for key, val in extra_fields.items():
+                fmtstr += (
+                    f'   {key}{ds}{val!r}\n'
+                )
+
+        return (
+            fmtstr
+            +
             ')>\n'
         )
     # NOTE: making this return a value that can be passed to
@@ -335,7 +345,8 @@ class Context:
     # logging perspective over `eval()`-ability since we do NOT
     # target serializing non-struct instances!
     # def __repr__(self) -> str:
-    __repr__ = __str__
+    __str__ = pformat
+    __repr__ = pformat
 
     @property
     def cancel_called(self) -> bool:
@@ -615,10 +626,10 @@ class Context:
 
             whom: str = (
                 'us' if error.canceller == self._actor.uid
-                else 'peer'
+                else 'a remote peer (not us)'
             )
             log.cancel(
-                f'IPC context cancelled by {whom}!\n\n'
+                f'IPC context was cancelled by {whom}!\n\n'
                 f'{error}'
             )
 
@@ -626,7 +637,6 @@ class Context:
             msgerr = True
             log.error(
                 f'IPC dialog error due to msg-type caused by {self.peer_side!r} side\n\n'
-
                 f'{error}\n'
                 f'{pformat(self)}\n'
             )
@@ -696,24 +706,23 @@ class Context:
         else:
             message: str = 'NOT cancelling `Context._scope` !\n\n'
 
-        scope_info: str = 'No `self._scope: CancelScope` was set/used ?'
+        fmt_str: str = 'No `self._scope: CancelScope` was set/used ?'
         if cs:
-            scope_info: str = (
-                f'self._scope: {cs}\n'
-                f'|_ .cancel_called: {cs.cancel_called}\n'
-                f'|_ .cancelled_caught: {cs.cancelled_caught}\n'
-                f'|_ ._cancel_status: {cs._cancel_status}\n\n'
+            fmt_str: str = self.pformat(
+                extra_fields={
+                    '._is_self_cancelled()': self._is_self_cancelled(),
+                    '._cancel_on_msgerr': self._cancel_on_msgerr,
 
-                f'{self}\n'
-                f'|_ ._is_self_cancelled(): {self._is_self_cancelled()}\n'
-                f'|_ ._cancel_on_msgerr: {self._cancel_on_msgerr}\n\n'
-
-                f'msgerr: {msgerr}\n'
+                    '._scope': cs,
+                    '._scope.cancel_called': cs.cancel_called,
+                    '._scope.cancelled_caught': cs.cancelled_caught,
+                    '._scope._cancel_status': cs._cancel_status,
+                }
             )
         log.cancel(
             message
             +
-            f'{scope_info}'
+            fmt_str
         )
         # TODO: maybe we should also call `._res_scope.cancel()` if it
         # exists to support cancelling any drain loop hangs?
@@ -748,7 +757,7 @@ class Context:
         )
         return (
             # f'{self._nsf}() -{{{codec}}}-> {repr(self.outcome)}:'
-            f'{self._nsf}() -> {outcome_str}:'
+            f'{self._nsf}() -> {outcome_str}'
         )
 
     @property
@@ -836,7 +845,7 @@ class Context:
             if not self._portal:
                 raise InternalError(
                     'No portal found!?\n'
-                    'Why is this supposed caller context missing it?'
+                    'Why is this supposed {self.side!r}-side ctx task missing it?!?'
                 )
 
             cid: str = self.cid
@@ -1274,11 +1283,11 @@ class Context:
                     )
 
             log.cancel(
-                'Ctx drained pre-result msgs:\n'
-                f'{pformat(drained_msgs)}\n\n'
+                'Ctx drained to final result msgs\n'
+                f'{return_msg}\n\n'
 
-                f'Final return msg:\n'
-                f'{return_msg}\n'
+                f'pre-result drained msgs:\n'
+                f'{pformat(drained_msgs)}\n'
             )
 
         self.maybe_raise(
@@ -1443,6 +1452,65 @@ class Context:
             repr(self._result)
         )
 
+    @property
+    def repr_state(self) -> str:
+        '''
+        A `str`-status describing the current state of this
+        inter-actor IPC context in terms of the current "phase" state
+        of the SC shuttling dialog protocol.
+
+        '''
+        merr: Exception|None = self.maybe_error
+        outcome: Unresolved|Exception|Any = self.outcome
+
+        match (
+            outcome,
+            merr,
+        ):
+            case (
+                Unresolved,
+                ContextCancelled(),
+            ) if self.cancel_acked:
+                status = 'self-cancelled'
+
+            case (
+                Unresolved,
+                ContextCancelled(),
+            ) if (
+                self.canceller
+                and not self._cancel_called
+            ):
+                status = 'peer-cancelled'
+
+            case (
+                Unresolved,
+                BaseException(),
+            ) if self.canceller:
+                status = 'errored'
+
+            case (
+                _,  # any non-unresolved value
+                None,
+            ) if self._final_result_is_set():
+                status = 'returned'
+
+            case (
+                Unresolved,  # noqa (weird.. ruff)
+                None,
+            ):
+                if stream := self._stream:
+                    if stream.closed:
+                        status = 'streaming-finished'
+                    else:
+                        status = 'streaming'
+                elif self._started_called:
+                    status = 'started'
+
+            case _:
+                status = 'unknown!?'
+
+        return status
+
     async def started(
         self,
 
@@ -1451,7 +1519,11 @@ class Context:
         value: PayloadT|None = None,
 
         strict_parity: bool = False,
-        complain_no_parity: bool = True,
+
+        # TODO: this will always emit now that we do `.pld: Raw`
+        # passthrough.. so maybe just only complain when above strict
+        # flag is set?
+        complain_no_parity: bool = False,
 
     ) -> None:
         '''
@@ -1511,18 +1583,19 @@ class Context:
                     )
                     raise RuntimeError(
                         'Failed to roundtrip `Started` msg?\n'
-                        f'{pformat(rt_started)}\n'
+                        f'{pretty_struct.pformat(rt_started)}\n'
                     )
 
                 if rt_started != started_msg:
                     # TODO: break these methods out from the struct subtype?
 
+                    # TODO: make that one a mod func too..
                     diff = pretty_struct.Struct.__sub__(
                         rt_started,
                         started_msg,
                     )
                     complaint: str = (
-                        'Started value does not match after codec rountrip?\n\n'
+                        'Started value does not match after roundtrip?\n\n'
                         f'{diff}'
                     )
 
@@ -1538,8 +1611,6 @@ class Context:
                     else:
                         log.warning(complaint)
 
-                # started_msg = rt_started
-
             await self.chan.send(started_msg)
 
         # raise any msg type error NO MATTER WHAT!
@@ -2354,7 +2425,7 @@ async def open_context_from_portal(
         # FINALLY, remove the context from runtime tracking and
         # exit!
         log.runtime(
-            'De-allocating IPC ctx opened with {ctx.side!r} peer \n'
+            f'De-allocating IPC ctx opened with {ctx.side!r} peer \n'
             f'uid: {uid}\n'
             f'cid: {ctx.cid}\n'
         )
@@ -2390,10 +2461,8 @@ def mk_context(
     from .devx._code import find_caller_info
     caller_info: CallerInfo|None = find_caller_info()
 
-    pld_rx = msgops.PldRx(
-        # _rx_mc=recv_chan,
-        _msgdec=_codec.mk_dec(spec=pld_spec)
-    )
+    # TODO: when/how do we apply `.limit_plds()` from here?
+    pld_rx: msgops.PldRx = msgops.current_pldrx()
 
     ctx = Context(
         chan=chan,
diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index f2ff8c21..af653f92 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -46,7 +46,7 @@ from tractor.msg import (
     Error,
     MsgType,
     Stop,
-    Yield,
+    # Yield,
     types as msgtypes,
     MsgCodec,
     MsgDec,
@@ -140,71 +140,6 @@ def get_err_type(type_name: str) -> BaseException|None:
             return type_ref
 
 
-def pformat_boxed_tb(
-    tb_str: str,
-    fields_str: str|None = None,
-    field_prefix: str = ' |_',
-
-    tb_box_indent: int|None = None,
-    tb_body_indent: int = 1,
-
-) -> str:
-    if (
-        fields_str
-        and
-        field_prefix
-    ):
-        fields: str = textwrap.indent(
-            fields_str,
-            prefix=field_prefix,
-        )
-    else:
-        fields = fields_str or ''
-
-    tb_body = tb_str
-    if tb_body_indent:
-        tb_body: str = textwrap.indent(
-            tb_str,
-            prefix=tb_body_indent * ' ',
-        )
-
-    tb_box: str = (
-
-        # orig
-        # f'  |\n'
-        # f'   ------ - ------\n\n'
-        # f'{tb_str}\n'
-        # f'   ------ - ------\n'
-        # f' _|\n'
-
-        f'|\n'
-        f' ------ - ------\n\n'
-        # f'{tb_str}\n'
-        f'{tb_body}'
-        f' ------ - ------\n'
-        f'_|\n'
-    )
-    tb_box_indent: str = (
-        tb_box_indent
-        or
-        1
-
-        # (len(field_prefix))
-        # ? ^-TODO-^ ? if you wanted another indent level
-    )
-    if tb_box_indent > 0:
-        tb_box: str = textwrap.indent(
-            tb_box,
-            prefix=tb_box_indent * ' ',
-        )
-
-    return (
-        fields
-        +
-        tb_box
-    )
-
-
 def pack_from_raise(
     local_err: (
         ContextCancelled
@@ -504,12 +439,15 @@ class RemoteActorError(Exception):
         reprol_str: str = (
             f'{type(self).__name__}'  # type name
             f'[{self.boxed_type_str}]'  # parameterized by boxed type
-            '('  # init-style look
         )
+
         _repr: str = self._mk_fields_str(
             self.reprol_fields,
             end_char=' ',
         )
+        if _repr:
+            reprol_str += '('  # init-style call
+
         return (
             reprol_str
             +
@@ -521,6 +459,7 @@ class RemoteActorError(Exception):
         Nicely formatted boxed error meta data + traceback.
 
         '''
+        from tractor.devx._code import pformat_boxed_tb
         fields: str = self._mk_fields_str(
             _body_fields
             +
@@ -1092,14 +1031,10 @@ def _mk_msg_type_err(
         # no src error from `msgspec.msgpack.Decoder.decode()` so
         # prolly a manual type-check on our part.
         if message is None:
-            fmt_stack: str = (
-                '\n'.join(traceback.format_stack(limit=3))
-            )
-            tb_fmt: str = pformat_boxed_tb(
-                tb_str=fmt_stack,
-                field_prefix='  ',
-                indent='',
+            from tractor.devx._code import (
+                pformat_caller_frame,
             )
+            tb_fmt: str = pformat_caller_frame(stack_limit=3)
             message: str = (
                 f'invalid msg -> {msg}: {type(msg)}\n\n'
                 f'{tb_fmt}\n'
diff --git a/tractor/devx/_code.py b/tractor/devx/_code.py
index 01d64cd1..8d55212b 100644
--- a/tractor/devx/_code.py
+++ b/tractor/devx/_code.py
@@ -23,6 +23,8 @@ from __future__ import annotations
 import inspect
 # import msgspec
 # from pprint import pformat
+import textwrap
+import traceback
 from types import (
     FrameType,
     FunctionType,
@@ -175,3 +177,103 @@ def find_caller_info(
             )
 
     return None
+
+
+def pformat_boxed_tb(
+    tb_str: str,
+    fields_str: str|None = None,
+    field_prefix: str = ' |_',
+
+    tb_box_indent: int|None = None,
+    tb_body_indent: int = 1,
+
+) -> str:
+    '''
+    Create a "boxed" looking traceback string.
+
+    Useful for emphasizing traceback text content as being an
+    embedded attribute of some other object (like
+    a `RemoteActorError` or other boxing remote error shuttle
+    container).
+
+    Any other parent/container "fields" can be passed in the
+    `fields_str` input along with other prefix/indent settings.
+
+    '''
+    if (
+        fields_str
+        and
+        field_prefix
+    ):
+        fields: str = textwrap.indent(
+            fields_str,
+            prefix=field_prefix,
+        )
+    else:
+        fields = fields_str or ''
+
+    tb_body = tb_str
+    if tb_body_indent:
+        tb_body: str = textwrap.indent(
+            tb_str,
+            prefix=tb_body_indent * ' ',
+        )
+
+    tb_box: str = (
+
+        # orig
+        # f'  |\n'
+        # f'   ------ - ------\n\n'
+        # f'{tb_str}\n'
+        # f'   ------ - ------\n'
+        # f' _|\n'
+
+        f'|\n'
+        f' ------ - ------\n\n'
+        # f'{tb_str}\n'
+        f'{tb_body}'
+        f' ------ - ------\n'
+        f'_|\n'
+    )
+    tb_box_indent: str = (
+        tb_box_indent
+        or
+        1
+
+        # (len(field_prefix))
+        # ? ^-TODO-^ ? if you wanted another indent level
+    )
+    if tb_box_indent > 0:
+        tb_box: str = textwrap.indent(
+            tb_box,
+            prefix=tb_box_indent * ' ',
+        )
+
+    return (
+        fields
+        +
+        tb_box
+    )
+
+
+def pformat_caller_frame(
+    stack_limit: int = 1,
+    box_tb: bool = True,
+) -> str:
+    '''
+    Capture and return the traceback text content from
+    `stack_limit` call frames up.
+
+    '''
+    tb_str: str = (
+        '\n'.join(
+            traceback.format_stack(limit=stack_limit)
+        )
+    )
+    if box_tb:
+        tb_str: str = pformat_boxed_tb(
+            tb_str=tb_str,
+            field_prefix='  ',
+            indent='',
+        )
+    return tb_str
-- 
2.34.1


From 30c5896d26cc4567f9176b586538bded8be67e74 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 30 Apr 2024 12:55:46 -0400
Subject: [PATCH 280/378] Fix attr name error, use public `MsgDec.dec`

---
 tractor/msg/_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tractor/msg/_ops.py b/tractor/msg/_ops.py
index 5a9ab46a..4cf20496 100644
--- a/tractor/msg/_ops.py
+++ b/tractor/msg/_ops.py
@@ -229,7 +229,7 @@ class PldRx(Struct):
                     log.runtime(
                         'Decoded msg payload\n\n'
                         f'{msg}\n'
-                        f'|_pld={pld!r}'
+                        f'|_pld={pld!r}\n'
                     )
                     return pld
 
@@ -237,7 +237,7 @@ class PldRx(Struct):
                 except ValidationError as src_err:
                     msgterr: MsgTypeError = _mk_msg_type_err(
                         msg=msg,
-                        codec=self._dec,
+                        codec=self.dec,
                         src_validation_error=src_err,
                     )
                     msg: Error = pack_from_raise(
-- 
2.34.1


From 338395346d9a35560d94e9b0c6e5d4849a4f9799 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 30 Apr 2024 12:56:29 -0400
Subject: [PATCH 281/378] Tweak `breakpoint()` usage error message

---
 tractor/_root.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tractor/_root.py b/tractor/_root.py
index afe91e7f..de8388d5 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -124,8 +124,9 @@ async def open_root_actor(
         # usage by a clobbered TTY's stdstreams!
         def block_bps(*args, **kwargs):
             raise RuntimeError(
-                '`tractor` blocks built-in `breakpoint()` calls by default!\n'
-                'If you need to us it please install `greenback` and set '
+                'Trying to use `breakpoint()` eh?\n'
+                'Welp, `tractor` blocks `breakpoint()` built-in calls by default!\n'
+                'If you need to use it please install `greenback` and set '
                 '`debug_mode=True` when opening the runtime '
                 '(either via `.open_nursery()` or `open_root_actor()`)\n'
             )
-- 
2.34.1


From 54530dcf949449b4bcfe19073dc9ce007831f6a0 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 30 Apr 2024 12:59:38 -0400
Subject: [PATCH 282/378] Type annot the proc from
 `trio.lowlevel.open_process()`

---
 tractor/_spawn.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tractor/_spawn.py b/tractor/_spawn.py
index 43814918..06a2bf10 100644
--- a/tractor/_spawn.py
+++ b/tractor/_spawn.py
@@ -451,10 +451,9 @@ async def trio_proc(
     proc: trio.Process|None = None
     try:
         try:
-            # TODO: needs ``trio_typing`` patch?
-            proc = await trio.lowlevel.open_process(spawn_cmd)
+            proc: trio.Process = await trio.lowlevel.open_process(spawn_cmd)
             log.runtime(
-                'Started new sub-proc\n'
+                'Started new child\n'
                 f'|_{proc}\n'
             )
 
-- 
2.34.1


From cca3206fd6d924c29b5d30c5535dcf9ed47e7b5e Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 30 Apr 2024 13:00:03 -0400
Subject: [PATCH 283/378] Use `log.devx()` for `stackscope` messages

---
 tractor/devx/_stackscope.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tractor/devx/_stackscope.py b/tractor/devx/_stackscope.py
index 38c7af1d..e8e97d1a 100644
--- a/tractor/devx/_stackscope.py
+++ b/tractor/devx/_stackscope.py
@@ -65,7 +65,7 @@ def dump_task_tree() -> None:
         level='cancel',
     )
     actor: Actor = _state.current_actor()
-    log.pdb(
+    log.devx(
         f'Dumping `stackscope` tree for actor\n'
         f'{actor.name}: {actor}\n'
         f' |_{mp.current_process()}\n\n'
@@ -104,7 +104,7 @@ def signal_handler(
         subproc: ProcessType
         subactor: Actor
         for subactor, subproc, _ in an._children.values():
-            log.pdb(
+            log.devx(
                 f'Relaying `SIGUSR1`[{sig}] to sub-actor\n'
                 f'{subactor}\n'
                 f' |_{subproc}\n'
-- 
2.34.1


From 63c23d6b823d3c910b390942da5932b5fab6aa1d Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 30 Apr 2024 13:01:07 -0400
Subject: [PATCH 284/378] Add todo for rigorous struct-type spec of `SpawnSpec`
 fields

---
 tractor/msg/types.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index 63c0a467..7e10dab0 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -144,6 +144,8 @@ class SpawnSpec(
     `Aid` msg.
 
     '''
+    # TODO: similar to the `Start` kwargs spec needed below, we need
+    # a hard `Struct` def for all of these fields!
     _parent_main_data: dict
     _runtime_vars: dict[str, Any]
 
-- 
2.34.1


From 544ff5ab4cbbc69273b45742fddd06f63bf45fe0 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 6 May 2024 12:55:16 -0400
Subject: [PATCH 285/378] Change to `RemoteActorError.pformat()`

For more sane manual calls as needed in logging purposes. Obvi remap
the dunder methods to it.

Other:
- drop `hide_tb: bool` from `unpack_error()`, shouldn't need it since
  frame won't ever be part of any tb raised from returned error.
- add a `is_invalid_payload: bool` to `_raise_from_unexpected_msg()` to
  be used from `PldRx` where we don't need to decode the IPC
  msg, just the payload; make the error message reflect this case.
- drop commented `._portal._unwrap_msg()` since we've replaced it with
  `PldRx`'s delegation to newer `._raise_from_unexpected_msg()`.
- hide the `Portal.result()` frame by default, again.
---
 tractor/_exceptions.py | 156 +++++++++++++++++++++++------------------
 tractor/_portal.py     |  37 +---------
 2 files changed, 89 insertions(+), 104 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index af653f92..83675069 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -46,7 +46,6 @@ from tractor.msg import (
     Error,
     MsgType,
     Stop,
-    # Yield,
     types as msgtypes,
     MsgCodec,
     MsgDec,
@@ -212,6 +211,8 @@ class RemoteActorError(Exception):
     ) -> None:
         super().__init__(message)
 
+        # for manual display without having to muck with `Exception.args`
+        self._message: str = message
         # TODO: maybe a better name?
         # - .errtype
         # - .retype
@@ -454,32 +455,46 @@ class RemoteActorError(Exception):
             _repr
         )
 
-    def __repr__(self) -> str:
+    def pformat(self) -> str:
         '''
-        Nicely formatted boxed error meta data + traceback.
+        Nicely formatted boxed error meta data + traceback, OR just
+        the normal message from `.args` (for eg. as you'd want shown
+        by a locally raised `ContextCancelled`).
 
         '''
-        from tractor.devx._code import pformat_boxed_tb
-        fields: str = self._mk_fields_str(
-            _body_fields
-            +
-            self.extra_body_fields,
-        )
-        body: str = pformat_boxed_tb(
-            tb_str=self.tb_str,
-            fields_str=fields,
-            field_prefix=' |_',
-            # ^- is so that it's placed like so,
-            # just after <Type(
-            #             |___ ..
-            tb_body_indent=1,
-        )
+        tb_str: str = self.tb_str
+        if tb_str:
+            fields: str = self._mk_fields_str(
+                _body_fields
+                +
+                self.extra_body_fields,
+            )
+            from tractor.devx import (
+                pformat_boxed_tb,
+            )
+            body: str = pformat_boxed_tb(
+                tb_str=tb_str,
+                fields_str=fields,
+                field_prefix=' |_',
+                # ^- is so that it's placed like so,
+                # just after <Type(
+                #             |___ ..
+                tb_body_indent=1,
+            )
+        else:
+            body: str = textwrap.indent(
+                self._message,
+                prefix='  ',
+            ) + '\n'
         return (
             f'<{type(self).__name__}(\n'
             f'{body}'
             ')>'
         )
 
+    __repr__ = pformat
+    __str__ = pformat
+
     def unwrap(
         self,
     ) -> BaseException:
@@ -809,12 +824,9 @@ def pack_error(
 
 def unpack_error(
     msg: Error,
-
-    chan: Channel|None = None,
+    chan: Channel,
     box_type: RemoteActorError = RemoteActorError,
 
-    hide_tb: bool = True,
-
 ) -> None|Exception:
     '''
     Unpack an 'error' message from the wire
@@ -824,12 +836,10 @@ def unpack_error(
     which is the responsibilitiy of the caller.
 
     '''
-    __tracebackhide__: bool = hide_tb
-
     if not isinstance(msg, Error):
         return None
 
-    # retrieve the remote error's encoded details from fields
+    # retrieve the remote error's msg-encoded details
     tb_str: str = msg.tb_str
     message: str = (
         f'{chan.uid}\n'
@@ -858,7 +868,6 @@ def unpack_error(
     # original source error.
     elif boxed_type_str == 'RemoteActorError':
         assert boxed_type is RemoteActorError
-        # assert len(error_dict['relay_path']) >= 1
         assert len(msg.relay_path) >= 1
 
     exc = box_type(
@@ -943,8 +952,6 @@ def _raise_from_unexpected_msg(
         raise unpack_error(
             msg,
             ctx.chan,
-            hide_tb=hide_tb,
-
         ) from src_err
 
     # `MsgStream` termination msg.
@@ -1014,6 +1021,7 @@ def _mk_msg_type_err(
 
     src_validation_error: ValidationError|None = None,
     src_type_error: TypeError|None = None,
+    is_invalid_payload: bool = False,
 
 ) -> MsgTypeError:
     '''
@@ -1028,12 +1036,12 @@ def _mk_msg_type_err(
                 '`codec` must be a `MsgCodec` for send-side errors?'
             )
 
+        from tractor.devx import (
+            pformat_caller_frame,
+        )
         # no src error from `msgspec.msgpack.Decoder.decode()` so
         # prolly a manual type-check on our part.
         if message is None:
-            from tractor.devx._code import (
-                pformat_caller_frame,
-            )
             tb_fmt: str = pformat_caller_frame(stack_limit=3)
             message: str = (
                 f'invalid msg -> {msg}: {type(msg)}\n\n'
@@ -1071,47 +1079,57 @@ def _mk_msg_type_err(
 
     # `Channel.recv()` case
     else:
-        # decode the msg-bytes using the std msgpack
-        # interchange-prot (i.e. without any
-        # `msgspec.Struct` handling) so that we can
-        # determine what `.msg.types.Msg` is the culprit
-        # by reporting the received value.
-        msg_dict: dict = msgpack.decode(msg)
-        msg_type_name: str = msg_dict['msg_type']
-        msg_type = getattr(msgtypes, msg_type_name)
-        message: str = (
-            f'invalid `{msg_type_name}` IPC msg\n\n'
-        )
+        if is_invalid_payload:
+            msg_type: str = type(msg)
+            message: str = (
+                f'invalid `{msg_type.__qualname__}` payload\n\n'
+                f'<{type(msg).__qualname__}(\n'
+                f' |_pld: {codec.pld_spec_str} = {msg.pld!r}'
+                f')>\n'
+            )
+
+        else:
+            # decode the msg-bytes using the std msgpack
+            # interchange-prot (i.e. without any
+            # `msgspec.Struct` handling) so that we can
+            # determine what `.msg.types.Msg` is the culprit
+            # by reporting the received value.
+            msg_dict: dict = msgpack.decode(msg)
+            msg_type_name: str = msg_dict['msg_type']
+            msg_type = getattr(msgtypes, msg_type_name)
+            message: str = (
+                f'invalid `{msg_type_name}` IPC msg\n\n'
+            )
+            # XXX be "fancy" and see if we can determine the exact
+            # invalid field such that we can comprehensively report
+            # the specific field's type problem.
+            msgspec_msg: str = src_validation_error.args[0].rstrip('`')
+            msg, _, maybe_field = msgspec_msg.rpartition('$.')
+            obj = object()
+            if (field_val := msg_dict.get(maybe_field, obj)) is not obj:
+                field_name_expr: str = (
+                    f' |_{maybe_field}: {codec.pld_spec_str} = '
+                )
+                fmt_val_lines: list[str] = pformat(field_val).splitlines()
+                fmt_val: str = (
+                    f'{fmt_val_lines[0]}\n'
+                    +
+                    textwrap.indent(
+                        '\n'.join(fmt_val_lines[1:]),
+                        prefix=' '*len(field_name_expr),
+                    )
+                )
+                message += (
+                    f'{msg.rstrip("`")}\n\n'
+                    f'<{msg_type.__qualname__}(\n'
+                    # f'{".".join([msg_type.__module__, msg_type.__qualname__])}\n'
+                    f'{field_name_expr}{fmt_val}\n'
+                    f')>'
+                )
+
         if verb_header:
             message = f'{verb_header} ' + message
 
-        # XXX see if we can determine the exact invalid field
-        # such that we can comprehensively report the
-        # specific field's type problem
-        msgspec_msg: str = src_validation_error.args[0].rstrip('`')
-        msg, _, maybe_field = msgspec_msg.rpartition('$.')
-        obj = object()
-        if (field_val := msg_dict.get(maybe_field, obj)) is not obj:
-            field_name_expr: str = (
-                f' |_{maybe_field}: {codec.pld_spec_str} = '
-            )
-            fmt_val_lines: list[str] = pformat(field_val).splitlines()
-            fmt_val: str = (
-                f'{fmt_val_lines[0]}\n'
-                +
-                textwrap.indent(
-                    '\n'.join(fmt_val_lines[1:]),
-                    prefix=' '*len(field_name_expr),
-                )
-            )
-            message += (
-                f'{msg.rstrip("`")}\n\n'
-                f'<{msg_type.__qualname__}(\n'
-                # f'{".".join([msg_type.__module__, msg_type.__qualname__])}\n'
-                f'{field_name_expr}{fmt_val}\n'
-                f')>'
-            )
-
         msgtyperr = MsgTypeError.from_decode(
             message=message,
             msgdict=msg_dict,
diff --git a/tractor/_portal.py b/tractor/_portal.py
index 806dcc7b..79a9dc5d 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -68,40 +68,6 @@ if TYPE_CHECKING:
 log = get_logger(__name__)
 
 
-# TODO: remove and/or rework?
-# -[ ] rename to `unwrap_result()` and use
-#     `._raise_from_unexpected_msg()` (after tweak to accept a `chan:
-#     Channel` arg) in key block??
-# -[ ] pretty sure this is entirely covered by
-# `_exceptions._raise_from_unexpected_msg()` so REMOVE!
-# def _unwrap_msg(
-#     msg: Return|Error,
-#     ctx: Context,
-
-#     hide_tb: bool = True,
-
-# ) -> Any:
-#     '''
-#     Unwrap a final result from a `{return: <Any>}` IPC msg.
-
-#     '''
-#     __tracebackhide__: bool = hide_tb
-#     try:
-#         return msg.pld
-#     except AttributeError as err:
-
-#         # internal error should never get here
-#         # assert msg.get('cid'), (
-#         assert msg.cid, (
-#             "Received internal error at portal?"
-#         )
-
-#         raise unpack_error(
-#             msg,
-#             ctx.chan,
-#         ) from err
-
-
 class Portal:
     '''
     A 'portal' to a memory-domain-separated `Actor`.
@@ -173,12 +139,13 @@ class Portal:
             portal=self,
         )
 
+    # @api_frame
     async def result(self) -> Any:
         '''
         Return the result(s) from the remote actor's "main" task.
 
         '''
-        # __tracebackhide__ = True
+        __tracebackhide__ = True
         # Check for non-rpc errors slapped on the
         # channel for which we always raise
         exc = self.channel._exc
-- 
2.34.1


From 523c24eb72d1c708d7b363c38b96665f9e01897a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 6 May 2024 13:04:58 -0400
Subject: [PATCH 286/378] Move pformatters into new `.devx.pformat`

Since `._code` is prolly gonna get renamed (to something "frame & stack
tools" related) and to give a bit better organization.

Also adds a new `add_div()` helper, factored out of ctxc message
creation in `._rpc._invoke()`, for adding a little "header line" divider
under a given `message: str` with a little math to center it.
---
 tractor/devx/pformat.py | 135 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 135 insertions(+)
 create mode 100644 tractor/devx/pformat.py

diff --git a/tractor/devx/pformat.py b/tractor/devx/pformat.py
new file mode 100644
index 00000000..0b35feee
--- /dev/null
+++ b/tractor/devx/pformat.py
@@ -0,0 +1,135 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+Pretty formatters for use throughout the code base.
+Mostly handy for logging and exception message content.
+
+'''
+import textwrap
+import traceback
+
+
+def add_div(
+    message: str,
+    div_str: str = '------ - ------',
+
+) -> str:
+    '''
+    Add a "divider string" to the input `message` with
+    a little math to center it underneath.
+
+    '''
+    div_offset: int = (
+        round(len(message)/2)+1
+        -
+        round(len(div_str)/2)+1
+    )
+    div_str: str = (
+        '\n' + ' '*div_offset + f'{div_str}\n'
+    )
+    return div_str
+
+
+def pformat_boxed_tb(
+    tb_str: str,
+    fields_str: str|None = None,
+    field_prefix: str = ' |_',
+
+    tb_box_indent: int|None = None,
+    tb_body_indent: int = 1,
+
+) -> str:
+    '''
+    Create a "boxed" looking traceback string.
+
+    Useful for emphasizing traceback text content as being an
+    embedded attribute of some other object (like
+    a `RemoteActorError` or other boxing remote error shuttle
+    container).
+
+    Any other parent/container "fields" can be passed in the
+    `fields_str` input along with other prefix/indent settings.
+
+    '''
+    if (
+        fields_str
+        and
+        field_prefix
+    ):
+        fields: str = textwrap.indent(
+            fields_str,
+            prefix=field_prefix,
+        )
+    else:
+        fields = fields_str or ''
+
+    tb_body = tb_str
+    if tb_body_indent:
+        tb_body: str = textwrap.indent(
+            tb_str,
+            prefix=tb_body_indent * ' ',
+        )
+
+    tb_box: str = (
+        f'|\n'
+        f' ------ - ------\n'
+        f'{tb_body}'
+        f' ------ - ------\n'
+        f'_|\n'
+    )
+    tb_box_indent: str = (
+        tb_box_indent
+        or
+        1
+
+        # (len(field_prefix))
+        # ? ^-TODO-^ ? if you wanted another indent level
+    )
+    if tb_box_indent > 0:
+        tb_box: str = textwrap.indent(
+            tb_box,
+            prefix=tb_box_indent * ' ',
+        )
+
+    return (
+        fields
+        +
+        tb_box
+    )
+
+
+def pformat_caller_frame(
+    stack_limit: int = 1,
+    box_tb: bool = True,
+) -> str:
+    '''
+    Capture and return the traceback text content from
+    `stack_limit` call frames up.
+
+    '''
+    tb_str: str = (
+        '\n'.join(
+            traceback.format_stack(limit=stack_limit)
+        )
+    )
+    if box_tb:
+        tb_str: str = pformat_boxed_tb(
+            tb_str=tb_str,
+            field_prefix='  ',
+            indent='',
+        )
+    return tb_str
-- 
2.34.1


From 7707e0e75aa858bf9e2d9a5077aaa811f08df456 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 6 May 2024 13:07:53 -0400
Subject: [PATCH 287/378] Woops, make `log.devx()` level 600

---
 tractor/log.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tractor/log.py b/tractor/log.py
index 1870d4e1..e85b49cf 100644
--- a/tractor/log.py
+++ b/tractor/log.py
@@ -53,7 +53,7 @@ LEVELS: dict[str, int] = {
     'RUNTIME': 15,
     'CANCEL': 16,
     'PDB': 500,
-    'DEVX': 500,
+    'DEVX': 600,
 }
 # _custom_levels: set[str] = {
 #     lvlname.lower for lvlname in LEVELS.keys()
@@ -132,7 +132,7 @@ class StackLevelAdapter(logging.LoggerAdapter):
         "Developer experience" sub-sys statuses.
 
         '''
-        return self.log(500, msg)
+        return self.log(600, msg)
 
     def log(
         self,
-- 
2.34.1


From 8ffa6a5e68be0267ff2760a0107480a9b3a87932 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 6 May 2024 13:12:44 -0400
Subject: [PATCH 288/378] "Icons" in `._entry`'s subactor `.info()` messages

Add a little `>` or `X` supervision icon indicating the spawning or
termination of each sub-actor respectively.
---
 tractor/_entry.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tractor/_entry.py b/tractor/_entry.py
index 78f83283..750dc59f 100644
--- a/tractor/_entry.py
+++ b/tractor/_entry.py
@@ -20,6 +20,7 @@ Sub-process entry points.
 """
 from __future__ import annotations
 from functools import partial
+# import textwrap
 from typing import (
     Any,
     TYPE_CHECKING,
@@ -91,7 +92,7 @@ def _mp_main(
         pass  # handle it the same way trio does?
 
     finally:
-        log.info(f"Actor {actor.uid} terminated")
+        log.info(f"Subactor {actor.uid} terminated")
 
 
 def _trio_main(
@@ -125,9 +126,11 @@ def _trio_main(
             f'  loglevel: {actor.loglevel}\n'
         )
         log.info(
-            'Started new trio process:\n'
+            'Started new trio subactor:\n'
             +
-            actor_info
+            '>\n'  # like a "started/play"-icon from super perspective
+            +
+            actor_info,
         )
 
     try:
@@ -148,5 +151,7 @@ def _trio_main(
         log.info(
             'Subactor terminated\n'
             +
+            'x\n'  # like a "crossed-out/killed" from super perspective
+            +
             actor_info
         )
-- 
2.34.1


From b278164f83893ce401c07f457445c92d9e9cd69f Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 6 May 2024 13:27:00 -0400
Subject: [PATCH 289/378] Mk `drain_to_final_msg()` never raise from `Error`

Since we usually want them raised from some (internal) call to
`Context.maybe_raise()` and NOT directly from the drainage call, make it
possible via a new `raise_error: bool` to both `PldRx.recv_msg_w_pld()`
and `.dec_msg()`.

In support,
- rename `return_msg` -> `result_msg` since we expect to return
  `Error`s.
- do a `result_msg` assign and `break` in the `case Error()`.
- add `**dec_msg_kwargs` passthrough for other `.dec_msg()` calling
  methods.

Other,
- drop/aggregate todo-notes around the main loop's
  `ctx._pld_rx.recv_msg_w_pld()` call.
- add (configurable) frame hiding to most payload receive meths.
---
 tractor/msg/_ops.py | 146 ++++++++++++++++++++++----------------------
 1 file changed, 72 insertions(+), 74 deletions(-)

diff --git a/tractor/msg/_ops.py b/tractor/msg/_ops.py
index 4cf20496..1ba623db 100644
--- a/tractor/msg/_ops.py
+++ b/tractor/msg/_ops.py
@@ -161,9 +161,10 @@ class PldRx(Struct):
         ipc_msg: MsgType|None = None,
         expect_msg: Type[MsgType]|None = None,
 
-        **kwargs,
+        **dec_msg_kwargs,
 
     ) -> Any|Raw:
+        __tracebackhide__: bool = True
 
         msg: MsgType = (
             ipc_msg
@@ -176,6 +177,7 @@ class PldRx(Struct):
             msg,
             ctx=ctx,
             expect_msg=expect_msg,
+            **dec_msg_kwargs,
         )
 
     async def recv_pld(
@@ -183,14 +185,16 @@ class PldRx(Struct):
         ctx: Context,
         ipc_msg: MsgType|None = None,
         expect_msg: Type[MsgType]|None = None,
+        hide_tb: bool = True,
 
-        **kwargs
+        **dec_msg_kwargs,
 
     ) -> Any|Raw:
         '''
         Receive a `MsgType`, then decode and return its `.pld` field.
 
         '''
+        __tracebackhide__: bool = hide_tb
         msg: MsgType = (
             ipc_msg
             or
@@ -199,9 +203,10 @@ class PldRx(Struct):
             await ctx._rx_chan.receive()
         )
         return self.dec_msg(
-            msg,
+            msg=msg,
             ctx=ctx,
             expect_msg=expect_msg,
+            **dec_msg_kwargs,
         )
 
     def dec_msg(
@@ -210,12 +215,16 @@ class PldRx(Struct):
         ctx: Context,
         expect_msg: Type[MsgType]|None,
 
+        raise_error: bool = True,
+        hide_tb: bool = True,
+
     ) -> PayloadT|Raw:
         '''
         Decode a msg's payload field: `MsgType.pld: PayloadT|Raw` and
         return the value or raise an appropriate error.
 
         '''
+        __tracebackhide__: bool = hide_tb
         match msg:
             # payload-data shuttle msg; deliver the `.pld` value
             # directly to IPC (primitive) client-consumer code.
@@ -228,7 +237,8 @@ class PldRx(Struct):
                     pld: PayloadT = self._pldec.decode(pld)
                     log.runtime(
                         'Decoded msg payload\n\n'
-                        f'{msg}\n'
+                        f'{msg}\n\n'
+                        f'where payload is\n'
                         f'|_pld={pld!r}\n'
                     )
                     return pld
@@ -237,8 +247,9 @@ class PldRx(Struct):
                 except ValidationError as src_err:
                     msgterr: MsgTypeError = _mk_msg_type_err(
                         msg=msg,
-                        codec=self.dec,
+                        codec=self.pld_dec,
                         src_validation_error=src_err,
+                        is_invalid_payload=True,
                     )
                     msg: Error = pack_from_raise(
                         local_err=msgterr,
@@ -263,8 +274,29 @@ class PldRx(Struct):
 
             case Error():
                 src_err = MessagingError(
-                    'IPC ctx dialog terminated without `Return`-ing a result'
+                    'IPC ctx dialog terminated without `Return`-ing a result\n'
+                    f'Instead it raised {msg.boxed_type_str!r}!'
                 )
+                # XXX NOTE XXX another super subtle runtime-y thing..
+                #
+                # - when user code (transitively) calls into this
+                #   func (usually via a `Context/MsgStream` API) we
+                #   generally want errors to propagate immediately
+                #   and directly so that the user can define how it
+                #   wants to handle them.
+                #
+                #  HOWEVER,
+                #
+                # - for certain runtime calling cases, we don't want to
+                #   directly raise since the calling code might have
+                #   special logic around whether to raise the error
+                #   or supress it silently (eg. a `ContextCancelled`
+                #   received from the far end which was requested by
+                #   this side, aka a self-cancel).
+                #
+                # SO, we offer a flag to control this.
+                if not raise_error:
+                    return src_err
 
             case Stop(cid=cid):
                 message: str = (
@@ -305,6 +337,9 @@ class PldRx(Struct):
                     f'{msg}\n'
                 )
 
+        # TODO: maybe use the new `.add_note()` from 3.11?
+        # |_https://docs.python.org/3.11/library/exceptions.html#BaseException.add_note
+        #
         # fallthrough and raise from `src_err`
         _raise_from_unexpected_msg(
             ctx=ctx,
@@ -312,7 +347,7 @@ class PldRx(Struct):
             src_err=src_err,
             log=log,
             expect_msg=expect_msg,
-            hide_tb=False,
+            hide_tb=hide_tb,
         )
 
     async def recv_msg_w_pld(
@@ -320,6 +355,8 @@ class PldRx(Struct):
         ipc: Context|MsgStream,
         expect_msg: MsgType,
 
+        **kwargs,
+
     ) -> tuple[MsgType, PayloadT]:
         '''
         Retrieve the next avail IPC msg, decode it's payload, and return
@@ -335,6 +372,7 @@ class PldRx(Struct):
             msg,
             ctx=ipc,
             expect_msg=expect_msg,
+            **kwargs,
         )
         return msg, pld
 
@@ -433,70 +471,33 @@ async def drain_to_final_msg(
     # basically ignoring) any bi-dir-stream msgs still in transit
     # from the far end.
     pre_result_drained: list[MsgType] = []
-    return_msg: Return|None = None
+    result_msg: Return|Error|None = None
     while not (
         ctx.maybe_error
         and not ctx._final_result_is_set()
     ):
         try:
-            # TODO: can remove?
-            # await trio.lowlevel.checkpoint()
-
-            # NOTE: this REPL usage actually works here dawg! Bo
-            # from .devx._debug import pause
-            # await pause()
-
-            # TODO: bad idea?
-            # -[ ] wrap final outcome channel wait in a scope so
-            # it can be cancelled out of band if needed?
-            #
-            # with trio.CancelScope() as res_cs:
-            #     ctx._res_scope = res_cs
-            #     msg: dict = await ctx._rx_chan.receive()
-            # if res_cs.cancelled_caught:
-
-            # TODO: ensure there's no more hangs, debugging the
-            # runtime pretty preaase!
-            # from .devx._debug import pause
-            # await pause()
-
-            # TODO: can remove this finally?
-            # we have no more need for the sync draining right
-            # since we're can kinda guarantee the async
-            # `.receive()` below will never block yah?
-            #
-            # if (
-            #     ctx._cancel_called and (
-            #         ctx.cancel_acked
-            #         # or ctx.chan._cancel_called
-            #     )
-            #     # or not ctx._final_result_is_set()
-            #     # ctx.outcome is not
-            #     # or ctx.chan._closed
-            # ):
-            #     try:
-            #         msg: dict = await ctx._rx_chan.receive_nowait()()
-            #     except trio.WouldBlock:
-            #         log.warning(
-            #             'When draining already `.cancel_called` ctx!\n'
-            #             'No final msg arrived..\n'
-            #         )
-            #         break
-            # else:
-            #     msg: dict = await ctx._rx_chan.receive()
-
-            # TODO: don't need it right jefe?
-            # with trio.move_on_after(1) as cs:
-            # if cs.cancelled_caught:
-            #     from .devx._debug import pause
-            #     await pause()
-
-            # pray to the `trio` gawds that we're corrent with this
-            # msg: dict = await ctx._rx_chan.receive()
+            # receive all msgs, scanning for either a final result
+            # or error; the underlying call should never raise any
+            # remote error directly!
             msg, pld = await ctx._pld_rx.recv_msg_w_pld(
                 ipc=ctx,
                 expect_msg=Return,
+                raise_error=False,
             )
+            # ^-TODO-^ some bad ideas?
+            # -[ ] wrap final outcome .receive() in a scope so
+            #     it can be cancelled out of band if needed?
+            # |_with trio.CancelScope() as res_cs:
+            #       ctx._res_scope = res_cs
+            #       msg: dict = await ctx._rx_chan.receive()
+            #   if res_cs.cancelled_caught:
+            #
+            # -[ ] make sure pause points work here for REPLing
+            #   the runtime itself; i.e. ensure there's no hangs!
+            # |_from tractor.devx._debug import pause
+            #   await pause()
+
 
         # NOTE: we get here if the far end was
         # `ContextCancelled` in 2 cases:
@@ -504,7 +505,7 @@ async def drain_to_final_msg(
         #    SHOULD NOT raise that far end error,
         # 2. WE DID NOT REQUEST that cancel and thus
         #    SHOULD RAISE HERE!
-        except trio.Cancelled:
+        except trio.Cancelled as taskc:
 
             # CASE 2: mask the local cancelled-error(s)
             # only when we are sure the remote error is
@@ -514,7 +515,7 @@ async def drain_to_final_msg(
 
             # CASE 1: we DID request the cancel we simply
             # continue to bubble up as normal.
-            raise
+            raise taskc
 
         match msg:
 
@@ -534,7 +535,7 @@ async def drain_to_final_msg(
                 # if ctx._rx_chan:
                 #     await ctx._rx_chan.aclose()
                 # TODO: ^ we don't need it right?
-                return_msg = msg
+                result_msg = msg
                 break
 
             # far end task is still streaming to us so discard
@@ -565,10 +566,7 @@ async def drain_to_final_msg(
 
                         f'{pretty_struct.pformat(msg)}\n'
                     )
-                    return (
-                        return_msg,
-                        pre_result_drained,
-                    )
+                    break
 
                 # drain up to the `msg_limit` hoping to get
                 # a final result or error/ctxc.
@@ -604,9 +602,9 @@ async def drain_to_final_msg(
             case Error():
                 # TODO: can we replace this with `ctx.maybe_raise()`?
                 # -[ ]  would this be handier for this case maybe?
-                #     async with maybe_raise_on_exit() as raises:
-                #         if raises:
-                #             log.error('some msg about raising..')
+                # |_async with maybe_raise_on_exit() as raises:
+                #       if raises:
+                #           log.error('some msg about raising..')
                 #
                 re: Exception|None = ctx._remote_error
                 if re:
@@ -640,7 +638,7 @@ async def drain_to_final_msg(
                         # raise_overrun_from_self=False,
                         raise_overrun_from_self=raise_overrun,
                     )
-
+                    result_msg = msg
                     break  # OOOOOF, yeah obvi we need this..
 
                 # XXX we should never really get here
@@ -686,6 +684,6 @@ async def drain_to_final_msg(
         )
 
     return (
-        return_msg,
+        result_msg,
         pre_result_drained,
     )
-- 
2.34.1


From fbc21a1dec09eb2a270020758dbbf799e591ff87 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 7 May 2024 09:20:43 -0400
Subject: [PATCH 290/378] Add a "current IPC `Context`" `ContextVar`

Expose it from `._state.current_ipc_ctx()` and set it inside
`._rpc._invoke()` for child and inside `Portal.open_context()` for
parent.

Still need to write a few more tests (particularly demonstrating usage
throughout multiple nested nurseries on each side) but this suffices as
a proto for testing with some debugger request-from-subactor stuff.

Other,
- use new `.devx.pformat.add_div()` for ctxc messages.
- add a block to always traceback dump on corrupted cs stacks.
- better handle non-RAEs exception output-formatting in context
  termination summary log message.
- use a summary for `start_status` for msg logging in RPC loop.
---
 tests/test_context_stream_semantics.py |   4 +
 tractor/_rpc.py                        | 129 +++++++++++++++----------
 tractor/_state.py                      |  23 +++++
 3 files changed, 106 insertions(+), 50 deletions(-)

diff --git a/tests/test_context_stream_semantics.py b/tests/test_context_stream_semantics.py
index cedddf73..8edea510 100644
--- a/tests/test_context_stream_semantics.py
+++ b/tests/test_context_stream_semantics.py
@@ -25,6 +25,7 @@ from tractor._exceptions import (
     StreamOverrun,
     ContextCancelled,
 )
+from tractor._state import current_ipc_ctx
 
 from tractor._testing import (
     tractor_test,
@@ -144,6 +145,8 @@ async def simple_setup_teardown(
     global _state
     _state = True
 
+    assert current_ipc_ctx() is ctx
+
     # signal to parent that we're up
     await ctx.started(data + 1)
 
@@ -204,6 +207,7 @@ def test_simple_context(
                             block_forever=callee_blocks_forever,
                         ) as (ctx, sent),
                     ):
+                        assert current_ipc_ctx() is ctx
                         assert sent == 11
 
                         if callee_blocks_forever:
diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index ee3151d3..eed47902 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -57,6 +57,7 @@ from ._exceptions import (
 from .devx import (
     maybe_wait_for_debugger,
     _debug,
+    add_div,
 )
 from . import _state
 from .log import get_logger
@@ -250,6 +251,9 @@ async def _errors_relayed_via_ipc(
 
 ) -> None:
     __tracebackhide__: bool = hide_tb
+    # TODO: a debug nursery when in debug mode!
+    # async with maybe_open_debugger_nursery() as debug_tn:
+    # => see matching comment in side `._debug._pause()`
     try:
         yield  # run RPC invoke body
 
@@ -273,6 +277,8 @@ async def _errors_relayed_via_ipc(
 
         # TODO: maybe we'll want different "levels" of debugging
         # eventualy such as ('app', 'supervisory', 'runtime') ?
+        #
+        # -[ ] this if check is duplicate with `._maybe_enter_pm()`..
         if not is_multi_cancelled(err):
             entered_debug: bool = False
             if (
@@ -296,7 +302,6 @@ async def _errors_relayed_via_ipc(
                     )
                 )
             ):
-                # await _debug.pause()
                 # XXX QUESTION XXX: is there any case where we'll
                 # want to debug IPC disconnects as a default?
                 # => I can't think of a reason that inspecting this
@@ -304,7 +309,14 @@ async def _errors_relayed_via_ipc(
                 # recovery logic - the only case is some kind of
                 # strange bug in our transport layer itself? Going
                 # to keep this open ended for now.
-                entered_debug = await _debug._maybe_enter_pm(err)
+                log.debug(
+                    'RPC task crashed, attempting to enter debugger\n'
+                    f'|_{ctx}'
+                )
+                entered_debug = await _debug._maybe_enter_pm(
+                    err,
+                    api_frame=inspect.currentframe(),
+                )
                 if not entered_debug:
                     log.exception(
                         'RPC task crashed\n'
@@ -434,6 +446,8 @@ async def _invoke(
     )
     context: bool = False
 
+    assert not _state._ctxvar_Context.get()
+
     # TODO: deprecate this style..
     if getattr(func, '_tractor_stream_function', False):
         # handle decorated ``@tractor.stream`` async functions
@@ -557,6 +571,7 @@ async def _invoke(
             async with trio.open_nursery() as tn:
                 ctx._scope_nursery = tn
                 ctx._scope = tn.cancel_scope
+                _state._ctxvar_Context.set(ctx)
                 task_status.started(ctx)
 
                 # TODO: should would be nice to have our
@@ -592,7 +607,6 @@ async def _invoke(
                 cs: CancelScope = ctx._scope
 
                 if cs.cancel_called:
-
                     canceller: tuple = ctx.canceller
                     explain: str = f'{ctx.side!r}-side task was cancelled by '
 
@@ -621,23 +635,9 @@ async def _invoke(
                     else:
                         explain += 'a remote peer'
 
-                    # TODO: move this "div centering" into
-                    # a helper for use elsewhere!
-                    div_chars: str = '------ - ------'
-                    div_offset: int = (
-                        round(len(explain)/2)+1
-                        +
-                        round(len(div_chars)/2)+1
-                    )
-                    div_str: str = (
-                        '\n'
-                        +
-                        ' '*div_offset
-                        +
-                        f'{div_chars}\n'
-                    )
                     explain += (
-                        div_str +
+                        add_div(message=explain)
+                        +
                         f'<= canceller: {canceller}\n'
                         f'=> cancellee: {our_uid}\n'
                         # TODO: better repr for ctx tasks..
@@ -664,10 +664,10 @@ async def _invoke(
                         boxed_type=trio.Cancelled,
                         canceller=canceller,
                     )
-                    # assign local error so that the `.outcome`
-                    # resolves to an error for both reporting and
-                    # state checks.
-                    ctx._local_error = ctxc
+                    # does this matter other then for
+                    # consistentcy/testing? |_ no user code should be
+                    # in this scope at this point..
+                    # ctx._local_error = ctxc
                     raise ctxc
 
         # XXX: do we ever trigger this block any more?
@@ -677,6 +677,13 @@ async def _invoke(
             BaseException,
 
         ) as scope_error:
+            if (
+                isinstance(scope_error, RuntimeError)
+                and scope_error.args
+                and 'Cancel scope stack corrupted' in scope_error.args[0]
+            ):
+                log.exception('Cancel scope stack corrupted!?\n')
+                # _debug.mk_pdb().set_trace()
 
             # always set this (child) side's exception as the
             # local error on the context
@@ -710,17 +717,32 @@ async def _invoke(
                 res_type_str,
                 res_str,
             ) = (
-                ('error', f'{type(merr)}',)
-                if merr
+                ('error', f'{type(merr)}',) if merr
                 else (
                     'result',
                     f'`{repr(ctx.outcome)}`',
                 )
             )
-            log.runtime(
+            message: str = (
                 f'IPC context terminated with a final {res_type_str}\n\n'
                 f'{ctx}'
             )
+            if merr:
+                from tractor import RemoteActorError
+                if not isinstance(merr, RemoteActorError):
+                    fmt_merr: str = (
+                        f'\n{merr!r}\n'
+                        # f'{merr.args[0]!r}\n'
+                    )
+                else:
+                    fmt_merr = f'\n{merr!r}'
+                log.error(
+                    message
+                    +
+                    fmt_merr
+                )
+            else:
+                log.runtime(message)
 
 
 async def try_ship_error_to_remote(
@@ -955,12 +977,19 @@ async def process_messages(
                         kwargs=kwargs,  # type-spec this? see `msg.types`
                         uid=actorid,
                     ):
-                        log.runtime(
+                        start_status: str = (
                             'Handling RPC `Start` request\n'
-                            f'<= peer: {actorid}\n'
-                            f'  |_{ns}.{funcname}({kwargs})\n\n'
+                            f'<= peer: {actorid}\n\n'
+                            f'  |_{chan}\n'
+                            f'  |_cid: {cid}\n\n'
+                            # f'  |_{ns}.{funcname}({kwargs})\n'
+                            f'>> {actor.uid}\n'
+                            f'  |_{actor}\n'
+                            f'   -> nsp: `{ns}.{funcname}({kwargs})`\n'
 
-                            f'{pretty_struct.pformat(msg)}\n'
+                            # f'  |_{ns}.{funcname}({kwargs})\n\n'
+
+                            # f'{pretty_struct.pformat(msg)}\n'
                         )
 
                         # runtime-internal endpoint: `Actor.<funcname>`
@@ -989,6 +1018,10 @@ async def process_messages(
                                 await chan.send(err_msg)
                                 continue
 
+                        start_status += (
+                            f'   -> func: {func}\n'
+                        )
+
                         # schedule a task for the requested RPC function
                         # in the actor's main "service nursery".
                         #
@@ -996,18 +1029,8 @@ async def process_messages(
                         # supervision isolation? would avoid having to
                         # manage RPC tasks individually in `._rpc_tasks`
                         # table?
-                        log.runtime(
-                            f'Spawning task for RPC request\n'
-                            f'<= caller: {chan.uid}\n'
-                            f'  |_{chan}\n\n'
-                            # ^-TODO-^ maddr style repr?
-                            # f'  |_@ /ipv4/{chan.raddr}/tcp/{chan.rport}/'
-                            # f'cid="{cid[-16:]} .."\n\n'
-
-                            f'=> {actor}\n'
-                            f'  |_cid: {cid}\n'
-                            f'   |>> {func}()\n'
-                        )
+                        start_status += '   -> scheduling new task..\n'
+                        log.runtime(start_status)
                         try:
                             ctx: Context = await actor._service_n.start(
                                 partial(
@@ -1035,8 +1058,9 @@ async def process_messages(
                         # scoped exception from ``_invoke()`` itself.
                         if isinstance(err := ctx, Exception):
                             log.warning(
-                                'Task for RPC failed?'
-                                f'|_ {func}()\n\n'
+                                start_status
+                                +
+                                '   -> task for RPC failed?\n\n'
                                 f'{err}'
                             )
                             continue
@@ -1155,12 +1179,17 @@ async def process_messages(
 
     finally:
         # msg debugging for when he machinery is brokey
-        log.runtime(
-            'Exiting IPC msg loop with final msg\n\n'
-            f'<= peer: {chan.uid}\n'
-            f'  |_{chan}\n\n'
-            f'{pretty_struct.pformat(msg)}'
-        )
+        if msg is None:
+            message: str = 'Exiting IPC msg loop without receiving a msg?'
+        else:
+            message: str = (
+                'Exiting IPC msg loop with final msg\n\n'
+                f'<= peer: {chan.uid}\n'
+                f'  |_{chan}\n\n'
+                f'{pretty_struct.pformat(msg)}'
+            )
+
+        log.runtime(message)
 
     # transport **WAS NOT** disconnected
     return (False, msg)
diff --git a/tractor/_state.py b/tractor/_state.py
index 30346a6a..a3729833 100644
--- a/tractor/_state.py
+++ b/tractor/_state.py
@@ -19,13 +19,19 @@ Per process state
 
 """
 from __future__ import annotations
+from contextvars import (
+    ContextVar,
+)
 from typing import (
     Any,
     TYPE_CHECKING,
 )
 
+from trio.lowlevel import current_task
+
 if TYPE_CHECKING:
     from ._runtime import Actor
+    from ._context import Context
 
 
 _current_actor: Actor|None = None  # type: ignore # noqa
@@ -110,3 +116,20 @@ def debug_mode() -> bool:
 
 def is_root_process() -> bool:
     return _runtime_vars['_is_root']
+
+
+_ctxvar_Context: ContextVar[Context] = ContextVar(
+    'ipc_context',
+    default=None,
+)
+
+
+def current_ipc_ctx() -> Context:
+    ctx: Context = _ctxvar_Context.get()
+    if not ctx:
+        from ._exceptions import InternalError
+        raise InternalError(
+            'No IPC context has been allocated for this task yet?\n'
+            f'|_{current_task()}\n'
+        )
+    return ctx
-- 
2.34.1


From a354732a9e1fdd9686376ed9a0ae59f95c691965 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 8 May 2024 08:50:16 -0400
Subject: [PATCH 291/378] Allow `Stop` passthrough from
 `PldRx.recv_msg_w_pld()`

Since we need to allow it (at the least) inside
`drain_until_final_msg()` for handling stream-phase termination races
where we don't want to have to handle a raised error from something like
`Context.result()`. Expose the passthrough option via
a `passthrough_non_pld_msgs: bool` kwarg.

Add comprehensive comment to `current_pldrx()`.
---
 tractor/msg/_ops.py | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/tractor/msg/_ops.py b/tractor/msg/_ops.py
index 1ba623db..3b0b8339 100644
--- a/tractor/msg/_ops.py
+++ b/tractor/msg/_ops.py
@@ -355,6 +355,9 @@ class PldRx(Struct):
         ipc: Context|MsgStream,
         expect_msg: MsgType,
 
+        # NOTE: generally speaking only for handling `Stop`-msgs that
+        # arrive during a call to `drain_to_final_msg()` above!
+        passthrough_non_pld_msgs: bool = True,
         **kwargs,
 
     ) -> tuple[MsgType, PayloadT]:
@@ -365,6 +368,11 @@ class PldRx(Struct):
         '''
         msg: MsgType = await ipc._rx_chan.receive()
 
+        if passthrough_non_pld_msgs:
+            match msg:
+                case Stop():
+                    return msg, None
+
         # TODO: is there some way we can inject the decoded
         # payload into an existing output buffer for the original
         # msg instance?
@@ -389,15 +397,30 @@ _ctxvar_PldRx: ContextVar[PldRx] = ContextVar(
 
 def current_pldrx() -> PldRx:
     '''
-    Return the current `trio.Task.context`'s msg-payload
-    receiver, the post IPC but pre-app code `MsgType.pld`
-    filter.
+    Return the current `trio.Task.context`'s msg-payload-receiver.
+
+    A payload receiver is the IPC-msg processing sub-sys which
+    filters inter-actor-task communicated payload data, i.e. the
+    `PayloadMsg.pld: PayloadT` field value, AFTER it's container
+    shuttlle msg (eg. `Started`/`Yield`/`Return) has been delivered
+    up from `tractor`'s transport layer but BEFORE the data is
+    yielded to application code, normally via an IPC primitive API
+    like, for ex., `pld_data: PayloadT = MsgStream.receive()`.
 
     Modification of the current payload spec via `limit_plds()`
-    allows an application to contextually filter typed IPC msg
-    content delivered via wire transport.
+    allows a `tractor` application to contextually filter IPC
+    payload content with a type specification as supported by
+    the interchange backend.
+
+    - for `msgspec` see <PUTLINKHERE>.
+
+    NOTE that the `PldRx` itself is a per-`Context` global sub-system
+    that normally does not change other then the applied pld-spec
+    for the current `trio.Task`.
 
     '''
+    # ctx: context = current_ipc_ctx()
+    # return ctx._pld_rx
     return _ctxvar_PldRx.get()
 
 
-- 
2.34.1


From 05b143d9ef874e420d154a56fce00044de1b43a2 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 8 May 2024 09:08:01 -0400
Subject: [PATCH 292/378] Big debugger rework, more tolerance for internal
 err-hangs

Since i was running into them (internal errors) during lock request
machinery dev and was getting all sorts of difficult to understand hangs
whenever i intro-ed a bug to either side of the ipc ctx; this all while
trying to get the msg-spec working for `Lock` requesting subactors..

Deats:
- hideframes for `@acm`s and `trio.Event.wait()`, `Lock.release()`.
- better detail out the `Lock.acquire/release()` impls
- drop `Lock.remote_task_in_debug`, use new `.ctx_in_debug`.
- add a `Lock.release(force: bool)`.
- move most of what was `_acquire_debug_lock_from_root_task()` and some
  of the `lock_tty_for_child().__a[enter/exit]()` logic into
  `Lock.[acquire/release]()`  including bunch more logging.
- move `lock_tty_for_child()` up in the module to below `Lock`, with
  some rework:
  - drop `subactor_uid: tuple` arg since we can just use the `ctx`..
  - add exception handler blocks for reporting internal (impl) errors
    and always force release the lock in such cases.
- extend `DebugStatus` (prolly will rename to `DebugRequest` btw):
  - add `.req_ctx: Context` for subactor side.
  - add `.req_finished: trio.Event` to sub to signal request task exit.
  - extend `.shield_sigint()` doc-str.
  - add `.release()` to encaps all the state mgmt previously strewn
    about inside `._pause()`..
- use new `DebugStatus.release()` to replace all the duplication:
  - inside `PdbREPL.set_[continue/quit]()`.
  - inside `._pause()` for the subactor branch on internal
    repl-invocation error cases,
  - in the `_enter_repl_sync()` closure on error,
- replace `apply_debug_codec()` -> `apply_debug_pldec()` in tandem with
  the new `PldRx` sub-sys  which handles the new `__pld_spec__`.
- add a new `pformat_cs()` helper orig to help debug cs stack
  a corruption; going to move to `.devx.pformat` obvi.
- rename `wait_for_parent_stdin_hijack()` -> `request_root_stdio_lock()`
  with improvements:
  - better doc-str and add todos,
  - use `DebugStatus` more stringently to encaps all subactor req state.
  - error handling blocks for cancellation and straight up impl errors
    directly around the `.open_context()` block with the latter doing
    a `ctx.cancel()` to avoid hanging in the shielded `.req_cs` scope.
  - similar exc blocks for the func's overall body with explicit
    `log.exception()` reporting.
  - only set the new `DebugStatus.req_finished: trio.Event` in `finally`.
- rename `mk_mpdb()` -> `mk_pdb()` and don't cal `.shield_sigint()`
  implicitly since the caller usage does matter for this.
- factor out `any_connected_locker_child()` from the SIGINT handler.
- rework SIGINT handler to better handle any stale-lock/hang cases:
  - use new `Lock.ctx_in_debug: Context` to detect subactor-in-debug.
    and use it to cancel any lock request instead of the lower level
  - use `problem: str` summary approach to log emissions.
- rework `_pause()` given all of the above, stuff not yet mentioned:
  - don't take `shield: bool` input and proxy to `debug_func()` (for now).
  - drop `extra_frames_up_when_async: int` usage, expect
    `**debug_func_kwargs` to passthrough an `api_frame: Frametype` (more
    on this later).
  - lotsa asserts around the request ctx vs. task-in-debug ctx using new
    `current_ipc_ctx()`.
  - asserts around `DebugStatus` state.
- rework and simplify the `debug_func` hooks,
  `_set_trace()`/`_post_mortem()`:
  - make them accept a non-optional `repl: PdbRepl` and `api_frame:
    FrameType` which should be used to set the current frame when the
    REPL engages.
  - always hide the hook frames.
  - always accept a `tb: TracebackType` to `_post_mortem()`.
   |_ copy and re-impl what was the delegation to
     `pdbp.xpm()`/`pdbp.post_mortem()` and instead call the
     underlying `Pdb.interaction()` ourselves with a `caller_frame`
     and tb instance.
- adjust the public `.pause()` impl:
  - accept optional `hide_tb` and `api_frame` inputs.
  - mask opening a cancel-scope for now (can cause `trio` stack
    corruption, see notes) and thus don't use the `shield` input other
    then to eventually passthrough to `_post_mortem()`?
   |_ thus drop `task_status` support for now as well.
   |_ pretty sure correct soln is a debug-nursery around `._invoke()`.
- since no longer using `extra_frames_up_when_async` inside
  `debug_func()`s ensure all public apis pass a `api_frame`.
- re-impl our `tractor.post_mortem()` to directly call into `._pause()`
  instead of binding in via `partial` and mk it take similar input as
  `.pause()`.
- drop `Lock.release()` from `_maybe_enter_pm()`, expose and pass
  expected frame and tb.
- use necessary changes from all the above within
  `maybe_wait_for_debugger()` and `acquire_debug_lock()`.

Lel, sorry thought that would be shorter..
There's still a lot more re-org to do particularly with `DebugStatus`
encapsulation but it's coming in follow up.
---
 tractor/devx/__init__.py |    6 +
 tractor/devx/_debug.py   | 1729 ++++++++++++++++++++++----------------
 2 files changed, 992 insertions(+), 743 deletions(-)

diff --git a/tractor/devx/__init__.py b/tractor/devx/__init__.py
index 7ea2b25c..ab9d2d1a 100644
--- a/tractor/devx/__init__.py
+++ b/tractor/devx/__init__.py
@@ -30,7 +30,13 @@ from ._debug import (
     open_crash_handler as open_crash_handler,
     maybe_open_crash_handler as maybe_open_crash_handler,
     post_mortem as post_mortem,
+    mk_pdb as mk_pdb,
 )
 from ._stackscope import (
     enable_stack_on_sig as enable_stack_on_sig,
 )
+from .pformat import (
+    add_div as add_div,
+    pformat_caller_frame as pformat_caller_frame,
+    pformat_boxed_tb as pformat_boxed_tb,
+)
diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index e4ab7d83..0567e42a 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -26,11 +26,13 @@ from contextlib import (
     contextmanager as cm,
     nullcontext,
     _GeneratorContextManager,
+    _AsyncGeneratorContextManager,
 )
 from functools import (
     partial,
     cached_property,
 )
+import inspect
 import os
 import signal
 import sys
@@ -48,13 +50,14 @@ from typing import (
 from types import (
     FrameType,
     ModuleType,
+    TracebackType,
 )
 
 from msgspec import Struct
 import pdbp
 import sniffio
-import tractor
 import trio
+from trio import CancelScope
 from trio.lowlevel import (
     current_task,
     Task,
@@ -62,26 +65,25 @@ from trio.lowlevel import (
 from trio import (
     TaskStatus,
 )
-
+import tractor
 from tractor.log import get_logger
-from tractor.msg import (
-    _codec,
-)
 from tractor._state import (
     current_actor,
     is_root_process,
     debug_mode,
+    current_ipc_ctx,
 )
-from tractor._exceptions import (
-    is_multi_cancelled,
-    ContextCancelled,
-)
-from tractor._ipc import Channel
+# from .pformat import pformat_caller_frame
 
 if TYPE_CHECKING:
+    from tractor._ipc import Channel
+    from tractor._context import Context
     from tractor._runtime import (
         Actor,
     )
+    from tractor.msg import (
+        _codec,
+    )
 
 log = get_logger(__name__)
 
@@ -115,6 +117,8 @@ log = get_logger(__name__)
 pdbp.hideframe(trio._core._run.NurseryManager.__aexit__)
 pdbp.hideframe(trio._core._run.CancelScope.__exit__)
 pdbp.hideframe(_GeneratorContextManager.__exit__)
+pdbp.hideframe(_AsyncGeneratorContextManager.__aexit__)
+pdbp.hideframe(trio.Event.wait)
 
 __all__ = [
     'breakpoint',
@@ -141,14 +145,14 @@ class LockRelease(
     cid: str
 
 
-__msg_spec__: TypeAlias = LockStatus|LockRelease
+__pld_spec__: TypeAlias = LockStatus|LockRelease
 
 
 class Lock:
     '''
-    Actor global debug lock state.
+    Actor-tree-global debug lock state, exists only in a root process.
 
-    Mostly to avoid a lot of ``global`` declarations for now XD.
+    Mostly to avoid a lot of global declarations for now XD.
 
     '''
     # XXX local ref to the `Pbp` instance, ONLY set in the
@@ -157,30 +161,17 @@ class Lock:
     # that does not have this lock acquired in the root proc.
     repl: PdbREPL|None = None
 
-    # placeholder for function to set a ``trio.Event`` on debugger exit
-    # pdb_release_hook: Callable | None = None
-
-    remote_task_in_debug: str|None = None
-
     @staticmethod
-    def get_locking_task_cs() -> trio.CancelScope|None:
-        if is_root_process():
-            return Lock._locking_task_cs
-
-        raise RuntimeError(
-            '`Lock.locking_task_cs` is invalid in subactors!'
-        )
-
-    @staticmethod
-    def set_locking_task_cs(
-        cs: trio.CancelScope,
-    ) -> None:
+    def get_locking_task_cs() -> CancelScope|None:
         if not is_root_process():
             raise RuntimeError(
                 '`Lock.locking_task_cs` is invalid in subactors!'
             )
 
-        Lock._locking_task_cs = cs
+        if ctx := Lock.ctx_in_debug:
+            return ctx._scope
+
+        return None
 
     #     ROOT ONLY
     # ------ - -------
@@ -195,12 +186,14 @@ class Lock:
     # * in case it needs to be manually cancelled in root due to
     #   a stale lock condition (eg. IPC failure with the locking
     #   child
-    global_actor_in_debug: tuple[str, str]|None = None
-    no_remote_has_tty: trio.Event|None = None
-    _locking_task_cs: trio.CancelScope|None = None
+    ctx_in_debug: Context|None = None
 
+    no_remote_has_tty: trio.Event|None = None
     _debug_lock: trio.StrictFIFOLock = trio.StrictFIFOLock()
-    _blocked: set[tuple[str, str]] = set()  # `Actor.uid` block list
+    _blocked: set[
+        tuple[str, str]  # `Actor.uid` for per actor
+        |str  # Context.cid for per task
+    ] = set()
 
     @classmethod
     def repr(cls) -> str:
@@ -213,12 +206,11 @@ class Lock:
         if is_root_process():
             lock_stats: trio.LockStatistics = cls._debug_lock.statistics()
             fields += (
-                f'global_actor_in_debug: {cls.global_actor_in_debug}\n'
                 f'no_remote_has_tty: {cls.no_remote_has_tty}\n'
-                f'remote_task_in_debug: {cls.remote_task_in_debug}\n'
-                f'_locking_task_cs: {cls.get_locking_task_cs()}\n'
                 f'_blocked: {cls._blocked}\n\n'
 
+                f'ctx_in_debug: {cls.ctx_in_debug}\n\n'
+
                 f'_debug_lock: {cls._debug_lock}\n'
                 f'lock_stats: {lock_stats}\n'
             )
@@ -234,16 +226,29 @@ class Lock:
         )
 
     @classmethod
-    def release(cls):
+    @pdbp.hideframe
+    def release(
+        cls,
+        force: bool = False,
+    ):
+        lock: trio.StrictFIFOLock = cls._debug_lock
         try:
-            if not DebugStatus.is_main_trio_thread():
-                trio.from_thread.run_sync(
-                    cls._debug_lock.release
-                )
+            if lock.locked():
+                if not DebugStatus.is_main_trio_thread():
+                    trio.from_thread.run_sync(
+                        cls._debug_lock.release
+                    )
+                else:
+                    cls._debug_lock.release()
+
+                message: str = 'TTY lock released for child\n'
             else:
-                cls._debug_lock.release()
+                message: str = 'TTY lock not held by any child\n'
 
         except RuntimeError as rte:
+            message: str = 'TTY lock FAILED to release for child??\n'
+            log.exception(message)
+
             # uhhh makes no sense but been seeing the non-owner
             # release error even though this is definitely the task
             # that locked?
@@ -256,7 +261,7 @@ class Lock:
             #     raise RuntimeError(
             #         'Stale `Lock` detected, no remote task active!?\n'
             #         f'|_{owner}\n'
-            #         # f'{Lock}'
+            #         # f'{cls}'
             #     ) from rte
 
             if owner:
@@ -266,23 +271,265 @@ class Lock:
             # something somethin corrupts a cancel-scope
             # somewhere..
 
+        finally:
+            # IFF there are no more requesting tasks queued up fire, the
+            # "tty-unlocked" event thereby alerting any monitors of the lock that
+            # we are now back in the "tty unlocked" state. This is basically
+            # and edge triggered signal around an empty queue of sub-actor
+            # tasks that may have tried to acquire the lock.
+            stats = cls._debug_lock.statistics()
+            if (
+                not stats.owner
+                or force
+                # and cls.no_remote_has_tty is not None
+            ):
+                message += '-> No more child ctx tasks hold the TTY lock!\n'
+
+                # set and release
+                if cls.no_remote_has_tty is not None:
+                    cls.no_remote_has_tty.set()
+                    cls.no_remote_has_tty = None
+
+                    # cls.remote_task_in_debug = None
+
+                else:
+                    message += (
+                        f'-> Not signalling `Lock.no_remote_has_tty` since it has value:{cls.no_remote_has_tty}\n'
+                    )
+
+            else:
+                # wakeup any waiters since the lock was released
+                # (presumably) temporarily.
+                if no_remote_has_tty := cls.no_remote_has_tty:
+                    no_remote_has_tty.set()
+                    no_remote_has_tty = trio.Event()
+
+                message += (
+                    f'-> A child ctx task still owns the `Lock` ??\n'
+                    f'   |_owner task: {stats.owner}\n'
+                )
+
+            cls.ctx_in_debug = None
+
+    @classmethod
+    @acm
+    async def acquire(
+        cls,
+        ctx: Context,
+        # subactor_uid: tuple[str, str],
+        # remote_task_uid: str,
+
+    ) -> AsyncIterator[trio.StrictFIFOLock]:
+        '''
+        Acquire a root-actor local FIFO lock which tracks mutex access of
+        the process tree's global debugger breakpoint.
+
+        This lock avoids tty clobbering (by preventing multiple processes
+        reading from stdstreams) and ensures multi-actor, sequential access
+        to the ``pdb`` repl.
+
+        '''
+        if not is_root_process():
+            raise RuntimeError('Only callable by a root actor task!')
+
+        # subactor_uid: tuple[str, str] = ctx.chan.uid
+        we_acquired: bool = False
+        log.runtime(
+            f'Attempting to acquire TTY lock for sub-actor\n'
+            f'{ctx}'
+        )
         try:
-            # sometimes the ``trio`` might already be terminated in
-            # which case this call will raise.
-            if DebugStatus.repl_release is not None:
-                DebugStatus.repl_release.set()
+            pre_msg: str = (
+                f'Entering lock checkpoint for sub-actor\n'
+                f'{ctx}'
+            )
+            stats = cls._debug_lock.statistics()
+            if owner := stats.owner:
+                # and cls.no_remote_has_tty is not None
+                pre_msg += (
+                    f'\n'
+                    f'`Lock` already held by local task?\n'
+                    f'{owner}\n\n'
+                    # f'On behalf of task: {cls.remote_task_in_debug!r}\n'
+                    f'On behalf of IPC ctx\n'
+                    f'{ctx}'
+                )
+            log.runtime(pre_msg)
+
+            # NOTE: if the surrounding cancel scope from the
+            # `lock_tty_for_child()` caller is cancelled, this line should
+            # unblock and NOT leave us in some kind of
+            # a "child-locked-TTY-but-child-is-uncontactable-over-IPC"
+            # condition.
+            await cls._debug_lock.acquire()
+            cls.ctx_in_debug = ctx
+            we_acquired = True
+            if cls.no_remote_has_tty is None:
+                # mark the tty lock as being in use so that the runtime
+                # can try to avoid clobbering any connection from a child
+                # that's currently relying on it.
+                cls.no_remote_has_tty = trio.Event()
+                # cls.remote_task_in_debug = remote_task_uid
+
+            log.runtime(
+                f'TTY lock acquired for sub-actor\n'
+                f'{ctx}'
+            )
+
+            # NOTE: critical section: this yield is unshielded!
+
+            # IF we received a cancel during the shielded lock entry of some
+            # next-in-queue requesting task, then the resumption here will
+            # result in that ``trio.Cancelled`` being raised to our caller
+            # (likely from ``lock_tty_for_child()`` below)!  In
+            # this case the ``finally:`` below should trigger and the
+            # surrounding caller side context should cancel normally
+            # relaying back to the caller.
+
+            yield cls._debug_lock
 
         finally:
-            cls.repl = None
-            cls.global_actor_in_debug = None
+            message :str = 'Exiting `Lock.acquire()` on behalf of sub-actor\n'
+            if (
+                we_acquired
+                # and
+                # cls._debug_lock.locked()
+            ):
+                message += '-> TTY lock released by child\n'
+                cls.release()
 
-            # restore original sigint handler
-            DebugStatus.unshield_sigint()
-            # actor-local state, irrelevant for non-root.
-            DebugStatus.repl_task = None
+            else:
+                message += '-> TTY lock never acquired by child??\n'
+
+            log.runtime(
+                f'{message}\n'
+                f'{ctx}'
+            )
 
 
-# TODO: actually use this instead throughout for subs!
+@tractor.context
+async def lock_tty_for_child(
+
+    ctx: Context,
+    subactor_task_uid: tuple[str, int],
+
+) -> LockStatus|LockRelease:
+    '''
+    Lock the TTY in the root process of an actor tree in a new
+    inter-actor-context-task such that the ``pdbp`` debugger console
+    can be mutex-allocated to the calling sub-actor for REPL control
+    without interference by other processes / threads.
+
+    NOTE: this task must be invoked in the root process of the actor
+    tree. It is meant to be invoked as an rpc-task and should be
+    highly reliable at releasing the mutex complete!
+
+    '''
+    subactor_uid: tuple[str, str] = ctx.chan.uid
+    # NOTE: we use the IPC ctx's cancel scope directly in order to
+    # ensure that on any transport failure, or cancellation request
+    # from the child we expect
+    # `Context._maybe_cancel_and_set_remote_error()` to cancel this
+    # scope despite the shielding we apply below.
+    debug_lock_cs: CancelScope = ctx._scope
+
+    try:
+        if ctx.cid in Lock._blocked:
+            raise RuntimeError(
+                f'Double lock request!?\n'
+                f'The same remote task already has an active request for TTY lock ??\n\n'
+                f'subactor uid: {subactor_uid}\n\n'
+
+                'This might be mean that the requesting task '
+                'in `request_root_stdio_lock()` may have crashed?\n'
+                'Consider that an internal bug exists given the TTY '
+                '`Lock`ing IPC dialog..\n'
+            )
+
+        root_task_name: str = current_task().name
+        if tuple(subactor_uid) in Lock._blocked:
+            log.warning(
+                f'Subactor is blocked from acquiring debug lock..\n'
+                f'subactor_uid: {subactor_uid}\n'
+                f'remote task: {subactor_task_uid}\n'
+            )
+            ctx._enter_debugger_on_cancel: bool = False
+            await ctx.cancel(f'Debug lock blocked for {subactor_uid}')
+            # TODO: remove right?
+            # return LockStatus(
+            #     subactor_uid=subactor_uid,
+            #     cid=ctx.cid,
+            #     locked=False,
+            # )
+
+        # TODO: when we get to true remote debugging
+        # this will deliver stdin data?
+
+        log.debug(
+            'Subactor attempting to acquire TTY lock\n'
+            f'root task: {root_task_name}\n'
+            f'subactor_uid: {subactor_uid}\n'
+            f'remote task: {subactor_task_uid}\n'
+        )
+        DebugStatus.shield_sigint()
+        Lock._blocked.add(ctx.cid)
+        with (
+            # enable the locking msgspec
+            apply_debug_pldec(),
+        ):
+            async with Lock.acquire(ctx=ctx):
+                debug_lock_cs.shield = True
+
+                # indicate to child that we've locked stdio
+                await ctx.started(
+                    LockStatus(
+                        subactor_uid=subactor_uid,
+                        cid=ctx.cid,
+                        locked=True,
+                    )
+                )
+
+                log.debug( f'Actor {subactor_uid} acquired TTY lock')
+
+                # wait for unlock pdb by child
+                async with ctx.open_stream() as stream:
+                    release_msg: LockRelease = await stream.receive()
+
+                    # TODO: security around only releasing if
+                    # these match?
+                    log.pdb(
+                        f'TTY lock released requested\n\n'
+                        f'{release_msg}\n'
+                    )
+                    assert release_msg.cid == ctx.cid
+                    assert release_msg.subactor_uid == tuple(subactor_uid)
+
+                log.debug(f'Actor {subactor_uid} released TTY lock')
+
+            return LockStatus(
+                subactor_uid=subactor_uid,
+                cid=ctx.cid,
+                locked=False,
+            )
+
+    except BaseException:
+        log.exception(
+            'Errored during root TTY-lock dialog?\n'
+            'Forcing release since an internal error caused this!\n'
+        )
+        Lock.release(force=True)
+        raise
+
+    finally:
+        Lock._blocked.remove(ctx.cid)
+        if (no_locker := Lock.no_remote_has_tty):
+            no_locker.set()
+
+        DebugStatus.unshield_sigint()
+
+
+# TODO: rename to ReplState or somethin?
+# DebugRequest, make it a singleton instance?
 class DebugStatus:
     '''
     Singleton-state for debugging machinery in a subactor.
@@ -297,26 +544,26 @@ class DebugStatus:
     '''
     repl: PdbREPL|None = None
     repl_task: Task|None = None
-    req_cs: trio.CancelScope|None = None
+    req_ctx: Context|None = None
+    req_cs: CancelScope|None = None
     repl_release: trio.Event|None = None
-
+    req_finished: trio.Event|None = None
     lock_status: LockStatus|None = None
 
-    _orig_sigint_handler: Callable | None = None
+    _orig_sigint_handler: Callable|None = None
     _trio_handler: (
         Callable[[int, FrameType|None], Any]
         |int
         | None
     ) = None
 
-
     @classmethod
     def repr(cls) -> str:
         fields: str = (
             f'repl: {cls.repl}\n'
             f'repl_task: {cls.repl_task}\n'
             f'repl_release: {cls.repl_release}\n'
-            f'req_cs: {cls.req_cs}\n'
+            f'req_ctx: {cls.req_ctx}\n'
         )
         body: str = textwrap.indent(
             fields,
@@ -328,19 +575,37 @@ class DebugStatus:
             ')>'
         )
 
+    # TODO: how do you get this to work on a non-inited class?
+    # __repr__ = classmethod(repr)
+    # __str__ = classmethod(repr)
+
     @classmethod
     def shield_sigint(cls):
         '''
         Shield out SIGINT handling (which by default triggers
-        `trio.Task` cancellation) in subactors when the `pdb` REPL
+        `trio.Task` cancellation) in subactors when a `pdb` REPL
         is active.
 
-        Avoids cancellation of the current actor (task) when the
-        user mistakenly sends ctl-c or a signal is received from
-        an external request; explicit runtime cancel requests are
-        allowed until the use exits the REPL session using
-        'continue' or 'quit', at which point the orig SIGINT
-        handler is restored.
+        Avoids cancellation of the current actor (task) when the user
+        mistakenly sends ctl-c or via a recevied signal (from an
+        external request). Explicit runtime cancel requests are
+        allowed until the current REPL-session (the blocking call
+        `Pdb.interaction()`) exits, normally via the 'continue' or
+        'quit' command - at which point the orig SIGINT handler is
+        restored via `.unshield_sigint()` below.
+
+        Impl notes:
+        -----------
+        - we prefer that `trio`'s default handler is always used when
+          SIGINT is unshielded (hence disabling the `pdb.Pdb`
+          defaults in `mk_pdb()`) such that reliable KBI cancellation
+          is always enforced.
+
+        - we always detect whether we're running from a non-main
+          thread, in which case schedule the SIGINT shielding override
+          to in the main thread as per,
+
+          https://docs.python.org/3/library/signal.html#signals-and-threads
 
         '''
         #
@@ -364,6 +629,12 @@ class DebugStatus:
     @classmethod
     @pdbp.hideframe  # XXX NOTE XXX see below in `.pause_from_sync()`
     def unshield_sigint(cls):
+        '''
+        Un-shield SIGINT for REPL-active (su)bactor.
+
+        See details in `.shield_sigint()`.
+
+        '''
         # always restore ``trio``'s sigint handler. see notes below in
         # the pdb factory about the nightmare that is that code swapping
         # out the handler when the repl activates...
@@ -374,6 +645,11 @@ class DebugStatus:
                 cls._trio_handler,
             )
         else:
+            trio_h: Callable = cls._trio_handler
+            # XXX should never really happen XXX
+            if not trio_h:
+                mk_pdb().set_trace()
+
             signal.signal(
                 signal.SIGINT,
                 cls._trio_handler,
@@ -411,6 +687,36 @@ class DebugStatus:
         #     is not threading.main_thread()
         # )
 
+    @classmethod
+    @pdbp.hideframe
+    def release(
+        cls,
+        cancel_req_task: bool = True,
+    ):
+        try:
+            # sometimes the task might already be terminated in
+            # which case this call will raise an RTE?
+            if cls.repl_release is not None:
+                cls.repl_release.set()
+
+        finally:
+            # if req_ctx := cls.req_ctx:
+            #     req_ctx._scope.cancel()
+
+            if (
+                cancel_req_task
+                and
+                (req_cs := cls.req_cs)
+            ):
+                req_cs.cancel()
+
+            # restore original sigint handler
+            cls.unshield_sigint()
+
+            # actor-local state, irrelevant for non-root.
+            cls.repl_task = None
+            cls.repl = None
+
 
 class TractorConfig(pdbp.DefaultConfig):
     '''
@@ -466,13 +772,24 @@ class PdbREPL(pdbp.Pdb):
         try:
             super().set_continue()
         finally:
-            Lock.release()
+            DebugStatus.release()
+
+            # NOTE: for subactors the stdio lock is released via the
+            # allocated RPC locker task, so for root we have to do it
+            # manually.
+            if is_root_process():
+                Lock.release()
 
     def set_quit(self):
         try:
             super().set_quit()
         finally:
-            Lock.release()
+            DebugStatus.release(
+                    cancel_req_task=False,
+                )
+
+            if is_root_process():
+                Lock.release()
 
     # TODO: special handling where we just want the next LOC and
     # not to resume to the next pause/crash point?
@@ -515,413 +832,297 @@ class PdbREPL(pdbp.Pdb):
         return None
 
 
-@acm
-async def _acquire_debug_lock_from_root_task(
-    subactor_uid: tuple[str, str],
-    remote_task_uid: str,
-
-) -> AsyncIterator[trio.StrictFIFOLock]:
-    '''
-    Acquire a root-actor local FIFO lock which tracks mutex access of
-    the process tree's global debugger breakpoint.
-
-    This lock avoids tty clobbering (by preventing multiple processes
-    reading from stdstreams) and ensures multi-actor, sequential access
-    to the ``pdb`` repl.
-
-    '''
-    # task_name: str = current_task().name
-    we_acquired: bool = False
-
-    log.runtime(
-        f'Attempting to acquire TTY lock for,\n'
-        f'subactor_uid: {subactor_uid}\n'
-        f'remote task: {remote_task_uid}\n'
-    )
-    try:
-        pre_msg: str = (
-            f'Entering lock checkpoint for sub-actor\n'
-            f'subactor_uid: {subactor_uid}\n'
-            f'remote task: {remote_task_uid}\n'
-        )
-        stats = Lock._debug_lock.statistics()
-        if owner := stats.owner:
-            # and Lock.no_remote_has_tty is not None
-            pre_msg += (
-                f'\n'
-                f'`Lock` already held by local task\n'
-                f'{owner}\n\n'
-                f'On behalf of remote task: {Lock.remote_task_in_debug!r}\n'
-            )
-        log.runtime(pre_msg)
-
-        # NOTE: if the surrounding cancel scope from the
-        # `lock_tty_for_child()` caller is cancelled, this line should
-        # unblock and NOT leave us in some kind of
-        # a "child-locked-TTY-but-child-is-uncontactable-over-IPC"
-        # condition.
-        await Lock._debug_lock.acquire()
-        we_acquired = True
-
-        if Lock.no_remote_has_tty is None:
-            # mark the tty lock as being in use so that the runtime
-            # can try to avoid clobbering any connection from a child
-            # that's currently relying on it.
-            Lock.no_remote_has_tty = trio.Event()
-            Lock.remote_task_in_debug = remote_task_uid
-
-        Lock.global_actor_in_debug = subactor_uid
-        log.runtime(
-            f'TTY lock acquired for,\n'
-            f'subactor_uid: {subactor_uid}\n'
-            f'remote task: {remote_task_uid}\n'
-        )
-
-        # NOTE: critical section: this yield is unshielded!
-
-        # IF we received a cancel during the shielded lock entry of some
-        # next-in-queue requesting task, then the resumption here will
-        # result in that ``trio.Cancelled`` being raised to our caller
-        # (likely from ``lock_tty_for_child()`` below)!  In
-        # this case the ``finally:`` below should trigger and the
-        # surrounding caller side context should cancel normally
-        # relaying back to the caller.
-
-        yield Lock._debug_lock
-
-    finally:
-        if (
-            we_acquired
-            and
-            Lock._debug_lock.locked()
-        ):
-            Lock._debug_lock.release()
-
-        # IFF there are no more requesting tasks queued up fire, the
-        # "tty-unlocked" event thereby alerting any monitors of the lock that
-        # we are now back in the "tty unlocked" state. This is basically
-        # and edge triggered signal around an empty queue of sub-actor
-        # tasks that may have tried to acquire the lock.
-        stats = Lock._debug_lock.statistics()
-        if (
-            not stats.owner
-            # and Lock.no_remote_has_tty is not None
-        ):
-            # log.runtime(
-            log.info(
-                f'No more child ctx tasks hold the TTY lock!\n'
-                f'last subactor: {subactor_uid}\n'
-                f'remote task: {remote_task_uid}\n'
-            )
-            if Lock.no_remote_has_tty is not None:
-                # set and release
-                Lock.no_remote_has_tty.set()
-                Lock.no_remote_has_tty = None
-                Lock.remote_task_in_debug = None
-            else:
-                log.warning(
-                    'Not signalling `Lock.no_remote_has_tty` since it has value:\n'
-                    f'{Lock.no_remote_has_tty}\n'
-                )
-        else:
-            log.info(
-                f'A child ctx tasks still holds the TTY lock ??\n'
-                f'last subactor: {subactor_uid}\n'
-                f'remote task: {remote_task_uid}\n'
-                f'current local owner task: {stats.owner}\n'
-            )
-
-        Lock.global_actor_in_debug = None
-        log.runtime(
-            'TTY lock released by child\n'
-            f'last subactor: {subactor_uid}\n'
-            f'remote task: {remote_task_uid}\n'
-        )
-
-
-@tractor.context
-async def lock_tty_for_child(
-
-    ctx: tractor.Context,
-
-    # TODO: when we finally get a `Start.params: ParamSpec`
-    # working it'd sure be nice to have `msgspec` auto-decode this
-    # to an actual tuple XD
-    subactor_uid: tuple[str, str],
-    subactor_task_uid: tuple[str, int],
-
-) -> LockStatus|LockRelease:
-    '''
-    Lock the TTY in the root process of an actor tree in a new
-    inter-actor-context-task such that the ``pdbp`` debugger console
-    can be mutex-allocated to the calling sub-actor for REPL control
-    without interference by other processes / threads.
-
-    NOTE: this task must be invoked in the root process of the actor
-    tree. It is meant to be invoked as an rpc-task and should be
-    highly reliable at releasing the mutex complete!
-
-    '''
-    req_task_uid: tuple = tuple(subactor_task_uid)
-    if req_task_uid in Lock._blocked:
-        raise RuntimeError(
-            f'Double lock request!?\n'
-            f'The same remote task already has an active request for TTY lock ??\n\n'
-            f'task uid: {req_task_uid}\n'
-            f'subactor uid: {subactor_uid}\n\n'
-
-            'This might be mean that the requesting task '
-            'in `wait_for_parent_stdin_hijack()` may have crashed?\n'
-            'Consider that an internal bug exists given the TTY '
-            '`Lock`ing IPC dialog..\n'
-        )
-
-    root_task_name: str = current_task().name
-    if tuple(subactor_uid) in Lock._blocked:
-        log.warning(
-            f'Subactor is blocked from acquiring debug lock..\n'
-            f'subactor_uid: {subactor_uid}\n'
-            f'remote task: {subactor_task_uid}\n'
-        )
-        ctx._enter_debugger_on_cancel: bool = False
-        await ctx.cancel(f'Debug lock blocked for {subactor_uid}')
-        return LockStatus(
-            subactor_uid=subactor_uid,
-            cid=ctx.cid,
-            locked=False,
-        )
-
-    # TODO: when we get to true remote debugging
-    # this will deliver stdin data?
-
-    log.debug(
-        'Subactor attempting to acquire TTY lock\n'
-        f'root task: {root_task_name}\n'
-        f'subactor_uid: {subactor_uid}\n'
-        f'remote task: {subactor_task_uid}\n'
-    )
-    DebugStatus.shield_sigint()
-    try:
-        Lock._blocked.add(req_task_uid)
-        with (
-            # NOTE: though a cs is created for every subactor lock
-            # REQUEST in this ctx-child task, only the root-task
-            # holding the `Lock` (on behalf of the ctx parent task
-            # in a subactor) will set
-            # `Lock._locking_task_cs` such that if the
-            # lock holdingn task ever needs to be cancelled (since
-            # it's shielded by default) that global ref can be
-            # used to do so!
-            trio.CancelScope(shield=True) as debug_lock_cs,
-
-            # TODO: make this ONLY limit the pld_spec such that we
-            # can on-error-decode-`.pld: Raw` fields in
-            # `Context._deliver_msg()`?
-            _codec.limit_msg_spec(
-                payload_spec=__msg_spec__,
-            ) as codec,
-        ):
-            # sanity?
-            # TODO: don't need the ref right?
-            assert codec is _codec.current_codec()
-
-            async with _acquire_debug_lock_from_root_task(
-                subactor_uid,
-                subactor_task_uid,
-            ):
-                # XXX SUPER IMPORTANT BELOW IS ON THIS LINE XXX
-                # without that the root cs might be,
-                # - set and then removed in the finally block by 
-                #   a task that never acquired the lock, leaving 
-                # - the task that DID acquire the lock STUCK since
-                #   it's original cs was GC-ed bc the first task
-                #   already set the global ref to `None`
-                Lock.set_locking_task_cs(debug_lock_cs)
-
-                # indicate to child that we've locked stdio
-                await ctx.started(
-                    LockStatus(
-                        subactor_uid=subactor_uid,
-                        cid=ctx.cid,
-                        locked=True,
-                    )
-                )
-
-                log.debug( f'Actor {subactor_uid} acquired TTY lock')
-
-                # wait for unlock pdb by child
-                async with ctx.open_stream() as stream:
-                    release_msg: LockRelease = await stream.receive()
-
-                    # TODO: security around only releasing if
-                    # these match?
-                    log.pdb(
-                        f'TTY lock released requested\n\n'
-                        f'{release_msg}\n'
-                    )
-                    assert release_msg.cid == ctx.cid
-                    assert release_msg.subactor_uid == tuple(subactor_uid)
-
-                log.debug(f'Actor {subactor_uid} released TTY lock')
-
-            return LockStatus(
-                subactor_uid=subactor_uid,
-                cid=ctx.cid,
-                locked=False,
-            )
-
-    finally:
-        debug_lock_cs.cancel()
-        Lock._blocked.remove(req_task_uid)
-        Lock.set_locking_task_cs(None)
-        DebugStatus.unshield_sigint()
-
-
 @cm
-def apply_debug_codec() -> _codec.MsgCodec:
+def apply_debug_pldec() -> _codec.MsgCodec:
     '''
     Apply the subactor TTY `Lock`-ing protocol's msgspec temporarily
     (only in the current task).
 
     '''
-    with (
-        _codec.limit_msg_spec(
-            payload_spec=__msg_spec__,
-        ) as debug_codec,
-    ):
-        assert debug_codec is _codec.current_codec()
-        log.pdb(
-            'Applied `.devx._debug` msg-spec via codec\n'
-            f'{debug_codec}\n'
-        )
-        yield debug_codec
 
-    log.pdb(
-        'REMOVED `.devx._debug` msg-spec via codec\n'
-        f'{debug_codec}\n'
+    from tractor.msg import (
+        _ops as msgops,
+    )
+    orig_plrx: msgops.PldRx = msgops.current_pldrx()
+    orig_pldec: msgops.MsgDec = orig_plrx.pld_dec
+
+    try:
+        with msgops.limit_plds(
+            spec=__pld_spec__,
+        ) as debug_dec:
+            assert debug_dec is msgops.current_pldrx().pld_dec
+            log.runtime(
+                'Applied `.devx._debug` pld-spec\n\n'
+                f'{debug_dec}\n'
+            )
+            yield debug_dec
+
+    finally:
+        assert (
+            (plrx := msgops.current_pldrx()) is orig_plrx
+            and
+            plrx.pld_dec is orig_pldec
+        )
+        log.runtime(
+            'Reverted to previous pld-spec\n\n'
+            f'{orig_pldec}\n'
+        )
+
+# TODO: add this formatter to `.devx.pformat()`!
+def pformat_cs(
+    cs: CancelScope,
+    var_name: str = 'cs',
+) -> str:
+    return (
+        f'{var_name}: {cs}\n'
+        f'{var_name}.cancel_called = {cs.cancel_called}\n'
+        f'{var_name}.cancelled_caught = {cs.cancelled_caught}\n'
+        f'{var_name}._cancel_status = {cs._cancel_status}\n'
+        f'{var_name}.shield = {cs.shield}\n'
     )
 
 
-async def wait_for_parent_stdin_hijack(
+async def request_root_stdio_lock(
     actor_uid: tuple[str, str],
     task_uid: tuple[str, int],
-    task_status: TaskStatus[trio.CancelScope] = trio.TASK_STATUS_IGNORED
+    task_status: TaskStatus[CancelScope] = trio.TASK_STATUS_IGNORED
 ):
     '''
-    Connect to the root actor via a ``Context`` and invoke a task which
-    locks a root-local TTY lock: ``lock_tty_for_child()``; this func
-    should be called in a new task from a child actor **and never the
-    root*.
+    Connect to the root actor of this process tree and RPC-invoke
+    a task which acquires a std-streams global `Lock`: a actor tree
+    global mutex which prevents other subactors from entering
+    a `PdbREPL` at the same time as any other.
 
-    This function is used by any sub-actor to acquire mutex access to
-    the ``pdb`` REPL and thus the root's TTY for interactive debugging
-    (see below inside ``pause()``). It can be used to ensure that
-    an intermediate nursery-owning actor does not clobber its children
-    if they are in debug (see below inside
-    ``maybe_wait_for_debugger()``).
+    The actual `Lock` singleton exists ONLY in the root actor's
+    memory and does nothing more then set process-tree global state.
+    The actual `PdbREPL` interaction is completely isolated to each
+    sub-actor and with the `Lock` merely providing the multi-process
+    syncing mechanism to avoid any subactor (or the root itself) from
+    entering the REPL at the same time.
 
     '''
-    from .._discovery import get_root
+    # TODO: likely we can implement this mutex more generally as
+    #      a `._sync.Lock`?
+    # -[ ] simply add the wrapping needed for the debugger specifics?
+    #   - the `__pld_spec__` impl and maybe better APIs for the client
+    #   vs. server side state tracking? (`Lock` + `DebugStatus`)
+    # -[ ] for eg. `mp` has a multi-proc lock via the manager
+    #   - https://docs.python.org/3.8/library/multiprocessing.html#synchronization-primitives
+    # -[ ] technically we need a `RLock` since re-acquire should be a noop
+    #   - https://docs.python.org/3.8/library/multiprocessing.html#multiprocessing.RLock
+    DebugStatus.req_finished = trio.Event()
+    try:
+        from tractor._discovery import get_root
+        with (
+            # NOTE: we need this to ensure that this task exits
+            # BEFORE the REPl instance raises an error like
+            # `bdb.BdbQuit` directly, OW you get a trio cs stack
+            # corruption!
+            # Further, the since this task is spawned inside the
+            # `Context._scope_nursery: trio.Nursery`, once an RPC
+            # task errors that cs is cancel_called and so if we want
+            # to debug the TPC task that failed we need to shield
+            # against that expected `.cancel()` call and instead
+            # expect all of the `PdbREPL`.set_[continue/quit/]()`
+            # methods to unblock this task by setting the
+            # `.repl_release: # trio.Event`.
+            trio.CancelScope(shield=True) as req_cs,
 
-    with (
-        trio.CancelScope(shield=True) as cs,
-        apply_debug_codec(),
-    ):
-        DebugStatus.req_cs = cs
-        try:
-            # TODO: merge into sync async with ?
-            async with get_root() as portal:
-                # this syncs to child's ``Context.started()`` call.
-                async with portal.open_context(
-                    lock_tty_for_child,
-                    subactor_uid=actor_uid,
-                    subactor_task_uid=task_uid,
+            # NOTE: set it here in the locker request task bc it's
+            # possible for multiple such requests for the lock in any
+            # single sub-actor AND there will be a race between when the
+            # root locking task delivers the `Started(pld=LockStatus)`
+            # and when the REPL is actually entered by the requesting
+            # application task who called
+            # `.pause()`/`.post_mortem()`.
+            #
+            # SO, applying the pld-spec here means it is only applied to
+            # this IPC-ctx request task, NOT any other task(s)
+            # including the one that actually enters the REPL. This
+            # is oc desired bc ow the debugged task will msg-type-error.
+            #
+            apply_debug_pldec() as debug_dec,
+        ):
+            log.critical(
+                'Request cancel-scope is:\n\n'
+                f'{pformat_cs(req_cs, var_name="req_cs")}\n\n'
 
-                ) as (ctx, resp):
-                    log.pdb(
-                        'Subactor locked TTY with msg\n\n'
-                        f'{resp}\n'
-                    )
-                    assert resp.subactor_uid == actor_uid
-                    assert resp.cid
+            )
+            DebugStatus.req_cs = req_cs
+            try:
+                # TODO: merge into single async with ?
+                async with get_root() as portal:
 
-                    async with ctx.open_stream() as stream:
-                        try:  # to unblock local caller
+                    async with portal.open_context(
+                        lock_tty_for_child,
+                        subactor_task_uid=task_uid,
+                    ) as (ctx, status):
+
+                        DebugStatus.req_ctx = ctx
+
+                        from tractor.msg import (
+                            _ops as msgops,
+                        )
+                        assert (
+                            msgops.current_pldrx().pld_dec is debug_dec
+                        )
+                        log.debug(
+                            'Subactor locked TTY with msg\n\n'
+                            f'{status}\n'
+                        )
+
+                        # mk_pdb().set_trace()
+                        assert status.subactor_uid == actor_uid
+                        assert status.cid
+
+                        # set last rxed lock dialog status.
+                        DebugStatus.lock_status = status
+
+                        async with ctx.open_stream() as stream:
                             assert DebugStatus.repl_release
-                            task_status.started(cs)
+                            task_status.started(ctx)
 
-                            # wait for local task to exit and
-                            # release the REPL
+                            # wait for local task to exit its
+                            # `PdbREPL.interaction()`, call
+                            # `DebugStatus.release()` and then
+                            # unblock here.
                             await DebugStatus.repl_release.wait()
-
-                        finally:
                             await stream.send(
                                 LockRelease(
                                     subactor_uid=actor_uid,
-                                    cid=resp.cid,
+                                    cid=status.cid,
                                 )
                             )
 
-                        # sync with callee termination
-                        status: LockStatus = await ctx.result()
-                        assert not status.locked
+                            # sync with child-side root locker task
+                            # completion
+                            status: LockStatus = await ctx.result()
+                            assert not status.locked
+                            DebugStatus.lock_status = status
 
-                log.pdb(
-                    'TTY lock was released for subactor with msg\n\n'
-                    f'{status}\n\n'
-                    'Exitting {ctx.side!r} side locking of locking ctx'
+                    log.pdb(
+                        'TTY lock was released for subactor with msg\n\n'
+                        f'{status}\n\n'
+                        f'Exitting {ctx.side!r}-side of locking ctx'
+                    )
+
+            except (
+                tractor.ContextCancelled,
+                trio.Cancelled,
+            ):
+                log.exception(
+                    'Debug lock request CANCELLED?\n\n'
+                    f'{pformat_cs(req_cs, var_name="req_cs")}\n\n'
+                    f'{pformat_cs(ctx._scope, var_name="ctx._scope")}\n\n'
+                    f'{ctx}'
                 )
+                raise
 
-        except ContextCancelled:
-            log.warning('Root actor cancelled debug lock')
-            raise
+            except (
+                BaseException,
+            ):
+                log.exception(
+                    'Failed during root TTY-lock dialog?\n'
+                    f'{ctx}\n'
 
-        finally:
-            DebugStatus.repl_task = None
-            log.debug('Exiting debugger TTY lock request func from child')
+                    f'Cancelling IPC ctx!\n'
+                )
+                await ctx.cancel()
+                raise
 
 
-    log.cancel('Reverting SIGINT handler!')
-    DebugStatus.unshield_sigint()
+    except (
+        tractor.ContextCancelled,
+        trio.Cancelled,
+    ):
+        log.cancel(
+            'Debug lock request CANCELLED?\n'
+            f'{ctx}\n'
+        )
+        raise
+
+    except BaseException:
+        log.exception('Errored during root TTY-lock dialog?')
+        raise
+
+    finally:
+        log.debug('Exiting debugger TTY lock request func from child')
+        # signal request task exit
+        DebugStatus.req_finished.set()
 
 
-
-def mk_mpdb() -> PdbREPL:
+def mk_pdb() -> PdbREPL:
     '''
-    Deliver a new `PdbREPL`: a multi-process safe `pdbp`
-    REPL using the magic of SC!
+    Deliver a new `PdbREPL`: a multi-process safe `pdbp.Pdb`-variant
+    using the magic of `tractor`'s SC-safe IPC.
+
+    B)
 
     Our `pdb.Pdb` subtype accomplishes multi-process safe debugging
     by:
 
-    - mutexing access to the root process' TTY & stdstreams
-      via an IPC managed `Lock` singleton per process tree.
+    - mutexing access to the root process' std-streams (& thus parent
+      process TTY) via an IPC managed `Lock` singleton per
+      actor-process tree.
 
-    - temporarily overriding any subactor's SIGINT handler to shield during
-      live REPL sessions in sub-actors such that cancellation is
-      never (mistakenly) triggered by a ctrl-c and instead only 
-      by either explicit requests in the runtime or 
+    - temporarily overriding any subactor's SIGINT handler to shield
+      during live REPL sessions in sub-actors such that cancellation
+      is never (mistakenly) triggered by a ctrl-c and instead only by
+      explicit runtime API requests or after the
+      `pdb.Pdb.interaction()` call has returned.
+
+    FURTHER, the `pdbp.Pdb` instance is configured to be `trio`
+    "compatible" from a SIGINT handling perspective; we mask out
+    the default `pdb` handler and instead apply `trio`s default
+    which mostly addresses all issues described in:
+
+     - https://github.com/python-trio/trio/issues/1155
+
+    The instance returned from this factory should always be
+    preferred over the default `pdb[p].set_trace()` whenever using
+    a `pdb` REPL inside a `trio` based runtime.
 
     '''
     pdb = PdbREPL()
 
-    # Always shield out SIGINTs for subactors when REPL is active.
-    #
-    # XXX detect whether we're running from a non-main thread
-    # in which case schedule the SIGINT shielding override
-    # to in the main thread.
-    # https://docs.python.org/3/library/signal.html#signals-and-threads
-    DebugStatus.shield_sigint()
-
     # XXX: These are the important flags mentioned in
     # https://github.com/python-trio/trio/issues/1155
     # which resolve the traceback spews to console.
     pdb.allow_kbdint = True
     pdb.nosigint = True
-
     return pdb
 
 
+def any_connected_locker_child() -> bool:
+    '''
+    Predicate to determine if a reported child subactor in debug
+    is actually connected.
+
+    Useful to detect stale `Lock` requests after IPC failure.
+
+    '''
+    actor: Actor = current_actor()
+
+    if not is_root_process():
+        raise RuntimeError('This is a root-actor only API!')
+
+    if (
+        (ctx := Lock.ctx_in_debug)
+        and
+        (uid_in_debug := ctx.chan.uid)
+    ):
+        chans: list[tractor.Channel] = actor._peers.get(
+            tuple(uid_in_debug)
+        )
+        if chans:
+            return any(
+                chan.connected()
+                for chan in chans
+            )
+
+    return False
+
+
 def shield_sigint_handler(
     signum: int,
     frame: 'frame',  # type: ignore # noqa
@@ -938,10 +1139,7 @@ def shield_sigint_handler(
 
     '''
     __tracebackhide__: bool = True
-    uid_in_debug: tuple[str, str]|None = Lock.global_actor_in_debug
-
     actor: Actor = current_actor()
-    case_handled: bool = False
 
     def do_cancel():
         # If we haven't tried to cancel the runtime then do that instead
@@ -956,28 +1154,8 @@ def shield_sigint_handler(
         else:
             raise KeyboardInterrupt
 
-    # try to see if the supposed (sub)actor in debug still
-    # has an active connection to *this* actor, and if not
-    # it's likely they aren't using the TTY lock / debugger
-    # and we should propagate SIGINT normally.
-    any_connected: bool = False
-    if uid_in_debug is not None:
-        chans: list[tractor.Channel] = actor._peers.get(
-            tuple(uid_in_debug)
-        )
-        if chans:
-            any_connected = any(chan.connected() for chan in chans)
-            if not any_connected:
-                log.warning(
-                    'A global actor reported to be in debug '
-                    'but no connection exists for this child!?\n'
-                    f'subactor_uid: {uid_in_debug}\n\n'
-                    'Allowing SIGINT propagation..'
-                )
-                return do_cancel()
-
     # only set in the actor actually running the REPL
-    repl: PdbREPL|None = Lock.repl
+    repl: PdbREPL|None = DebugStatus.repl
 
     # TODO: maybe we should flatten out all these cases using
     # a match/case?
@@ -985,98 +1163,102 @@ def shield_sigint_handler(
     # root actor branch that reports whether or not a child
     # has locked debugger.
     if is_root_process():
-        lock_cs: trio.CancelScope = Lock.get_locking_task_cs()
+        # try to see if the supposed (sub)actor in debug still
+        # has an active connection to *this* actor, and if not
+        # it's likely they aren't using the TTY lock / debugger
+        # and we should propagate SIGINT normally.
+        any_connected: bool = any_connected_locker_child()
+        # if not any_connected:
+        #     return do_cancel()
 
-        log.warning(
+        problem = (
             f'root {actor.uid} handling SIGINT\n'
             f'any_connected: {any_connected}\n\n'
 
             f'{Lock.repr()}\n'
         )
 
-        maybe_stale_lock_cs: bool = (
-            lock_cs is not None
-            # and not lock_cs.cancel_called
-            and uid_in_debug is None
-        )
-        if maybe_stale_lock_cs:
-            log.warning(
-                'Stale `Lock._locking_task_cs: CancelScope` DETECTED?\n'
-                f'|_{lock_cs}\n\n'
-            )
-            lock_cs.cancel()
-
-        if uid_in_debug:  # "someone" is (ostensibly) using debug `Lock`
+        if (
+            (ctx := Lock.ctx_in_debug)
+            and
+            (uid_in_debug := ctx.chan.uid) # "someone" is (ostensibly) using debug `Lock`
+        ):
             name_in_debug: str = uid_in_debug[0]
-            if (
-                not repl  # but it's NOT us, the root actor.
-            ):
-                # sanity: since no repl ref is set, we def shouldn't
-                # be the lock owner!
-                assert name_in_debug != 'root'
+            assert not repl
+            # if not repl:  # but it's NOT us, the root actor.
+            # sanity: since no repl ref is set, we def shouldn't
+            # be the lock owner!
+            assert name_in_debug != 'root'
 
+            # IDEAL CASE: child has REPL as expected
+            if any_connected:  # there are subactors we can contact
                 # XXX: only if there is an existing connection to the
                 # (sub-)actor in debug do we ignore SIGINT in this
                 # parent! Otherwise we may hang waiting for an actor
                 # which has already terminated to unlock.
-                if any_connected:  # there are subactors we can contact
-                    # NOTE: don't emit this with `.pdb()` level in
-                    # root without a higher level.
-                    log.debug(
-                        f'Ignoring SIGINT while debug REPL in use by child\n'
-                        f'subactor: {uid_in_debug}\n'
-                    )
-                    # returns here minus tail logic
-                    case_handled = True
-
-                else:
-                    message: str = (
-                        f'Ignoring SIGINT while debug REPL SUPPOSEDLY in use by child\n'
-                        f'subactor: {uid_in_debug}\n\n'
-                        f'BUT, no child actors are contactable!?!?\n\n'
-
-                        # f'Reverting to def `trio` SIGINT handler..\n'
-                    )
-
-                    if maybe_stale_lock_cs:
-                        lock_cs.cancel()
-                        message += (
-                            'Maybe `Lock._locking_task_cs: CancelScope` is stale?\n'
-                            f'|_{lock_cs}\n\n'
-                        )
-
-                    log.warning(message)
-                    # Lock.unshield_sigint()
-                    DebugStatus.unshield_sigint()
-                    case_handled = True
+                #
+                # NOTE: don't emit this with `.pdb()` level in
+                # root without a higher level.
+                log.runtime(
+                    f'Ignoring SIGINT while debug REPL in use by child '
+                    f'{uid_in_debug}\n'
+                )
+                problem = None
 
             else:
-                assert name_in_debug == 'root'  # we are the registered locker
-                assert repl  # we have a pdb REPL engaged
-                log.pdb(
-                    f'Ignoring SIGINT while debug REPL in use\n'
-                    f'root actor: {uid_in_debug}\n'
+                problem += (
+                    '\n'
+                    f'A `pdb` REPL is SUPPOSEDLY in use by child {uid_in_debug}\n'
+                    f'BUT, no child actors are IPC contactable!?!?\n'
                 )
-                # returns here minus tail logic
-                case_handled = True
 
-        # root actor still has this SIGINT handler active without
-        # an actor using the `Lock` (a bug state) ??
-        # => so immediately cancel any stale lock cs and revert
-        # the handler!
+        # IDEAL CASE: root has REPL as expected
         else:
-            # XXX revert back to ``trio`` handler since this handler shouldn't 
-            # be enabled withtout an actor using a debug REPL!
-            log.warning(
-                'Ignoring SIGINT in root actor but no actor using a `pdb` REPL?\n'
-                'Reverting SIGINT handler to `trio` default!\n'
-            )
+            # root actor still has this SIGINT handler active without
+            # an actor using the `Lock` (a bug state) ??
+            # => so immediately cancel any stale lock cs and revert
+            # the handler!
+            if not repl:
+                # TODO: WHEN should we revert back to ``trio``
+                # handler if this one is stale?
+                # -[ ] maybe after a counts work of ctl-c mashes?
+                # -[ ] use a state var like `stale_handler: bool`?
+                problem += (
+                    '\n'
+                    'No subactor is using a `pdb` REPL according `Lock.ctx_in_debug`?\n'
+                    'BUT, the root should be using it, WHY this handler ??\n'
+                )
+            else:
+                log.pdb(
+                    'Ignoring SIGINT while pdb REPL in use by root actor..\n'
+                )
+                problem = None
 
+        # XXX if one is set it means we ARE NOT operating an ideal
+        # case where a child subactor or us (the root) has the
+        # lock without any other detected problems.
+        if problem:
+
+            # detect, report and maybe clear a stale lock request
+            # cancel scope.
+            lock_cs: trio.CancelScope = Lock.get_locking_task_cs()
+            maybe_stale_lock_cs: bool = (
+                lock_cs is not None
+                and not lock_cs.cancel_called
+            )
             if maybe_stale_lock_cs:
+                problem += (
+                    '\n'
+                    'Stale `Lock.ctx_in_debug._scope: CancelScope` detected?\n'
+                    f'{Lock.ctx_in_debug}\n\n'
+
+                    '-> Calling ctx._scope.cancel()!\n'
+                )
                 lock_cs.cancel()
 
-            DebugStatus.unshield_sigint()
-            case_handled = True
+            # TODO: wen do we actually want/need this, see above.
+            # DebugStatus.unshield_sigint()
+            log.warning(problem)
 
     # child actor that has locked the debugger
     elif not is_root_process():
@@ -1092,14 +1274,13 @@ def shield_sigint_handler(
             not rent_chan.connected()
         ):
             log.warning(
-                'A global sub-actor reported to be in debug '
+                'This sub-actor thinks it is debugging '
                 'but it has no connection to its parent ??\n'
-                f'{uid_in_debug}\n'
+                f'{actor.uid}\n'
                 'Allowing SIGINT propagation..'
             )
             DebugStatus.unshield_sigint()
             # do_cancel()
-            case_handled = True
 
         task: str|None = DebugStatus.repl_task
         if (
@@ -1107,13 +1288,11 @@ def shield_sigint_handler(
             and
             repl
         ):
-        # if repl:
             log.pdb(
                 f'Ignoring SIGINT while local task using debug REPL\n'
                 f'|_{task}\n'
                 f'  |_{repl}\n'
             )
-            case_handled = True
         else:
             msg: str = (
                 'SIGINT shield handler still active BUT, \n\n'
@@ -1136,7 +1315,6 @@ def shield_sigint_handler(
                 'Reverting handler to `trio` default!\n'
             )
             DebugStatus.unshield_sigint()
-            case_handled = True
 
             # XXX ensure that the reverted-to-handler actually is
             # able to rx what should have been **this** KBI ;)
@@ -1156,7 +1334,7 @@ def shield_sigint_handler(
     # we want to alert the user that more input is expect since
     # nothing has been done dur to ignoring sigint.
     if (
-        repl  # only when this actor has a REPL engaged
+        repl  # only when current actor has a REPL engaged
     ):
         # XXX: yah, mega hack, but how else do we catch this madness XD
         if repl.shname == 'xonsh':
@@ -1174,72 +1352,19 @@ def shield_sigint_handler(
         # https://github.com/goodboy/tractor/issues/130#issuecomment-663752040
         # https://github.com/prompt-toolkit/python-prompt-toolkit/blob/c2c6af8a0308f9e5d7c0e28cb8a02963fe0ce07a/prompt_toolkit/patch_stdout.py
 
-    if not case_handled:
-        log.critical(
-            f'{actor.uid} UNHANDLED SIGINT !?!?\n'
-            # TODO: pprint for `Lock`?
-        )
+    # XXX only for tracing this handler
+    # log.warning('exiting SIGINT')
 
 
 _pause_msg: str = 'Attaching to pdb REPL in actor'
 
 
-def _set_trace(
-    actor: tractor.Actor|None = None,
-    pdb: PdbREPL|None = None,
-    shield: bool = False,
-
-    extra_frames_up_when_async: int = 1,
-    hide_tb: bool = True,
-):
-    __tracebackhide__: bool = hide_tb
-
-    actor: tractor.Actor = (
-        actor
-        or
-        current_actor()
-    )
-
-    # always start 1 level up from THIS in user code.
-    frame: FrameType|None
-    if frame := sys._getframe():
-        frame: FrameType = frame.f_back  # type: ignore
-
-    if (
-        frame
-        and (
-            pdb
-            and actor is not None
-        )
-    ):
-        # TODO: maybe print the actor supervion tree up to the
-        # root here? Bo
-
-        log.pdb(
-            f'{_pause_msg}\n'
-            '|\n'
-            # TODO: make an `Actor.__repr()__`
-            f'|_ {current_task()} @ {actor.uid}\n'
-        )
-        # no f!#$&* idea, but when we're in async land
-        # we need 2x frames up?
-        for i in range(extra_frames_up_when_async):
-            frame: FrameType = frame.f_back
-            log.debug(
-                f'Going up frame_{i}:\n|_{frame}\n'
-            )
-
-    # engage ze REPL
-    # B~()
-    pdb.set_trace(frame=frame)
-
-
 async def _pause(
 
-    debug_func: Callable = _set_trace,
+    debug_func: Callable|None,
 
     # NOTE: must be passed in the `.pause_from_sync()` case!
-    pdb: PdbREPL|None = None,
+    repl: PdbREPL|None = None,
 
     # TODO: allow caller to pause despite task cancellation,
     # exactly the same as wrapping with:
@@ -1249,11 +1374,15 @@ async def _pause(
     # is always show in the debugger on entry.. and there seems to
     # be no way to override it?..
     #
-    shield: bool = False,
+    # shield: bool = False,
     hide_tb: bool = True,
-    extra_frames_up_when_async: int = 4,
 
-    task_status: TaskStatus[trio.Event] = trio.TASK_STATUS_IGNORED
+    # bc, `debug_func()`, `_enter_repl_sync()` and `_pause()`
+    # extra_frames_up_when_async: int = 3,
+
+    task_status: TaskStatus[trio.Event] = trio.TASK_STATUS_IGNORED,
+
+    **debug_func_kwargs,
 
 ) -> None:
     '''
@@ -1277,8 +1406,9 @@ async def _pause(
                 'for infected `asyncio` mode!'
             ) from rte
 
-    # task_name: str = task.name
-
+    # TODO: this should be created as part of `DebugRequest()` init
+    # which should instead be a one-shot-use singleton much like
+    # the `PdbREPL`.
     if (
         not DebugStatus.repl_release
         or
@@ -1289,43 +1419,65 @@ async def _pause(
     if debug_func is not None:
         debug_func = partial(debug_func)
 
-    if pdb is None:
-        pdb: PdbREPL = mk_mpdb()
+    repl: PdbREPL = repl or mk_pdb()
 
+    # TODO: maybe make this a `PdbREPL` method or mod func?
+    # -[ ] factor out better, main reason for it is common logic for
+    #   both root and sub repl entry
     def _enter_repl_sync(
         debug_func: Callable,
     ) -> None:
         __tracebackhide__: bool = hide_tb
-        try:
-            # TODO: do we want to support using this **just** for the
-            # locking / common code (prolly to help address #320)?
-            #
-            if debug_func is None:
-                task_status.started(Lock)
-            else:
-                # block here one (at the appropriate frame *up*) where
-                # ``breakpoint()`` was awaited and begin handling stdio.
-                log.debug('Entering sync world of the `pdb` REPL..')
-                try:
-                    # log.critical(
-                    #     f'stack len: {len(pdb.stack)}\n'
-                    # )
-                    debug_func(
-                        actor,
-                        pdb,
-                        extra_frames_up_when_async=extra_frames_up_when_async,
-                        shield=shield,
-                    )
-                except BaseException:
-                    log.exception(
-                        'Failed to invoke internal `debug_func = '
-                        f'{debug_func.func.__name__}`\n'
-                    )
-                    raise
 
-        except bdb.BdbQuit:
-            Lock.release()
-            raise
+        # TODO: do we want to support using this **just** for the
+        # locking / common code (prolly to help address #320)?
+        #
+        if debug_func is None:
+            task_status.started(DebugStatus)
+        else:
+            # block here one (at the appropriate frame *up*) where
+            # ``breakpoint()`` was awaited and begin handling stdio.
+            log.debug('Entering sync world of the `pdb` REPL..')
+
+            # XXX used by the SIGINT handler to check if
+            # THIS actor is in REPL interaction
+            try:
+                # TODO: move this into a `open_debug_request()` @acm?
+                # -[ ] prolly makes the most send to do the request
+                #   task spawn as part of an `@acm` api which
+                #   delivers the `DebugRequest` instance and ensures
+                #   encapsing all the pld-spec and debug-nursery?
+                #
+                # set local actor task to avoid recurrent
+                # entries/requests from the same local task
+                # (to the root process).
+                DebugStatus.repl_task = task
+                DebugStatus.repl = repl
+                DebugStatus.shield_sigint()
+
+                # enter `PdbREPL` specific method
+                debug_func(
+                    repl=repl,
+                    hide_tb=hide_tb,
+                    **debug_func_kwargs,
+                )
+            except trio.Cancelled:
+                log.exception(
+                    'Cancelled during invoke of internal `debug_func = '
+                    f'{debug_func.func.__name__}`\n'
+                )
+                # NOTE: DON'T release lock yet
+                raise
+
+            except BaseException:
+                log.exception(
+                    'Failed to invoke internal `debug_func = '
+                    f'{debug_func.func.__name__}`\n'
+                )
+                # NOTE: OW this is ONLY called from the
+                # `.set_continue/next` hooks!
+                DebugStatus.release()
+                raise
 
     try:
         if is_root_process():
@@ -1333,7 +1485,14 @@ async def _pause(
             # we also wait in the root-parent for any child that
             # may have the tty locked prior
             # TODO: wait, what about multiple root tasks acquiring it though?
-            if Lock.global_actor_in_debug == actor.uid:
+            ctx: Context|None = Lock.ctx_in_debug
+            if (
+                ctx is None
+                and
+                DebugStatus.repl
+                and
+                DebugStatus.repl_task is task
+            ):
                 # re-entrant root process already has it: noop.
                 log.warning(
                     f'{task.name}@{actor.uid} already has TTY lock\n'
@@ -1347,8 +1506,8 @@ async def _pause(
             # callbacks. Can't think of a nicer way then this atm.
             if Lock._debug_lock.locked():
                 log.warning(
-                    'attempting to shield-acquire active TTY lock'
-                    f' owned by {Lock.global_actor_in_debug}'
+                    'attempting to shield-acquire active TTY lock owned by\n'
+                    f'{ctx}'
                 )
 
                 # must shield here to avoid hitting a ``Cancelled`` and
@@ -1359,10 +1518,6 @@ async def _pause(
                 # may be cancelled
                 await Lock._debug_lock.acquire()
 
-            Lock.global_actor_in_debug = actor.uid
-            DebugStatus.repl_task = task
-            DebugStatus.repl = Lock.repl = pdb
-
             # enter REPL from root, no TTY locking IPC ctx necessary
             _enter_repl_sync(debug_func)
             return  # next branch is mutex and for subactors
@@ -1405,10 +1560,6 @@ async def _pause(
                 await DebugStatus.repl_release.wait()
                 await trio.sleep(0.1)
 
-            # mark local actor as "in debug mode" to avoid recurrent
-            # entries/requests to the root process
-            DebugStatus.repl_task = task
-
             # this **must** be awaited by the caller and is done using the
             # root nursery so that the debugger can continue to run without
             # being restricted by the scope of a new task nursery.
@@ -1420,88 +1571,106 @@ async def _pause(
             #   actor._service_n.cancel_scope.shield = shield
             # ```
             # but not entirely sure if that's a sane way to implement it?
-
-            # NOTE: MUST it here bc multiple tasks are spawned by any
-            # one sub-actor AND there will be a race between when the
-            # root locking task delivers the `Started(pld=LockStatus)`
-            # and when the REPL is actually entered here. SO ensure
-            # the codec is set before either are run!
-            #
-            with (
-                # _codec.limit_msg_spec(
-                #     payload_spec=__msg_spec__,
-                # ) as debug_codec,
-                trio.CancelScope(shield=shield),
-            ):
-                # async with trio.open_nursery() as tn:
-                #     tn.cancel_scope.shield = True
-                try:
-                    # cs: trio.CancelScope = await tn.start(
-                    cs: trio.CancelScope = await actor._service_n.start(
-                        wait_for_parent_stdin_hijack,
-                        actor.uid,
-                        (task.name, id(task)),
-                    )
-                    # our locker task should be the one in ctx
-                    # with the root actor
-                    assert DebugStatus.req_cs is cs
-
-                    # XXX used by the SIGINT handler to check if
-                    # THIS actor is in REPL interaction
-                    Lock.repl = pdb
-
-                except RuntimeError:
-                    Lock.release()
-
-                    if actor._cancel_called:
-                        # service nursery won't be usable and we
-                        # don't want to lock up the root either way since
-                        # we're in (the midst of) cancellation.
-                        return
-
-                    raise
+            try:
+                # NOTE spawn the stdio locker request task inside the
+                # current `Context._scope_nursery` to entsure that
+                # the request never can outlive the task's (parent)
+                # lifetime.
+                curr_ctx: Context = current_ipc_ctx()
+                # TODO: see `_errors_relayed_via_ipc()` where we
+                # should dynamically open a `debug_tn`  for use here,
+                # BUT it needs to be outside the normal error
+                # catching and `_maybe_enter_debugger()` call!
+                # ctx: Context = await curr_ctx._debug_tn.start(
+                ctx: Context = await actor._service_n.start(
+                    request_root_stdio_lock,
+                    actor.uid,
+                    (task.name, id(task)),  # task uuid (effectively)
+                )
+                # our locker task should be the one in ctx
+                # with the root actor
+                assert (
+                    ctx
+                    is
+                    DebugStatus.req_ctx
+                    is not
+                    curr_ctx
+                )
 
                 # enter REPL
+                _enter_repl_sync(debug_func)
 
-                try:
-                    _enter_repl_sync(debug_func)
-                finally:
-                    DebugStatus.unshield_sigint()
+            except RuntimeError:
+                if actor._cancel_called:
+                    # service nursery won't be usable and we
+                    # don't want to lock up the root either way since
+                    # we're in (the midst of) cancellation.
+                    return
+
+                raise
+
+    # TODO: prolly factor this plus the similar block from
+    # `_enter_repl_sync()` into a common @cm?
+    except BaseException as repl_err:
+        if isinstance(repl_err, bdb.BdbQuit):
+            log.devx(
+                'REPL for pdb was quit!\n'
+            )
+        else:
+            log.exception(
+                'Failed to engage debugger via `_pause()` ??\n'
+            )
+
+        DebugStatus.release()
+        # sanity checks for ^ on request/status teardown
+        assert DebugStatus.repl is None
+        assert DebugStatus.repl_task is None
+        req_ctx: Context = DebugStatus.req_ctx
+        if req_ctx:
+            assert req_ctx._scope.cancel_called
 
-    except BaseException:
-        log.exception(
-            'Failed to engage debugger via `_pause()` ??\n'
-        )
         raise
 
 
-# XXX: apparently we can't do this without showing this frame
-# in the backtrace on first entry to the REPL? Seems like an odd
-# behaviour that should have been fixed by now. This is also why
-# we scrapped all the @cm approaches that were tried previously.
-# finally:
-#     __tracebackhide__ = True
-#     # frame = sys._getframe()
-#     # last_f = frame.f_back
-#     # last_f.f_globals['__tracebackhide__'] = True
-#     # signal.signal = pdbp.hideframe(signal.signal)
+def _set_trace(
+    repl: PdbREPL,  # passed by `_pause()`
+    hide_tb: bool,
+
+    # partial-ed in by `.pause()`
+    api_frame: FrameType,
+):
+    __tracebackhide__: bool = hide_tb
+    actor: tractor.Actor = current_actor()
+
+    # else:
+    # TODO: maybe print the actor supervion tree up to the
+    # root here? Bo
+    log.pdb(
+        f'{_pause_msg}\n'
+        '|\n'
+        # TODO: make an `Actor.__repr()__`
+        f'|_ {current_task()} @ {actor.uid}\n'
+    )
+    # presuming the caller passed in the "api frame"
+    # (the last frame before user code - like `.pause()`)
+    # then we only step up one frame to where the user
+    # called our API.
+    caller_frame: FrameType = api_frame.f_back  # type: ignore
+
+    # engage ze REPL
+    # B~()
+    repl.set_trace(frame=caller_frame)
 
 
 async def pause(
+    *,
+    hide_tb: bool = True,
+    api_frame: FrameType|None = None,
 
-    debug_func: Callable|None = _set_trace,
-
-    # TODO: allow caller to pause despite task cancellation,
-    # exactly the same as wrapping with:
-    # with CancelScope(shield=True):
-    #     await pause()
-    # => the REMAINING ISSUE is that the scope's .__exit__() frame
-    # is always show in the debugger on entry.. and there seems to
-    # be no way to override it?..
-    #
+    # TODO: figure out how to still make this work:
+    # -[ ] pass it direct to `_pause()`?
+    # -[ ] use it to set the `debug_nursery.cancel_scope.shield`
     shield: bool = False,
-    task_status: TaskStatus[trio.Event] = trio.TASK_STATUS_IGNORED,
-
     **_pause_kwargs,
 
 ) -> None:
@@ -1522,19 +1691,37 @@ async def pause(
     '''
     __tracebackhide__: bool = True
 
-    with trio.CancelScope(
-        shield=shield,
-    ) as cs:
+    # always start 1 level up from THIS in user code since normally
+    # `tractor.pause()` is called explicitly by use-app code thus
+    # making it the highest up @api_frame.
+    api_frame: FrameType = api_frame or inspect.currentframe()
 
+    # XXX TODO: this was causing cs-stack corruption in trio due to
+    # usage within the `Context._scope_nursery` (which won't work
+    # based on scoping of it versus call to `_maybe_enter_debugger()`
+    # from `._rpc._invoke()`)
+    # with trio.CancelScope(
+    #     shield=shield,
+    # ) as cs:
         # NOTE: so the caller can always manually cancel even
         # if shielded!
-        task_status.started(cs)
-        return await _pause(
-            debug_func=debug_func,
-            shield=shield,
-            task_status=task_status,
-            **_pause_kwargs
-        )
+        # task_status.started(cs)
+        # log.critical(
+        #     '`.pause() cancel-scope is:\n\n'
+        #     f'{pformat_cs(cs, var_name="pause_cs")}\n\n'
+        # )
+    await _pause(
+        debug_func=partial(
+            _set_trace,
+            api_frame=api_frame,
+        ),
+
+        # task_status=task_status,
+        **_pause_kwargs
+    )
+        # XXX avoid cs stack corruption when `PdbREPL.interaction()`
+        # raises `BdbQuit`.
+        # await DebugStatus.req_finished.wait()
 
 
 _gb_mod: None|ModuleType|False = None
@@ -1626,7 +1813,7 @@ def pause_from_sync(
 
     # raises on not-found by default
     greenback: ModuleType = maybe_import_greenback()
-    mdb: PdbREPL = mk_mpdb()
+    mdb: PdbREPL = mk_pdb()
 
     # run async task which will lock out the root proc's TTY.
     if not Lock.is_main_trio_thread():
@@ -1664,10 +1851,10 @@ def pause_from_sync(
     # entering the global ``breakpoint()`` built-in from sync
     # code?
     _set_trace(
+        api_frame=inspect.current_frame(),
         actor=actor,
         pdb=mdb,
         hide_tb=hide_tb,
-        extra_frames_up_when_async=1,
 
         # TODO? will we ever need it?
         # -> the gb._await() won't be affected by cancellation?
@@ -1691,8 +1878,8 @@ async def breakpoint(**kwargs):
     )
     __tracebackhide__: bool = True
     await pause(
-        # extra_frames_up_when_async=6,
-        **kwargs
+        api_frame=inspect.currentframe(),
+        **kwargs,
     )
 
 
@@ -1702,12 +1889,15 @@ _crash_msg: str = (
 
 
 def _post_mortem(
-    actor: tractor.Actor,
-    pdb: PdbREPL,
-    shield: bool = False,
+    # provided and passed by `_pause()`
+    repl: PdbREPL,
 
-    # only for compat with `._set_trace()`..
-    extra_frames_up_when_async=1,
+    # XXX all `partial`-ed in by `post_mortem()` below!
+    tb: TracebackType,
+    api_frame: FrameType,
+
+    shield: bool = False,
+    hide_tb: bool = False,
 
 ) -> None:
     '''
@@ -1715,6 +1905,9 @@ def _post_mortem(
     debugger instance.
 
     '''
+    __tracebackhide__: bool = hide_tb
+    actor: tractor.Actor = current_actor()
+
     # TODO: print the actor supervion tree up to the root
     # here! Bo
     log.pdb(
@@ -1728,24 +1921,64 @@ def _post_mortem(
         # f'|_ {current_task()} @ {actor.name}\n'
     )
 
-    # TODO: only replacing this to add the
+    # NOTE only replacing this from `pdbp.xpm()` to add the
     # `end=''` to the print XD
-    # pdbp.xpm(Pdb=lambda: pdb)
-    info = sys.exc_info()
     print(traceback.format_exc(), end='')
-    pdbp.post_mortem(
-        t=info[2],
-        Pdb=lambda: pdb,
+
+    caller_frame: FrameType = api_frame.f_back
+
+    # NOTE: see the impl details of followings to understand usage:
+    # - `pdbp.post_mortem()`
+    # - `pdbp.xps()`
+    # - `bdb.interaction()`
+    repl.reset()
+    repl.interaction(
+        frame=caller_frame,
+        # frame=None,
+        traceback=tb,
     )
 
 
-post_mortem = partial(
-    pause,
-    debug_func=_post_mortem,
-)
+async def post_mortem(
+    *,
+    tb: TracebackType|None = None,
+    api_frame: FrameType|None = None,
+    hide_tb: bool = False,
+
+    # TODO: support shield here just like in `pause()`?
+    # shield: bool = False,
+
+    **_pause_kwargs,
+
+) -> None:
+    __tracebackhide__: bool = hide_tb
+
+    tb: TracebackType = tb or sys.exc_info()[2]
+
+    # TODO: do upward stack scan for highest @api_frame and
+    # use its parent frame as the expected user-app code
+    # interact point.
+    api_frame: FrameType = api_frame or inspect.currentframe()
+
+    await _pause(
+        debug_func=partial(
+            _post_mortem,
+            api_frame=api_frame,
+            tb=tb,
+        ),
+        hide_tb=hide_tb,
+        **_pause_kwargs
+    )
 
 
-async def _maybe_enter_pm(err):
+async def _maybe_enter_pm(
+    err: BaseException,
+    *,
+    tb: TracebackType|None = None,
+    api_frame: FrameType|None = None,
+    hide_tb: bool = False,
+):
+    from tractor._exceptions import is_multi_cancelled
     if (
         debug_mode()
 
@@ -1764,12 +1997,13 @@ async def _maybe_enter_pm(err):
         # might be a simpler check we can do?
         and not is_multi_cancelled(err)
     ):
-        log.debug("Actor crashed, entering debug mode")
-        try:
-            await post_mortem()
-        finally:
-            Lock.release()
-            return True
+        api_frame: FrameType = api_frame or inspect.currentframe()
+        tb: TracebackType = tb or sys.exc_info()[2]
+        await post_mortem(
+            api_frame=api_frame,
+            tb=tb,
+        )
+        return True
 
     else:
         return False
@@ -1796,12 +2030,12 @@ async def acquire_debug_lock(
         return
 
     async with trio.open_nursery() as n:
-        cs = await n.start(
-            wait_for_parent_stdin_hijack,
+        ctx: Context = await n.start(
+            request_root_stdio_lock,
             subactor_uid,
         )
-        yield cs
-        cs.cancel()
+        yield ctx
+        ctx.cancel()
 
 
 async def maybe_wait_for_debugger(
@@ -1830,8 +2064,8 @@ async def maybe_wait_for_debugger(
         # will make the pdb repl unusable.
         # Instead try to wait for pdb to be released before
         # tearing down.
-        in_debug: tuple[str, str]|None = Lock.global_actor_in_debug
-
+        ctx_in_debug: Context|None = Lock.ctx_in_debug
+        in_debug: tuple[str, str]|None = ctx_in_debug.chan.uid if ctx_in_debug else None
         if in_debug == current_actor().uid:
             log.debug(
                 msg
@@ -1864,17 +2098,26 @@ async def maybe_wait_for_debugger(
                 and not Lock.no_remote_has_tty.is_set()
                 and in_debug is not None
             ):
-                log.pdb(
+
+                # caller_frame_info: str = pformat_caller_frame()
+                log.debug(
                     msg
                     +
-                    '\nRoot is waiting on tty lock to release..\n'
+                    '\nRoot is waiting on tty lock to release from\n\n'
+                    # f'{caller_frame_info}\n'
                 )
+
+                if not any_connected_locker_child():
+                    Lock.get_locking_task_cs().cancel()
+
                 with trio.CancelScope(shield=True):
                     await Lock.no_remote_has_tty.wait()
+
                 log.pdb(
-                    f'Child subactor released debug lock\n'
+                    f'Subactor released debug lock\n'
                     f'|_{in_debug}\n'
                 )
+                break
 
             # is no subactor locking debugger currently?
             if (
@@ -1900,7 +2143,7 @@ async def maybe_wait_for_debugger(
                     f'poll step: {istep}\n'
                     f'poll delya: {poll_delay}'
                 )
-                with trio.CancelScope(shield=True):
+                with CancelScope(shield=True):
                     await trio.sleep(poll_delay)
                     continue
 
-- 
2.34.1


From 4d528b76a0b7ac359785360867bad271d906a604 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 8 May 2024 13:30:15 -0400
Subject: [PATCH 293/378] Move `_debug.pformat_cs()` into `devx.pformat`

---
 tractor/devx/_debug.py  | 19 ++++---------------
 tractor/devx/pformat.py | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 0567e42a..da322407 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -73,7 +73,10 @@ from tractor._state import (
     debug_mode,
     current_ipc_ctx,
 )
-# from .pformat import pformat_caller_frame
+from .pformat import (
+    # pformat_caller_frame,
+    pformat_cs,
+)
 
 if TYPE_CHECKING:
     from tractor._ipc import Channel
@@ -868,20 +871,6 @@ def apply_debug_pldec() -> _codec.MsgCodec:
             f'{orig_pldec}\n'
         )
 
-# TODO: add this formatter to `.devx.pformat()`!
-def pformat_cs(
-    cs: CancelScope,
-    var_name: str = 'cs',
-) -> str:
-    return (
-        f'{var_name}: {cs}\n'
-        f'{var_name}.cancel_called = {cs.cancel_called}\n'
-        f'{var_name}.cancelled_caught = {cs.cancelled_caught}\n'
-        f'{var_name}._cancel_status = {cs._cancel_status}\n'
-        f'{var_name}.shield = {cs.shield}\n'
-    )
-
-
 async def request_root_stdio_lock(
     actor_uid: tuple[str, str],
     task_uid: tuple[str, int],
diff --git a/tractor/devx/pformat.py b/tractor/devx/pformat.py
index 0b35feee..5fe9bc62 100644
--- a/tractor/devx/pformat.py
+++ b/tractor/devx/pformat.py
@@ -22,6 +22,8 @@ Mostly handy for logging and exception message content.
 import textwrap
 import traceback
 
+from trio import CancelScope
+
 
 def add_div(
     message: str,
@@ -133,3 +135,34 @@ def pformat_caller_frame(
             indent='',
         )
     return tb_str
+
+
+def pformat_cs(
+    cs: CancelScope,
+    var_name: str = 'cs',
+    field_prefix: str = ' |_',
+) -> str:
+    '''
+    Pretty format info about a `trio.CancelScope` including most
+    of its public state and `._cancel_status`.
+
+    The output can be modified to show a "var name" for the
+    instance as a field prefix, just a simple str before each
+    line more or less.
+
+    '''
+
+    fields: str = textwrap.indent(
+        (
+            f'cancel_called = {cs.cancel_called}\n'
+            f'cancelled_caught = {cs.cancelled_caught}\n'
+            f'_cancel_status = {cs._cancel_status}\n'
+            f'shield = {cs.shield}\n'
+        ),
+        prefix=field_prefix,
+    )
+    return (
+        f'{var_name}: {cs}\n'
+        +
+        fields
+    )
-- 
2.34.1


From 45f37870af7644ad00a320bc0fe289a11e66f168 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 8 May 2024 13:33:59 -0400
Subject: [PATCH 294/378] Add a `.log.at_least_level()` predicate

---
 tractor/log.py | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/tractor/log.py b/tractor/log.py
index e85b49cf..f7d6a32e 100644
--- a/tractor/log.py
+++ b/tractor/log.py
@@ -21,6 +21,11 @@ Log like a forester!
 from collections.abc import Mapping
 import sys
 import logging
+from logging import (
+    LoggerAdapter,
+    Logger,
+    StreamHandler,
+)
 import colorlog  # type: ignore
 
 import trio
@@ -80,7 +85,7 @@ BOLD_PALETTE = {
 
 # TODO: this isn't showing the correct '{filename}'
 # as it did before..
-class StackLevelAdapter(logging.LoggerAdapter):
+class StackLevelAdapter(LoggerAdapter):
 
     def transport(
         self,
@@ -237,6 +242,7 @@ def get_logger(
     '''Return the package log or a sub-logger for ``name`` if provided.
 
     '''
+    log: Logger
     log = rlog = logging.getLogger(_root_name)
 
     if (
@@ -291,7 +297,7 @@ def get_logger(
 def get_console_log(
     level: str | None = None,
     **kwargs,
-) -> logging.LoggerAdapter:
+) -> LoggerAdapter:
     '''Get the package logger and enable a handler which writes to stderr.
 
     Yeah yeah, i know we can use ``DictConfig``. You do it.
@@ -316,7 +322,7 @@ def get_console_log(
             None,
         )
     ):
-        handler = logging.StreamHandler()
+        handler = StreamHandler()
         formatter = colorlog.ColoredFormatter(
             LOG_FORMAT,
             datefmt=DATE_FORMAT,
@@ -336,3 +342,19 @@ def get_loglevel() -> str:
 
 # global module logger for tractor itself
 log = get_logger('tractor')
+
+
+def at_least_level(
+    log: Logger|LoggerAdapter,
+    level: int|str,
+) -> bool:
+    '''
+    Predicate to test if a given level is active.
+
+    '''
+    if isinstance(level, str):
+        level: int = LEVELS[level.upper()]
+
+    if log.getEffectiveLevel() <= level:
+        return True
+    return False
-- 
2.34.1


From 343b7c971249b25480c6a59d8974856107e736b7 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 8 May 2024 13:35:29 -0400
Subject: [PATCH 295/378] Even moar bitty `Context` refinements

- set `._state._ctxvar_Context` just after `StartAck` inside
  `open_context_from_portal()` so that `current_ipc_ctx()` always
  works on the 'parent' side.
- always set `.canceller` to any `MsgTypeError.src_uid` and otherwise to
  any maybe-detected `.src_uid` (i.e. for RAEs).
- always set `.canceller` to us when we rx a ctxc which reports us as
  its canceller; this is a sanity check on definite "self cancellation".
- adjust `._is_self_cancelled()` logic to only be `True` when
  `._remote_error` is both a ctxc with a `.canceller` set to us AND
  when `Context.canceller` is also set to us (since the change above)
  as a little bit of extra rigor.
- fill-in/fix some `.repr_state` edge cases:
  - merge self-vs.-peer ctxc cases to one block and distinguish via
    nested `._is_self_cancelled()` check.
  - set 'errored' for all exception matched cases despite `.canceller`.
  - add pre-`Return` phase statuses:
   |_'pre-started' and 'syncing-to-child' depending on side and when
     `._stream` has not (yet) been set.
   |_'streaming' and 'streaming-finished' depending on side when
     `._stream` is set and whether it was stopped/closed.
- tweak drainage log-message to use "outcome" instead of "result".
- use new `.devx.pformat.pformat_cs()` inside `_maybe_cancel_and_set_remote_error()`
  but, IFF the log level is at least 'cancel'.
---
 tractor/_context.py | 197 ++++++++++++++++++++++++++++----------------
 1 file changed, 126 insertions(+), 71 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index f333c9ee..3dcf8151 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -37,8 +37,9 @@ import inspect
 from pprint import pformat
 from typing import (
     Any,
-    Callable,
     AsyncGenerator,
+    Callable,
+    Mapping,
     Type,
     TYPE_CHECKING,
     Union,
@@ -59,7 +60,10 @@ from ._exceptions import (
     pack_from_raise,
     unpack_error,
 )
-from .log import get_logger
+from .log import (
+    get_logger,
+    at_least_level,
+)
 from .msg import (
     Error,
     MsgType,
@@ -83,6 +87,7 @@ from ._streaming import MsgStream
 from ._state import (
     current_actor,
     debug_mode,
+    _ctxvar_Context,
 )
 
 if TYPE_CHECKING:
@@ -204,7 +209,7 @@ class Context:
     # cancelled that the other side is as well, so maybe we should
     # instead just have a `.canceller` pulled from the
     # `ContextCancelled`?
-    _canceller: tuple[str, str] | None = None
+    _canceller: tuple[str, str]|None = None
 
     # NOTE: we try to ensure assignment of a "cancel msg" since
     # there's always going to be an "underlying reason" that any
@@ -384,8 +389,12 @@ class Context:
 
         re: BaseException|None = (
             remote_error
-            or self._remote_error
+            or
+            self._remote_error
         )
+        # XXX we only report "this context" as self-cancelled
+        # once we've received a ctxc from our direct-peer task
+        # (aka we're `.cancel_acked`).
         if not re:
             return False
 
@@ -396,10 +405,10 @@ class Context:
         our_canceller = self.canceller
 
         return bool(
-            isinstance(re, ContextCancelled)
+            isinstance((ctxc := re), ContextCancelled)
             and from_uid == self.chan.uid
-            and re.canceller == our_uid
-            and our_canceller == from_uid
+            and ctxc.canceller == our_uid
+            and our_canceller == our_uid
         )
 
     @property
@@ -619,15 +628,27 @@ class Context:
         )
         self._remote_error: BaseException = error
 
+        msgerr: bool = False
+
         # self-cancel (ack) or,
         # peer propagated remote cancellation.
-        msgerr: bool = False
         if isinstance(error, ContextCancelled):
+            # NOTE in the case error is a ctxc the canceller will
+            # either be another peer or us. in the case where it's us
+            # we mark ourself as the canceller of ourselves (a ctx
+            # "self cancel" from this side's perspective), if instead
+            # the far end was cancelled by some other (inter-) peer,
+            # we want to mark our canceller as the actor that was
+            # cancelled, NOT their reported canceller. IOW in the
+            # latter case we're cancelled by someone else getting
+            # cancelled.
+            if (canc := error.canceller) == self._actor.uid:
+                whom: str = 'us'
+                self._canceller = canc
+            else:
+                whom = 'a remote peer (not us)'
+                self._canceller = error.src_uid
 
-            whom: str = (
-                'us' if error.canceller == self._actor.uid
-                else 'a remote peer (not us)'
-            )
             log.cancel(
                 f'IPC context was cancelled by {whom}!\n\n'
                 f'{error}'
@@ -635,6 +656,7 @@ class Context:
 
         elif isinstance(error, MsgTypeError):
             msgerr = True
+            self._canceller = error.src_uid
             log.error(
                 f'IPC dialog error due to msg-type caused by {self.peer_side!r} side\n\n'
                 f'{error}\n'
@@ -642,28 +664,25 @@ class Context:
             )
 
         else:
+            # always record the cancelling actor's uid since its
+            # cancellation state is linked and we want to know
+            # which process was the cause / requester of the
+            # cancellation.
+            maybe_error_src_uid: tuple = getattr(
+                error,
+                'src_uid',
+                None,
+            )
+            # we mark the source actor as our canceller
+            self._canceller = maybe_error_src_uid
             log.error(
                 f'Remote context error:\n\n'
                 # f'{pformat(self)}\n'
                 f'{error}\n'
             )
 
-        # always record the cancelling actor's uid since its
-        # cancellation state is linked and we want to know
-        # which process was the cause / requester of the
-        # cancellation.
-        maybe_error_src: tuple = getattr(
-            error,
-            'src_uid',
-            None,
-        )
-        self._canceller = (
-            maybe_error_src
-            or
-            # XXX: in the case we get a non-boxed error?
-            # -> wait but this should never happen right?
-            self.chan.uid
-        )
+        if self._canceller is None:
+            log.error('Ctx has no canceller set!?')
 
         # Cancel the local `._scope`, catch that
         # `._scope.cancelled_caught` and re-raise any remote error
@@ -707,27 +726,34 @@ class Context:
             message: str = 'NOT cancelling `Context._scope` !\n\n'
 
         fmt_str: str = 'No `self._scope: CancelScope` was set/used ?'
-        if cs:
+        if (
+            cs
+            and
+            at_least_level(log=log, level='cancel')
+        ):
             fmt_str: str = self.pformat(
                 extra_fields={
                     '._is_self_cancelled()': self._is_self_cancelled(),
                     '._cancel_on_msgerr': self._cancel_on_msgerr,
-
-                    '._scope': cs,
-                    '._scope.cancel_called': cs.cancel_called,
-                    '._scope.cancelled_caught': cs.cancelled_caught,
-                    '._scope._cancel_status': cs._cancel_status,
                 }
             )
+            from .devx.pformat import pformat_cs
+            cs_fmt: str = pformat_cs(
+                cs,
+                var_name='Context._scope',
+            )
+            fmt_str += (
+                '\n'
+                +
+                cs_fmt
+            )
         log.cancel(
             message
             +
             fmt_str
         )
-        # TODO: maybe we should also call `._res_scope.cancel()` if it
-        # exists to support cancelling any drain loop hangs?
 
-    # TODO: add to `Channel`?
+    # TODO: also add to `Channel`?
     @property
     def dst_maddr(self) -> str:
         chan: Channel = self.chan
@@ -1100,7 +1126,8 @@ class Context:
                             f'ctx id: {self.cid}'
                         )
 
-    # TODO: replace all the instances of this!! XD
+    # TODO: replace all the `._maybe_raise_remote_err()` usage
+    # with instances of this!!
     def maybe_raise(
         self,
         hide_tb: bool = True,
@@ -1111,6 +1138,7 @@ class Context:
         if re := self._remote_error:
             return self._maybe_raise_remote_err(
                 re,
+                hide_tb=hide_tb,
                 **kwargs,
             )
 
@@ -1212,7 +1240,6 @@ class Context:
         #       runtime frames from the tb explicitly?
         # https://docs.python.org/3/reference/simple_stmts.html#the-raise-statement
         # https://stackoverflow.com/a/24752607
-        __tracebackhide__: bool = True
         raise remote_error # from None
 
     # TODO: change  to `.wait_for_result()`?
@@ -1263,8 +1290,15 @@ class Context:
             # wait for a final context result/error by "draining"
             # (by more or less ignoring) any bi-dir-stream "yield"
             # msgs still in transit from the far end.
+            #
+            # XXX NOTE XXX: this call shouldn't really ever raise
+            # (other then internal error), instead delivering an
+            # `Error`-msg  and that being `.maybe_raise()`-ed below
+            # since every message should be delivered via the normal
+            # `._deliver_msg()` route which will appropriately set
+            # any `.maybe_error`.
             (
-                return_msg,
+                outcome_msg,
                 drained_msgs,
             ) = await msgops.drain_to_final_msg(
                 ctx=self,
@@ -1282,13 +1316,18 @@ class Context:
                         f'{msg}\n'
                     )
 
-            log.cancel(
-                'Ctx drained to final result msgs\n'
-                f'{return_msg}\n\n'
-
-                f'pre-result drained msgs:\n'
-                f'{pformat(drained_msgs)}\n'
+            drained_status: str = (
+                'Ctx drained to final outcome msg\n\n'
+                f'{outcome_msg}\n'
             )
+            if drained_msgs:
+                drained_status += (
+                    '\n'
+                    f'The pre-drained msgs are\n'
+                    f'{pformat(drained_msgs)}\n'
+                )
+
+            log.cancel(drained_status)
 
         self.maybe_raise(
             # NOTE: obvi we don't care if we
@@ -1319,7 +1358,7 @@ class Context:
 
     @property
     def maybe_error(self) -> BaseException|None:
-        le: Exception|None = self._local_error
+        le: BaseException|None = self._local_error
         re: RemoteActorError|ContextCancelled|None = self._remote_error
 
         match (le, re):
@@ -1347,7 +1386,7 @@ class Context:
             #     ContextCancelled(canceller=),
             # ):
 
-        error: Exception|None = le or re
+        error: BaseException|None = le or re
         if error:
             return error
 
@@ -1462,52 +1501,63 @@ class Context:
         '''
         merr: Exception|None = self.maybe_error
         outcome: Unresolved|Exception|Any = self.outcome
-
+        status: str|None = None
         match (
             outcome,
             merr,
         ):
+            # "graceful" ctx cancellation
             case (
                 Unresolved,
                 ContextCancelled(),
-            ) if self.cancel_acked:
-                status = 'self-cancelled'
-
-            case (
-                Unresolved,
-                ContextCancelled(),
-            ) if (
-                self.canceller
-                and not self._cancel_called
             ):
-                status = 'peer-cancelled'
+                if self._is_self_cancelled():
+                    status = 'self-cancelled'
+                elif (
+                    self.canceller
+                    and not self._cancel_called
+                ):
+                    status = 'peer-cancelled'
 
+            # (remote) error condition
             case (
                 Unresolved,
-                BaseException(),
-            ) if self.canceller:
+                BaseException(),  # any error-type
+            ):
                 status = 'errored'
 
+            # result already returned
             case (
                 _,  # any non-unresolved value
                 None,
             ) if self._final_result_is_set():
                 status = 'returned'
 
+            # normal operation but still in a pre-`Return`-result
+            # dialog phase
             case (
-                Unresolved,  # noqa (weird.. ruff)
-                None,
+                Unresolved,  # noqa (ruff, you so weird..)
+                None,  # no (remote) error set
             ):
                 if stream := self._stream:
                     if stream.closed:
                         status = 'streaming-finished'
                     else:
                         status = 'streaming'
+
                 elif self._started_called:
                     status = 'started'
 
-            case _:
-                status = 'unknown!?'
+                else:
+                    if self.side == 'child':
+                        status = 'pre-started'
+                    else:
+                        status = 'syncing-to-child'
+
+        if status is None:
+            status = '??unknown??'
+            # from tractor.devx import mk_pdb
+            # mk_pdb().set_trace()
 
         return status
 
@@ -1738,7 +1788,6 @@ class Context:
                 f'Delivering IPC ctx error from {self.peer_side!r} to {side!r} task\n\n'
 
                 f'{flow_body}'
-
                 f'{pformat(re)}\n'
             )
             self._cancel_msg: dict = msg
@@ -2003,6 +2052,7 @@ async def open_context_from_portal(
     )
     assert ctx._remote_func_type == 'context'
     assert ctx._caller_info
+    _ctxvar_Context.set(ctx)
 
     # XXX NOTE since `._scope` is NOT set BEFORE we retreive the
     # `Started`-msg any cancellation triggered
@@ -2156,7 +2206,7 @@ async def open_context_from_portal(
 
         # CASE 2: context was cancelled by local task calling
         # `.cancel()`, we don't raise and the exit block should
-        # exit silently.
+        # finish silently.
         if (
             ctx._cancel_called
             and
@@ -2281,6 +2331,11 @@ async def open_context_from_portal(
             try:
                 result_or_err: Exception|Any = await ctx.result()
             except BaseException as berr:
+                # cancelled before (or maybe during?) final result capture
+                # if isinstance(trio.Cancelled, berr):
+                #     from .devx import mk_pdb
+                #     mk_pdb.set_trace()
+
                 # on normal teardown, if we get some error
                 # raised in `Context.result()` we still want to
                 # save that error on the ctx's state to
@@ -2476,12 +2531,12 @@ def mk_context(
         _caller_info=caller_info,
         **kwargs,
     )
-    # TODO: we can drop the old placeholder yah?
-    # ctx._result: int | Any = id(ctx)
     ctx._result = Unresolved
     return ctx
 
 
+# TODO: use the new type-parameters to annotate this in 3.13?
+# -[ ] https://peps.python.org/pep-0718/#unknown-types
 def context(func: Callable) -> Callable:
     '''
     Mark an (async) function as an SC-supervised, inter-`Actor`,
@@ -2495,8 +2550,8 @@ def context(func: Callable) -> Callable:
     # https://github.com/python/mypy/issues/2087#issuecomment-769266912
     func._tractor_context_function = True  # type: ignore
 
-    sig = inspect.signature(func)
-    params = sig.parameters
+    sig: inspect.Signature = inspect.signature(func)
+    params: Mapping = sig.parameters
     if 'ctx' not in params:
         raise TypeError(
             "The first argument to the context function "
-- 
2.34.1


From 6690968236aba51349fa1462f0b67b08336cdd3a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 8 May 2024 14:24:25 -0400
Subject: [PATCH 296/378] Rework and first draft of `.devx._frame_stack.py`

Proto-ing a little suite of call-stack-frame annotation-for-scanning
sub-systems for the purposes of both,
- the `.devx._debug`er and its
  traceback and frame introspection needs when entering the REPL,
- detailed trace-style logging such that we can explicitly report
  on "which and where" `tractor`'s APIs are used in the "app" code.

Deats:
- change mod name obvi from `._code` and adjust client mod imports.
- using `wrapt` (for perf) implement a `@api_frame` annot decorator
  which both stashes per-call-stack-frame instances of `CallerInfo` in
  a table and marks the function such that API endpoints can be easily
  found via runtime stack scanning despite any internal impl changes.
- add a global `_frame2callerinfo_cache: dict[FrameType, CallerInfo]`
  table for providing the per func-frame info caching.
- Re-implement `CallerInfo` to require less (types of) inputs:
  |_ `_api_func: Callable`, a ref to the (singleton) func def.
  |_ `_api_frame: FrameType` taken from the `@api_frame` marked `tractor`-API
     func's runtime call-stack, from which we can determine the
     app code's `.caller_frame`.
  |_`_caller_frames_up: int|None` allowing the specific `@api_frame` to
    determine "how many frames up" the application / calling code is.
  And, a better set of derived attrs:
  |_`caller_frame: FrameType` which finds and caches the API-eps calling
    frame.
  |_`caller_frame: FrameType` which finds and caches the API-eps calling
- add a new attempt at "getting a method ref from its runtime frame"
  with `get_ns_and_func_from_frame()` using a heuristic that the
  `CodeType.co_qualname: str` should have a "." in it for methods.
  - main issue is still that the func-ref lookup will require searching
    for the method's instance type by name, and that name isn't
    guaranteed to be defined in any particular ns..
   |_rn we try to read it from the `FrameType.f_locals` but that is
     going to obvi fail any time the method is called in a module where
     it's type is not also defined/imported.
  - returns both the ns and the func ref FYI.
---
 tractor/_context.py                        |   4 +-
 tractor/devx/{_code.py => _frame_stack.py} | 229 +++++++++++----------
 2 files changed, 121 insertions(+), 112 deletions(-)
 rename tractor/devx/{_code.py => _frame_stack.py} (53%)

diff --git a/tractor/_context.py b/tractor/_context.py
index 3dcf8151..fe5d6543 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -94,7 +94,7 @@ if TYPE_CHECKING:
     from ._portal import Portal
     from ._runtime import Actor
     from ._ipc import MsgTransport
-    from .devx._code import (
+    from .devx._frame_stack import (
         CallerInfo,
     )
 
@@ -2513,7 +2513,7 @@ def mk_context(
     send_chan, recv_chan = trio.open_memory_channel(msg_buffer_size)
 
     # TODO: only scan caller-info if log level so high!
-    from .devx._code import find_caller_info
+    from .devx._frame_stack import find_caller_info
     caller_info: CallerInfo|None = find_caller_info()
 
     # TODO: when/how do we apply `.limit_plds()` from here?
diff --git a/tractor/devx/_code.py b/tractor/devx/_frame_stack.py
similarity index 53%
rename from tractor/devx/_code.py
rename to tractor/devx/_frame_stack.py
index 8d55212b..89a9e849 100644
--- a/tractor/devx/_code.py
+++ b/tractor/devx/_frame_stack.py
@@ -20,11 +20,8 @@ as it pertains to improving the grok-ability of our runtime!
 
 '''
 from __future__ import annotations
+from functools import partial
 import inspect
-# import msgspec
-# from pprint import pformat
-import textwrap
-import traceback
 from types import (
     FrameType,
     FunctionType,
@@ -32,9 +29,8 @@ from types import (
     # CodeType,
 )
 from typing import (
-    # Any,
+    Any,
     Callable,
-    # TYPE_CHECKING,
     Type,
 )
 
@@ -42,6 +38,7 @@ from tractor.msg import (
     pretty_struct,
     NamespacePath,
 )
+import wrapt
 
 
 # TODO: yeah, i don't love this and we should prolly just
@@ -83,6 +80,31 @@ def get_class_from_frame(fr: FrameType) -> (
     return None
 
 
+def get_ns_and_func_from_frame(
+    frame: FrameType,
+) -> Callable:
+    '''
+    Return the corresponding function object reference from
+    a `FrameType`, and return it and it's parent namespace `dict`.
+
+    '''
+    ns: dict[str, Any]
+
+    # for a method, go up a frame and lookup the name in locals()
+    if '.' in (qualname := frame.f_code.co_qualname):
+        cls_name, _, func_name = qualname.partition('.')
+        ns = frame.f_back.f_locals[cls_name].__dict__
+
+    else:
+        func_name: str = frame.f_code.co_name
+        ns = frame.f_globals
+
+    return (
+        ns,
+        ns[func_name],
+    )
+
+
 def func_ref_from_frame(
     frame: FrameType,
 ) -> Callable:
@@ -98,34 +120,63 @@ def func_ref_from_frame(
             )
 
 
-# TODO: move all this into new `.devx._code`!
-# -[ ] prolly create a `@runtime_api` dec?
-# -[ ] ^- make it capture and/or accept buncha optional
-#     meta-data like a fancier version of `@pdbp.hideframe`.
-#
 class CallerInfo(pretty_struct.Struct):
-    rt_fi: inspect.FrameInfo
-    call_frame: FrameType
+    # https://docs.python.org/dev/reference/datamodel.html#frame-objects
+    # https://docs.python.org/dev/library/inspect.html#the-interpreter-stack
+    _api_frame: FrameType
 
     @property
-    def api_func_ref(self) -> Callable|None:
-        return func_ref_from_frame(self.rt_fi.frame)
+    def api_frame(self) -> FrameType:
+        try:
+            self._api_frame.clear()
+        except RuntimeError:
+            # log.warning(
+            print(
+                f'Frame {self._api_frame} for {self.api_func} is still active!'
+            )
+
+        return self._api_frame
+
+    _api_func: Callable
+
+    @property
+    def api_func(self) -> Callable:
+        return self._api_func
+
+    _caller_frames_up: int|None = 1
+    _caller_frame: FrameType|None = None  # cached after first stack scan
 
     @property
     def api_nsp(self) -> NamespacePath|None:
-        func: FunctionType = self.api_func_ref
+        func: FunctionType = self.api_func
         if func:
             return NamespacePath.from_ref(func)
 
         return '<unknown>'
 
     @property
-    def caller_func_ref(self) -> Callable|None:
-        return func_ref_from_frame(self.call_frame)
+    def caller_frame(self) -> FrameType:
+
+        # if not already cached, scan up stack explicitly by
+        # configured count.
+        if not self._caller_frame:
+            if self._caller_frames_up:
+                for _ in range(self._caller_frames_up):
+                    caller_frame: FrameType|None = self.api_frame.f_back
+
+                if not caller_frame:
+                    raise ValueError(
+                        'No frame exists {self._caller_frames_up} up from\n'
+                        f'{self.api_frame} @ {self.api_nsp}\n'
+                    )
+
+            self._caller_frame = caller_frame
+
+        return self._caller_frame
 
     @property
     def caller_nsp(self) -> NamespacePath|None:
-        func: FunctionType = self.caller_func_ref
+        func: FunctionType = self.api_func
         if func:
             return NamespacePath.from_ref(func)
 
@@ -172,108 +223,66 @@ def find_caller_info(
                 call_frame = call_frame.f_back
 
             return CallerInfo(
-                rt_fi=fi,
-                call_frame=call_frame,
+                _api_frame=rt_frame,
+                _api_func=func_ref_from_frame(rt_frame),
+                _caller_frames_up=go_up_iframes,
             )
 
     return None
 
 
-def pformat_boxed_tb(
-    tb_str: str,
-    fields_str: str|None = None,
-    field_prefix: str = ' |_',
+_frame2callerinfo_cache: dict[FrameType, CallerInfo] = {}
 
-    tb_box_indent: int|None = None,
-    tb_body_indent: int = 1,
 
-) -> str:
-    '''
-    Create a "boxed" looking traceback string.
+# TODO: -[x] move all this into new `.devx._code`!
+# -[ ] consider rename to _callstack?
+# -[ ] prolly create a `@runtime_api` dec?
+#   |_ @api_frame seems better?
+# -[ ] ^- make it capture and/or accept buncha optional
+#     meta-data like a fancier version of `@pdbp.hideframe`.
+#
+def api_frame(
+    wrapped: Callable|None = None,
+    *,
+    caller_frames_up: int = 1,
 
-    Useful for emphasizing traceback text content as being an
-    embedded attribute of some other object (like
-    a `RemoteActorError` or other boxing remote error shuttle
-    container).
+) -> Callable:
 
-    Any other parent/container "fields" can be passed in the
-    `fields_str` input along with other prefix/indent settings.
+    # handle the decorator called WITHOUT () case,
+    # i.e. just @api_frame, NOT @api_frame(extra=<blah>)
+    if wrapped is None:
+        return partial(
+            api_frame,
+            caller_frames_up=caller_frames_up,
+        )
 
-    '''
-    if (
-        fields_str
-        and
-        field_prefix
+    @wrapt.decorator
+    async def wrapper(
+        wrapped: Callable,
+        instance: object,
+        args: tuple,
+        kwargs: dict,
     ):
-        fields: str = textwrap.indent(
-            fields_str,
-            prefix=field_prefix,
-        )
-    else:
-        fields = fields_str or ''
+        # maybe cache the API frame for this call
+        global _frame2callerinfo_cache
+        this_frame: FrameType = inspect.currentframe()
+        api_frame: FrameType = this_frame.f_back
 
-    tb_body = tb_str
-    if tb_body_indent:
-        tb_body: str = textwrap.indent(
-            tb_str,
-            prefix=tb_body_indent * ' ',
-        )
+        if not _frame2callerinfo_cache.get(api_frame):
+            _frame2callerinfo_cache[api_frame] = CallerInfo(
+                _api_frame=api_frame,
+                _api_func=wrapped,
+                _caller_frames_up=caller_frames_up,
+            )
 
-    tb_box: str = (
+        return wrapped(*args, **kwargs)
 
-        # orig
-        # f'  |\n'
-        # f'   ------ - ------\n\n'
-        # f'{tb_str}\n'
-        # f'   ------ - ------\n'
-        # f' _|\n'
-
-        f'|\n'
-        f' ------ - ------\n\n'
-        # f'{tb_str}\n'
-        f'{tb_body}'
-        f' ------ - ------\n'
-        f'_|\n'
-    )
-    tb_box_indent: str = (
-        tb_box_indent
-        or
-        1
-
-        # (len(field_prefix))
-        # ? ^-TODO-^ ? if you wanted another indent level
-    )
-    if tb_box_indent > 0:
-        tb_box: str = textwrap.indent(
-            tb_box,
-            prefix=tb_box_indent * ' ',
-        )
-
-    return (
-        fields
-        +
-        tb_box
-    )
-
-
-def pformat_caller_frame(
-    stack_limit: int = 1,
-    box_tb: bool = True,
-) -> str:
-    '''
-    Capture and return the traceback text content from
-    `stack_limit` call frames up.
-
-    '''
-    tb_str: str = (
-        '\n'.join(
-            traceback.format_stack(limit=stack_limit)
-        )
-    )
-    if box_tb:
-        tb_str: str = pformat_boxed_tb(
-            tb_str=tb_str,
-            field_prefix='  ',
-            indent='',
-        )
-    return tb_str
+    # annotate the function as a "api function", meaning it is
+    # a function for which the function above it in the call stack should be
+    # non-`tractor` code aka "user code".
+    #
+    # in the global frame cache for easy lookup from a given
+    # func-instance
+    wrapped._call_infos: dict[FrameType, CallerInfo] = _frame2callerinfo_cache
+    wrapped.__api_func__: bool = True
+    return wrapper(wrapped)
-- 
2.34.1


From c929bc15c91380dc9ea39e82529202a20b1ce0df Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 8 May 2024 14:53:10 -0400
Subject: [PATCH 297/378] Add `pexpect` to dev deps for testing

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index c1064744..c163c7f7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,6 +55,7 @@ xontrib-vox = "^0.0.1"
 optional = false
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.2.0"
+pexpect = "^4.9.0"
 
 # only for xonsh as sh..
 xontrib-vox = "^0.0.1"
-- 
2.34.1


From f85314ecabd539f99939102ce9cb810d7630b3c8 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 8 May 2024 14:53:45 -0400
Subject: [PATCH 298/378] Adjust `._runtime` to report `DebugStatus.req_ctx`

- inside the `Actor.cancel()`'s maybe-wait-on-debugger delay,
  report the full debug request status and it's affiliated lock request
  IPC ctx.
- use the new `.req_ctx.chan.uid` to do the local nursery lookup during
  channel teardown handling.
- another couple log fmt tweaks.
---
 tractor/_runtime.py | 45 +++++++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index d28f4906..23c1c6f5 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -644,7 +644,7 @@ class Actor:
             peers_str: str = ''
             for uid, chans in self._peers.items():
                 peers_str += (
-                    f'|_ uid: {uid}\n'
+                    f'uid: {uid}\n'
                 )
                 for i, chan in enumerate(chans):
                     peers_str += (
@@ -678,10 +678,12 @@ class Actor:
                     # XXX => YES IT DOES, when i was testing ctl-c
                     # from broken debug TTY locking due to
                     # msg-spec races on application using RunVar...
-                    pdb_user_uid: tuple = pdb_lock.global_actor_in_debug
                     if (
-                        pdb_user_uid
-                        and local_nursery
+                        (ctx_in_debug := pdb_lock.ctx_in_debug)
+                        and
+                        (pdb_user_uid := ctx_in_debug.chan.uid)
+                        and
+                        local_nursery
                     ):
                         entry: tuple|None = local_nursery._children.get(
                             tuple(pdb_user_uid)
@@ -1169,13 +1171,17 @@ class Actor:
 
             # kill any debugger request task to avoid deadlock
             # with the root actor in this tree
-            dbcs = _debug.DebugStatus.req_cs
-            if dbcs is not None:
+            debug_req = _debug.DebugStatus
+            lock_req_ctx: Context = debug_req.req_ctx
+            if lock_req_ctx is not None:
                 msg += (
                     '-> Cancelling active debugger request..\n'
-                    f'|_{_debug.Lock.pformat()}'
+                    f'|_{_debug.Lock.repr()}\n\n'
+                    f'|_{lock_req_ctx}\n\n'
                 )
-                dbcs.cancel()
+                # lock_req_ctx._scope.cancel()
+                # TODO: wrap this in a method-API..
+                debug_req.req_cs.cancel()
 
             # self-cancel **all** ongoing RPC tasks
             await self.cancel_rpc_tasks(
@@ -1375,15 +1381,17 @@ class Actor:
             "IPC channel's "
         )
         rent_chan_repr: str = (
-            f'|_{parent_chan}'
+            f'   |_{parent_chan}\n\n'
             if parent_chan
             else ''
         )
         log.cancel(
-            f'Cancelling {descr} {len(tasks)} rpc tasks\n\n'
-            f'<= `Actor.cancel_rpc_tasks()`: {req_uid}\n'
-            f'    {rent_chan_repr}\n'
-            # f'{self}\n'
+            f'Cancelling {descr} RPC tasks\n\n'
+            f'<= canceller: {req_uid}\n'
+            f'{rent_chan_repr}'
+            f'=> cancellee: {self.uid}\n'
+            f'  |_{self}.cancel_rpc_tasks()\n'
+            f'  |_tasks: {len(tasks)}\n'
             # f'{tasks_str}'
         )
         for (
@@ -1413,7 +1421,7 @@ class Actor:
         if tasks:
             log.cancel(
                 'Waiting for remaining rpc tasks to complete\n'
-                f'|_{tasks}'
+                f'|_{tasks_str}'
             )
         await self._ongoing_rpc_tasks.wait()
 
@@ -1466,7 +1474,10 @@ class Actor:
         assert self._parent_chan, "No parent channel for this actor?"
         return Portal(self._parent_chan)
 
-    def get_chans(self, uid: tuple[str, str]) -> list[Channel]:
+    def get_chans(
+        self,
+        uid: tuple[str, str],
+    ) -> list[Channel]:
         '''
         Return all IPC channels to the actor with provided `uid`.
 
@@ -1626,7 +1637,9 @@ async def async_main(
                     # tranport address bind errors - normally it's
                     # something silly like the wrong socket-address
                     # passed via a config or CLI Bo
-                    entered_debug = await _debug._maybe_enter_pm(oserr)
+                    entered_debug = await _debug._maybe_enter_pm(
+                        oserr,
+                    )
                     if entered_debug:
                         log.runtime('Exited debug REPL..')
                     raise
-- 
2.34.1


From c5a0cfc6399530a4b0aecb93c62db02eb9c0120e Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 8 May 2024 15:07:34 -0400
Subject: [PATCH 299/378] Rename `.msg.types.Msg` -> `PayloadMsg`

---
 tractor/msg/types.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index 7e10dab0..1b3733cb 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -56,8 +56,7 @@ log = get_logger('tractor.msgspec')
 PayloadT = TypeVar('PayloadT')
 
 
-# TODO: PayloadMsg
-class Msg(
+class PayloadMsg(
     Struct,
     Generic[PayloadT],
 
@@ -110,6 +109,10 @@ class Msg(
     pld: Raw
 
 
+# TODO: complete rename
+Msg = PayloadMsg
+
+
 class Aid(
     Struct,
     tag=True,
-- 
2.34.1


From d6ca4771ce7d9e5137db030d2179d3fc395464e3 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 9 May 2024 09:37:47 -0400
Subject: [PATCH 300/378] Use `.recv_msg_w_pld()` for final `Portal.result()`

Woops, due to a `None` test against the `._final_result`, any actual
final `None` result would be received but not acked as such causing
a spawning test to hang. Fix it by instead receiving and assigning both
a `._final_result_msg: PayloadMsg` and `._final_result_pld`.

NB: as mentioned in many recent comments surrounding this API layer,
really this whole `Portal`-has-final-result interface/semantics should
be entirely removed as should the `ActorNursery.run_in_actor()` API(s).
Instead it should all be replaced by a wrapping "high level" API
(`tractor.hilevel` ?) which combines a task nursery, `Portal.open_context()`
and underlying `Context` APIs + an `outcome.Outcome` to accomplish the
same "run a single task in a spawned actor and return it's result"; aka
a "one-shot-task-actor".
---
 tractor/_portal.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/tractor/_portal.py b/tractor/_portal.py
index 79a9dc5d..700f2fdc 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -47,6 +47,7 @@ from ._ipc import Channel
 from .log import get_logger
 from .msg import (
     # Error,
+    PayloadMsg,
     NamespacePath,
     Return,
 )
@@ -98,7 +99,8 @@ class Portal:
 
         self.chan = channel
         # during the portal's lifetime
-        self._final_result: Any|None = None
+        self._final_result_pld: Any|None = None
+        self._final_result_msg: PayloadMsg|None = None
 
         # When set to a ``Context`` (when _submit_for_result is called)
         # it is expected that ``result()`` will be awaited at some
@@ -132,7 +134,7 @@ class Portal:
                 'A pending main result has already been submitted'
             )
 
-        self._expect_result_ctx = await self.actor.start_remote_task(
+        self._expect_result_ctx: Context = await self.actor.start_remote_task(
             self.channel,
             nsf=NamespacePath(f'{ns}:{func}'),
             kwargs=kwargs,
@@ -163,13 +165,16 @@ class Portal:
         # expecting a "main" result
         assert self._expect_result_ctx
 
-        if self._final_result is None:
-            self._final_result: Any = await self._expect_result_ctx._pld_rx.recv_pld(
-                ctx=self._expect_result_ctx,
+        if self._final_result_msg is None:
+            (
+                self._final_result_msg,
+                self._final_result_pld,
+            ) = await self._expect_result_ctx._pld_rx.recv_msg_w_pld(
+                ipc=self._expect_result_ctx,
                 expect_msg=Return,
             )
 
-        return self._final_result
+        return self._final_result_pld
 
     async def _cancel_streams(self):
         # terminate all locally running async generator
-- 
2.34.1


From fc075e96c6c1c3e31604ae709e0a18b985ff71ec Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 9 May 2024 15:20:03 -0400
Subject: [PATCH 301/378] Hide some API frames, port to new `._debug` apis

- start tossing in `__tracebackhide__`s to various eps which don't need
  to show in tbs or in the pdb REPL.
- port final `._maybe_enter_pm()` to pass a `api_frame`.
- start comment-marking up some API eps with `@api_frame`
  in prep for actually using the new frame-stack tracing.
---
 tractor/_root.py      | 11 ++++++++---
 tractor/_spawn.py     | 19 ++++++++-----------
 tractor/_supervise.py |  4 ++--
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/tractor/_root.py b/tractor/_root.py
index de8388d5..77806992 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -18,7 +18,7 @@
 Root actor runtime ignition(s).
 
 '''
-from contextlib import asynccontextmanager
+from contextlib import asynccontextmanager as acm
 from functools import partial
 import importlib
 import logging
@@ -60,7 +60,7 @@ _default_lo_addrs: list[tuple[str, int]] = [(
 logger = log.get_logger('tractor')
 
 
-@asynccontextmanager
+@acm
 async def open_root_actor(
 
     *,
@@ -96,6 +96,7 @@ async def open_root_actor(
     Runtime init entry point for ``tractor``.
 
     '''
+    __tracebackhide__ = True
     # TODO: stick this in a `@cm` defined in `devx._debug`?
     #
     # Override the global debugger hook to make it play nice with
@@ -358,7 +359,11 @@ async def open_root_actor(
                 BaseExceptionGroup,
             ) as err:
 
-                entered: bool = await _debug._maybe_enter_pm(err)
+                import inspect
+                entered: bool = await _debug._maybe_enter_pm(
+                    err,
+                    api_frame=inspect.currentframe(),
+                )
 
                 if (
                     not entered
diff --git a/tractor/_spawn.py b/tractor/_spawn.py
index 06a2bf10..b234099f 100644
--- a/tractor/_spawn.py
+++ b/tractor/_spawn.py
@@ -142,7 +142,9 @@ async def exhaust_portal(
     '''
     __tracebackhide__ = True
     try:
-        log.debug(f"Waiting on final result from {actor.uid}")
+        log.debug(
+            f'Waiting on final result from {actor.uid}'
+        )
 
         # XXX: streams should never be reaped here since they should
         # always be established and shutdown using a context manager api
@@ -195,7 +197,10 @@ async def cancel_on_completion(
     # if this call errors we store the exception for later
     # in ``errors`` which will be reraised inside
     # an exception group and we still send out a cancel request
-    result: Any|Exception = await exhaust_portal(portal, actor)
+    result: Any|Exception = await exhaust_portal(
+        portal,
+        actor,
+    )
     if isinstance(result, Exception):
         errors[actor.uid]: Exception = result
         log.cancel(
@@ -503,14 +508,6 @@ async def trio_proc(
             )
         )
 
-        # await chan.send({
-        #     '_parent_main_data': subactor._parent_main_data,
-        #     'enable_modules': subactor.enable_modules,
-        #     'reg_addrs': subactor.reg_addrs,
-        #     'bind_addrs': bind_addrs,
-        #     '_runtime_vars': _runtime_vars,
-        # })
-
         # track subactor in current nursery
         curr_actor: Actor = current_actor()
         curr_actor._actoruid2nursery[subactor.uid] = actor_nursery
@@ -554,8 +551,8 @@ async def trio_proc(
         # killing the process too early.
         if proc:
             log.cancel(f'Hard reap sequence starting for {subactor.uid}')
-            with trio.CancelScope(shield=True):
 
+            with trio.CancelScope(shield=True):
                 # don't clobber an ongoing pdb
                 if cancelled_during_spawn:
                     # Try again to avoid TTY clobbering.
diff --git a/tractor/_supervise.py b/tractor/_supervise.py
index dc65cc65..59ec728b 100644
--- a/tractor/_supervise.py
+++ b/tractor/_supervise.py
@@ -346,8 +346,6 @@ async def _open_and_supervise_one_cancels_all_nursery(
     actor: Actor,
 
 ) -> typing.AsyncGenerator[ActorNursery, None]:
-
-    # TODO: yay or nay?
     __tracebackhide__ = True
 
     # the collection of errors retreived from spawned sub-actors
@@ -519,6 +517,7 @@ async def _open_and_supervise_one_cancels_all_nursery(
 
 
 @acm
+# @api_frame
 async def open_nursery(
     **kwargs,
 
@@ -538,6 +537,7 @@ async def open_nursery(
     which cancellation scopes correspond to each spawned subactor set.
 
     '''
+    __tracebackhide__ = True
     implicit_runtime: bool = False
     actor: Actor = current_actor(err_on_no_runtime=False)
     an: ActorNursery|None = None
-- 
2.34.1


From 5cb0cc0f0bd218fbcc0ae57be88a63fafde1d5fd Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 9 May 2024 16:31:23 -0400
Subject: [PATCH 302/378] Update tests for `PldRx` and `Context` changes

Mostly adjustments for the new pld-receiver semantics/shim-layer which
results more often in the direct delivery of `RemoteActorError`s from
IPC API primitives (like `Portal.result()`) instead of being embedded in
an `ExceptionGroup` bundled from an embedded nursery.

Tossed usage of the `debug_mode: bool` fixture to a couple problematic
tests while i was working on them.

Also includes detailed assertion updates to the inter-peer cancellation
suite in terms of,
- `Context.canceller` state correctly matching the true src actor when
  expecting a ctxc.
- any rxed `ContextCancelled` should instance match the `Context._local/remote_error`
  as should the `.msgdata` and `._ipc_msg`.
---
 tests/test_advanced_faults.py         |  1 +
 tests/test_cancellation.py            | 25 +++++--
 tests/test_infected_asyncio.py        | 22 +++---
 tests/test_inter_peer_cancellation.py | 99 +++++++++++++++++++--------
 tests/test_spawning.py                | 20 +++---
 5 files changed, 114 insertions(+), 53 deletions(-)

diff --git a/tests/test_advanced_faults.py b/tests/test_advanced_faults.py
index 45c0aa36..dfaeb68e 100644
--- a/tests/test_advanced_faults.py
+++ b/tests/test_advanced_faults.py
@@ -97,6 +97,7 @@ def test_ipc_channel_break_during_stream(
         examples_dir() / 'advanced_faults'
         / 'ipc_failure_during_stream.py',
         root=examples_dir(),
+        consider_namespace_packages=False,
     )
 
     # by def we expect KBI from user after a simulated "hang
diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py
index 5fd58fbc..bf41ddd1 100644
--- a/tests/test_cancellation.py
+++ b/tests/test_cancellation.py
@@ -89,17 +89,30 @@ def test_remote_error(reg_addr, args_err):
         assert excinfo.value.boxed_type == errtype
 
     else:
-        # the root task will also error on the `.result()` call
-        # so we expect an error from there AND the child.
-        with pytest.raises(BaseExceptionGroup) as excinfo:
+        # the root task will also error on the `Portal.result()`
+        # call so we expect an error from there AND the child.
+        # |_ tho seems like on new `trio` this doesn't always
+        #    happen?
+        with pytest.raises((
+            BaseExceptionGroup,
+            tractor.RemoteActorError,
+        )) as excinfo:
             trio.run(main)
 
-        # ensure boxed errors
-        for exc in excinfo.value.exceptions:
+        # ensure boxed errors are `errtype`
+        err: BaseException = excinfo.value
+        if isinstance(err, BaseExceptionGroup):
+            suberrs: list[BaseException] = err.exceptions
+        else:
+            suberrs: list[BaseException] = [err]
+
+        for exc in suberrs:
             assert exc.boxed_type == errtype
 
 
-def test_multierror(reg_addr):
+def test_multierror(
+    reg_addr: tuple[str, int],
+):
     '''
     Verify we raise a ``BaseExceptionGroup`` out of a nursery where
     more then one actor errors.
diff --git a/tests/test_infected_asyncio.py b/tests/test_infected_asyncio.py
index 8d34bef4..45722a63 100644
--- a/tests/test_infected_asyncio.py
+++ b/tests/test_infected_asyncio.py
@@ -444,6 +444,7 @@ def test_basic_interloop_channel_stream(reg_addr, fan_out):
                 infect_asyncio=True,
                 fan_out=fan_out,
             )
+            # should raise RAE diectly
             await portal.result()
 
     trio.run(main)
@@ -461,12 +462,11 @@ def test_trio_error_cancels_intertask_chan(reg_addr):
             # should trigger remote actor error
             await portal.result()
 
-    with pytest.raises(BaseExceptionGroup) as excinfo:
+    with pytest.raises(RemoteActorError) as excinfo:
         trio.run(main)
 
-    # ensure boxed errors
-    for exc in excinfo.value.exceptions:
-        assert exc.boxed_type == Exception
+    # ensure boxed error type
+    excinfo.value.boxed_type == Exception
 
 
 def test_trio_closes_early_and_channel_exits(reg_addr):
@@ -477,7 +477,7 @@ def test_trio_closes_early_and_channel_exits(reg_addr):
                 exit_early=True,
                 infect_asyncio=True,
             )
-            # should trigger remote actor error
+            # should raise RAE diectly
             await portal.result()
 
     # should be a quiet exit on a simple channel exit
@@ -492,15 +492,17 @@ def test_aio_errors_and_channel_propagates_and_closes(reg_addr):
                 aio_raise_err=True,
                 infect_asyncio=True,
             )
-            # should trigger remote actor error
+            # should trigger RAE directly, not an eg.
             await portal.result()
 
-    with pytest.raises(BaseExceptionGroup) as excinfo:
+    with pytest.raises(
+        # NOTE: bc we directly wait on `Portal.result()` instead
+        # of capturing it inside the `ActorNursery` machinery.
+        expected_exception=RemoteActorError,
+    ) as excinfo:
         trio.run(main)
 
-    # ensure boxed errors
-    for exc in excinfo.value.exceptions:
-        assert exc.boxed_type == Exception
+    excinfo.value.boxed_type == Exception
 
 
 @tractor.context
diff --git a/tests/test_inter_peer_cancellation.py b/tests/test_inter_peer_cancellation.py
index aa05e3c8..7bf9a2bd 100644
--- a/tests/test_inter_peer_cancellation.py
+++ b/tests/test_inter_peer_cancellation.py
@@ -55,9 +55,10 @@ from tractor._testing import (
 
 
 @tractor.context
-async def sleep_forever(
+async def open_stream_then_sleep_forever(
     ctx: Context,
     expect_ctxc: bool = False,
+
 ) -> None:
     '''
     Sync the context, open a stream then just sleep.
@@ -67,6 +68,10 @@ async def sleep_forever(
     '''
     try:
         await ctx.started()
+
+        # NOTE: the below means this child will send a `Stop`
+        # to it's parent-side task despite that side never
+        # opening a stream itself.
         async with ctx.open_stream():
             await trio.sleep_forever()
 
@@ -100,7 +105,7 @@ async def error_before_started(
     '''
     async with tractor.wait_for_actor('sleeper') as p2:
         async with (
-            p2.open_context(sleep_forever) as (peer_ctx, first),
+            p2.open_context(open_stream_then_sleep_forever) as (peer_ctx, first),
             peer_ctx.open_stream(),
         ):
             # NOTE: this WAS inside an @acm body but i factored it
@@ -204,9 +209,13 @@ async def stream_ints(
 @tractor.context
 async def stream_from_peer(
     ctx: Context,
+    debug_mode: bool,
     peer_name: str = 'sleeper',
 ) -> None:
 
+    # sanity
+    assert tractor._state.debug_mode() == debug_mode
+
     peer: Portal
     try:
         async with (
@@ -240,26 +249,54 @@ async def stream_from_peer(
                 assert msg is not None
                 print(msg)
 
-    # NOTE: cancellation of the (sleeper) peer should always
-    # cause a `ContextCancelled` raise in this streaming
-    # actor.
-    except ContextCancelled as ctxc:
-        ctxerr = ctxc
+    # NOTE: cancellation of the (sleeper) peer should always cause
+    # a `ContextCancelled` raise in this streaming actor.
+    except ContextCancelled as _ctxc:
+        ctxc = _ctxc
 
-        assert peer_ctx._remote_error is ctxerr
-        assert peer_ctx._remote_error.msgdata == ctxerr.msgdata
+        # print("TRYING TO ENTER PAUSSE!!!")
+        # await tractor.pause(shield=True)
+        re: ContextCancelled = peer_ctx._remote_error
 
-        # XXX YES, bc exact same msg instances
-        assert peer_ctx._remote_error._ipc_msg is ctxerr._ipc_msg
+        # XXX YES XXX, remote error should be unpacked only once!
+        assert (
+            re
+            is
+            peer_ctx.maybe_error
+            is
+            ctxc
+            is
+            peer_ctx._local_error
+        )
+        # NOTE: these errors should all match!
+        #   ------ - ------
+        # XXX [2024-05-03] XXX
+        #   ------ - ------
+        # broke this due to a re-raise inside `.msg._ops.drain_to_final_msg()`
+        # where the `Error()` msg was directly raising the ctxc
+        # instead of just returning up to the caller inside
+        # `Context.return()` which would results in a diff instance of
+        # the same remote error bubbling out above vs what was
+        # already unpacked and set inside `Context.
+        assert (
+            peer_ctx._remote_error.msgdata
+            ==
+            ctxc.msgdata
+        )
+        # ^-XXX-^ notice the data is of course the exact same.. so
+        # the above larger assert makes sense to also always be true!
 
-        # XXX NO, bc new one always created for property accesss
-        assert peer_ctx._remote_error.ipc_msg != ctxerr.ipc_msg
+        # XXX YES XXX, bc should be exact same msg instances
+        assert peer_ctx._remote_error._ipc_msg is ctxc._ipc_msg
+
+        # XXX NO XXX, bc new one always created for property accesss
+        assert peer_ctx._remote_error.ipc_msg != ctxc.ipc_msg
 
         # the peer ctx is the canceller even though it's canceller
         # is the "canceller" XD
         assert peer_name in peer_ctx.canceller
 
-        assert "canceller" in ctxerr.canceller
+        assert "canceller" in ctxc.canceller
 
         # caller peer should not be the cancel requester
         assert not ctx.cancel_called
@@ -283,12 +320,13 @@ async def stream_from_peer(
 
         # TODO / NOTE `.canceller` won't have been set yet
         # here because that machinery is inside
-        # `.open_context().__aexit__()` BUT, if we had
+        # `Portal.open_context().__aexit__()` BUT, if we had
         # a way to know immediately (from the last
         # checkpoint) that cancellation was due to
         # a remote, we COULD assert this here..see,
         # https://github.com/goodboy/tractor/issues/368
         #
+        # await tractor.pause()
         # assert 'canceller' in ctx.canceller
 
         # root/parent actor task should NEVER HAVE cancelled us!
@@ -392,12 +430,13 @@ def test_peer_canceller(
             try:
                 async with (
                     sleeper.open_context(
-                        sleep_forever,
+                        open_stream_then_sleep_forever,
                         expect_ctxc=True,
                     ) as (sleeper_ctx, sent),
 
                     just_caller.open_context(
                         stream_from_peer,
+                        debug_mode=debug_mode,
                     ) as (caller_ctx, sent),
 
                     canceller.open_context(
@@ -423,10 +462,11 @@ def test_peer_canceller(
 
                     # should always raise since this root task does
                     # not request the sleeper cancellation ;)
-                    except ContextCancelled as ctxerr:
+                    except ContextCancelled as _ctxc:
+                        ctxc = _ctxc
                         print(
                             'CAUGHT REMOTE CONTEXT CANCEL\n\n'
-                            f'{ctxerr}\n'
+                            f'{ctxc}\n'
                         )
 
                         # canceller and caller peers should not
@@ -437,7 +477,7 @@ def test_peer_canceller(
                         # we were not the actor, our peer was
                         assert not sleeper_ctx.cancel_acked
 
-                        assert ctxerr.canceller[0] == 'canceller'
+                        assert ctxc.canceller[0] == 'canceller'
 
                         # XXX NOTE XXX: since THIS `ContextCancelled`
                         # HAS NOT YET bubbled up to the
@@ -448,7 +488,7 @@ def test_peer_canceller(
 
                         # CASE_1: error-during-ctxc-handling,
                         if error_during_ctxerr_handling:
-                            raise RuntimeError('Simulated error during teardown')
+                            raise RuntimeError('Simulated RTE re-raise during ctxc handling')
 
                         # CASE_2: standard teardown inside in `.open_context()` block
                         raise
@@ -513,6 +553,9 @@ def test_peer_canceller(
                 #   should be cancelled by US.
                 #
                 if error_during_ctxerr_handling:
+                    print(f'loc_err: {_loc_err}\n')
+                    assert isinstance(loc_err, RuntimeError)
+
                     # since we do a rte reraise above, the
                     # `.open_context()` error handling should have
                     # raised a local rte, thus the internal
@@ -521,9 +564,6 @@ def test_peer_canceller(
                     # a `trio.Cancelled` due to a local
                     # `._scope.cancel()` call.
                     assert not sleeper_ctx._scope.cancelled_caught
-
-                    assert isinstance(loc_err, RuntimeError)
-                    print(f'_loc_err: {_loc_err}\n')
                     # assert sleeper_ctx._local_error is _loc_err
                     # assert sleeper_ctx._local_error is _loc_err
                     assert not (
@@ -560,9 +600,12 @@ def test_peer_canceller(
 
                         else:  # the other 2 ctxs
                             assert (
-                                re.canceller
-                                ==
-                                canceller.channel.uid
+                                isinstance(re, ContextCancelled)
+                                and (
+                                    re.canceller
+                                    ==
+                                    canceller.channel.uid
+                                )
                             )
 
                     # since the sleeper errors while handling a
@@ -811,8 +854,7 @@ async def serve_subactors(
     async with open_nursery() as an:
 
         # sanity
-        if debug_mode:
-            assert tractor._state.debug_mode()
+        assert tractor._state.debug_mode() == debug_mode
 
         await ctx.started(peer_name)
         async with ctx.open_stream() as ipc:
@@ -1091,7 +1133,6 @@ def test_peer_spawns_and_cancels_service_subactor(
                             '-> root checking `client_ctx.result()`,\n'
                             f'-> checking that sub-spawn {peer_name} is down\n'
                         )
-                    # else:
 
                     try:
                         res = await client_ctx.result(hide_tb=False)
diff --git a/tests/test_spawning.py b/tests/test_spawning.py
index 5995ed2d..99ec9abc 100644
--- a/tests/test_spawning.py
+++ b/tests/test_spawning.py
@@ -2,7 +2,9 @@
 Spawning basics
 
 """
-from typing import Optional
+from typing import (
+    Any,
+)
 
 import pytest
 import trio
@@ -25,13 +27,11 @@ async def spawn(
     async with tractor.open_root_actor(
         arbiter_addr=reg_addr,
     ):
-
         actor = tractor.current_actor()
         assert actor.is_arbiter == is_arbiter
         data = data_to_pass_down
 
         if actor.is_arbiter:
-
             async with tractor.open_nursery() as nursery:
 
                 # forks here
@@ -95,7 +95,9 @@ async def test_movie_theatre_convo(start_method):
         await portal.cancel_actor()
 
 
-async def cellar_door(return_value: Optional[str]):
+async def cellar_door(
+    return_value: str|None,
+):
     return return_value
 
 
@@ -105,16 +107,18 @@ async def cellar_door(return_value: Optional[str]):
 )
 @tractor_test
 async def test_most_beautiful_word(
-    start_method,
-    return_value
+    start_method: str,
+    return_value: Any,
+    debug_mode: bool,
 ):
     '''
     The main ``tractor`` routine.
 
     '''
     with trio.fail_after(1):
-        async with tractor.open_nursery() as n:
-
+        async with tractor.open_nursery(
+            debug_mode=debug_mode,
+        ) as n:
             portal = await n.run_in_actor(
                 cellar_door,
                 return_value=return_value,
-- 
2.34.1


From d2dee87b369a445dc753bd11c4817ea1ca262987 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 9 May 2024 16:51:51 -0400
Subject: [PATCH 303/378] Modernize streaming example script

- add typing,
- apply multi-line call style,
- use 'cancel' log level,
- enable debug mode.
---
 examples/full_fledged_streaming_service.py | 45 +++++++++++++++-------
 1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/examples/full_fledged_streaming_service.py b/examples/full_fledged_streaming_service.py
index c93df242..be4c372e 100644
--- a/examples/full_fledged_streaming_service.py
+++ b/examples/full_fledged_streaming_service.py
@@ -1,6 +1,11 @@
 import time
 import trio
 import tractor
+from tractor import (
+    ActorNursery,
+    MsgStream,
+    Portal,
+)
 
 
 # this is the first 2 actors, streamer_1 and streamer_2
@@ -12,14 +17,18 @@ async def stream_data(seed):
 
 # this is the third actor; the aggregator
 async def aggregate(seed):
-    """Ensure that the two streams we receive match but only stream
+    '''
+    Ensure that the two streams we receive match but only stream
     a single set of values to the parent.
-    """
-    async with tractor.open_nursery() as nursery:
-        portals = []
+
+    '''
+    an: ActorNursery
+    async with tractor.open_nursery() as an:
+        portals: list[Portal] = []
         for i in range(1, 3):
-            # fork point
-            portal = await nursery.start_actor(
+
+            # fork/spawn call
+            portal = await an.start_actor(
                 name=f'streamer_{i}',
                 enable_modules=[__name__],
             )
@@ -43,7 +52,11 @@ async def aggregate(seed):
         async with trio.open_nursery() as n:
 
             for portal in portals:
-                n.start_soon(push_to_chan, portal, send_chan.clone())
+                n.start_soon(
+                    push_to_chan,
+                    portal,
+                    send_chan.clone(),
+                )
 
             # close this local task's reference to send side
             await send_chan.aclose()
@@ -60,7 +73,7 @@ async def aggregate(seed):
 
             print("FINISHED ITERATING in aggregator")
 
-        await nursery.cancel()
+        await an.cancel()
         print("WAITING on `ActorNursery` to finish")
     print("AGGREGATOR COMPLETE!")
 
@@ -75,18 +88,21 @@ async def main() -> list[int]:
 
     '''
     # yes, a nursery which spawns `trio`-"actors" B)
-    nursery: tractor.ActorNursery
-    async with tractor.open_nursery() as nursery:
+    an: ActorNursery
+    async with tractor.open_nursery(
+        loglevel='cancel',
+        debug_mode=True,
+    ) as an:
 
         seed = int(1e3)
         pre_start = time.time()
 
-        portal: tractor.Portal = await nursery.start_actor(
+        portal: Portal = await an.start_actor(
             name='aggregator',
             enable_modules=[__name__],
         )
 
-        stream: tractor.MsgStream
+        stream: MsgStream
         async with portal.open_stream_from(
             aggregate,
             seed=seed,
@@ -95,11 +111,12 @@ async def main() -> list[int]:
             start = time.time()
             # the portal call returns exactly what you'd expect
             # as if the remote "aggregate" function was called locally
-            result_stream = []
+            result_stream: list[int] = []
             async for value in stream:
                 result_stream.append(value)
 
-        await portal.cancel_actor()
+        cancelled: bool = await portal.cancel_actor()
+        assert cancelled
 
         print(f"STREAM TIME = {time.time() - start}")
         print(f"STREAM + SPAWN TIME = {time.time() - pre_start}")
-- 
2.34.1


From 236083b6e41a98414f1b8c016742fa5a29897f96 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 10 May 2024 13:15:45 -0400
Subject: [PATCH 304/378] Rename `.msg.types.Msg` -> `PayloadMsg`

---
 tractor/msg/__init__.py |  2 +-
 tractor/msg/_codec.py   |  2 +-
 tractor/msg/types.py    | 26 +++++++++++++-------------
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/tractor/msg/__init__.py b/tractor/msg/__init__.py
index 13739cdb..44586f2d 100644
--- a/tractor/msg/__init__.py
+++ b/tractor/msg/__init__.py
@@ -44,7 +44,7 @@ from ._codec import (
 # )
 
 from .types import (
-    Msg as Msg,
+    PayloadMsg as PayloadMsg,
 
     Aid as Aid,
     SpawnSpec as SpawnSpec,
diff --git a/tractor/msg/_codec.py b/tractor/msg/_codec.py
index 901c0da1..6ba23b78 100644
--- a/tractor/msg/_codec.py
+++ b/tractor/msg/_codec.py
@@ -432,7 +432,7 @@ class MsgCodec(Struct):
 
     # ) -> Any|Struct:
 
-    #     msg: Msg = codec.dec.decode(msg)
+    #     msg: PayloadMsg = codec.dec.decode(msg)
     #     payload_tag: str = msg.header.payload_tag
     #     payload_dec: msgpack.Decoder = codec._payload_decs[payload_tag]
     #     return payload_dec.decode(msg.pld)
diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index 1b3733cb..f8205c23 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -302,7 +302,7 @@ class StartAck(
 
 
 class Started(
-    Msg,
+    PayloadMsg,
     Generic[PayloadT],
 ):
     '''
@@ -316,12 +316,12 @@ class Started(
 
 # TODO: instead of using our existing `Start`
 # for this (as we did with the original `{'cmd': ..}` style)
-# class Cancel(Msg):
+# class Cancel:
 #     cid: str
 
 
 class Yield(
-    Msg,
+    PayloadMsg,
     Generic[PayloadT],
 ):
     '''
@@ -348,7 +348,7 @@ class Stop(
 
 # TODO: is `Result` or `Out[come]` a better name?
 class Return(
-    Msg,
+    PayloadMsg,
     Generic[PayloadT],
 ):
     '''
@@ -360,7 +360,7 @@ class Return(
 
 
 class CancelAck(
-    Msg,
+    PayloadMsg,
     Generic[PayloadT],
 ):
     '''
@@ -466,14 +466,14 @@ def from_dict_msg(
 
 # TODO: should be make a msg version of `ContextCancelled?`
 # and/or with a scope field or a full `ActorCancelled`?
-# class Cancelled(Msg):
+# class Cancelled(MsgType):
 #     cid: str
 
 # TODO what about overruns?
-# class Overrun(Msg):
+# class Overrun(MsgType):
 #     cid: str
 
-_runtime_msgs: list[Msg] = [
+_runtime_msgs: list[Struct] = [
 
     # identity handshake on first IPC `Channel` contact.
     Aid,
@@ -499,9 +499,9 @@ _runtime_msgs: list[Msg] = [
 ]
 
 # the no-outcome-yet IAC (inter-actor-communication) sub-set which
-# can be `Msg.pld` payload field type-limited by application code
+# can be `PayloadMsg.pld` payload field type-limited by application code
 # using `apply_codec()` and `limit_msg_spec()`.
-_payload_msgs: list[Msg] = [
+_payload_msgs: list[PayloadMsg] = [
     # first <value> from `Context.started(<value>)`
     Started,
 
@@ -544,8 +544,8 @@ def mk_msg_spec(
     ] = 'indexed_generics',
 
 ) -> tuple[
-    Union[Type[Msg]],
-    list[Type[Msg]],
+    Union[MsgType],
+    list[MsgType],
 ]:
     '''
     Create a payload-(data-)type-parameterized IPC message specification.
@@ -557,7 +557,7 @@ def mk_msg_spec(
     determined by the input `payload_type_union: Union[Type]`.
 
     '''
-    submsg_types: list[Type[Msg]] = Msg.__subclasses__()
+    submsg_types: list[MsgType] = Msg.__subclasses__()
     bases: tuple = (
         # XXX NOTE XXX the below generic-parameterization seems to
         # be THE ONLY way to get this to work correctly in terms
-- 
2.34.1


From 31de5f6648a506294d6d307ab95cf7c2468e9b9a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 14 May 2024 11:39:04 -0400
Subject: [PATCH 305/378] Always release debug request from `._post_mortem()`

Since obviously the thread is likely expected to halt and raise after
the REPL session exits; this was a regression from the prior impl. The
main reason for this is that otherwise the request task will never
unblock if the user steps through the crashed task using 'next' since
the `.do_next()` handler doesn't by default release the request since in
the `.pause()` case this would end the session too early.

Other,
- toss in draft `Pdb.user_exception()`, though doesn't seem to ever
  trigger?
- only release `Lock._debug_lock` when already locked.
---
 tractor/devx/_debug.py | 43 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 7 deletions(-)

diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index da322407..a789c6ce 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -249,7 +249,10 @@ class Lock:
                 message: str = 'TTY lock not held by any child\n'
 
         except RuntimeError as rte:
-            message: str = 'TTY lock FAILED to release for child??\n'
+            message: str = (
+                'TTY lock FAILED to release for child??\n'
+                f'{current_task()}\n'
+            )
             log.exception(message)
 
             # uhhh makes no sense but been seeing the non-owner
@@ -755,6 +758,16 @@ class PdbREPL(pdbp.Pdb):
 
     status = DebugStatus
 
+    # NOTE: see details in stdlib's `bdb.py`
+    def user_exception(self, frame, exc_info):
+        '''
+        Called when we stop on an exception.
+        '''
+        log.warning(
+            'Exception during REPL sesh\n\n'
+            f'{frame}\n\n'
+            f'{exc_info}\n\n'
+        )
 
     # def preloop(self):
     #     print('IN PRELOOP')
@@ -780,7 +793,11 @@ class PdbREPL(pdbp.Pdb):
             # NOTE: for subactors the stdio lock is released via the
             # allocated RPC locker task, so for root we have to do it
             # manually.
-            if is_root_process():
+            if (
+                is_root_process()
+                and
+                Lock._debug_lock.locked()
+            ):
                 Lock.release()
 
     def set_quit(self):
@@ -791,7 +808,11 @@ class PdbREPL(pdbp.Pdb):
                     cancel_req_task=False,
                 )
 
-            if is_root_process():
+            if (
+                is_root_process()
+                and
+                Lock._debug_lock.locked()
+            ):
                 Lock.release()
 
     # TODO: special handling where we just want the next LOC and
@@ -803,7 +824,7 @@ class PdbREPL(pdbp.Pdb):
     #     try:
     #         super().set_next(frame)
     #     finally:
-    #         Lock.release()
+    #         pdbp.set_trace()
 
     # XXX NOTE: we only override this because apparently the stdlib pdb
     # bois likes to touch the SIGINT handler as much as i like to touch
@@ -1251,7 +1272,7 @@ def shield_sigint_handler(
 
     # child actor that has locked the debugger
     elif not is_root_process():
-        log.warning(
+        log.debug(
             f'Subactor {actor.uid} handling SIGINT\n\n'
             f'{Lock.repr()}\n'
         )
@@ -1484,8 +1505,11 @@ async def _pause(
             ):
                 # re-entrant root process already has it: noop.
                 log.warning(
-                    f'{task.name}@{actor.uid} already has TTY lock\n'
-                    f'ignoring..'
+                    f'This root actor task is already within an active REPL session\n'
+                    f'Ignoring this re-entered `tractor.pause()`\n'
+                    f'task: {task.name}\n'
+                    f'REPL: {Lock.repl}\n'
+                    # TODO: use `._frame_stack` scanner to find the @api_frame
                 )
                 await trio.lowlevel.checkpoint()
                 return
@@ -1609,6 +1633,7 @@ async def _pause(
             log.exception(
                 'Failed to engage debugger via `_pause()` ??\n'
             )
+            mk_pdb().set_trace()
 
         DebugStatus.release()
         # sanity checks for ^ on request/status teardown
@@ -1926,6 +1951,10 @@ def _post_mortem(
         # frame=None,
         traceback=tb,
     )
+    # Since we presume the post-mortem was enaged to a task-ending
+    # error, we MUST release the local REPL request so that not other
+    # local task nor the root remains blocked!
+    DebugStatus.release()
 
 
 async def post_mortem(
-- 
2.34.1


From b23780c102c89d369285026c8c625e17cdbe2634 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 14 May 2024 15:22:13 -0400
Subject: [PATCH 306/378] Make `request_root_stdio_lock()` post-mortem-able

Finally got this working so that if/when an internal bug is introduced
to this request task-func, we can actually REPL-debug the lock request
task itself B)

As in, if the subactor's lock request task internally errors we,
- ensure the task always terminates (by calling `DebugStatus.release()`)
  and explicitly reports (via a `log.exception()`) the internal error.
- capture the error instance and set as a new `DebugStatus.req_err` and
  always check for it on final teardown - in which case we also,
 - ensure it's reraised from a new `DebugRequestError`.
 - unhide the stack frames for `_pause()`, `_enter_repl_sync()` so that
   the dev can upward inspect the `_pause()` call stack sanely.

Supporting internal impl changes,
- add `DebugStatus.cancel()` and `.req_err`.
- don't ever cancel the request task from
  `PdbREPL.set_[continue/quit]()` only when there's some internal error
  that would likely result in a hang and stale lock state with the root.
- only release the root's lock when the current ask is also the owner
  (avoids bad release errors).
- also show internal `._pause()`-related frames on any `repl_err`.

Other temp-dev-tweaks,
- make pld-dec change log msgs info level again while solving this
  final context-vars race stuff..
- drop the debug pld-dec instance match asserts for now since
  the problem is already caught (and now debug-able B) by an attr-error
  on the decoded-as-`dict` started msg, and instead add in
  a `log.exception()` trace to see which task is triggering the case
  where the debug `MsgDec` isn't set correctly vs. when we think it's
  being applied.
---
 tractor/devx/_debug.py | 341 +++++++++++++++++++++++++----------------
 1 file changed, 207 insertions(+), 134 deletions(-)

diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index a789c6ce..1e82122c 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -234,49 +234,26 @@ class Lock:
         cls,
         force: bool = False,
     ):
-        lock: trio.StrictFIFOLock = cls._debug_lock
         try:
-            if lock.locked():
+            lock: trio.StrictFIFOLock = cls._debug_lock
+            owner: Task = lock.statistics().owner
+            if (
+                lock.locked()
+                and
+                owner is current_task()
+                # ^-NOTE-^ if not will raise a RTE..
+            ):
                 if not DebugStatus.is_main_trio_thread():
                     trio.from_thread.run_sync(
                         cls._debug_lock.release
                     )
                 else:
                     cls._debug_lock.release()
+                    message: str = 'TTY lock released for child\n'
 
-                message: str = 'TTY lock released for child\n'
             else:
                 message: str = 'TTY lock not held by any child\n'
 
-        except RuntimeError as rte:
-            message: str = (
-                'TTY lock FAILED to release for child??\n'
-                f'{current_task()}\n'
-            )
-            log.exception(message)
-
-            # uhhh makes no sense but been seeing the non-owner
-            # release error even though this is definitely the task
-            # that locked?
-            owner = cls._debug_lock.statistics().owner
-            # if (
-            #     owner
-            #     and
-            #     cls.remote_task_in_debug is None
-            # ):
-            #     raise RuntimeError(
-            #         'Stale `Lock` detected, no remote task active!?\n'
-            #         f'|_{owner}\n'
-            #         # f'{cls}'
-            #     ) from rte
-
-            if owner:
-                raise rte
-
-            # OW suppress, can't member why tho .. XD
-            # something somethin corrupts a cancel-scope
-            # somewhere..
-
         finally:
             # IFF there are no more requesting tasks queued up fire, the
             # "tty-unlocked" event thereby alerting any monitors of the lock that
@@ -518,11 +495,23 @@ async def lock_tty_for_child(
                 locked=False,
             )
 
-    except BaseException:
-        log.exception(
-            'Errored during root TTY-lock dialog?\n'
-            'Forcing release since an internal error caused this!\n'
+    except BaseException as req_err:
+        message: str = (
+            'Forcing `Lock.release()` since likely an internal error!\n'
         )
+        if isinstance(req_err, trio.Cancelled):
+            log.cancel(
+                'Cancelled during root TTY-lock dialog?\n'
+                +
+                message
+            )
+        else:
+            log.exception(
+                'Errored during root TTY-lock dialog?\n'
+                +
+                message
+            )
+
         Lock.release(force=True)
         raise
 
@@ -555,6 +544,7 @@ class DebugStatus:
     repl_release: trio.Event|None = None
     req_finished: trio.Event|None = None
     lock_status: LockStatus|None = None
+    req_err: BaseException|None = None
 
     _orig_sigint_handler: Callable|None = None
     _trio_handler: (
@@ -693,28 +683,37 @@ class DebugStatus:
         #     is not threading.main_thread()
         # )
 
+    @classmethod
+    def cancel(cls) -> bool:
+        if (req_cs := cls.req_cs):
+            req_cs.cancel()
+            return True
+
+        return False
+
     @classmethod
     @pdbp.hideframe
     def release(
         cls,
-        cancel_req_task: bool = True,
+        cancel_req_task: bool = False,
     ):
+        repl_release: trio.Event = cls.repl_release
         try:
             # sometimes the task might already be terminated in
             # which case this call will raise an RTE?
-            if cls.repl_release is not None:
-                cls.repl_release.set()
+            if repl_release is not None:
+                repl_release.set()
 
         finally:
             # if req_ctx := cls.req_ctx:
             #     req_ctx._scope.cancel()
-
-            if (
-                cancel_req_task
-                and
-                (req_cs := cls.req_cs)
-            ):
-                req_cs.cancel()
+            if cancel_req_task:
+                cancelled: bool = cls.cancel()
+                if not cancelled:
+                    log.warning(
+                        'Failed to cancel request task!?\n'
+                        f'{cls.repl_task}\n'
+                    )
 
             # restore original sigint handler
             cls.unshield_sigint()
@@ -759,16 +758,19 @@ class PdbREPL(pdbp.Pdb):
     status = DebugStatus
 
     # NOTE: see details in stdlib's `bdb.py`
-    def user_exception(self, frame, exc_info):
-        '''
-        Called when we stop on an exception.
-        '''
-        log.warning(
-            'Exception during REPL sesh\n\n'
-            f'{frame}\n\n'
-            f'{exc_info}\n\n'
-        )
+    # def user_exception(self, frame, exc_info):
+    #     '''
+    #     Called when we stop on an exception.
+    #     '''
+    #     log.warning(
+    #         'Exception during REPL sesh\n\n'
+    #         f'{frame}\n\n'
+    #         f'{exc_info}\n\n'
+    #     )
 
+    # NOTE: this actually hooks but i don't see anyway to detect
+    # if an error was caught.. this is why currently we just always
+    # call `DebugStatus.release` inside `_post_mortem()`.
     # def preloop(self):
     #     print('IN PRELOOP')
     #     super().preloop()
@@ -804,10 +806,7 @@ class PdbREPL(pdbp.Pdb):
         try:
             super().set_quit()
         finally:
-            DebugStatus.release(
-                    cancel_req_task=False,
-                )
-
+            DebugStatus.release()
             if (
                 is_root_process()
                 and
@@ -863,7 +862,6 @@ def apply_debug_pldec() -> _codec.MsgCodec:
     (only in the current task).
 
     '''
-
     from tractor.msg import (
         _ops as msgops,
     )
@@ -874,8 +872,12 @@ def apply_debug_pldec() -> _codec.MsgCodec:
         with msgops.limit_plds(
             spec=__pld_spec__,
         ) as debug_dec:
-            assert debug_dec is msgops.current_pldrx().pld_dec
-            log.runtime(
+            assert (
+                debug_dec
+                is
+                msgops.current_pldrx().pld_dec
+            )
+            log.info(
                 'Applied `.devx._debug` pld-spec\n\n'
                 f'{debug_dec}\n'
             )
@@ -887,11 +889,12 @@ def apply_debug_pldec() -> _codec.MsgCodec:
             and
             plrx.pld_dec is orig_pldec
         )
-        log.runtime(
+        log.info(
             'Reverted to previous pld-spec\n\n'
             f'{orig_pldec}\n'
         )
 
+
 async def request_root_stdio_lock(
     actor_uid: tuple[str, str],
     task_uid: tuple[str, int],
@@ -911,6 +914,10 @@ async def request_root_stdio_lock(
     entering the REPL at the same time.
 
     '''
+
+    log.pdb(
+        'Initing stdio-lock request task with root actor'
+    )
     # TODO: likely we can implement this mutex more generally as
     #      a `._sync.Lock`?
     # -[ ] simply add the wrapping needed for the debugger specifics?
@@ -923,6 +930,8 @@ async def request_root_stdio_lock(
     DebugStatus.req_finished = trio.Event()
     try:
         from tractor._discovery import get_root
+        from tractor.msg import _ops as msgops
+        debug_dec: msgops.MsgDec
         with (
             # NOTE: we need this to ensure that this task exits
             # BEFORE the REPl instance raises an error like
@@ -953,12 +962,13 @@ async def request_root_stdio_lock(
             #
             apply_debug_pldec() as debug_dec,
         ):
-            log.critical(
-                'Request cancel-scope is:\n\n'
-                f'{pformat_cs(req_cs, var_name="req_cs")}\n\n'
-
-            )
+            # XXX: was orig for debugging cs stack corruption..
+            # log.info(
+            #     'Request cancel-scope is:\n\n'
+            #     f'{pformat_cs(req_cs, var_name="req_cs")}\n\n'
+            # )
             DebugStatus.req_cs = req_cs
+            req_ctx: Context|None = None
             try:
                 # TODO: merge into single async with ?
                 async with get_root() as portal:
@@ -966,31 +976,37 @@ async def request_root_stdio_lock(
                     async with portal.open_context(
                         lock_tty_for_child,
                         subactor_task_uid=task_uid,
-                    ) as (ctx, status):
+                    ) as (req_ctx, status):
 
-                        DebugStatus.req_ctx = ctx
+                        DebugStatus.req_ctx = req_ctx
+
+                        # sanity checks on pld-spec limit state
+                        assert debug_dec
+                        # curr_pldrx: msgops.PldRx = msgops.current_pldrx()
+                        # assert (
+                        #     curr_pldrx.pld_dec is debug_dec
+                        # )
 
-                        from tractor.msg import (
-                            _ops as msgops,
-                        )
-                        assert (
-                            msgops.current_pldrx().pld_dec is debug_dec
-                        )
                         log.debug(
                             'Subactor locked TTY with msg\n\n'
                             f'{status}\n'
                         )
 
                         # mk_pdb().set_trace()
-                        assert status.subactor_uid == actor_uid
-                        assert status.cid
+                        try:
+                            assert status.subactor_uid == actor_uid
+                            assert status.cid
+                        except AttributeError:
+                            log.exception('failed pldspec asserts!')
+                            raise
 
                         # set last rxed lock dialog status.
                         DebugStatus.lock_status = status
 
-                        async with ctx.open_stream() as stream:
+                        async with req_ctx.open_stream() as stream:
+
                             assert DebugStatus.repl_release
-                            task_status.started(ctx)
+                            task_status.started(req_ctx)
 
                             # wait for local task to exit its
                             # `PdbREPL.interaction()`, call
@@ -1006,25 +1022,25 @@ async def request_root_stdio_lock(
 
                             # sync with child-side root locker task
                             # completion
-                            status: LockStatus = await ctx.result()
+                            status: LockStatus = await req_ctx.result()
                             assert not status.locked
                             DebugStatus.lock_status = status
 
                     log.pdb(
                         'TTY lock was released for subactor with msg\n\n'
                         f'{status}\n\n'
-                        f'Exitting {ctx.side!r}-side of locking ctx'
+                        f'Exitting {req_ctx.side!r}-side of locking req_ctx'
                     )
 
             except (
                 tractor.ContextCancelled,
                 trio.Cancelled,
             ):
-                log.exception(
-                    'Debug lock request CANCELLED?\n\n'
-                    f'{pformat_cs(req_cs, var_name="req_cs")}\n\n'
-                    f'{pformat_cs(ctx._scope, var_name="ctx._scope")}\n\n'
-                    f'{ctx}'
+                log.cancel(
+                    'Debug lock request was CANCELLED?\n\n'
+                    f'{req_ctx}\n'
+                    # f'{pformat_cs(req_cs, var_name="req_cs")}\n\n'
+                    # f'{pformat_cs(req_ctx._scope, var_name="req_ctx._scope")}\n\n'
                 )
                 raise
 
@@ -1033,11 +1049,11 @@ async def request_root_stdio_lock(
             ):
                 log.exception(
                     'Failed during root TTY-lock dialog?\n'
-                    f'{ctx}\n'
+                    f'{req_ctx}\n'
 
                     f'Cancelling IPC ctx!\n'
                 )
-                await ctx.cancel()
+                await req_ctx.cancel()
                 raise
 
 
@@ -1047,13 +1063,26 @@ async def request_root_stdio_lock(
     ):
         log.cancel(
             'Debug lock request CANCELLED?\n'
-            f'{ctx}\n'
+            f'{req_ctx}\n'
         )
         raise
 
-    except BaseException:
-        log.exception('Errored during root TTY-lock dialog?')
-        raise
+    except BaseException as req_err:
+        # log.error('Failed to request root stdio-lock?')
+        DebugStatus.req_err = req_err
+        DebugStatus.release()
+
+        # TODO: how to dev a test that ensures we actually drop
+        # into THIS internal frame on any internal error in the above
+        # code?
+        # -[ ] eg. on failed pld_dec assert above we should be able
+        #   to REPL pm it.
+        # -[ ]FURTHER, after we 'continue', we should be able to
+        #   ctl-c out of the currently hanging task! 
+        raise DebugRequestError(
+            'Failed to lock stdio from subactor IPC ctx!\n\n'
+            f'req_ctx: {req_ctx}\n'
+        ) from req_err
 
     finally:
         log.debug('Exiting debugger TTY lock request func from child')
@@ -1369,6 +1398,13 @@ def shield_sigint_handler(
 _pause_msg: str = 'Attaching to pdb REPL in actor'
 
 
+class DebugRequestError(RuntimeError):
+    '''
+    Failed to request stdio lock from root actor!
+
+    '''
+
+
 async def _pause(
 
     debug_func: Callable|None,
@@ -1480,15 +1516,18 @@ async def _pause(
                 raise
 
             except BaseException:
+                __tracebackhide__: bool = False
                 log.exception(
                     'Failed to invoke internal `debug_func = '
                     f'{debug_func.func.__name__}`\n'
                 )
                 # NOTE: OW this is ONLY called from the
                 # `.set_continue/next` hooks!
-                DebugStatus.release()
+                DebugStatus.release(cancel_req_task=True)
+
                 raise
 
+    repl_err: BaseException|None = None
     try:
         if is_root_process():
 
@@ -1584,43 +1623,45 @@ async def _pause(
             #   actor._service_n.cancel_scope.shield = shield
             # ```
             # but not entirely sure if that's a sane way to implement it?
-            try:
-                # NOTE spawn the stdio locker request task inside the
-                # current `Context._scope_nursery` to entsure that
-                # the request never can outlive the task's (parent)
-                # lifetime.
-                curr_ctx: Context = current_ipc_ctx()
-                # TODO: see `_errors_relayed_via_ipc()` where we
-                # should dynamically open a `debug_tn`  for use here,
-                # BUT it needs to be outside the normal error
-                # catching and `_maybe_enter_debugger()` call!
-                # ctx: Context = await curr_ctx._debug_tn.start(
-                ctx: Context = await actor._service_n.start(
-                    request_root_stdio_lock,
-                    actor.uid,
-                    (task.name, id(task)),  # task uuid (effectively)
-                )
-                # our locker task should be the one in ctx
-                # with the root actor
-                assert (
-                    ctx
-                    is
-                    DebugStatus.req_ctx
-                    is not
-                    curr_ctx
-                )
 
-                # enter REPL
-                _enter_repl_sync(debug_func)
+            # NOTE currently we spawn the lock request task inside this
+            # subactor's global `Actor._service_n` so that the
+            # lifetime of the lock-request can outlive the current
+            # `._pause()` scope while the user steps through their
+            # application code and when they finally exit the
+            # session, via 'continue' or 'quit' cmds, the `PdbREPL`
+            # will manually call `DebugStatus.release()` to release
+            # the lock session with the root actor.
+            #
+            # TODO: ideally we can add a tighter scope for this
+            # request task likely by conditionally opening a "debug
+            # nursery" inside `_errors_relayed_via_ipc()`, see the
+            # todo in tht module, but
+            # -[ ] it needs to be outside the normal crash handling
+            #   `_maybe_enter_debugger()` block-call.
+            # -[ ] we probably only need to allocate the nursery when
+            #   we detect the runtime is already in debug mode.
+            #
+            # ctx: Context = await curr_ctx._debug_tn.start(
+            req_ctx: Context = await actor._service_n.start(
+                request_root_stdio_lock,
+                actor.uid,
+                (task.name, id(task)),  # task uuid (effectively)
+            )
+            # XXX sanity, our locker task should be the one which
+            # entered a new IPC ctx with the root actor, NOT the one
+            # that exists around the task calling into `._pause()`.
+            curr_ctx: Context = current_ipc_ctx()
+            assert (
+                req_ctx
+                is
+                DebugStatus.req_ctx
+                is not
+                curr_ctx
+            )
 
-            except RuntimeError:
-                if actor._cancel_called:
-                    # service nursery won't be usable and we
-                    # don't want to lock up the root either way since
-                    # we're in (the midst of) cancellation.
-                    return
-
-                raise
+            # enter REPL
+            _enter_repl_sync(debug_func)
 
     # TODO: prolly factor this plus the similar block from
     # `_enter_repl_sync()` into a common @cm?
@@ -1629,13 +1670,31 @@ async def _pause(
             log.devx(
                 'REPL for pdb was quit!\n'
             )
+
+        # when the actor is mid-runtime cancellation the
+        # `Actor._service_n` might get closed before we can spawn
+        # the request task, so just ignore expected RTE.
+        elif (
+            isinstance(repl_err, RuntimeError)
+            and
+            actor._cancel_called
+        ):
+            # service nursery won't be usable and we
+            # don't want to lock up the root either way since
+            # we're in (the midst of) cancellation.
+            log.warning(
+                'Service nursery likely closed due to actor-runtime cancellation..\n'
+                'Ignoring failed debugger lock request task spawn..\n'
+            )
+            return
+
         else:
             log.exception(
                 'Failed to engage debugger via `_pause()` ??\n'
             )
-            mk_pdb().set_trace()
 
-        DebugStatus.release()
+        DebugStatus.release(cancel_req_task=True)
+
         # sanity checks for ^ on request/status teardown
         assert DebugStatus.repl is None
         assert DebugStatus.repl_task is None
@@ -1645,6 +1704,16 @@ async def _pause(
 
         raise
 
+    finally:
+        # always show frame when request fails due to internal
+        # failure in the above code (including an `BdbQuit`).
+        if (
+            DebugStatus.req_err
+            or
+            repl_err
+        ):
+            __tracebackhide__: bool = False
+
 
 def _set_trace(
     repl: PdbREPL,  # passed by `_pause()`
@@ -1703,7 +1772,7 @@ async def pause(
     https://en.wikipedia.org/wiki/Breakpoint
 
     '''
-    __tracebackhide__: bool = True
+    __tracebackhide__: bool = hide_tb
 
     # always start 1 level up from THIS in user code since normally
     # `tractor.pause()` is called explicitly by use-app code thus
@@ -1885,12 +1954,15 @@ def pause_from_sync(
 # NOTE prefer a new "pause" semantic since it better describes
 # "pausing the actor's runtime" for this particular
 # paralell task to do debugging in a REPL.
-async def breakpoint(**kwargs):
+async def breakpoint(
+    hide_tb: bool = True,
+    **kwargs,
+):
     log.warning(
         '`tractor.breakpoint()` is deprecated!\n'
         'Please use `tractor.pause()` instead!\n'
     )
-    __tracebackhide__: bool = True
+    __tracebackhide__: bool = hide_tb
     await pause(
         api_frame=inspect.currentframe(),
         **kwargs,
@@ -1951,6 +2023,7 @@ def _post_mortem(
         # frame=None,
         traceback=tb,
     )
+    # XXX NOTE XXX: absolutely required to avoid hangs!
     # Since we presume the post-mortem was enaged to a task-ending
     # error, we MUST release the local REPL request so that not other
     # local task nor the root remains blocked!
-- 
2.34.1


From d93135acd813c8e467938afcbb535ccdd614947e Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 15 May 2024 09:36:22 -0400
Subject: [PATCH 307/378] Include truncated `id(trio.Task)` for task info in
 log header

---
 tractor/log.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/tractor/log.py b/tractor/log.py
index f7d6a32e..78611f2e 100644
--- a/tractor/log.py
+++ b/tractor/log.py
@@ -202,8 +202,19 @@ class StackLevelAdapter(LoggerAdapter):
         )
 
 
+def pformat_task_uid():
+    '''
+    Return `str`-ified unique for a `trio.Task` via a combo of its
+    `.name: str` and `id()` truncated output.
+
+    '''
+    task: trio.Task = trio.lowlevel.current_task()
+    tid: str = str(id(task))
+    return f'{task.name}[{tid[:6]}]'
+
+
 _conc_name_getters = {
-    'task': lambda: trio.lowlevel.current_task().name,
+    'task': pformat_task_uid,
     'actor': lambda: current_actor(),
     'actor_name': lambda: current_actor().name,
     'actor_uid': lambda: current_actor().uid[1][:6],
@@ -211,7 +222,10 @@ _conc_name_getters = {
 
 
 class ActorContextInfo(Mapping):
-    "Dyanmic lookup for local actor and task names"
+    '''
+    Dyanmic lookup for local actor and task names.
+
+    '''
     _context_keys = (
         'task',
         'actor',
-- 
2.34.1


From 262a0e36c6f073f49d27482db64de5adac1d9ded Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 20 May 2024 14:34:50 -0400
Subject: [PATCH 308/378] Allocate a `PldRx` per `Context`, new pld-spec API

Since the state mgmt becomes quite messy with multiple sub-tasks inside
an IPC ctx, AND bc generally speaking the payload-type-spec should map
1-to-1 with the `Context`, it doesn't make a lot of sense to be using
`ContextVar`s to modify the `Context.pld_rx: PldRx` instance.

Instead, always allocate a full instance inside `mk_context()` with the
default `.pld_rx: PldRx` set to use the `msg._ops._def_any_pldec: MsgDec`

In support, simplify the `.msg._ops` impl and APIs:
- drop `_ctxvar_PldRx`, `_def_pld_rx` and `current_pldrx()`.
- rename `PldRx._pldec` -> `._pld_dec`.
- rename the unused `PldRx.apply_to_ipc()` -> `.wraps_ipc()`.
- add a required `PldRx._ctx: Context` attr since it is needed
  internally in some meths and each pld-rx now maps to a specific ctx.
- modify all recv methods to accept a `ipc: Context|MsgStream` (instead
  of a `ctx` arg) since both have a ref to the same `._rx_chan` and there
  are only a couple spots (in `.dec_msg()`) where we need the `ctx`
  explicitly (which can now be easily accessed via a new `MsgStream.ctx`
  property, see below).
- always show the `.dec_msg()` frame in tbs if there's a reference error
  when calling `_raise_from_unexpected_msg()` in the fallthrough case.
- implement `limit_plds()` as light wrapper around getting the
  `current_ipc_ctx()` and mutating its `MsgDec` via
  `Context.pld_rx.limit_plds()`.
- add a `maybe_limit_plds()` which just provides an `@acm` equivalent of
  `limit_plds()` handy for composing in a `async with ():` style block
  (avoiding additional indent levels in the body of async funcs).

Obvi extend the `Context` and `MsgStream` interfaces as needed
to match the above:
- add a `Context.pld_rx` pub prop.
- new private refs to `Context._started_msg: Started` and
  a `._started_pld` (mostly for internal debugging / testing / logging)
  and set inside `.open_context()` immediately after the syncing phase.
- a `Context.has_outcome() -> bool:` predicate which can be used to more
  easily determine if the ctx errored or has a final result.
- pub props for `MsgStream.ctx: Context` and `.chan: Channel` providing
  full `ipc`-arg compat with the `PldRx` method signatures.
---
 tractor/_context.py   | 145 ++++++++++++++++++++----------
 tractor/_streaming.py |  25 +++++-
 tractor/msg/_ops.py   | 205 +++++++++++++++++++-----------------------
 3 files changed, 212 insertions(+), 163 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index fe5d6543..ed720a2d 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -41,6 +41,7 @@ from typing import (
     Callable,
     Mapping,
     Type,
+    TypeAlias,
     TYPE_CHECKING,
     Union,
 )
@@ -155,6 +156,41 @@ class Context:
     # payload receiver
     _pld_rx: msgops.PldRx
 
+    @property
+    def pld_rx(self) -> msgops.PldRx:
+        '''
+        The current `tractor.Context`'s msg-payload-receiver.
+
+        A payload receiver is the IPC-msg processing sub-sys which
+        filters inter-actor-task communicated payload data, i.e. the
+        `PayloadMsg.pld: PayloadT` field value, AFTER its container
+        shuttlle msg (eg. `Started`/`Yield`/`Return) has been
+        delivered up from `tractor`'s transport layer but BEFORE the
+        data is yielded to `tractor` application code.
+
+        The "IPC-primitive API" is normally one of a `Context` (this)` or a `MsgStream`
+        or some higher level API using one of them.
+
+        For ex. `pld_data: PayloadT = MsgStream.receive()` implicitly
+        calls into the stream's parent `Context.pld_rx.recv_pld().` to
+        receive the latest `PayloadMsg.pld` value.
+
+        Modification of the current payload spec via `limit_plds()`
+        allows a `tractor` application to contextually filter IPC
+        payload content with a type specification as supported by the
+        interchange backend.
+
+        - for `msgspec` see <PUTLINKHERE>.
+
+        Note that the `PldRx` itself is a per-`Context` instance that
+        normally only changes when some (sub-)task, on a given "side"
+        of the IPC ctx (either a "child"-side RPC or inside
+        a "parent"-side `Portal.open_context()` block), modifies it
+        using the `.msg._ops.limit_plds()` API.
+
+        '''
+        return self._pld_rx
+
     # full "namespace-path" to target RPC function
     _nsf: NamespacePath
 
@@ -231,6 +267,8 @@ class Context:
 
     # init and streaming state
     _started_called: bool = False
+    _started_msg: MsgType|None = None
+    _started_pld: Any = None
     _stream_opened: bool = False
     _stream: MsgStream|None = None
 
@@ -623,7 +661,7 @@ class Context:
         log.runtime(
             'Setting remote error for ctx\n\n'
             f'<= {self.peer_side!r}: {self.chan.uid}\n'
-            f'=> {self.side!r}\n\n'
+            f'=> {self.side!r}: {self._actor.uid}\n\n'
             f'{error}'
         )
         self._remote_error: BaseException = error
@@ -678,7 +716,7 @@ class Context:
             log.error(
                 f'Remote context error:\n\n'
                 # f'{pformat(self)}\n'
-                f'{error}\n'
+                f'{error}'
             )
 
         if self._canceller is None:
@@ -724,8 +762,10 @@ class Context:
                 )
         else:
             message: str = 'NOT cancelling `Context._scope` !\n\n'
+            # from .devx import mk_pdb
+            # mk_pdb().set_trace()
 
-        fmt_str: str = 'No `self._scope: CancelScope` was set/used ?'
+        fmt_str: str = 'No `self._scope: CancelScope` was set/used ?\n'
         if (
             cs
             and
@@ -805,6 +845,7 @@ class Context:
         #         f'{ci.api_nsp}()\n'
         #     )
 
+        # TODO: use `.dev._frame_stack` scanning to find caller!
         return 'Portal.open_context()'
 
     async def cancel(
@@ -1304,17 +1345,6 @@ class Context:
                 ctx=self,
                 hide_tb=hide_tb,
             )
-            for msg in drained_msgs:
-
-                # TODO: mask this by default..
-                if isinstance(msg, Return):
-                    # from .devx import pause
-                    # await pause()
-                    # raise InternalError(
-                    log.warning(
-                        'Final `return` msg should never be drained !?!?\n\n'
-                        f'{msg}\n'
-                    )
 
             drained_status: str = (
                 'Ctx drained to final outcome msg\n\n'
@@ -1435,6 +1465,10 @@ class Context:
             self._result
         )
 
+    @property
+    def has_outcome(self) -> bool:
+        return bool(self.maybe_error) or self._final_result_is_set()
+
     # @property
     def repr_outcome(
         self,
@@ -1637,8 +1671,6 @@ class Context:
                     )
 
                 if rt_started != started_msg:
-                    # TODO: break these methods out from the struct subtype?
-
                     # TODO: make that one a mod func too..
                     diff = pretty_struct.Struct.__sub__(
                         rt_started,
@@ -1674,6 +1706,8 @@ class Context:
             ) from verr
 
         self._started_called = True
+        self._started_msg = started_msg
+        self._started_pld = value
 
     async def _drain_overflows(
         self,
@@ -1961,6 +1995,7 @@ async def open_context_from_portal(
     portal: Portal,
     func: Callable,
 
+    pld_spec: TypeAlias|None = None,
     allow_overruns: bool = False,
 
     # TODO: if we set this the wrapping `@acm` body will
@@ -2026,7 +2061,7 @@ async def open_context_from_portal(
     # XXX NOTE XXX: currenly we do NOT allow opening a contex
     # with "self" since the local feeder mem-chan processing
     # is not built for it.
-    if portal.channel.uid == portal.actor.uid:
+    if (uid := portal.channel.uid) == portal.actor.uid:
         raise RuntimeError(
             '** !! Invalid Operation !! **\n'
             'Can not open an IPC ctx with the local actor!\n'
@@ -2054,32 +2089,45 @@ async def open_context_from_portal(
     assert ctx._caller_info
     _ctxvar_Context.set(ctx)
 
-    # XXX NOTE since `._scope` is NOT set BEFORE we retreive the
-    # `Started`-msg any cancellation triggered
-    # in `._maybe_cancel_and_set_remote_error()` will
-    # NOT actually cancel the below line!
-    # -> it's expected that if there is an error in this phase of
-    # the dialog, the `Error` msg should be raised from the `msg`
-    # handling block below.
-    first: Any = await ctx._pld_rx.recv_pld(
-        ctx=ctx,
-        expect_msg=Started,
-    )
-    ctx._started_called: bool = True
-
-    uid: tuple = portal.channel.uid
-    cid: str = ctx.cid
-
     # placeholder for any exception raised in the runtime
     # or by user tasks which cause this context's closure.
     scope_err: BaseException|None = None
     ctxc_from_callee: ContextCancelled|None = None
     try:
-        async with trio.open_nursery() as nurse:
+        async with (
+            trio.open_nursery() as tn,
+            msgops.maybe_limit_plds(
+                ctx=ctx,
+                spec=pld_spec,
+            ) as maybe_msgdec,
+        ):
+            if maybe_msgdec:
+                assert maybe_msgdec.pld_spec == pld_spec
 
-            # NOTE: used to start overrun queuing tasks
-            ctx._scope_nursery: trio.Nursery = nurse
-            ctx._scope: trio.CancelScope = nurse.cancel_scope
+            # XXX NOTE since `._scope` is NOT set BEFORE we retreive the
+            # `Started`-msg any cancellation triggered
+            # in `._maybe_cancel_and_set_remote_error()` will
+            # NOT actually cancel the below line!
+            # -> it's expected that if there is an error in this phase of
+            # the dialog, the `Error` msg should be raised from the `msg`
+            # handling block below.
+            started_msg, first = await ctx._pld_rx.recv_msg_w_pld(
+                ipc=ctx,
+                expect_msg=Started,
+                passthrough_non_pld_msgs=False,
+            )
+
+            # from .devx import pause
+            # await pause()
+            ctx._started_called: bool = True
+            ctx._started_msg: bool = started_msg
+            ctx._started_pld: bool = first
+
+            # NOTE: this in an implicit runtime nursery used to,
+            # - start overrun queuing tasks when as well as
+            # for cancellation of the scope opened by the user.
+            ctx._scope_nursery: trio.Nursery = tn
+            ctx._scope: trio.CancelScope = tn.cancel_scope
 
             # deliver context instance and .started() msg value
             # in enter tuple.
@@ -2126,13 +2174,13 @@ async def open_context_from_portal(
 
             # when in allow_overruns mode there may be
             # lingering overflow sender tasks remaining?
-            if nurse.child_tasks:
+            if tn.child_tasks:
                 # XXX: ensure we are in overrun state
                 # with ``._allow_overruns=True`` bc otherwise
                 # there should be no tasks in this nursery!
                 if (
                     not ctx._allow_overruns
-                    or len(nurse.child_tasks) > 1
+                    or len(tn.child_tasks) > 1
                 ):
                     raise InternalError(
                         'Context has sub-tasks but is '
@@ -2304,8 +2352,8 @@ async def open_context_from_portal(
             ):
                 log.warning(
                     'IPC connection for context is broken?\n'
-                    f'task:{cid}\n'
-                    f'actor:{uid}'
+                    f'task: {ctx.cid}\n'
+                    f'actor: {uid}'
                 )
 
         raise  # duh
@@ -2455,9 +2503,8 @@ async def open_context_from_portal(
                 and ctx.cancel_acked
             ):
                 log.cancel(
-                    'Context cancelled by {ctx.side!r}-side task\n'
+                    f'Context cancelled by {ctx.side!r}-side task\n'
                     f'|_{ctx._task}\n\n'
-
                     f'{repr(scope_err)}\n'
                 )
 
@@ -2485,7 +2532,7 @@ async def open_context_from_portal(
             f'cid: {ctx.cid}\n'
         )
         portal.actor._contexts.pop(
-            (uid, cid),
+            (uid, ctx.cid),
             None,
         )
 
@@ -2516,8 +2563,9 @@ def mk_context(
     from .devx._frame_stack import find_caller_info
     caller_info: CallerInfo|None = find_caller_info()
 
-    # TODO: when/how do we apply `.limit_plds()` from here?
-    pld_rx: msgops.PldRx = msgops.current_pldrx()
+    pld_rx = msgops.PldRx(
+        _pld_dec=msgops._def_any_pldec,
+    )
 
     ctx = Context(
         chan=chan,
@@ -2531,13 +2579,16 @@ def mk_context(
         _caller_info=caller_info,
         **kwargs,
     )
+    pld_rx._ctx = ctx
     ctx._result = Unresolved
     return ctx
 
 
 # TODO: use the new type-parameters to annotate this in 3.13?
 # -[ ] https://peps.python.org/pep-0718/#unknown-types
-def context(func: Callable) -> Callable:
+def context(
+    func: Callable,
+) -> Callable:
     '''
     Mark an (async) function as an SC-supervised, inter-`Actor`,
     child-`trio.Task`, IPC endpoint otherwise known more
diff --git a/tractor/_streaming.py b/tractor/_streaming.py
index dd4cd0e1..a008eaf5 100644
--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@@ -52,6 +52,7 @@ from tractor.msg import (
 
 if TYPE_CHECKING:
     from ._context import Context
+    from ._ipc import Channel
 
 
 log = get_logger(__name__)
@@ -65,10 +66,10 @@ log = get_logger(__name__)
 class MsgStream(trio.abc.Channel):
     '''
     A bidirectional message stream for receiving logically sequenced
-    values over an inter-actor IPC ``Channel``.
+    values over an inter-actor IPC `Channel`.
 
     This is the type returned to a local task which entered either
-    ``Portal.open_stream_from()`` or ``Context.open_stream()``.
+    `Portal.open_stream_from()` or `Context.open_stream()`.
 
     Termination rules:
 
@@ -95,6 +96,22 @@ class MsgStream(trio.abc.Channel):
         self._eoc: bool|trio.EndOfChannel = False
         self._closed: bool|trio.ClosedResourceError = False
 
+    @property
+    def ctx(self) -> Context:
+        '''
+        This stream's IPC `Context` ref.
+
+        '''
+        return self._ctx
+
+    @property
+    def chan(self) -> Channel:
+        '''
+        Ref to the containing `Context`'s transport `Channel`.
+
+        '''
+        return self._ctx.chan
+
     # TODO: could we make this a direct method bind to `PldRx`?
     # -> receive_nowait = PldRx.recv_pld
     # |_ means latter would have to accept `MsgStream`-as-`self`?
@@ -109,7 +126,7 @@ class MsgStream(trio.abc.Channel):
     ):
         ctx: Context = self._ctx
         return ctx._pld_rx.recv_pld_nowait(
-            ctx=ctx,
+            ipc=self,
             expect_msg=expect_msg,
         )
 
@@ -148,7 +165,7 @@ class MsgStream(trio.abc.Channel):
         try:
 
             ctx: Context = self._ctx
-            return await ctx._pld_rx.recv_pld(ctx=ctx)
+            return await ctx._pld_rx.recv_pld(ipc=self)
 
         # XXX: the stream terminates on either of:
         # - via `self._rx_chan.receive()` raising  after manual closure
diff --git a/tractor/msg/_ops.py b/tractor/msg/_ops.py
index 3b0b8339..3014c15b 100644
--- a/tractor/msg/_ops.py
+++ b/tractor/msg/_ops.py
@@ -22,10 +22,9 @@ operational helpers for processing transaction flows.
 '''
 from __future__ import annotations
 from contextlib import (
-    # asynccontextmanager as acm,
+    asynccontextmanager as acm,
     contextmanager as cm,
 )
-from contextvars import ContextVar
 from typing import (
     Any,
     Type,
@@ -50,6 +49,7 @@ from tractor._exceptions import (
     _mk_msg_type_err,
     pack_from_raise,
 )
+from tractor._state import current_ipc_ctx
 from ._codec import (
     mk_dec,
     MsgDec,
@@ -75,7 +75,7 @@ if TYPE_CHECKING:
 log = get_logger(__name__)
 
 
-_def_any_pldec: MsgDec = mk_dec()
+_def_any_pldec: MsgDec[Any] = mk_dec()
 
 
 class PldRx(Struct):
@@ -104,15 +104,19 @@ class PldRx(Struct):
     '''
     # TODO: better to bind it here?
     # _rx_mc: trio.MemoryReceiveChannel
-    _pldec: MsgDec
+    _pld_dec: MsgDec
+    _ctx: Context|None = None
     _ipc: Context|MsgStream|None = None
 
     @property
     def pld_dec(self) -> MsgDec:
-        return self._pldec
+        return self._pld_dec
 
+    # TODO: a better name?
+    # -[ ] when would this be used as it avoids needingn to pass the
+    #   ipc prim to every method
     @cm
-    def apply_to_ipc(
+    def wraps_ipc(
         self,
         ipc_prim: Context|MsgStream,
 
@@ -140,49 +144,50 @@ class PldRx(Struct):
         exit.
 
         '''
-        orig_dec: MsgDec = self._pldec
+        orig_dec: MsgDec = self._pld_dec
         limit_dec: MsgDec = mk_dec(spec=spec)
         try:
-            self._pldec = limit_dec
+            self._pld_dec = limit_dec
             yield limit_dec
         finally:
-            self._pldec = orig_dec
+            self._pld_dec = orig_dec
 
     @property
     def dec(self) -> msgpack.Decoder:
-        return self._pldec.dec
+        return self._pld_dec.dec
 
     def recv_pld_nowait(
         self,
         # TODO: make this `MsgStream` compat as well, see above^
         # ipc_prim: Context|MsgStream,
-        ctx: Context,
+        ipc: Context|MsgStream,
 
         ipc_msg: MsgType|None = None,
         expect_msg: Type[MsgType]|None = None,
-
+        hide_tb: bool = False,
         **dec_msg_kwargs,
 
     ) -> Any|Raw:
-        __tracebackhide__: bool = True
+        __tracebackhide__: bool = hide_tb
 
         msg: MsgType = (
             ipc_msg
             or
 
             # sync-rx msg from underlying IPC feeder (mem-)chan
-            ctx._rx_chan.receive_nowait()
+            ipc._rx_chan.receive_nowait()
         )
         return self.dec_msg(
             msg,
-            ctx=ctx,
+            ipc=ipc,
             expect_msg=expect_msg,
+            hide_tb=hide_tb,
             **dec_msg_kwargs,
         )
 
     async def recv_pld(
         self,
-        ctx: Context,
+        ipc: Context|MsgStream,
         ipc_msg: MsgType|None = None,
         expect_msg: Type[MsgType]|None = None,
         hide_tb: bool = True,
@@ -200,11 +205,11 @@ class PldRx(Struct):
             or
 
             # async-rx msg from underlying IPC feeder (mem-)chan
-            await ctx._rx_chan.receive()
+            await ipc._rx_chan.receive()
         )
         return self.dec_msg(
             msg=msg,
-            ctx=ctx,
+            ipc=ipc,
             expect_msg=expect_msg,
             **dec_msg_kwargs,
         )
@@ -212,7 +217,7 @@ class PldRx(Struct):
     def dec_msg(
         self,
         msg: MsgType,
-        ctx: Context,
+        ipc: Context|MsgStream,
         expect_msg: Type[MsgType]|None,
 
         raise_error: bool = True,
@@ -225,6 +230,9 @@ class PldRx(Struct):
 
         '''
         __tracebackhide__: bool = hide_tb
+
+        _src_err = None
+        src_err: BaseException|None = None
         match msg:
             # payload-data shuttle msg; deliver the `.pld` value
             # directly to IPC (primitive) client-consumer code.
@@ -234,7 +242,7 @@ class PldRx(Struct):
                 |Return(pld=pld)  # termination phase
             ):
                 try:
-                    pld: PayloadT = self._pldec.decode(pld)
+                    pld: PayloadT = self._pld_dec.decode(pld)
                     log.runtime(
                         'Decoded msg payload\n\n'
                         f'{msg}\n\n'
@@ -243,25 +251,30 @@ class PldRx(Struct):
                     )
                     return pld
 
-                # XXX pld-type failure
-                except ValidationError as src_err:
+                # XXX pld-value type failure
+                except ValidationError as valerr:
+                    # pack mgterr into error-msg for
+                    # reraise below; ensure remote-actor-err
+                    # info is displayed nicely?
                     msgterr: MsgTypeError = _mk_msg_type_err(
                         msg=msg,
                         codec=self.pld_dec,
-                        src_validation_error=src_err,
+                        src_validation_error=valerr,
                         is_invalid_payload=True,
                     )
                     msg: Error = pack_from_raise(
                         local_err=msgterr,
                         cid=msg.cid,
-                        src_uid=ctx.chan.uid,
+                        src_uid=ipc.chan.uid,
                     )
+                    src_err = valerr
 
                 # XXX some other decoder specific failure?
                 # except TypeError as src_error:
                 #     from .devx import mk_pdb
                 #     mk_pdb().set_trace()
                 #     raise src_error
+                # ^-TODO-^ can remove?
 
             # a runtime-internal RPC endpoint response.
             # always passthrough since (internal) runtime
@@ -299,6 +312,7 @@ class PldRx(Struct):
                     return src_err
 
             case Stop(cid=cid):
+                ctx: Context = getattr(ipc, 'ctx', ipc)
                 message: str = (
                     f'{ctx.side!r}-side of ctx received stream-`Stop` from '
                     f'{ctx.peer_side!r} peer ?\n'
@@ -341,14 +355,21 @@ class PldRx(Struct):
         # |_https://docs.python.org/3.11/library/exceptions.html#BaseException.add_note
         #
         # fallthrough and raise from `src_err`
-        _raise_from_unexpected_msg(
-            ctx=ctx,
-            msg=msg,
-            src_err=src_err,
-            log=log,
-            expect_msg=expect_msg,
-            hide_tb=hide_tb,
-        )
+        try:
+            _raise_from_unexpected_msg(
+                ctx=getattr(ipc, 'ctx', ipc),
+                msg=msg,
+                src_err=src_err,
+                log=log,
+                expect_msg=expect_msg,
+                hide_tb=hide_tb,
+            )
+        except UnboundLocalError:
+            # XXX if there's an internal lookup error in the above
+            # code (prolly on `src_err`) we want to show this frame
+            # in the tb!
+            __tracebackhide__: bool = False
+            raise
 
     async def recv_msg_w_pld(
         self,
@@ -378,52 +399,13 @@ class PldRx(Struct):
         # msg instance?
         pld: PayloadT = self.dec_msg(
             msg,
-            ctx=ipc,
+            ipc=ipc,
             expect_msg=expect_msg,
             **kwargs,
         )
         return msg, pld
 
 
-# Always maintain a task-context-global `PldRx`
-_def_pld_rx: PldRx = PldRx(
-    _pldec=_def_any_pldec,
-)
-_ctxvar_PldRx: ContextVar[PldRx] = ContextVar(
-    'pld_rx',
-    default=_def_pld_rx,
-)
-
-
-def current_pldrx() -> PldRx:
-    '''
-    Return the current `trio.Task.context`'s msg-payload-receiver.
-
-    A payload receiver is the IPC-msg processing sub-sys which
-    filters inter-actor-task communicated payload data, i.e. the
-    `PayloadMsg.pld: PayloadT` field value, AFTER it's container
-    shuttlle msg (eg. `Started`/`Yield`/`Return) has been delivered
-    up from `tractor`'s transport layer but BEFORE the data is
-    yielded to application code, normally via an IPC primitive API
-    like, for ex., `pld_data: PayloadT = MsgStream.receive()`.
-
-    Modification of the current payload spec via `limit_plds()`
-    allows a `tractor` application to contextually filter IPC
-    payload content with a type specification as supported by
-    the interchange backend.
-
-    - for `msgspec` see <PUTLINKHERE>.
-
-    NOTE that the `PldRx` itself is a per-`Context` global sub-system
-    that normally does not change other then the applied pld-spec
-    for the current `trio.Task`.
-
-    '''
-    # ctx: context = current_ipc_ctx()
-    # return ctx._pld_rx
-    return _ctxvar_PldRx.get()
-
-
 @cm
 def limit_plds(
     spec: Union[Type[Struct]],
@@ -439,29 +421,55 @@ def limit_plds(
     '''
     __tracebackhide__: bool = True
     try:
-        # sanity on orig settings
-        orig_pldrx: PldRx = current_pldrx()
-        orig_pldec: MsgDec = orig_pldrx.pld_dec
+        curr_ctx: Context = current_ipc_ctx()
+        rx: PldRx = curr_ctx._pld_rx
+        orig_pldec: MsgDec = rx.pld_dec
 
-        with orig_pldrx.limit_plds(
+        with rx.limit_plds(
             spec=spec,
             **kwargs,
         ) as pldec:
-            log.info(
+            log.runtime(
                 'Applying payload-decoder\n\n'
                 f'{pldec}\n'
             )
             yield pldec
     finally:
-        log.info(
+        log.runtime(
             'Reverted to previous payload-decoder\n\n'
             f'{orig_pldec}\n'
         )
-        assert (
-            (pldrx := current_pldrx()) is orig_pldrx
-            and
-            pldrx.pld_dec is orig_pldec
-        )
+        # sanity on orig settings
+        assert rx.pld_dec is orig_pldec
+
+
+@acm
+async def maybe_limit_plds(
+    ctx: Context,
+    spec: Union[Type[Struct]]|None = None,
+    **kwargs,
+) -> MsgDec|None:
+    '''
+    Async compat maybe-payload type limiter.
+
+    Mostly for use inside other internal `@acm`s such that a separate
+    indent block isn't needed when an async one is already being
+    used.
+
+    '''
+    if spec is None:
+        yield None
+        return
+
+    # sanity on scoping
+    curr_ctx: Context = current_ipc_ctx()
+    assert ctx is curr_ctx
+
+    with ctx._pld_rx.limit_plds(spec=spec) as msgdec:
+        yield msgdec
+
+    curr_ctx: Context = current_ipc_ctx()
+    assert ctx is curr_ctx
 
 
 async def drain_to_final_msg(
@@ -543,21 +551,12 @@ async def drain_to_final_msg(
         match msg:
 
             # final result arrived!
-            case Return(
-                # cid=cid,
-                # pld=res,
-            ):
-                # ctx._result: Any = res
-                ctx._result: Any = pld
+            case Return():
                 log.runtime(
                     'Context delivered final draining msg:\n'
                     f'{pretty_struct.pformat(msg)}'
                 )
-                # XXX: only close the rx mem chan AFTER
-                # a final result is retreived.
-                # if ctx._rx_chan:
-                #     await ctx._rx_chan.aclose()
-                # TODO: ^ we don't need it right?
+                ctx._result: Any = pld
                 result_msg = msg
                 break
 
@@ -664,24 +663,6 @@ async def drain_to_final_msg(
                     result_msg = msg
                     break  # OOOOOF, yeah obvi we need this..
 
-                # XXX we should never really get here
-                # right! since `._deliver_msg()` should
-                # always have detected an {'error': ..}
-                # msg and already called this right!?!
-                # elif error := unpack_error(
-                #     msg=msg,
-                #     chan=ctx._portal.channel,
-                #     hide_tb=False,
-                # ):
-                #     log.critical('SHOULD NEVER GET HERE!?')
-                #     assert msg is ctx._cancel_msg
-                #     assert error.msgdata == ctx._remote_error.msgdata
-                #     assert error.ipc_msg == ctx._remote_error.ipc_msg
-                #     from .devx._debug import pause
-                #     await pause()
-                #     ctx._maybe_cancel_and_set_remote_error(error)
-                #     ctx._maybe_raise_remote_err(error)
-
                 else:
                     # bubble the original src key error
                     raise
-- 
2.34.1


From c80f020ebc99cbb0c4e78da3d3325cecb89cf5cd Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 20 May 2024 15:47:01 -0400
Subject: [PATCH 309/378] Expose `tractor.current_ipc_ctx()` at pkg level

---
 tractor/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tractor/__init__.py b/tractor/__init__.py
index 0f2bdd65..ad3144dc 100644
--- a/tractor/__init__.py
+++ b/tractor/__init__.py
@@ -42,6 +42,7 @@ from ._supervise import (
 from ._state import (
     current_actor as current_actor,
     is_root_process as is_root_process,
+    current_ipc_ctx as current_ipc_ctx,
 )
 from ._exceptions import (
     ContextCancelled as ContextCancelled,
-- 
2.34.1


From 30afcd2b6bba02c95e3b54d6aa8355d6b252cc33 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 20 May 2024 16:07:57 -0400
Subject: [PATCH 310/378] Adjust `Portal` usage of `Context.pld_rx`

Pass the new `ipc` arg and try to show api frames when an unexpected
internal error is detected.
---
 tractor/_portal.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/tractor/_portal.py b/tractor/_portal.py
index 700f2fdc..2c676e12 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -166,13 +166,19 @@ class Portal:
         assert self._expect_result_ctx
 
         if self._final_result_msg is None:
-            (
-                self._final_result_msg,
-                self._final_result_pld,
-            ) = await self._expect_result_ctx._pld_rx.recv_msg_w_pld(
-                ipc=self._expect_result_ctx,
-                expect_msg=Return,
-            )
+            try:
+                (
+                    self._final_result_msg,
+                    self._final_result_pld,
+                ) = await self._expect_result_ctx._pld_rx.recv_msg_w_pld(
+                    ipc=self._expect_result_ctx,
+                    expect_msg=Return,
+                )
+            except BaseException as err:
+                # TODO: wrap this into `@api_frame` optionally with
+                # some kinda filtering mechanism like log levels?
+                __tracebackhide__: bool = False
+                raise err
 
         return self._final_result_pld
 
@@ -306,7 +312,7 @@ class Portal:
             portal=self,
         )
         return await ctx._pld_rx.recv_pld(
-            ctx=ctx,
+            ipc=ctx,
             expect_msg=Return,
         )
 
@@ -325,6 +331,8 @@ class Portal:
         remote rpc task or a local async generator instance.
 
         '''
+        __runtimeframe__: int = 1  # noqa
+
         if isinstance(func, str):
             warnings.warn(
                 "`Portal.run(namespace: str, funcname: str)` is now"
@@ -358,7 +366,7 @@ class Portal:
             portal=self,
         )
         return await ctx._pld_rx.recv_pld(
-            ctx=ctx,
+            ipc=ctx,
             expect_msg=Return,
         )
 
-- 
2.34.1


From 60fc43e530aed4eb4dc932f140518df62461d2fd Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 20 May 2024 16:11:59 -0400
Subject: [PATCH 311/378] Shield channel closing in `_connect_chan()`

---
 tractor/_ipc.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tractor/_ipc.py b/tractor/_ipc.py
index 70774bed..511a053c 100644
--- a/tractor/_ipc.py
+++ b/tractor/_ipc.py
@@ -716,4 +716,5 @@ async def _connect_chan(
     chan = Channel((host, port))
     await chan.connect()
     yield chan
-    await chan.aclose()
+    with trio.CancelScope(shield=True):
+        await chan.aclose()
-- 
2.34.1


From 13bc3c308dcf63d6c9e50417ba5c4f1110d8d04f Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 20 May 2024 16:12:51 -0400
Subject: [PATCH 312/378] Add error suppress flag to `current_ipc_ctx()`

---
 tractor/_state.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tractor/_state.py b/tractor/_state.py
index a3729833..8c5cca14 100644
--- a/tractor/_state.py
+++ b/tractor/_state.py
@@ -124,9 +124,15 @@ _ctxvar_Context: ContextVar[Context] = ContextVar(
 )
 
 
-def current_ipc_ctx() -> Context:
+def current_ipc_ctx(
+    error_on_not_set: bool = False,
+) -> Context|None:
     ctx: Context = _ctxvar_Context.get()
-    if not ctx:
+
+    if (
+        not ctx
+        and error_on_not_set
+    ):
         from ._exceptions import InternalError
         raise InternalError(
             'No IPC context has been allocated for this task yet?\n'
-- 
2.34.1


From e78fdf2f6975e69f1d5c2cb8dbab9b95f2eaf8c9 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 20 May 2024 16:13:57 -0400
Subject: [PATCH 313/378] Make `log.devx()` level below `.pdb()`

Kinda like a "runtime"-y level for `.pdb()` (which is more or less like
an `.info()` for our debugger subsys) which can be used to report
internals info for those hacking on `.devx` tools.

Also, inject only the *last* 6 digits of the `id(Task)` in
`pformat_task_uid()` output by default.
---
 tractor/log.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/tractor/log.py b/tractor/log.py
index 78611f2e..3621fc15 100644
--- a/tractor/log.py
+++ b/tractor/log.py
@@ -57,8 +57,8 @@ LEVELS: dict[str, int] = {
     'TRANSPORT': 5,
     'RUNTIME': 15,
     'CANCEL': 16,
+    'DEVX': 400,
     'PDB': 500,
-    'DEVX': 600,
 }
 # _custom_levels: set[str] = {
 #     lvlname.lower for lvlname in LEVELS.keys()
@@ -137,7 +137,7 @@ class StackLevelAdapter(LoggerAdapter):
         "Developer experience" sub-sys statuses.
 
         '''
-        return self.log(600, msg)
+        return self.log(400, msg)
 
     def log(
         self,
@@ -202,7 +202,12 @@ class StackLevelAdapter(LoggerAdapter):
         )
 
 
-def pformat_task_uid():
+# TODO IDEA:
+# -[ ] do per task-name and actor-name color coding
+# -[ ] unique color per task-id and actor-uuid
+def pformat_task_uid(
+    id_part: str = 'tail'
+):
     '''
     Return `str`-ified unique for a `trio.Task` via a combo of its
     `.name: str` and `id()` truncated output.
@@ -210,7 +215,12 @@ def pformat_task_uid():
     '''
     task: trio.Task = trio.lowlevel.current_task()
     tid: str = str(id(task))
-    return f'{task.name}[{tid[:6]}]'
+    if id_part == 'tail':
+        tid_part: str = tid[-6:]
+    else:
+        tid_part: str = tid[:6]
+
+    return f'{task.name}[{tid_part}]'
 
 
 _conc_name_getters = {
-- 
2.34.1


From 4ef77bb64f38d46e60f7fb931d96939106001a95 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 20 May 2024 16:18:42 -0400
Subject: [PATCH 314/378] Set `_ctxvar_Context` for child-side RPC tasks

Just inside `._invoke()` after the `ctx: Context` is retrieved.

Also try our best to *not hide* internal frames when a non-user-code
crash happens, normally either due to a runtime RPC EP bug or
a transport failure.
---
 tractor/_rpc.py | 55 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 37 insertions(+), 18 deletions(-)

diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index eed47902..df79c653 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -70,7 +70,6 @@ from .msg import (
 from tractor.msg.types import (
     CancelAck,
     Error,
-    Msg,
     MsgType,
     Return,
     Start,
@@ -250,10 +249,17 @@ async def _errors_relayed_via_ipc(
     ] = trio.TASK_STATUS_IGNORED,
 
 ) -> None:
+    # NOTE: we normally always hide this frame in call-stack tracebacks
+    # if the crash originated from an RPC task (since normally the
+    # user is only going to care about their own code not this
+    # internal runtime frame) and we DID NOT
+    # fail due to an IPC transport error!
     __tracebackhide__: bool = hide_tb
+
     # TODO: a debug nursery when in debug mode!
     # async with maybe_open_debugger_nursery() as debug_tn:
     # => see matching comment in side `._debug._pause()`
+    rpc_err: BaseException|None = None
     try:
         yield  # run RPC invoke body
 
@@ -264,16 +270,7 @@ async def _errors_relayed_via_ipc(
         BaseExceptionGroup,
         KeyboardInterrupt,
     ) as err:
-
-        # NOTE: always hide this frame from debug REPL call stack
-        # if the crash originated from an RPC task and we DID NOT
-        # fail due to an IPC transport error!
-        if (
-            is_rpc
-            and
-            chan.connected()
-        ):
-            __tracebackhide__: bool = hide_tb
+        rpc_err = err
 
         # TODO: maybe we'll want different "levels" of debugging
         # eventualy such as ('app', 'supervisory', 'runtime') ?
@@ -318,11 +315,19 @@ async def _errors_relayed_via_ipc(
                     api_frame=inspect.currentframe(),
                 )
                 if not entered_debug:
+                    # if we prolly should have entered the REPL but
+                    # didn't, maybe there was an internal error in
+                    # the above code and we do want to show this
+                    # frame!
+                    if _state.debug_mode():
+                        __tracebackhide__: bool = False
+
                     log.exception(
                         'RPC task crashed\n'
                         f'|_{ctx}'
                     )
 
+
         # ALWAYS try to ship RPC errors back to parent/caller task
         if is_rpc:
 
@@ -355,6 +360,20 @@ async def _errors_relayed_via_ipc(
     # `Actor._service_n`, we add "handles" to each such that
     # they can be individually ccancelled.
     finally:
+
+        # if the error is not from user code and instead a failure
+        # of a runtime RPC or transport failure we do prolly want to
+        # show this frame
+        if (
+            rpc_err
+            and (
+                not is_rpc
+                or
+                not chan.connected()
+            )
+        ):
+            __tracebackhide__: bool = False
+
         try:
             ctx: Context
             func: Callable
@@ -444,9 +463,10 @@ async def _invoke(
         # open the stream with this option.
         # allow_overruns=True,
     )
-    context: bool = False
+    context_ep_func: bool = False
 
-    assert not _state._ctxvar_Context.get()
+    # set the current IPC ctx var for this RPC task
+    _state._ctxvar_Context.set(ctx)
 
     # TODO: deprecate this style..
     if getattr(func, '_tractor_stream_function', False):
@@ -475,7 +495,7 @@ async def _invoke(
     # handle decorated ``@tractor.context`` async function
     elif getattr(func, '_tractor_context_function', False):
         kwargs['ctx'] = ctx
-        context = True
+        context_ep_func = True
 
     # errors raised inside this block are propgated back to caller
     async with _errors_relayed_via_ipc(
@@ -501,7 +521,7 @@ async def _invoke(
             raise
 
         # TODO: impl all these cases in terms of the `Context` one!
-        if not context:
+        if not context_ep_func:
             await _invoke_non_context(
                 actor,
                 cancel_scope,
@@ -571,7 +591,6 @@ async def _invoke(
             async with trio.open_nursery() as tn:
                 ctx._scope_nursery = tn
                 ctx._scope = tn.cancel_scope
-                _state._ctxvar_Context.set(ctx)
                 task_status.started(ctx)
 
                 # TODO: should would be nice to have our
@@ -831,7 +850,7 @@ async def process_messages(
       (as utilized inside `Portal.cancel_actor()` ).
 
     '''
-    assert actor._service_n  # state sanity
+    assert actor._service_n  # runtime state sanity
 
     # TODO: once `trio` get's an "obvious way" for req/resp we
     # should use it?
@@ -844,7 +863,7 @@ async def process_messages(
     #  - https://github.com/aiortc/aioquic/blob/main/src/aioquic/quic/connection.py#L1175
     #  - https://github.com/aiortc/aioquic/blob/main/src/aioquic/quic/connection.py#L659
     nursery_cancelled_before_task: bool = False
-    msg: Msg|None = None
+    msg: MsgType|None = None
     try:
         # NOTE: this internal scope allows for keeping this
         # message loop running despite the current task having
-- 
2.34.1


From fde62c72be13c5c424d2c9017041fc92d11137cb Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 20 May 2024 17:04:30 -0400
Subject: [PATCH 315/378] Show runtime nursery frames on internal errors

Much like other recent changes attempt to detect runtime-bug-causing
crashes and only show the runtime-endpoint frame when present.

Adds a `ActorNursery._scope_error: BaseException|None` attr to aid with
detection. Also toss in some todo notes for removing and replacing the
`.run_in_actor()` method API.
---
 tractor/_supervise.py | 50 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/tractor/_supervise.py b/tractor/_supervise.py
index 59ec728b..8f3574bb 100644
--- a/tractor/_supervise.py
+++ b/tractor/_supervise.py
@@ -84,6 +84,7 @@ class ActorNursery:
         ria_nursery: trio.Nursery,
         da_nursery: trio.Nursery,
         errors: dict[tuple[str, str], BaseException],
+
     ) -> None:
         # self.supervisor = supervisor  # TODO
         self._actor: Actor = actor
@@ -105,6 +106,7 @@ class ActorNursery:
         self._at_least_one_child_in_debug: bool = False
         self.errors = errors
         self.exited = trio.Event()
+        self._scope_error: BaseException|None = None
 
         # NOTE: when no explicit call is made to
         # `.open_root_actor()` by application code,
@@ -117,7 +119,9 @@ class ActorNursery:
     async def start_actor(
         self,
         name: str,
+
         *,
+
         bind_addrs: list[tuple[str, int]] = [_default_bind_addr],
         rpc_module_paths: list[str]|None = None,
         enable_modules: list[str]|None = None,
@@ -125,6 +129,7 @@ class ActorNursery:
         nursery: trio.Nursery|None = None,
         debug_mode: bool|None = None,
         infect_asyncio: bool = False,
+
     ) -> Portal:
         '''
         Start a (daemon) actor: an process that has no designated
@@ -189,6 +194,13 @@ class ActorNursery:
             )
         )
 
+    # TODO: DEPRECATE THIS:
+    # -[ ] impl instead as a hilevel wrapper on
+    #   top of a `@context` style invocation.
+    #  |_ dynamic @context decoration on child side
+    #  |_ implicit `Portal.open_context() as (ctx, first):`
+    #    and `return first` on parent side.
+    # -[ ] use @api_frame on the wrapper
     async def run_in_actor(
         self,
 
@@ -221,7 +233,7 @@ class ActorNursery:
             # use the explicit function name if not provided
             name = fn.__name__
 
-        portal = await self.start_actor(
+        portal: Portal = await self.start_actor(
             name,
             enable_modules=[mod_path] + (
                 enable_modules or rpc_module_paths or []
@@ -250,6 +262,7 @@ class ActorNursery:
         )
         return portal
 
+    # @api_frame
     async def cancel(
         self,
         hard_kill: bool = False,
@@ -346,7 +359,12 @@ async def _open_and_supervise_one_cancels_all_nursery(
     actor: Actor,
 
 ) -> typing.AsyncGenerator[ActorNursery, None]:
-    __tracebackhide__ = True
+
+    # normally don't need to show user by default
+    __tracebackhide__: bool = True
+
+    outer_err: BaseException|None = None
+    inner_err: BaseException|None = None
 
     # the collection of errors retreived from spawned sub-actors
     errors: dict[tuple[str, str], BaseException] = {}
@@ -356,7 +374,7 @@ async def _open_and_supervise_one_cancels_all_nursery(
     # handling errors that are generated by the inner nursery in
     # a supervisor strategy **before** blocking indefinitely to wait for
     # actors spawned in "daemon mode" (aka started using
-    # ``ActorNursery.start_actor()``).
+    # `ActorNursery.start_actor()`).
 
     # errors from this daemon actor nursery bubble up to caller
     async with trio.open_nursery() as da_nursery:
@@ -391,7 +409,8 @@ async def _open_and_supervise_one_cancels_all_nursery(
                     )
                     an._join_procs.set()
 
-                except BaseException as inner_err:
+                except BaseException as _inner_err:
+                    inner_err = _inner_err
                     errors[actor.uid] = inner_err
 
                     # If we error in the root but the debugger is
@@ -469,8 +488,10 @@ async def _open_and_supervise_one_cancels_all_nursery(
             Exception,
             BaseExceptionGroup,
             trio.Cancelled
+        ) as _outer_err:
+            outer_err = _outer_err
 
-        ) as err:
+            an._scope_error = outer_err or inner_err
 
             # XXX: yet another guard before allowing the cancel
             # sequence in case a (single) child is in debug.
@@ -485,7 +506,7 @@ async def _open_and_supervise_one_cancels_all_nursery(
             if an._children:
                 log.cancel(
                     'Actor-nursery cancelling due error type:\n'
-                    f'{err}\n'
+                    f'{outer_err}\n'
                 )
                 with trio.CancelScope(shield=True):
                     await an.cancel()
@@ -512,6 +533,13 @@ async def _open_and_supervise_one_cancels_all_nursery(
                 else:
                     raise list(errors.values())[0]
 
+            # show frame on any (likely) internal error
+            if (
+                not an.cancelled
+                and an._scope_error
+            ):
+                __tracebackhide__: bool = False
+
         # da_nursery scope end - nursery checkpoint
     # final exit
 
@@ -537,7 +565,7 @@ async def open_nursery(
     which cancellation scopes correspond to each spawned subactor set.
 
     '''
-    __tracebackhide__ = True
+    __tracebackhide__: bool = True
     implicit_runtime: bool = False
     actor: Actor = current_actor(err_on_no_runtime=False)
     an: ActorNursery|None = None
@@ -588,6 +616,14 @@ async def open_nursery(
                 an.exited.set()
 
     finally:
+        # show frame on any internal runtime-scope error
+        if (
+            an
+            and not an.cancelled
+            and an._scope_error
+        ):
+            __tracebackhide__: bool = False
+
         msg: str = (
             'Actor-nursery exited\n'
             f'|_{an}\n'
-- 
2.34.1


From b22f7dcae042dae0a9d068021a76f2c818489d7d Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 21 May 2024 09:19:56 -0400
Subject: [PATCH 316/378] Resolve remaining debug-request race causing hangs

More or less by pedantically separating and managing root and subactor
request syncing events to always be managed by the locking IPC context
task-funcs:
- for the root's "child"-side, `lock_tty_for_child()` directly creates
  and sets a new `Lock.req_handler_finished` inside a `finally:`
- for the sub's "parent"-side, `request_root_stdio_lock()` does the same
  with a new `DebugStatus.req_finished` event and separates it from
  the `.repl_release` event (which indicates a "c" or "q" from user and
  thus exit of the REPL session) as well as sets a new `.req_task:
  trio.Task` to explicitly distinguish from the app-user-task that
  enters the REPL vs. the paired bg task used to request the global
  root's stdio mutex alongside it.
- apply the `__pld_spec__` on "child"-side of the ctx using the new
  `Portal.open_context(pld_spec)` parameter support; drops use of any
  `ContextVar` malarky used prior for `PldRx` mgmt.
- removing `Lock.no_remote_has_tty` since it was a nebulous name and
  from the prior "everything is in a `Lock`" design..

------ - ------

More rigorous impl to handle various edge cases in `._pause()`:
- rejig `_enter_repl_sync()` to wrap the `debug_func == None` case
  inside maybe-internal-error handler blocks.
- better logic for recurrent vs. multi-task contention for REPL entry in
  subactors, by guarding using `DebugStatus.req_task` and by now waiting
  on the new `DebugStatus.req_finished` for the multi-task contention
  case.
- even better internal error handling and reporting for when this code
  is hacked on and possibly broken ;p

------ - ------

Updates to `.pause_from_sync()` support:
- add optional `actor`, `task` kwargs to `_set_trace()` to allow
  compat with the new explicit `debug_func` calling in `._pause()` and
  pass a `threading.Thread` for `task` in the `.to_thread()` usage case.
- add an `except` block that tries to show the frame on any internal
  error.

------ - ------

Relatedly includes a buncha cleanups/simplifications somewhat in
prep for some coming refinements (around `DebugStatus`):
- use all the new attrs mentioned above as needed in the SIGINT shielder.
- wait on `Lock.req_handler_finished` in `maybe_wait_for_debugger()`.
- dropping a ton of masked legacy code left in during the recent reworks.
- better comments, like on the use of `Context._scope` for shielding on
  the "child"-side to avoid the need to manage yet another cs.
- add/change-to lotsa `log.devx()` level emissions for those infos which
  are handy while hacking on the debugger but not ideal/necessary to be
  user visible.
- obvi add lotsa follow up todo notes!
---
 tractor/devx/_debug.py | 824 ++++++++++++++++++++++-------------------
 1 file changed, 446 insertions(+), 378 deletions(-)

diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 1e82122c..877d2de6 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -73,10 +73,10 @@ from tractor._state import (
     debug_mode,
     current_ipc_ctx,
 )
-from .pformat import (
-    # pformat_caller_frame,
-    pformat_cs,
-)
+# from .pformat import (
+#     pformat_caller_frame,
+#     pformat_cs,
+# )
 
 if TYPE_CHECKING:
     from tractor._ipc import Channel
@@ -190,8 +190,8 @@ class Lock:
     #   a stale lock condition (eg. IPC failure with the locking
     #   child
     ctx_in_debug: Context|None = None
+    req_handler_finished: trio.Event|None = None
 
-    no_remote_has_tty: trio.Event|None = None
     _debug_lock: trio.StrictFIFOLock = trio.StrictFIFOLock()
     _blocked: set[
         tuple[str, str]  # `Actor.uid` for per actor
@@ -209,13 +209,12 @@ class Lock:
         if is_root_process():
             lock_stats: trio.LockStatistics = cls._debug_lock.statistics()
             fields += (
-                f'no_remote_has_tty: {cls.no_remote_has_tty}\n'
+                f'req_handler_finished: {cls.req_handler_finished}\n'
+
                 f'_blocked: {cls._blocked}\n\n'
-
-                f'ctx_in_debug: {cls.ctx_in_debug}\n\n'
-
                 f'_debug_lock: {cls._debug_lock}\n'
                 f'lock_stats: {lock_stats}\n'
+
             )
 
         body: str = textwrap.indent(
@@ -225,7 +224,9 @@ class Lock:
         return (
             f'<{cls.__name__}(\n'
             f'{body}'
-            ')>'
+            ')>\n\n'
+
+            f'{cls.ctx_in_debug}\n'
         )
 
     @classmethod
@@ -234,16 +235,23 @@ class Lock:
         cls,
         force: bool = False,
     ):
+        message: str = 'TTY lock not held by any child\n'
+
+        if not (is_trio_main := DebugStatus.is_main_trio_thread()):
+            task: threading.Thread = threading.current_thread()
+        else:
+            task: trio.Task = current_task()
+
         try:
             lock: trio.StrictFIFOLock = cls._debug_lock
             owner: Task = lock.statistics().owner
             if (
                 lock.locked()
                 and
-                owner is current_task()
+                owner is task
                 # ^-NOTE-^ if not will raise a RTE..
             ):
-                if not DebugStatus.is_main_trio_thread():
+                if not is_trio_main:
                     trio.from_thread.run_sync(
                         cls._debug_lock.release
                     )
@@ -251,45 +259,27 @@ class Lock:
                     cls._debug_lock.release()
                     message: str = 'TTY lock released for child\n'
 
-            else:
-                message: str = 'TTY lock not held by any child\n'
-
         finally:
             # IFF there are no more requesting tasks queued up fire, the
             # "tty-unlocked" event thereby alerting any monitors of the lock that
             # we are now back in the "tty unlocked" state. This is basically
             # and edge triggered signal around an empty queue of sub-actor
             # tasks that may have tried to acquire the lock.
-            stats = cls._debug_lock.statistics()
+            lock_stats = cls._debug_lock.statistics()
+            req_handler_finished: trio.Event|None = Lock.req_handler_finished
             if (
-                not stats.owner
+                not lock_stats.owner
                 or force
-                # and cls.no_remote_has_tty is not None
+                and req_handler_finished is None
             ):
                 message += '-> No more child ctx tasks hold the TTY lock!\n'
 
-                # set and release
-                if cls.no_remote_has_tty is not None:
-                    cls.no_remote_has_tty.set()
-                    cls.no_remote_has_tty = None
-
-                    # cls.remote_task_in_debug = None
-
-                else:
-                    message += (
-                        f'-> Not signalling `Lock.no_remote_has_tty` since it has value:{cls.no_remote_has_tty}\n'
-                    )
-
-            else:
-                # wakeup any waiters since the lock was released
-                # (presumably) temporarily.
-                if no_remote_has_tty := cls.no_remote_has_tty:
-                    no_remote_has_tty.set()
-                    no_remote_has_tty = trio.Event()
-
+            elif req_handler_finished:
+                req_stats = req_handler_finished.statistics()
                 message += (
                     f'-> A child ctx task still owns the `Lock` ??\n'
-                    f'   |_owner task: {stats.owner}\n'
+                    f'  |_lock_stats: {lock_stats}\n'
+                    f'  |_req_stats: {req_stats}\n'
                 )
 
             cls.ctx_in_debug = None
@@ -299,8 +289,6 @@ class Lock:
     async def acquire(
         cls,
         ctx: Context,
-        # subactor_uid: tuple[str, str],
-        # remote_task_uid: str,
 
     ) -> AsyncIterator[trio.StrictFIFOLock]:
         '''
@@ -328,7 +316,6 @@ class Lock:
             )
             stats = cls._debug_lock.statistics()
             if owner := stats.owner:
-                # and cls.no_remote_has_tty is not None
                 pre_msg += (
                     f'\n'
                     f'`Lock` already held by local task?\n'
@@ -347,12 +334,6 @@ class Lock:
             await cls._debug_lock.acquire()
             cls.ctx_in_debug = ctx
             we_acquired = True
-            if cls.no_remote_has_tty is None:
-                # mark the tty lock as being in use so that the runtime
-                # can try to avoid clobbering any connection from a child
-                # that's currently relying on it.
-                cls.no_remote_has_tty = trio.Event()
-                # cls.remote_task_in_debug = remote_task_uid
 
             log.runtime(
                 f'TTY lock acquired for sub-actor\n'
@@ -373,11 +354,7 @@ class Lock:
 
         finally:
             message :str = 'Exiting `Lock.acquire()` on behalf of sub-actor\n'
-            if (
-                we_acquired
-                # and
-                # cls._debug_lock.locked()
-            ):
+            if we_acquired:
                 message += '-> TTY lock released by child\n'
                 cls.release()
 
@@ -392,7 +369,6 @@ class Lock:
 
 @tractor.context
 async def lock_tty_for_child(
-
     ctx: Context,
     subactor_task_uid: tuple[str, int],
 
@@ -409,13 +385,11 @@ async def lock_tty_for_child(
 
     '''
     subactor_uid: tuple[str, str] = ctx.chan.uid
-    # NOTE: we use the IPC ctx's cancel scope directly in order to
-    # ensure that on any transport failure, or cancellation request
-    # from the child we expect
-    # `Context._maybe_cancel_and_set_remote_error()` to cancel this
-    # scope despite the shielding we apply below.
-    debug_lock_cs: CancelScope = ctx._scope
 
+    # mark the tty lock as being in use so that the runtime
+    # can try to avoid clobbering any connection from a child
+    # that's currently relying on it.
+    we_finished = Lock.req_handler_finished = trio.Event()
     try:
         if ctx.cid in Lock._blocked:
             raise RuntimeError(
@@ -437,18 +411,15 @@ async def lock_tty_for_child(
                 f'remote task: {subactor_task_uid}\n'
             )
             ctx._enter_debugger_on_cancel: bool = False
-            await ctx.cancel(f'Debug lock blocked for {subactor_uid}')
-            # TODO: remove right?
-            # return LockStatus(
-            #     subactor_uid=subactor_uid,
-            #     cid=ctx.cid,
-            #     locked=False,
-            # )
+            message: str = (
+                f'Debug lock blocked for {subactor_uid}\n'
+                'Cancelling debug request!\n'
+            )
+            log.cancel(message)
+            await ctx.cancel()
+            raise DebugRequestError(message)
 
-        # TODO: when we get to true remote debugging
-        # this will deliver stdin data?
-
-        log.debug(
+        log.devx(
             'Subactor attempting to acquire TTY lock\n'
             f'root task: {root_task_name}\n'
             f'subactor_uid: {subactor_uid}\n'
@@ -456,13 +427,33 @@ async def lock_tty_for_child(
         )
         DebugStatus.shield_sigint()
         Lock._blocked.add(ctx.cid)
-        with (
-            # enable the locking msgspec
-            apply_debug_pldec(),
-        ):
+
+        # NOTE: we use the IPC ctx's cancel scope directly in order to
+        # ensure that on any transport failure, or cancellation request
+        # from the child we expect
+        # `Context._maybe_cancel_and_set_remote_error()` to cancel this
+        # scope despite the shielding we apply below.
+        debug_lock_cs: CancelScope = ctx._scope
+
+        # TODO: use `.msg._ops.maybe_limit_plds()` here instead so we
+        # can merge into a single async with, with the
+        # `Lock.acquire()` enter below?
+        #
+        # enable the locking msgspec
+        with apply_debug_pldec():
             async with Lock.acquire(ctx=ctx):
                 debug_lock_cs.shield = True
 
+                log.devx(
+                    'Subactor acquired debugger request lock!\n'
+                    f'root task: {root_task_name}\n'
+                    f'subactor_uid: {subactor_uid}\n'
+                    f'remote task: {subactor_task_uid}\n\n'
+
+                    'Sending `ctx.started(LockStatus)`..\n'
+
+                )
+
                 # indicate to child that we've locked stdio
                 await ctx.started(
                     LockStatus(
@@ -472,7 +463,9 @@ async def lock_tty_for_child(
                     )
                 )
 
-                log.debug( f'Actor {subactor_uid} acquired TTY lock')
+                log.devx(
+                    f'Actor {subactor_uid} acquired `Lock` via debugger request'
+                )
 
                 # wait for unlock pdb by child
                 async with ctx.open_stream() as stream:
@@ -480,14 +473,16 @@ async def lock_tty_for_child(
 
                     # TODO: security around only releasing if
                     # these match?
-                    log.pdb(
+                    log.devx(
                         f'TTY lock released requested\n\n'
                         f'{release_msg}\n'
                     )
                     assert release_msg.cid == ctx.cid
                     assert release_msg.subactor_uid == tuple(subactor_uid)
 
-                log.debug(f'Actor {subactor_uid} released TTY lock')
+                log.devx(
+                    f'Actor {subactor_uid} released TTY lock'
+                )
 
             return LockStatus(
                 subactor_uid=subactor_uid,
@@ -497,29 +492,33 @@ async def lock_tty_for_child(
 
     except BaseException as req_err:
         message: str = (
-            'Forcing `Lock.release()` since likely an internal error!\n'
+            'Forcing `Lock.release()` for req-ctx since likely an '
+            'internal error!\n\n'
+            f'{ctx}'
         )
         if isinstance(req_err, trio.Cancelled):
-            log.cancel(
+            message = (
                 'Cancelled during root TTY-lock dialog?\n'
                 +
                 message
             )
         else:
-            log.exception(
+            message = (
                 'Errored during root TTY-lock dialog?\n'
                 +
                 message
             )
 
+        log.exception(message)
         Lock.release(force=True)
         raise
 
     finally:
         Lock._blocked.remove(ctx.cid)
-        if (no_locker := Lock.no_remote_has_tty):
-            no_locker.set()
 
+        # wakeup any waiters since the lock was (presumably)
+        # released, possibly only temporarily.
+        we_finished.set()
         DebugStatus.unshield_sigint()
 
 
@@ -538,14 +537,23 @@ class DebugStatus:
 
     '''
     repl: PdbREPL|None = None
+
+    # TODO: yet again this looks like a task outcome where we need
+    # to sync to the completion of one task (and get its result)
+    # being used everywhere for syncing..
+    # -[ ] see if we can get our proto oco task-mngr to work for
+    #   this?
     repl_task: Task|None = None
+    repl_release: trio.Event|None = None
+
+    req_task: Task|None = None
     req_ctx: Context|None = None
     req_cs: CancelScope|None = None
-    repl_release: trio.Event|None = None
     req_finished: trio.Event|None = None
-    lock_status: LockStatus|None = None
     req_err: BaseException|None = None
 
+    lock_status: LockStatus|None = None
+
     _orig_sigint_handler: Callable|None = None
     _trio_handler: (
         Callable[[int, FrameType|None], Any]
@@ -715,13 +723,13 @@ class DebugStatus:
                         f'{cls.repl_task}\n'
                     )
 
-            # restore original sigint handler
-            cls.unshield_sigint()
-
             # actor-local state, irrelevant for non-root.
             cls.repl_task = None
             cls.repl = None
 
+            # restore original sigint handler
+            cls.unshield_sigint()
+
 
 class TractorConfig(pdbp.DefaultConfig):
     '''
@@ -814,17 +822,6 @@ class PdbREPL(pdbp.Pdb):
             ):
                 Lock.release()
 
-    # TODO: special handling where we just want the next LOC and
-    # not to resume to the next pause/crash point?
-    # def set_next(
-    #     self,
-    #     frame: FrameType
-    # ) -> None:
-    #     try:
-    #         super().set_next(frame)
-    #     finally:
-    #         pdbp.set_trace()
-
     # XXX NOTE: we only override this because apparently the stdlib pdb
     # bois likes to touch the SIGINT handler as much as i like to touch
     # my d$%&.
@@ -855,6 +852,9 @@ class PdbREPL(pdbp.Pdb):
         return None
 
 
+# TODO: prolly remove this and instead finally get our @context API
+# supporting a msg/pld-spec via type annots as per,
+# https://github.com/goodboy/tractor/issues/365
 @cm
 def apply_debug_pldec() -> _codec.MsgCodec:
     '''
@@ -865,8 +865,9 @@ def apply_debug_pldec() -> _codec.MsgCodec:
     from tractor.msg import (
         _ops as msgops,
     )
-    orig_plrx: msgops.PldRx = msgops.current_pldrx()
-    orig_pldec: msgops.MsgDec = orig_plrx.pld_dec
+    cctx: Context = current_ipc_ctx()
+    rx: msgops.PldRx = cctx.pld_rx
+    orig_pldec: msgops.MsgDec = rx.pld_dec
 
     try:
         with msgops.limit_plds(
@@ -875,9 +876,9 @@ def apply_debug_pldec() -> _codec.MsgCodec:
             assert (
                 debug_dec
                 is
-                msgops.current_pldrx().pld_dec
+                rx.pld_dec
             )
-            log.info(
+            log.runtime(
                 'Applied `.devx._debug` pld-spec\n\n'
                 f'{debug_dec}\n'
             )
@@ -885,11 +886,9 @@ def apply_debug_pldec() -> _codec.MsgCodec:
 
     finally:
         assert (
-            (plrx := msgops.current_pldrx()) is orig_plrx
-            and
-            plrx.pld_dec is orig_pldec
+            rx.pld_dec is orig_pldec
         )
-        log.info(
+        log.runtime(
             'Reverted to previous pld-spec\n\n'
             f'{orig_pldec}\n'
         )
@@ -898,7 +897,9 @@ def apply_debug_pldec() -> _codec.MsgCodec:
 async def request_root_stdio_lock(
     actor_uid: tuple[str, str],
     task_uid: tuple[str, int],
-    task_status: TaskStatus[CancelScope] = trio.TASK_STATUS_IGNORED
+
+    shield: bool = False,
+    task_status: TaskStatus[CancelScope] = trio.TASK_STATUS_IGNORED,
 ):
     '''
     Connect to the root actor of this process tree and RPC-invoke
@@ -915,7 +916,7 @@ async def request_root_stdio_lock(
 
     '''
 
-    log.pdb(
+    log.devx(
         'Initing stdio-lock request task with root actor'
     )
     # TODO: likely we can implement this mutex more generally as
@@ -928,40 +929,22 @@ async def request_root_stdio_lock(
     # -[ ] technically we need a `RLock` since re-acquire should be a noop
     #   - https://docs.python.org/3.8/library/multiprocessing.html#multiprocessing.RLock
     DebugStatus.req_finished = trio.Event()
+    DebugStatus.req_task = current_task()
     try:
         from tractor._discovery import get_root
-        from tractor.msg import _ops as msgops
-        debug_dec: msgops.MsgDec
-        with (
-            # NOTE: we need this to ensure that this task exits
-            # BEFORE the REPl instance raises an error like
-            # `bdb.BdbQuit` directly, OW you get a trio cs stack
-            # corruption!
-            # Further, the since this task is spawned inside the
-            # `Context._scope_nursery: trio.Nursery`, once an RPC
-            # task errors that cs is cancel_called and so if we want
-            # to debug the TPC task that failed we need to shield
-            # against that expected `.cancel()` call and instead
-            # expect all of the `PdbREPL`.set_[continue/quit/]()`
-            # methods to unblock this task by setting the
-            # `.repl_release: # trio.Event`.
-            trio.CancelScope(shield=True) as req_cs,
-
-            # NOTE: set it here in the locker request task bc it's
-            # possible for multiple such requests for the lock in any
-            # single sub-actor AND there will be a race between when the
-            # root locking task delivers the `Started(pld=LockStatus)`
-            # and when the REPL is actually entered by the requesting
-            # application task who called
-            # `.pause()`/`.post_mortem()`.
-            #
-            # SO, applying the pld-spec here means it is only applied to
-            # this IPC-ctx request task, NOT any other task(s)
-            # including the one that actually enters the REPL. This
-            # is oc desired bc ow the debugged task will msg-type-error.
-            #
-            apply_debug_pldec() as debug_dec,
-        ):
+        # NOTE: we need this to ensure that this task exits
+        # BEFORE the REPl instance raises an error like
+        # `bdb.BdbQuit` directly, OW you get a trio cs stack
+        # corruption!
+        # Further, the since this task is spawned inside the
+        # `Context._scope_nursery: trio.Nursery`, once an RPC
+        # task errors that cs is cancel_called and so if we want
+        # to debug the TPC task that failed we need to shield
+        # against that expected `.cancel()` call and instead
+        # expect all of the `PdbREPL`.set_[continue/quit/]()`
+        # methods to unblock this task by setting the
+        # `.repl_release: # trio.Event`.
+        with trio.CancelScope(shield=shield) as req_cs:
             # XXX: was orig for debugging cs stack corruption..
             # log.info(
             #     'Request cancel-scope is:\n\n'
@@ -972,46 +955,49 @@ async def request_root_stdio_lock(
             try:
                 # TODO: merge into single async with ?
                 async with get_root() as portal:
-
                     async with portal.open_context(
                         lock_tty_for_child,
                         subactor_task_uid=task_uid,
+                        # NOTE: set it here in the locker request task bc it's
+                        # possible for multiple such requests for the lock in any
+                        # single sub-actor AND there will be a race between when the
+                        # root locking task delivers the `Started(pld=LockStatus)`
+                        # and when the REPL is actually entered by the requesting
+                        # application task who called
+                        # `.pause()`/`.post_mortem()`.
+                        #
+                        # SO, applying the pld-spec here means it is only applied to
+                        # this IPC-ctx request task, NOT any other task(s)
+                        # including the one that actually enters the REPL. This
+                        # is oc desired bc ow the debugged task will msg-type-error.
+                        pld_spec=__pld_spec__,
+
                     ) as (req_ctx, status):
 
                         DebugStatus.req_ctx = req_ctx
-
-                        # sanity checks on pld-spec limit state
-                        assert debug_dec
-                        # curr_pldrx: msgops.PldRx = msgops.current_pldrx()
-                        # assert (
-                        #     curr_pldrx.pld_dec is debug_dec
-                        # )
-
-                        log.debug(
+                        log.devx(
                             'Subactor locked TTY with msg\n\n'
                             f'{status}\n'
                         )
 
-                        # mk_pdb().set_trace()
-                        try:
-                            assert status.subactor_uid == actor_uid
-                            assert status.cid
-                        except AttributeError:
-                            log.exception('failed pldspec asserts!')
-                            raise
+                        # try:
+                        assert status.subactor_uid == actor_uid
+                        assert status.cid
+                        # except AttributeError:
+                        #     log.exception('failed pldspec asserts!')
+                        #     mk_pdb().set_trace()
+                        #     raise
 
                         # set last rxed lock dialog status.
                         DebugStatus.lock_status = status
 
                         async with req_ctx.open_stream() as stream:
-
-                            assert DebugStatus.repl_release
                             task_status.started(req_ctx)
 
-                            # wait for local task to exit its
-                            # `PdbREPL.interaction()`, call
-                            # `DebugStatus.release()` and then
-                            # unblock here.
+                            # wait for local task to exit
+                            # `PdbREPL.interaction()`, normally via
+                            # a `DebugStatus.release()`call,  and
+                            # then unblock us here.
                             await DebugStatus.repl_release.wait()
                             await stream.send(
                                 LockRelease(
@@ -1026,10 +1012,10 @@ async def request_root_stdio_lock(
                             assert not status.locked
                             DebugStatus.lock_status = status
 
-                    log.pdb(
+                    log.devx(
                         'TTY lock was released for subactor with msg\n\n'
                         f'{status}\n\n'
-                        f'Exitting {req_ctx.side!r}-side of locking req_ctx'
+                        f'Exitting {req_ctx.side!r}-side of locking req_ctx\n'
                     )
 
             except (
@@ -1081,13 +1067,14 @@ async def request_root_stdio_lock(
         #   ctl-c out of the currently hanging task! 
         raise DebugRequestError(
             'Failed to lock stdio from subactor IPC ctx!\n\n'
-            f'req_ctx: {req_ctx}\n'
+            f'req_ctx: {DebugStatus.req_ctx}\n'
         ) from req_err
 
     finally:
-        log.debug('Exiting debugger TTY lock request func from child')
+        log.devx('Exiting debugger TTY lock request func from child')
         # signal request task exit
         DebugStatus.req_finished.set()
+        DebugStatus.req_task = None
 
 
 def mk_pdb() -> PdbREPL:
@@ -1321,31 +1308,40 @@ def shield_sigint_handler(
             DebugStatus.unshield_sigint()
             # do_cancel()
 
-        task: str|None = DebugStatus.repl_task
+        repl_task: str|None = DebugStatus.repl_task
+        req_task: str|None = DebugStatus.req_task
         if (
-            task
+            repl_task
             and
             repl
         ):
             log.pdb(
                 f'Ignoring SIGINT while local task using debug REPL\n'
-                f'|_{task}\n'
+                f'|_{repl_task}\n'
                 f'  |_{repl}\n'
             )
+        elif req_task:
+            log.pdb(
+                f'Ignoring SIGINT while debug request task is open\n'
+                f'|_{req_task}\n'
+            )
         else:
             msg: str = (
                 'SIGINT shield handler still active BUT, \n\n'
             )
-            if task is None:
+            if repl_task is None:
                 msg += (
-                    f'- No local task claims to be in debug?\n'
-                    f' |_{task}\n\n'
+                    '- No local task claims to be in debug?\n'
                 )
 
             if repl is None:
                 msg += (
-                    f'- No local REPL is currently active?\n'
-                    f' |_{repl}\n\n'
+                    '- No local REPL is currently active?\n'
+                )
+
+            if req_task is None:
+                msg += (
+                    '- No debug request task is active?\n'
                 )
 
             log.warning(
@@ -1358,7 +1354,6 @@ def shield_sigint_handler(
             # XXX ensure that the reverted-to-handler actually is
             # able to rx what should have been **this** KBI ;)
             do_cancel()
-            # raise KeyboardInterrupt
 
         # TODO: how to handle the case of an intermediary-child actor
         # that **is not** marked in debug mode? See oustanding issue:
@@ -1392,7 +1387,7 @@ def shield_sigint_handler(
         # https://github.com/prompt-toolkit/python-prompt-toolkit/blob/c2c6af8a0308f9e5d7c0e28cb8a02963fe0ce07a/prompt_toolkit/patch_stdout.py
 
     # XXX only for tracing this handler
-    # log.warning('exiting SIGINT')
+    log.devx('exiting SIGINT')
 
 
 _pause_msg: str = 'Attaching to pdb REPL in actor'
@@ -1420,14 +1415,9 @@ async def _pause(
     # is always show in the debugger on entry.. and there seems to
     # be no way to override it?..
     #
-    # shield: bool = False,
-    hide_tb: bool = True,
-
-    # bc, `debug_func()`, `_enter_repl_sync()` and `_pause()`
-    # extra_frames_up_when_async: int = 3,
-
+    shield: bool = False,
+    hide_tb: bool = False,
     task_status: TaskStatus[trio.Event] = trio.TASK_STATUS_IGNORED,
-
     **debug_func_kwargs,
 
 ) -> None:
@@ -1452,6 +1442,87 @@ async def _pause(
                 'for infected `asyncio` mode!'
             ) from rte
 
+    if debug_func is not None:
+        debug_func = partial(debug_func)
+
+    repl: PdbREPL = repl or mk_pdb()
+
+    # XXX NOTE XXX set it here to avoid ctl-c from cancelling a debug
+    # request from a subactor BEFORE the REPL is entered by that
+    # process.
+    DebugStatus.shield_sigint()
+
+    # TODO: move this into a `open_debug_request()` @acm?
+    # -[ ] prolly makes the most sense to do the request
+    #    task spawn as part of an `@acm` api which delivers the
+    #    `DebugRequest` instance and ensures encapsing all the
+    #    pld-spec and debug-nursery?
+    # -[ ] maybe make this a `PdbREPL` method or mod func?
+    # -[ ] factor out better, main reason for it is common logic for
+    #   both root and sub repl entry
+    def _enter_repl_sync(
+        debug_func: Callable,
+    ) -> None:
+        __tracebackhide__: bool = hide_tb
+
+        try:
+            # set local actor task to avoid recurrent
+            # entries/requests from the same local task (to the root
+            # process).
+            DebugStatus.repl_task = task
+            DebugStatus.repl = repl
+
+            # TODO: do we want to support using this **just** for the
+            # locking / common code (prolly to help address #320)?
+            if debug_func is None:
+                task_status.started(DebugStatus)
+
+            else:
+                log.warning(
+                    'Entering REPL for task fuck you!\n'
+                    f'{task}\n'
+                )
+                # block here one (at the appropriate frame *up*) where
+                # ``breakpoint()`` was awaited and begin handling stdio.
+                log.devx(
+                    'Entering sync world of the `pdb` REPL for task..\n'
+                    f'{repl}\n'
+                    f'  |_{task}\n'
+                 )
+
+                # invoke the low-level REPL activation routine which itself
+                # should call into a `Pdb.set_trace()` of some sort.
+                debug_func(
+                    repl=repl,
+                    hide_tb=hide_tb,
+                    **debug_func_kwargs,
+                )
+
+        except trio.Cancelled:
+            log.exception(
+                'Cancelled during invoke of internal `debug_func = '
+                f'{debug_func.func.__name__}`\n'
+            )
+            # XXX NOTE: DON'T release lock yet
+            raise
+
+        except BaseException:
+            __tracebackhide__: bool = False
+            log.exception(
+                'Failed to invoke internal `debug_func = '
+                f'{debug_func.func.__name__}`\n'
+            )
+            # NOTE: OW this is ONLY called from the
+            # `.set_continue/next` hooks!
+            DebugStatus.release(cancel_req_task=True)
+
+            raise
+
+    log.devx(
+        'Entering `._pause()` for requesting task\n'
+        f'|_{task}\n'
+    )
+
     # TODO: this should be created as part of `DebugRequest()` init
     # which should instead be a one-shot-use singleton much like
     # the `PdbREPL`.
@@ -1461,71 +1532,9 @@ async def _pause(
         DebugStatus.repl_release.is_set()
     ):
         DebugStatus.repl_release = trio.Event()
-
-    if debug_func is not None:
-        debug_func = partial(debug_func)
-
-    repl: PdbREPL = repl or mk_pdb()
-
-    # TODO: maybe make this a `PdbREPL` method or mod func?
-    # -[ ] factor out better, main reason for it is common logic for
-    #   both root and sub repl entry
-    def _enter_repl_sync(
-        debug_func: Callable,
-    ) -> None:
-        __tracebackhide__: bool = hide_tb
-
-        # TODO: do we want to support using this **just** for the
-        # locking / common code (prolly to help address #320)?
-        #
-        if debug_func is None:
-            task_status.started(DebugStatus)
-        else:
-            # block here one (at the appropriate frame *up*) where
-            # ``breakpoint()`` was awaited and begin handling stdio.
-            log.debug('Entering sync world of the `pdb` REPL..')
-
-            # XXX used by the SIGINT handler to check if
-            # THIS actor is in REPL interaction
-            try:
-                # TODO: move this into a `open_debug_request()` @acm?
-                # -[ ] prolly makes the most send to do the request
-                #   task spawn as part of an `@acm` api which
-                #   delivers the `DebugRequest` instance and ensures
-                #   encapsing all the pld-spec and debug-nursery?
-                #
-                # set local actor task to avoid recurrent
-                # entries/requests from the same local task
-                # (to the root process).
-                DebugStatus.repl_task = task
-                DebugStatus.repl = repl
-                DebugStatus.shield_sigint()
-
-                # enter `PdbREPL` specific method
-                debug_func(
-                    repl=repl,
-                    hide_tb=hide_tb,
-                    **debug_func_kwargs,
-                )
-            except trio.Cancelled:
-                log.exception(
-                    'Cancelled during invoke of internal `debug_func = '
-                    f'{debug_func.func.__name__}`\n'
-                )
-                # NOTE: DON'T release lock yet
-                raise
-
-            except BaseException:
-                __tracebackhide__: bool = False
-                log.exception(
-                    'Failed to invoke internal `debug_func = '
-                    f'{debug_func.func.__name__}`\n'
-                )
-                # NOTE: OW this is ONLY called from the
-                # `.set_continue/next` hooks!
-                DebugStatus.release(cancel_req_task=True)
-
-                raise
+    # ^-NOTE-^ this must be created BEFORE scheduling any subactor
+    # debug-req task since it needs to wait on it just after
+    # `.started()`-ing back its wrapping `.req_cs: CancelScope`.
 
     repl_err: BaseException|None = None
     try:
@@ -1579,38 +1588,61 @@ async def _pause(
             not is_root_process()
             and actor._parent_chan  # a connected child
         ):
-            if DebugStatus.repl_task:
+            repl_task: Task|None = DebugStatus.repl_task
+            req_task: Task|None = DebugStatus.req_task
+            if req_task:
+                log.warning(
+                    f'Already an ongoing repl request?\n'
+                    f'|_{req_task}\n\n'
 
-                # Recurrence entry case: this task already has the lock and
-                # is likely recurrently entering a breakpoint
+                    f'REPL task is\n'
+                    f'|_{repl_task}\n\n'
+
+                )
+                # Recurrent entry case.
+                # this task already has the lock and is likely
+                # recurrently entering a `.pause()`-point either bc,
+                # - someone is hacking on runtime internals and put
+                #   one inside code that get's called on the way to
+                #   this code,
+                # - a legit app task uses the 'next' command while in
+                #   a REPL sesh, and actually enters another
+                #   `.pause()` (in a loop or something).
                 #
-                # NOTE: noop on recurrent entry case but we want to trigger
-                # a checkpoint to allow other actors error-propagate and
-                # potetially avoid infinite re-entries in some
-                # subactor that would otherwise not bubble until the
-                # next checkpoint was hit.
+                # XXX Any other cose is likely a bug.
                 if (
-                    (repl_task := DebugStatus.repl_task)
-                    and
-                    repl_task is task
+                    repl_task
+                ):
+                    if repl_task is task:
+                        log.warning(
+                            f'{task.name}@{actor.uid} already has TTY lock\n'
+                            f'ignoring..'
+                        )
+                        await trio.lowlevel.checkpoint()
+                        return
+
+                    else:
+                        # if **this** actor is already in debug REPL we want
+                        # to maintain actor-local-task mutex access, so block
+                        # here waiting for the control to be released - this
+                        # -> allows for recursive entries to `tractor.pause()`
+                        log.warning(
+                            f'{task}@{actor.uid} already has TTY lock\n'
+                            f'waiting for release..'
+                        )
+                        await DebugStatus.repl_release.wait()
+                        await trio.sleep(0.1)
+
+                elif (
+                    req_task
                 ):
                     log.warning(
-                        f'{task.name}@{actor.uid} already has TTY lock\n'
-                        f'ignoring..'
-                    )
-                    await trio.lowlevel.checkpoint()
-                    return
+                        'Local task already has active debug request\n'
+                        f'|_{task}\n\n'
 
-                # if **this** actor is already in debug REPL we want
-                # to maintain actor-local-task mutex access, so block
-                # here waiting for the control to be released - this
-                # -> allows for recursive entries to `tractor.pause()`
-                log.warning(
-                    f'{task.name}@{actor.uid} already has TTY lock\n'
-                    f'waiting for release..'
-                )
-                await DebugStatus.repl_release.wait()
-                await trio.sleep(0.1)
+                        'Waiting for previous request to complete..\n'
+                    )
+                    await DebugStatus.req_finished.wait()
 
             # this **must** be awaited by the caller and is done using the
             # root nursery so that the debugger can continue to run without
@@ -1642,16 +1674,23 @@ async def _pause(
             # -[ ] we probably only need to allocate the nursery when
             #   we detect the runtime is already in debug mode.
             #
-            # ctx: Context = await curr_ctx._debug_tn.start(
+            curr_ctx: Context = current_ipc_ctx()
+            # req_ctx: Context = await curr_ctx._debug_tn.start(
+            log.devx(
+                'Starting request task\n'
+                f'|_{task}\n'
+            )
             req_ctx: Context = await actor._service_n.start(
-                request_root_stdio_lock,
-                actor.uid,
-                (task.name, id(task)),  # task uuid (effectively)
+                partial(
+                    request_root_stdio_lock,
+                    actor_uid=actor.uid,
+                    task_uid=(task.name, id(task)),  # task uuid (effectively)
+                    shield=shield,
+                )
             )
             # XXX sanity, our locker task should be the one which
             # entered a new IPC ctx with the root actor, NOT the one
             # that exists around the task calling into `._pause()`.
-            curr_ctx: Context = current_ipc_ctx()
             assert (
                 req_ctx
                 is
@@ -1665,8 +1704,8 @@ async def _pause(
 
     # TODO: prolly factor this plus the similar block from
     # `_enter_repl_sync()` into a common @cm?
-    except BaseException as repl_err:
-        if isinstance(repl_err, bdb.BdbQuit):
+    except BaseException as pause_err:
+        if isinstance(pause_err, bdb.BdbQuit):
             log.devx(
                 'REPL for pdb was quit!\n'
             )
@@ -1675,7 +1714,7 @@ async def _pause(
         # `Actor._service_n` might get closed before we can spawn
         # the request task, so just ignore expected RTE.
         elif (
-            isinstance(repl_err, RuntimeError)
+            isinstance(pause_err, RuntimeError)
             and
             actor._cancel_called
         ):
@@ -1698,13 +1737,22 @@ async def _pause(
         # sanity checks for ^ on request/status teardown
         assert DebugStatus.repl is None
         assert DebugStatus.repl_task is None
-        req_ctx: Context = DebugStatus.req_ctx
-        if req_ctx:
-            assert req_ctx._scope.cancel_called
+
+        # sanity, for when hackin on all this?
+        if not isinstance(pause_err, trio.Cancelled):
+            req_ctx: Context = DebugStatus.req_ctx
+            if req_ctx:
+                # XXX, bc the child-task in root might cancel it?
+                # assert req_ctx._scope.cancel_called
+                assert req_ctx.maybe_error
 
         raise
 
     finally:
+        # set in finally block of func.. this can be synced-to
+        # eventually with a debug_nursery somehow?
+        # assert DebugStatus.req_task is None
+
         # always show frame when request fails due to internal
         # failure in the above code (including an `BdbQuit`).
         if (
@@ -1721,9 +1769,15 @@ def _set_trace(
 
     # partial-ed in by `.pause()`
     api_frame: FrameType,
+
+    # optionally passed in to provide support for
+    # `pause_from_sync()` where
+    actor: tractor.Actor|None = None,
+    task: trio.Task|None = None,
 ):
     __tracebackhide__: bool = hide_tb
-    actor: tractor.Actor = current_actor()
+    actor: tractor.Actor = actor or current_actor()
+    task: task or current_task()
 
     # else:
     # TODO: maybe print the actor supervion tree up to the
@@ -1731,8 +1785,10 @@ def _set_trace(
     log.pdb(
         f'{_pause_msg}\n'
         '|\n'
-        # TODO: make an `Actor.__repr()__`
-        f'|_ {current_task()} @ {actor.uid}\n'
+        # TODO: more compact pformating?
+        # -[ ] make an `Actor.__repr()__`
+        # -[ ] should we use `log.pformat_task_uid()`?
+        f'|_ {task} @ {actor.uid}\n'
     )
     # presuming the caller passed in the "api frame"
     # (the last frame before user code - like `.pause()`)
@@ -1747,7 +1803,7 @@ def _set_trace(
 
 async def pause(
     *,
-    hide_tb: bool = True,
+    hide_tb: bool = False,
     api_frame: FrameType|None = None,
 
     # TODO: figure out how to still make this work:
@@ -1798,8 +1854,7 @@ async def pause(
             _set_trace,
             api_frame=api_frame,
         ),
-
-        # task_status=task_status,
+        shield=shield,
         **_pause_kwargs
     )
         # XXX avoid cs stack corruption when `PdbREPL.interaction()`
@@ -1867,88 +1922,97 @@ async def maybe_init_greenback(
 # normally by remapping python's builtin breakpoint() hook to this
 # runtime aware version which takes care of all .
 def pause_from_sync(
+
     hide_tb: bool = False,
+    # proxied to `_pause()`
+
+    **_pause_kwargs,
+    # for eg.
+    # shield: bool = False,
+    # api_frame: FrameType|None = None,
+
 ) -> None:
 
     __tracebackhide__: bool = hide_tb
-    actor: tractor.Actor = current_actor(
-        err_on_no_runtime=False,
-    )
-    log.debug(
-        f'{actor.uid}: JUST ENTERED `tractor.pause_from_sync()`'
-        f'|_{actor}\n'
-    )
-    if not actor:
-        raise RuntimeError(
-            'Not inside the `tractor`-runtime?\n'
-            '`tractor.pause_from_sync()` is not functional without a wrapping\n'
-            '- `async with tractor.open_nursery()` or,\n'
-            '- `async with tractor.open_root_actor()`\n'
+    try:
+        actor: tractor.Actor = current_actor(
+            err_on_no_runtime=False,
         )
-
-    # NOTE: once supported, remove this AND the one
-    # inside `._pause()`!
-    if actor.is_infected_aio():
-        raise RuntimeError(
-            '`tractor.pause[_from_sync]()` not yet supported '
-            'for infected `asyncio` mode!'
+        log.debug(
+            f'{actor.uid}: JUST ENTERED `tractor.pause_from_sync()`'
+            f'|_{actor}\n'
         )
-
-    # raises on not-found by default
-    greenback: ModuleType = maybe_import_greenback()
-    mdb: PdbREPL = mk_pdb()
-
-    # run async task which will lock out the root proc's TTY.
-    if not Lock.is_main_trio_thread():
-
-        # TODO: we could also check for a non-`.to_thread` context
-        # using `trio.from_thread.check_cancelled()` (says
-        # oremanj) wherein we get the following outputs:
-        #
-        # `RuntimeError`: non-`.to_thread` spawned thread
-        # noop: non-cancelled `.to_thread`
-        # `trio.Cancelled`: cancelled `.to_thread`
-        #
-        trio.from_thread.run(
-            partial(
-                pause,
-                debug_func=None,
-                pdb=mdb,
-                hide_tb=hide_tb,
+        if not actor:
+            raise RuntimeError(
+                'Not inside the `tractor`-runtime?\n'
+                '`tractor.pause_from_sync()` is not functional without a wrapping\n'
+                '- `async with tractor.open_nursery()` or,\n'
+                '- `async with tractor.open_root_actor()`\n'
             )
-        )
-        # TODO: maybe the `trio.current_task()` id/name if avail?
-        DebugStatus.repl_task: str = str(threading.current_thread())
 
-    else:  # we are presumably the `trio.run()` + main thread
-        greenback.await_(
-            pause(
-                debug_func=None,
-                pdb=mdb,
-                hide_tb=hide_tb,
+        # NOTE: once supported, remove this AND the one
+        # inside `._pause()`!
+        if actor.is_infected_aio():
+            raise RuntimeError(
+                '`tractor.pause[_from_sync]()` not yet supported '
+                'for infected `asyncio` mode!'
             )
+
+        # raises on not-found by default
+        greenback: ModuleType = maybe_import_greenback()
+        mdb: PdbREPL = mk_pdb()
+
+        # run async task which will lock out the root proc's TTY.
+        if not DebugStatus.is_main_trio_thread():
+
+            # TODO: we could also check for a non-`.to_thread` context
+            # using `trio.from_thread.check_cancelled()` (says
+            # oremanj) wherein we get the following outputs:
+            #
+            # `RuntimeError`: non-`.to_thread` spawned thread
+            # noop: non-cancelled `.to_thread`
+            # `trio.Cancelled`: cancelled `.to_thread`
+            #
+            trio.from_thread.run(
+                partial(
+                    _pause,
+                    debug_func=None,
+                    repl=mdb,
+                    **_pause_kwargs
+                ),
+            )
+            task: threading.Thread = threading.current_thread()
+
+        else:  # we are presumably the `trio.run()` + main thread
+            task: trio.Task = current_task()
+            greenback.await_(
+                _pause(
+                    debug_func=None,
+                    repl=mdb,
+                    **_pause_kwargs,
+                )
+            )
+            DebugStatus.repl_task: str = current_task()
+
+        # TODO: ensure we aggressively make the user aware about
+        # entering the global ``breakpoint()`` built-in from sync
+        # code?
+        _set_trace(
+            api_frame=inspect.currentframe(),
+            repl=mdb,
+            hide_tb=hide_tb,
+            actor=actor,
+            task=task,
         )
-        DebugStatus.repl_task: str = current_task()
-
-    # TODO: ensure we aggressively make the user aware about
-    # entering the global ``breakpoint()`` built-in from sync
-    # code?
-    _set_trace(
-        api_frame=inspect.current_frame(),
-        actor=actor,
-        pdb=mdb,
-        hide_tb=hide_tb,
-
-        # TODO? will we ever need it?
-        # -> the gb._await() won't be affected by cancellation?
-        # shield=shield,
-    )
-    # LEGACY NOTE on next LOC's frame showing weirdness..
-    #
-    # XXX NOTE XXX no other LOC can be here without it
-    # showing up in the REPL's last stack frame !?!
-    # -[ ] tried to use `@pdbp.hideframe` decoration but
-    #   still doesn't work
+        # LEGACY NOTE on next LOC's frame showing weirdness..
+        #
+        # XXX NOTE XXX no other LOC can be here without it
+        # showing up in the REPL's last stack frame !?!
+        # -[ ] tried to use `@pdbp.hideframe` decoration but
+        #   still doesn't work
+    except BaseException as err:
+        __tracebackhide__: bool = False
+        raise err
 
 
 # NOTE prefer a new "pause" semantic since it better describes
@@ -2135,6 +2199,7 @@ async def maybe_wait_for_debugger(
     child_in_debug: bool = False,
 
     header_msg: str = '',
+    _ll: str = 'devx',
 
 ) -> bool:  # was locked and we polled?
 
@@ -2144,6 +2209,7 @@ async def maybe_wait_for_debugger(
     ):
         return False
 
+    logmeth: Callable = getattr(log, _ll)
 
     msg: str = header_msg
     if (
@@ -2156,7 +2222,11 @@ async def maybe_wait_for_debugger(
         # Instead try to wait for pdb to be released before
         # tearing down.
         ctx_in_debug: Context|None = Lock.ctx_in_debug
-        in_debug: tuple[str, str]|None = ctx_in_debug.chan.uid if ctx_in_debug else None
+        in_debug: tuple[str, str]|None = (
+            ctx_in_debug.chan.uid
+            if ctx_in_debug
+            else None
+        )
         if in_debug == current_actor().uid:
             log.debug(
                 msg
@@ -2176,7 +2246,7 @@ async def maybe_wait_for_debugger(
             # XXX => but it doesn't seem to work..
             # await trio.testing.wait_all_tasks_blocked(cushion=0)
         else:
-            log.debug(
+            logmeth(
                 msg
                 +
                 'Root immediately acquired debug TTY LOCK'
@@ -2185,13 +2255,13 @@ async def maybe_wait_for_debugger(
 
         for istep in range(poll_steps):
             if (
-                Lock.no_remote_has_tty is not None
-                and not Lock.no_remote_has_tty.is_set()
+                Lock.req_handler_finished is not None
+                and not Lock.req_handler_finished.is_set()
                 and in_debug is not None
             ):
 
                 # caller_frame_info: str = pformat_caller_frame()
-                log.debug(
+                logmeth(
                     msg
                     +
                     '\nRoot is waiting on tty lock to release from\n\n'
@@ -2202,7 +2272,7 @@ async def maybe_wait_for_debugger(
                     Lock.get_locking_task_cs().cancel()
 
                 with trio.CancelScope(shield=True):
-                    await Lock.no_remote_has_tty.wait()
+                    await Lock.req_handler_finished.wait()
 
                 log.pdb(
                     f'Subactor released debug lock\n'
@@ -2214,11 +2284,11 @@ async def maybe_wait_for_debugger(
             if (
                 in_debug is None
                 and (
-                    Lock.no_remote_has_tty is None
-                    or Lock.no_remote_has_tty.is_set()
+                    Lock.req_handler_finished is None
+                    or Lock.req_handler_finished.is_set()
                 )
             ):
-                log.pdb(
+                logmeth(
                     msg
                     +
                     'Root acquired tty lock!'
@@ -2226,13 +2296,11 @@ async def maybe_wait_for_debugger(
                 break
 
             else:
-                # TODO: don't need this right?
-                # await trio.lowlevel.checkpoint()
-
-                log.debug(
+                logmeth(
                     'Root polling for debug:\n'
                     f'poll step: {istep}\n'
-                    f'poll delya: {poll_delay}'
+                    f'poll delya: {poll_delay}\n\n'
+                    f'{Lock.repr()}\n'
                 )
                 with CancelScope(shield=True):
                     await trio.sleep(poll_delay)
-- 
2.34.1


From 3538ccd7992282a1b81e3560282a4e03b9e667d2 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 22 May 2024 10:22:51 -0400
Subject: [PATCH 317/378] Better context aware `RemoteActorError.pformat()`

Such that when displaying with `.__str__()` we do not show the type
header (style) since normally python's raising machinery already prints
the type path like `'tractor._exceptions.RemoteActorError:'`, so doing
it 2x is a bit ugly ;p

In support,
- include `.relay_uid` in `RemoteActorError.extra_body_fields`.
- offer a `with_type_header: bool` to `.pformat()` and only put the
  opening type path and closing `')>'` tail line when `True`.
- add `.is_inception() -> bool:` for an easy way to determine if the
  error is multi-hop relayed.
- only repr the `'|_relay_uid=<uid>'` field when an error is an inception.
- tweak the invalid-payload case in `_mk_msg_type_err()` to explicitly
  state in the `message` how the `any_pld` value does not match the `MsgDec.pld_spec`
  by decoding the invalid `.pld` with an any-dec.
- allow `_mk_msg_type_err(**mte_kwargs)` passthrough.
- pass `boxed_type=cls` inside `MsgTypeError.from_decode()`.
---
 tractor/_exceptions.py | 101 +++++++++++++++++++++++++++++++++++------
 1 file changed, 87 insertions(+), 14 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 83675069..179b49a1 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -187,6 +187,9 @@ class RemoteActorError(Exception):
     ]
     extra_body_fields: list[str] = [
         'cid',
+        # NOTE: we only show this on relayed errors (aka
+        # "inceptions").
+        'relay_uid',
         'boxed_type',
     ]
 
@@ -273,7 +276,7 @@ class RemoteActorError(Exception):
     @property
     def ipc_msg(self) -> Struct:
         '''
-        Re-render the underlying `._ipc_msg: Msg` as
+        Re-render the underlying `._ipc_msg: MsgType` as
         a `pretty_struct.Struct` for introspection such that the
         returned value is a read-only copy of the original.
 
@@ -344,7 +347,7 @@ class RemoteActorError(Exception):
         return str(bt.__name__)
 
     @property
-    def boxed_type(self) -> str:
+    def boxed_type(self) -> Type[BaseException]:
         '''
         Error type boxed by last actor IPC hop.
 
@@ -409,7 +412,14 @@ class RemoteActorError(Exception):
         end_char: str = '\n',
     ) -> str:
         _repr: str = ''
+
         for key in fields:
+            if (
+                key == 'relay_uid'
+                and not self.is_inception()
+            ):
+                continue
+
             val: Any|None = (
                 getattr(self, key, None)
                 or
@@ -427,6 +437,7 @@ class RemoteActorError(Exception):
             if val:
                 _repr += f'{key}={val_str}{end_char}'
 
+
         return _repr
 
     def reprol(self) -> str:
@@ -455,15 +466,45 @@ class RemoteActorError(Exception):
             _repr
         )
 
-    def pformat(self) -> str:
+    def is_inception(self) -> bool:
+        '''
+        Predicate which determines if the shuttled error type
+        is the same as the container error type; IOW is this
+        an "error within and error" which points to some original
+        source error that was relayed through multiple
+        actor hops.
+
+        Ex. a relayed remote error will generally be some form of
+        `RemoteActorError[RemoteActorError]` with a `.src_type` which
+        is not of that same type.
+
+        '''
+        # if a single hop boxed error it was not relayed
+        # more then one hop directly from the src actor.
+        if (
+            self.boxed_type
+            is
+            self.src_type
+        ):
+            return False
+
+        return True
+
+    def pformat(
+        self,
+        with_type_header: bool = True,
+    ) -> str:
         '''
         Nicely formatted boxed error meta data + traceback, OR just
         the normal message from `.args` (for eg. as you'd want shown
         by a locally raised `ContextCancelled`).
 
         '''
-        tb_str: str = self.tb_str
-        if tb_str:
+        header: str = ''
+        if with_type_header:
+            header: str = f'<{type(self).__name__}(\n'
+
+        if tb_str := self.tb_str:
             fields: str = self._mk_fields_str(
                 _body_fields
                 +
@@ -481,19 +522,35 @@ class RemoteActorError(Exception):
                 #             |___ ..
                 tb_body_indent=1,
             )
+            if not with_type_header:
+                body = '\n' + body
         else:
             body: str = textwrap.indent(
                 self._message,
                 prefix='  ',
             ) + '\n'
+
+        if with_type_header:
+            tail: str = ')>'
+        else:
+            tail = ''
+
         return (
-            f'<{type(self).__name__}(\n'
+            header
+            +
             f'{body}'
-            ')>'
+            +
+            tail
         )
 
     __repr__ = pformat
-    __str__ = pformat
+
+    # NOTE: apparently we need this so that
+    # the full fields show in debugger tests?
+    # |_ i guess `pexepect` relies on `str`-casing
+    #    of output?
+    def __str__(self) -> str:
+        return self.pformat(with_type_header=False)
 
     def unwrap(
         self,
@@ -682,6 +739,7 @@ class MsgTypeError(
     ) -> MsgTypeError:
         return cls(
             message=message,
+            boxed_type=cls,
 
             # NOTE: original "vanilla decode" of the msg-bytes
             # is placed inside a value readable from
@@ -949,10 +1007,11 @@ def _raise_from_unexpected_msg(
     if isinstance(msg, Error):
     # match msg:
     #     case Error():
-        raise unpack_error(
+        exc: RemoteActorError = unpack_error(
             msg,
             ctx.chan,
-        ) from src_err
+        )
+        raise exc from src_err
 
     # `MsgStream` termination msg.
     # TODO: does it make more sense to pack 
@@ -966,10 +1025,11 @@ def _raise_from_unexpected_msg(
             or
             isinstance(msg, Stop)
         ):
-            log.debug(
+            message: str = (
                 f'Context[{cid}] stream was stopped by remote side\n'
                 f'cid: {cid}\n'
             )
+            log.debug(message)
 
             # TODO: if the a local task is already blocking on
             # a `Context.result()` and thus a `.receive()` on the
@@ -983,6 +1043,8 @@ def _raise_from_unexpected_msg(
                 f'Context stream ended due to msg:\n\n'
                 f'{pformat(msg)}\n'
             )
+            eoc.add_note(message)
+
             # XXX: important to set so that a new `.receive()`
             # call (likely by another task using a broadcast receiver)
             # doesn't accidentally pull the `return` message
@@ -1007,6 +1069,7 @@ def _raise_from_unexpected_msg(
         ' BUT received a non-error msg:\n\n'
         f'{struct_format(msg)}'
     ) from src_err
+    # ^-TODO-^ maybe `MsgDialogError` is better?
 
 
 _raise_from_no_key_in_msg = _raise_from_unexpected_msg
@@ -1023,6 +1086,8 @@ def _mk_msg_type_err(
     src_type_error: TypeError|None = None,
     is_invalid_payload: bool = False,
 
+    **mte_kwargs,
+
 ) -> MsgTypeError:
     '''
     Compose a `MsgTypeError` from an input runtime context.
@@ -1081,12 +1146,20 @@ def _mk_msg_type_err(
     else:
         if is_invalid_payload:
             msg_type: str = type(msg)
+            any_pld: Any = msgpack.decode(msg.pld)
             message: str = (
                 f'invalid `{msg_type.__qualname__}` payload\n\n'
-                f'<{type(msg).__qualname__}(\n'
-                f' |_pld: {codec.pld_spec_str} = {msg.pld!r}'
-                f')>\n'
+                f'value: `{any_pld!r}` does not match type-spec: ' #\n'
+                f'`{type(msg).__qualname__}.pld: {codec.pld_spec_str}`'
+                # f'<{type(msg).__qualname__}(\n'
+                # f' |_pld: {codec.pld_spec_str}\n'# != {any_pld!r}\n'
+                # f')>\n\n'
             )
+            # TODO: should we just decode the msg to a dict despite
+            # only the payload being wrong?
+            # -[ ] maybe the better design is to break this construct
+            #   logic into a separate explicit helper raiser-func?
+            msg_dict: dict = {}
 
         else:
             # decode the msg-bytes using the std msgpack
-- 
2.34.1


From 74d4b5280a918b514f668c358d0c94456c0d759f Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 22 May 2024 14:56:18 -0400
Subject: [PATCH 318/378] Woops, make `log.devx()` level less `.error()`

---
 tractor/log.py | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/tractor/log.py b/tractor/log.py
index 3621fc15..41a910e8 100644
--- a/tractor/log.py
+++ b/tractor/log.py
@@ -53,17 +53,14 @@ LOG_FORMAT = (
 
 DATE_FORMAT = '%b %d %H:%M:%S'
 
-LEVELS: dict[str, int] = {
+# FYI, ERROR is 40
+CUSTOM_LEVELS: dict[str, int] = {
     'TRANSPORT': 5,
     'RUNTIME': 15,
     'CANCEL': 16,
-    'DEVX': 400,
+    'DEVX': 17,
     'PDB': 500,
 }
-# _custom_levels: set[str] = {
-#     lvlname.lower for lvlname in LEVELS.keys()
-# }
-
 STD_PALETTE = {
     'CRITICAL': 'red',
     'ERROR': 'red',
@@ -137,7 +134,7 @@ class StackLevelAdapter(LoggerAdapter):
         "Developer experience" sub-sys statuses.
 
         '''
-        return self.log(400, msg)
+        return self.log(17, msg)
 
     def log(
         self,
@@ -154,8 +151,7 @@ class StackLevelAdapter(LoggerAdapter):
         if self.isEnabledFor(level):
             stacklevel: int = 3
             if (
-                level in LEVELS.values()
-                # or level in _custom_levels
+                level in CUSTOM_LEVELS.values()
             ):
                 stacklevel: int = 4
 
@@ -202,7 +198,8 @@ class StackLevelAdapter(LoggerAdapter):
         )
 
 
-# TODO IDEA:
+# TODO IDEAs:
+# -[ ] move to `.devx.pformat`?
 # -[ ] do per task-name and actor-name color coding
 # -[ ] unique color per task-id and actor-uuid
 def pformat_task_uid(
@@ -309,7 +306,7 @@ def get_logger(
     logger = StackLevelAdapter(log, ActorContextInfo())
 
     # additional levels
-    for name, val in LEVELS.items():
+    for name, val in CUSTOM_LEVELS.items():
         logging.addLevelName(val, name)
 
         # ensure customs levels exist as methods
@@ -377,7 +374,7 @@ def at_least_level(
 
     '''
     if isinstance(level, str):
-        level: int = LEVELS[level.upper()]
+        level: int = CUSTOM_LEVELS[level.upper()]
 
     if log.getEffectiveLevel() <= level:
         return True
-- 
2.34.1


From d15e73557ac12673ccd6a9c1bbd1e5a1db52679a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 22 May 2024 14:56:54 -0400
Subject: [PATCH 319/378] Move runtime frame hiding into helper func

Call it `hide_runtime_frames()` and stick all the lines from the top of
the `._debug` mod in there along with a little `log.devx()` emission on
what gets hidden by default ;)

Other,
- fix ref-error where internal-error handler might trigger despite the
  debug `req_ctx` not yet having init-ed, such that we don't try to
  cancel or log about it when it never was fully created/initialize..
- fix assignment typo iniside `_set_trace()` for `task`.. lel
---
 tractor/devx/_debug.py | 131 +++++++++++++++++++++++++++--------------
 1 file changed, 86 insertions(+), 45 deletions(-)

diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 877d2de6..753c1985 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -48,9 +48,11 @@ from typing import (
     TYPE_CHECKING,
 )
 from types import (
+    FunctionType,
     FrameType,
     ModuleType,
     TracebackType,
+    CodeType,
 )
 
 from msgspec import Struct
@@ -90,43 +92,72 @@ if TYPE_CHECKING:
 
 log = get_logger(__name__)
 
-# XXX HACKZONE XXX
-#  hide exit stack frames on nurseries and cancel-scopes!
-# |_ so avoid seeing it when the `pdbp` REPL is first engaged from
-#    inside a `trio.open_nursery()` scope (with no line after it
-#    in before the block end??).
-#
-# TODO: FINALLY got this workin originally with
-#  `@pdbp.hideframe` around the `wrapper()` def embedded inside
-#  `_ki_protection_decoratior()`.. which is in the module:
-#  /home/goodboy/.virtualenvs/tractor311/lib/python3.11/site-packages/trio/_core/_ki.py
-#
-# -[ ] make an issue and patch for `trio` core? maybe linked
-#    to the long outstanding `pdb` one below?
-#   |_ it's funny that there's frame hiding throughout `._run.py`
-#      but not where it matters on the below exit funcs..
-#
-# -[ ] provide a patchset for the lonstanding
-#   |_ https://github.com/python-trio/trio/issues/1155
-#
-# -[ ] make a linked issue to ^ and propose allowing all the
-#     `._core._run` code to have their `__tracebackhide__` value
-#     configurable by a `RunVar` to allow getting scheduler frames
-#     if desired through configuration?
-#
-# -[ ] maybe dig into the core `pdb` issue why the extra frame is shown
-#      at all?
-#
-pdbp.hideframe(trio._core._run.NurseryManager.__aexit__)
-pdbp.hideframe(trio._core._run.CancelScope.__exit__)
-pdbp.hideframe(_GeneratorContextManager.__exit__)
-pdbp.hideframe(_AsyncGeneratorContextManager.__aexit__)
-pdbp.hideframe(trio.Event.wait)
 
-__all__ = [
-    'breakpoint',
-    'post_mortem',
-]
+def hide_runtime_frames() -> dict[FunctionType, CodeType]:
+    '''
+    Hide call-stack frames for various std-lib and `trio`-API primitives
+    such that the tracebacks presented from our runtime are as minimized
+    as possible, particularly from inside a `PdbREPL`.
+
+    '''
+    # XXX HACKZONE XXX
+    #  hide exit stack frames on nurseries and cancel-scopes!
+    # |_ so avoid seeing it when the `pdbp` REPL is first engaged from
+    #    inside a `trio.open_nursery()` scope (with no line after it
+    #    in before the block end??).
+    #
+    # TODO: FINALLY got this workin originally with
+    #  `@pdbp.hideframe` around the `wrapper()` def embedded inside
+    #  `_ki_protection_decoratior()`.. which is in the module:
+    #  /home/goodboy/.virtualenvs/tractor311/lib/python3.11/site-packages/trio/_core/_ki.py
+    #
+    # -[ ] make an issue and patch for `trio` core? maybe linked
+    #    to the long outstanding `pdb` one below?
+    #   |_ it's funny that there's frame hiding throughout `._run.py`
+    #      but not where it matters on the below exit funcs..
+    #
+    # -[ ] provide a patchset for the lonstanding
+    #   |_ https://github.com/python-trio/trio/issues/1155
+    #
+    # -[ ] make a linked issue to ^ and propose allowing all the
+    #     `._core._run` code to have their `__tracebackhide__` value
+    #     configurable by a `RunVar` to allow getting scheduler frames
+    #     if desired through configuration?
+    #
+    # -[ ] maybe dig into the core `pdb` issue why the extra frame is shown
+    #      at all?
+    #
+    funcs: list[FunctionType] = [
+        trio._core._run.NurseryManager.__aexit__,
+        trio._core._run.CancelScope.__exit__,
+         _GeneratorContextManager.__exit__,
+         _AsyncGeneratorContextManager.__aexit__,
+         _AsyncGeneratorContextManager.__aenter__,
+         trio.Event.wait,
+    ]
+    func_list_str: str = textwrap.indent(
+        "\n".join(f.__qualname__ for f in funcs),
+        prefix=' |_ ',
+    )
+    log.devx(
+        'Hiding the following runtime frames by default:\n'
+        f'{func_list_str}\n'
+    )
+
+    codes: dict[FunctionType, CodeType] = {}
+    for ref in funcs:
+        # stash a pre-modified version of each ref's code-obj
+        # so it can be reverted later if needed.
+        codes[ref] = ref.__code__
+        pdbp.hideframe(ref)
+    #
+    # pdbp.hideframe(trio._core._run.NurseryManager.__aexit__)
+    # pdbp.hideframe(trio._core._run.CancelScope.__exit__)
+    # pdbp.hideframe(_GeneratorContextManager.__exit__)
+    # pdbp.hideframe(_AsyncGeneratorContextManager.__aexit__)
+    # pdbp.hideframe(_AsyncGeneratorContextManager.__aenter__)
+    # pdbp.hideframe(trio.Event.wait)
+    return codes
 
 
 class LockStatus(
@@ -1032,15 +1063,24 @@ async def request_root_stdio_lock(
 
             except (
                 BaseException,
-            ):
-                log.exception(
-                    'Failed during root TTY-lock dialog?\n'
-                    f'{req_ctx}\n'
-
-                    f'Cancelling IPC ctx!\n'
+            ) as ctx_err:
+                message: str = (
+                    'Failed during debug request dialog with root actor?\n\n'
                 )
-                await req_ctx.cancel()
-                raise
+
+                if req_ctx:
+                    message += (
+                        f'{req_ctx}\n'
+                        f'Cancelling IPC ctx!\n'
+                    )
+                    await req_ctx.cancel()
+
+                else:
+                    message += 'Failed during `Portal.open_context()` ?\n'
+
+                log.exception(message)
+                ctx_err.add_note(message)
+                raise ctx_err
 
 
     except (
@@ -1067,6 +1107,7 @@ async def request_root_stdio_lock(
         #   ctl-c out of the currently hanging task! 
         raise DebugRequestError(
             'Failed to lock stdio from subactor IPC ctx!\n\n'
+
             f'req_ctx: {DebugStatus.req_ctx}\n'
         ) from req_err
 
@@ -1777,7 +1818,7 @@ def _set_trace(
 ):
     __tracebackhide__: bool = hide_tb
     actor: tractor.Actor = actor or current_actor()
-    task: task or current_task()
+    task: trio.Task = task or current_task()
 
     # else:
     # TODO: maybe print the actor supervion tree up to the
-- 
2.34.1


From 702dfe47d50ad6eb5d270b490725f7d17a931c8d Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 22 May 2024 15:01:31 -0400
Subject: [PATCH 320/378] Update debugger tests to expect new pformatting

Mostly the result of the `RemoteActorError.pformat()` and our
new `_pause/crash_msg: str`s which include the `trio.Task.__repr__()`
in the `log.pdb()` message.

Obvi use the `in_prompt_msg()` to accomplish where not used prior.

ToDo later:
-[ ] still some outstanding questions on how detailed inceptions
   should look, eg. in `test_multi_nested_subactors_error_through_nurseries()`
  |_maybe we should be more pedantic at checking `.src_uid` vs.
    `.relay_uid` fields?
-[ ] staged a placeholder test for verifying correct call-stack frame on
   crash handler REPL entry.
-[ ] also need a test to verify that you can't pause from an already paused actor task
   such as can happen if you try to step through runtime code that has
   a recurrent entry to `._debug.pause()`.
---
 tests/test_debugger.py | 108 +++++++++++++++++++++++++++++------------
 1 file changed, 78 insertions(+), 30 deletions(-)

diff --git a/tests/test_debugger.py b/tests/test_debugger.py
index 38a3bc2c..9d159ffe 100644
--- a/tests/test_debugger.py
+++ b/tests/test_debugger.py
@@ -146,9 +146,10 @@ def in_prompt_msg(
     log/REPL output for a given `pdb` interact point.
 
     '''
+    __tracebackhide__: bool = False
+
     for part in parts:
         if part not in prompt:
-
             if pause_on_false:
                 import pdbp
                 pdbp.set_trace()
@@ -167,6 +168,7 @@ def assert_before(
     **kwargs,
 
 ) -> None:
+    __tracebackhide__: bool = False
 
     # as in before the prompt end
     before: str = str(child.before.decode())
@@ -219,7 +221,10 @@ def ctlc(
     ],
     ids=lambda item: f'{item[0]} -> {item[1]}',
 )
-def test_root_actor_error(spawn, user_in_out):
+def test_root_actor_error(
+    spawn,
+    user_in_out,
+):
     '''
     Demonstrate crash handler entering pdb from basic error in root actor.
 
@@ -465,8 +470,12 @@ def test_subactor_breakpoint(
     child.expect(PROMPT)
 
     before = str(child.before.decode())
-    assert "RemoteActorError: ('breakpoint_forever'" in before
-    assert 'bdb.BdbQuit' in before
+    assert in_prompt_msg(
+        before,
+        ['RemoteActorError:',
+         "('breakpoint_forever'",
+         'bdb.BdbQuit',]
+    )
 
     if ctlc:
         do_ctlc(child)
@@ -478,8 +487,12 @@ def test_subactor_breakpoint(
     child.expect(pexpect.EOF)
 
     before = str(child.before.decode())
-    assert "RemoteActorError: ('breakpoint_forever'" in before
-    assert 'bdb.BdbQuit' in before
+    assert in_prompt_msg(
+        before,
+        ['RemoteActorError:',
+         "('breakpoint_forever'",
+         'bdb.BdbQuit',]
+    )
 
 
 @has_nested_actors
@@ -747,8 +760,9 @@ def test_multi_daemon_subactors(
             # boxed error raised in root task
             # "Attaching to pdb in crashed actor: ('root'",
             _crash_msg,
-            "('root'",
-            "_exceptions.RemoteActorError: ('name_error'",
+            "('root'",  # should attach in root
+            "_exceptions.RemoteActorError:",  # with an embedded RAE for..
+            "('name_error'",  # the src subactor which raised
         ]
     )
 
@@ -849,10 +863,11 @@ def test_multi_nested_subactors_error_through_nurseries(
     # https://github.com/goodboy/tractor/issues/320
     # ctlc: bool,
 ):
-    """Verify deeply nested actors that error trigger debugger entries
+    '''
+    Verify deeply nested actors that error trigger debugger entries
     at each actor nurserly (level) all the way up the tree.
 
-    """
+    '''
     # NOTE: previously, inside this script was a bug where if the
     # parent errors before a 2-levels-lower actor has released the lock,
     # the parent tries to cancel it but it's stuck in the debugger?
@@ -872,22 +887,31 @@ def test_multi_nested_subactors_error_through_nurseries(
         except EOF:
             break
 
-    assert_before(child, [
+    assert_before(
+        child,
+        [ # boxed source errors
+            "NameError: name 'doggypants' is not defined",
+            "tractor._exceptions.RemoteActorError:",
+            "('name_error'",
+            "bdb.BdbQuit",
 
-        # boxed source errors
-        "NameError: name 'doggypants' is not defined",
-        "tractor._exceptions.RemoteActorError: ('name_error'",
-        "bdb.BdbQuit",
+            # first level subtrees
+            # "tractor._exceptions.RemoteActorError: ('spawner0'",
+            "src_uid=('spawner0'",
 
-        # first level subtrees
-        "tractor._exceptions.RemoteActorError: ('spawner0'",
-        # "tractor._exceptions.RemoteActorError: ('spawner1'",
+            # "tractor._exceptions.RemoteActorError: ('spawner1'",
 
-        # propagation of errors up through nested subtrees
-        "tractor._exceptions.RemoteActorError: ('spawn_until_0'",
-        "tractor._exceptions.RemoteActorError: ('spawn_until_1'",
-        "tractor._exceptions.RemoteActorError: ('spawn_until_2'",
-    ])
+            # propagation of errors up through nested subtrees
+            # "tractor._exceptions.RemoteActorError: ('spawn_until_0'",
+            # "tractor._exceptions.RemoteActorError: ('spawn_until_1'",
+            # "tractor._exceptions.RemoteActorError: ('spawn_until_2'",
+            # ^-NOTE-^ old RAE repr, new one is below with a field
+            # showing the src actor's uid.
+            "src_uid=('spawn_until_0'",
+            "relay_uid=('spawn_until_1'",
+            "src_uid=('spawn_until_2'",
+        ]
+    )
 
 
 @pytest.mark.timeout(15)
@@ -1021,13 +1045,16 @@ def test_different_debug_mode_per_actor(
     # msg reported back from the debug mode actor is processed.
     # assert "tractor._exceptions.RemoteActorError: ('debugged_boi'" in before
 
-    assert "tractor._exceptions.RemoteActorError: ('crash_boi'" in before
-
     # the crash boi should not have made a debugger request but
     # instead crashed completely
-    assert "tractor._exceptions.RemoteActorError: ('crash_boi'" in before
-    assert "RuntimeError" in before
-
+    assert_before(
+        child,
+        [
+            "tractor._exceptions.RemoteActorError:",
+            "src_uid=('crash_boi'",
+            "RuntimeError",
+        ]
+    )
 
 
 def test_pause_from_sync(
@@ -1046,13 +1073,15 @@ def test_pause_from_sync(
     assert_before(
         child,
         [
-            '`greenback` portal opened!',
             # pre-prompt line
-            _pause_msg, "('root'",
+            _pause_msg,
+            "<Task '__main__.main'",
+            "('root'",
         ]
     )
     if ctlc:
         do_ctlc(child)
+
     child.sendline('c')
     child.expect(PROMPT)
 
@@ -1069,6 +1098,7 @@ def test_pause_from_sync(
 
     if ctlc:
         do_ctlc(child)
+
     child.sendline('c')
     child.expect(PROMPT)
     assert_before(
@@ -1078,6 +1108,7 @@ def test_pause_from_sync(
 
     if ctlc:
         do_ctlc(child)
+
     child.sendline('c')
     child.expect(PROMPT)
     # non-main thread case
@@ -1089,5 +1120,22 @@ def test_pause_from_sync(
 
     if ctlc:
         do_ctlc(child)
+
     child.sendline('c')
     child.expect(pexpect.EOF)
+
+
+# TODO!
+def test_correct_frames_below_hidden():
+    '''
+    Ensure that once a `tractor.pause()` enages, when the user
+    inputs a "next"/"n" command the actual next line steps
+    and that using a "step"/"s" into the next LOC, particuarly
+    `tractor` APIs, you can step down into that code.
+
+    '''
+    ...
+
+
+def test_cant_pause_from_paused_task():
+    ...
-- 
2.34.1


From 9eb74560adb0b08177b43bc56819fb4573b3fa04 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 22 May 2024 15:10:39 -0400
Subject: [PATCH 321/378] Port `Actor._stream_handler()` to use `.has_outcome`,
 fix indent bug..

---
 tractor/_runtime.py | 37 ++++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 23c1c6f5..1f81c74d 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -692,21 +692,21 @@ class Actor:
                             proc: trio.Process
                             _, proc, _ = entry
 
-                        if (
-                            (poll := getattr(proc, 'poll', None))
-                            and poll() is None
-                        ):
-                            log.cancel(
-                                'Root actor reports no-more-peers, BUT\n'
-                                'a DISCONNECTED child still has the debug '
-                                'lock!\n\n'
-                                # f'root uid: {self.uid}\n'
-                                f'last disconnected child uid: {uid}\n'
-                                f'locking child uid: {pdb_user_uid}\n'
-                            )
-                            await maybe_wait_for_debugger(
-                                child_in_debug=True
-                            )
+                            if (
+                                (poll := getattr(proc, 'poll', None))
+                                and poll() is None
+                            ):
+                                log.cancel(
+                                    'Root actor reports no-more-peers, BUT\n'
+                                    'a DISCONNECTED child still has the debug '
+                                    'lock!\n\n'
+                                    # f'root uid: {self.uid}\n'
+                                    f'last disconnected child uid: {uid}\n'
+                                    f'locking child uid: {pdb_user_uid}\n'
+                                )
+                                await maybe_wait_for_debugger(
+                                    child_in_debug=True
+                                )
 
                     # TODO: just bc a child's transport dropped
                     # doesn't mean it's not still using the pdb
@@ -1140,7 +1140,6 @@ class Actor:
             requester_type,
             req_chan,
             log_meth,
-
         ) = (
             req_chan.uid,
             'peer',
@@ -1173,7 +1172,11 @@ class Actor:
             # with the root actor in this tree
             debug_req = _debug.DebugStatus
             lock_req_ctx: Context = debug_req.req_ctx
-            if lock_req_ctx is not None:
+            if (
+                lock_req_ctx
+                and
+                lock_req_ctx.has_outcome
+            ):
                 msg += (
                     '-> Cancelling active debugger request..\n'
                     f'|_{_debug.Lock.repr()}\n\n'
-- 
2.34.1


From c6f599b1beb4b5927cf69d051e908870908729f8 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 22 May 2024 15:11:21 -0400
Subject: [PATCH 322/378] Call `.devx._debug.hide_runtime_frames()` by default

From both `open_root_actor()` and `._entry._trio_main()`.

Other `breakpoint()`-from-sync-func fixes:
- properly disable the default hook using `"0"` XD
- offer a `hide_tb: bool` from `open_root_actor()`.
- disable hiding the `._trio_main()` frame, bc pretty sure it doesn't
  help anyone (either way) when REPL-ing/tb-ing from a subactor..?
---
 tractor/_entry.py |  7 ++++---
 tractor/_root.py  | 13 ++++++++++---
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/tractor/_entry.py b/tractor/_entry.py
index 750dc59f..e22a4f1f 100644
--- a/tractor/_entry.py
+++ b/tractor/_entry.py
@@ -33,6 +33,7 @@ from .log import (
     get_logger,
 )
 from . import _state
+from .devx import _debug
 from .to_asyncio import run_as_asyncio_guest
 from ._runtime import (
     async_main,
@@ -96,7 +97,6 @@ def _mp_main(
 
 
 def _trio_main(
-
     actor: Actor,
     *,
     parent_addr: tuple[str, int] | None = None,
@@ -107,7 +107,9 @@ def _trio_main(
     Entry point for a `trio_run_in_process` subactor.
 
     '''
-    __tracebackhide__: bool = True
+    # __tracebackhide__: bool = True
+    _debug.hide_runtime_frames()
+
     _state._current_actor = actor
     trio_main = partial(
         async_main,
@@ -146,7 +148,6 @@ def _trio_main(
             +
             actor_info
         )
-
     finally:
         log.info(
             'Subactor terminated\n'
diff --git a/tractor/_root.py b/tractor/_root.py
index 77806992..a01ecd01 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -91,12 +91,16 @@ async def open_root_actor(
     # and that this call creates it.
     ensure_registry: bool = False,
 
+    hide_tb: bool = True,
+
 ) -> Actor:
     '''
     Runtime init entry point for ``tractor``.
 
     '''
-    __tracebackhide__ = True
+    __tracebackhide__: bool = hide_tb
+    _debug.hide_runtime_frames()
+
     # TODO: stick this in a `@cm` defined in `devx._debug`?
     #
     # Override the global debugger hook to make it play nice with
@@ -125,7 +129,7 @@ async def open_root_actor(
         # usage by a clobbered TTY's stdstreams!
         def block_bps(*args, **kwargs):
             raise RuntimeError(
-                'Trying to use `breakpoint()` eh?\n'
+                'Trying to use `breakpoint()` eh?\n\n'
                 'Welp, `tractor` blocks `breakpoint()` built-in calls by default!\n'
                 'If you need to use it please install `greenback` and set '
                 '`debug_mode=True` when opening the runtime '
@@ -133,7 +137,9 @@ async def open_root_actor(
             )
 
         sys.breakpointhook = block_bps
-        # os.environ['PYTHONBREAKPOINT'] = None
+        # lol ok,
+        # https://docs.python.org/3/library/sys.html#sys.breakpointhook
+        os.environ['PYTHONBREAKPOINT'] = "0"
 
     # attempt to retreive ``trio``'s sigint handler and stash it
     # on our debugger lock state.
@@ -203,6 +209,7 @@ async def open_root_actor(
         ):
             loglevel = 'PDB'
 
+
     elif debug_mode:
         raise RuntimeError(
             "Debug mode is only supported for the `trio` backend!"
-- 
2.34.1


From ce4d64ed2fb8681d519b28df3e4c64f659a2a084 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 22 May 2024 15:18:45 -0400
Subject: [PATCH 323/378] Mk `MsgDec.spec_str` have a more compact `

---
 tractor/msg/_codec.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tractor/msg/_codec.py b/tractor/msg/_codec.py
index 6ba23b78..cd86552f 100644
--- a/tractor/msg/_codec.py
+++ b/tractor/msg/_codec.py
@@ -162,7 +162,10 @@ class MsgDec(Struct):
     # TODO: would get moved into `FieldSpec.__str__()` right?
     @property
     def spec_str(self) -> str:
-        return pformat_msgspec(codec=self)
+        return pformat_msgspec(
+            codec=self,
+            join_char='|',
+        )
 
     pld_spec_str = spec_str
 
@@ -211,7 +214,7 @@ def mk_msgspec_table(
         msgtypes = [msgspec]
 
     msgt_table: dict[str, MsgType] = {
-        msgt: str(msgt)
+        msgt: str(msgt.__name__)
         for msgt in msgtypes
     }
     if msg:
-- 
2.34.1


From 9ce958cb4a6ec3ee55e7468fc8efe6c39a2550ac Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 22 May 2024 15:21:01 -0400
Subject: [PATCH 324/378] Add debug check-n-wait inside `._spawn.soft_kill()`

And IFF the `await wait_func(proc)` is cancelled such that we avoid
clobbering some subactor that might be REPL-ing even though its parent
actor is in the midst of (gracefully) cancelling it.
---
 tractor/_spawn.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tractor/_spawn.py b/tractor/_spawn.py
index b234099f..aeb7a7c9 100644
--- a/tractor/_spawn.py
+++ b/tractor/_spawn.py
@@ -43,6 +43,7 @@ from tractor._state import (
     is_main_process,
     is_root_process,
     debug_mode,
+    _runtime_vars,
 )
 from tractor.log import get_logger
 from tractor._portal import Portal
@@ -299,7 +300,6 @@ async def hard_kill(
 
 
 async def soft_kill(
-
     proc: ProcessType,
     wait_func: Callable[
         [ProcessType],
@@ -329,6 +329,18 @@ async def soft_kill(
         await wait_func(proc)
 
     except trio.Cancelled:
+        with trio.CancelScope(shield=True):
+            await maybe_wait_for_debugger(
+                child_in_debug=_runtime_vars.get(
+                    '_debug_mode', False
+                ),
+                header_msg=(
+                    'Delaying `soft_kill()` subproc reaper while debugger locked..\n'
+                ),
+                # TODO: need a diff value then default?
+                # poll_steps=9999999,
+            )
+
         # if cancelled during a soft wait, cancel the child
         # actor before entering the hard reap sequence
         # below. This means we try to do a graceful teardown
-- 
2.34.1


From e4ec6b7b0c3401b336e03a4ea7b42015480b7677 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 22 May 2024 15:26:48 -0400
Subject: [PATCH 325/378] Even smarter `RemoteActorError.pformat()`-ing

Related to the prior patch, re the new `with_type_header: bool`:
- in the `with_type_header == True` use case make sure we keep the first
  `._message: str` line non-indented since it'll show just after the
  header-line's type path with ':'.
- when `False` drop the `)>` `repr()`-instance style as well so that we
  just get the ascii boxed traceback as though it's the error
  message-`str` not the `repr()` of the error obj.

Other,
- hide `pack_from_raise()` call frame since it'll show in debug mode
  crash handling..
- mk `MsgTypeError.from_decode()` explicitly accept and proxy an
  optional `ipc_msg` and change `msgdict` to also be optional, only
  reading out the `**extra_msgdata` when provided.
- expose a `_mk_msg_type_err(src_err_msg: Error|None = None,)` for
  callers who which to inject a `._ipc_msg: Msgtype` to the MTE.
  |_ add a note how we can't use it due to a causality-dilemma when pld
     validating `Started` on the send side..
---
 tractor/_exceptions.py | 84 +++++++++++++++++++++++++++++++-----------
 1 file changed, 62 insertions(+), 22 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 179b49a1..9a94bbdb 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -35,7 +35,6 @@ import trio
 from msgspec import (
     defstruct,
     msgpack,
-    Raw,
     structs,
     ValidationError,
 )
@@ -44,11 +43,12 @@ from tractor._state import current_actor
 from tractor.log import get_logger
 from tractor.msg import (
     Error,
+    PayloadMsg,
     MsgType,
-    Stop,
-    types as msgtypes,
     MsgCodec,
     MsgDec,
+    Stop,
+    types as msgtypes,
 )
 from tractor.msg.pretty_struct import (
     iter_fields,
@@ -156,6 +156,7 @@ def pack_from_raise(
     `Error`-msg using `pack_error()` to extract the tb info.
 
     '''
+    __tracebackhide__: bool = True
     try:
         raise local_err
     except type(local_err) as local_err:
@@ -525,10 +526,26 @@ class RemoteActorError(Exception):
             if not with_type_header:
                 body = '\n' + body
         else:
-            body: str = textwrap.indent(
-                self._message,
-                prefix='  ',
-            ) + '\n'
+            first: str = ''
+            message: str = self._message
+
+            # split off the first line so it isn't indented
+            # the same like the "boxed content".
+            if not with_type_header:
+                lines: list[str] = message.splitlines()
+                first = lines[0]
+                message = ''.join(lines[1:])
+
+            body: str = (
+                first
+                +
+                textwrap.indent(
+                    message,
+                    prefix='  ',
+                )
+                +
+                '\n'
+            )
 
         if with_type_header:
             tail: str = ')>'
@@ -734,25 +751,38 @@ class MsgTypeError(
     def from_decode(
         cls,
         message: str,
-        msgdict: dict,
+
+        ipc_msg: PayloadMsg|None = None,
+        msgdict: dict|None = None,
 
     ) -> MsgTypeError:
-        return cls(
-            message=message,
-            boxed_type=cls,
+        '''
+        Constuctor for easy creation from (presumably) catching
+        the backend interchange lib's underlying validation error
+        and passing context-specific meta-data to `_mk_msg_type_err()`
+        (which is normally the caller of this).
 
-            # NOTE: original "vanilla decode" of the msg-bytes
-            # is placed inside a value readable from
-            # `.msgdata['_msg_dict']`
-            _msg_dict=msgdict,
-
-            # expand and pack all RAE compat fields
-            # into the `._extra_msgdata` aux `dict`.
-            **{
+        '''
+        # if provided, expand and pack all RAE compat fields into the
+        # `._extra_msgdata` auxillary data `dict` internal to
+        # `RemoteActorError`.
+        extra_msgdata: dict = {}
+        if msgdict:
+            extra_msgdata: dict = {
                 k: v
                 for k, v in msgdict.items()
                 if k in _ipcmsg_keys
-            },
+            }
+            # NOTE: original "vanilla decode" of the msg-bytes
+            # is placed inside a value readable from
+            # `.msgdata['_msg_dict']`
+            extra_msgdata['_msg_dict'] = msgdict
+
+        return cls(
+            message=message,
+            boxed_type=cls,
+            ipc_msg=ipc_msg,
+            **extra_msgdata,
         )
 
 
@@ -1076,7 +1106,7 @@ _raise_from_no_key_in_msg = _raise_from_unexpected_msg
 
 
 def _mk_msg_type_err(
-    msg: Any|bytes|Raw,
+    msg: Any|bytes|MsgType,
     codec: MsgCodec|MsgDec,
 
     message: str|None = None,
@@ -1085,6 +1115,7 @@ def _mk_msg_type_err(
     src_validation_error: ValidationError|None = None,
     src_type_error: TypeError|None = None,
     is_invalid_payload: bool = False,
+    src_err_msg: Error|None = None,
 
     **mte_kwargs,
 
@@ -1159,9 +1190,10 @@ def _mk_msg_type_err(
             # only the payload being wrong?
             # -[ ] maybe the better design is to break this construct
             #   logic into a separate explicit helper raiser-func?
-            msg_dict: dict = {}
+            msg_dict = None
 
         else:
+            msg: bytes
             # decode the msg-bytes using the std msgpack
             # interchange-prot (i.e. without any
             # `msgspec.Struct` handling) so that we can
@@ -1206,6 +1238,14 @@ def _mk_msg_type_err(
         msgtyperr = MsgTypeError.from_decode(
             message=message,
             msgdict=msg_dict,
+
+            # NOTE: for the send-side `.started()` pld-validate
+            # case we actually set the `._ipc_msg` AFTER we return
+            # from here inside `Context.started()` since we actually
+            # want to emulate the `Error` from the mte we build here
+            # Bo
+            # so by default in that case this is set to `None`
+            ipc_msg=src_err_msg,
         )
         msgtyperr.__cause__ = src_validation_error
         return msgtyperr
-- 
2.34.1


From c2cc12e14ff725dc326c65ef70794ff2f8adebc5 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 27 May 2024 13:52:35 -0400
Subject: [PATCH 326/378] Add basic payload-spec test suite

Starts with some very basic cases:
- verify both subactor-as-child-ctx-task send side validation (failures)
  as well as relay and raise on root-parent-side-task.
- wrap failure expectation cases that bubble out of `@acm`s with
  a `maybe_expect_raises()` equiv wrapper with an embedded timeout.
- add `Return` cases including invalid by `str` and valid by a `None`.

Still ToDo:
- commit impl changes to make the bulk of this suite pass.
- adjust how `MsgTypeError`s format the local (`.started()`) send side
  `.tb_str` such that we don't do a "boxed" error prior to
  `pack_error()` being called normally prior to `Error` transit.
---
 tests/test_pldrx_limiting.py | 316 +++++++++++++++++++++++++++++++++++
 1 file changed, 316 insertions(+)
 create mode 100644 tests/test_pldrx_limiting.py

diff --git a/tests/test_pldrx_limiting.py b/tests/test_pldrx_limiting.py
new file mode 100644
index 00000000..d658fb51
--- /dev/null
+++ b/tests/test_pldrx_limiting.py
@@ -0,0 +1,316 @@
+'''
+Audit sub-sys APIs from `.msg._ops`
+mostly for ensuring correct `contextvars`
+related settings around IPC contexts.
+
+'''
+from contextlib import (
+    asynccontextmanager as acm,
+    contextmanager as cm,
+)
+# import typing
+from typing import (
+    # Any,
+    TypeAlias,
+    # Union,
+)
+from contextvars import (
+    Context,
+)
+
+from msgspec import (
+    # structs,
+    # msgpack,
+    Struct,
+    # ValidationError,
+)
+import pytest
+import trio
+
+import tractor
+from tractor import (
+    # _state,
+    MsgTypeError,
+    current_ipc_ctx,
+    Portal,
+)
+from tractor.msg import (
+    _ops as msgops,
+    Return,
+)
+from tractor.msg import (
+    _codec,
+    # _ctxvar_MsgCodec,
+
+    # NamespacePath,
+    # MsgCodec,
+    # mk_codec,
+    # apply_codec,
+    # current_codec,
+)
+from tractor.msg.types import (
+    log,
+    # _payload_msgs,
+    # PayloadMsg,
+    # Started,
+    # mk_msg_spec,
+)
+
+
+class PldMsg(Struct):
+    field: str
+
+
+maybe_msg_spec = PldMsg|None
+
+
+@cm
+def custom_spec(
+    ctx: Context,
+    spec: TypeAlias,
+) -> _codec.MsgCodec:
+    '''
+    Apply a custom payload spec, remove on exit.
+
+    '''
+    rx: msgops.PldRx = ctx._pld_rx
+
+
+@acm
+async def maybe_expect_raises(
+    raises: BaseException|None = None,
+    ensure_in_message: list[str]|None = None,
+
+    reraise: bool = False,
+    timeout: int = 3,
+) -> None:
+    '''
+    Async wrapper for ensuring errors propagate from the inner scope.
+
+    '''
+    with trio.fail_after(timeout):
+        try:
+            yield
+        except BaseException as _inner_err:
+            inner_err = _inner_err
+            # wasn't-expected to error..
+            if raises is None:
+                raise
+
+            else:
+                assert type(inner_err) is raises
+
+                # maybe check for error txt content
+                if ensure_in_message:
+                    part: str
+                    for part in ensure_in_message:
+                        for i, arg in enumerate(inner_err.args):
+                            if part in arg:
+                                break
+                        # if part never matches an arg, then we're
+                        # missing a match.
+                        else:
+                            raise ValueError(
+                                'Failed to find error message content?\n\n'
+                                f'expected: {ensure_in_message!r}\n'
+                                f'part: {part!r}\n\n'
+                                f'{inner_err.args}'
+                        )
+
+                if reraise:
+                    raise inner_err
+
+        else:
+            if raises:
+                raise RuntimeError(
+                    f'Expected a {raises.__name__!r} to be raised?'
+                )
+
+
+@tractor.context
+async def child(
+    ctx: Context,
+    started_value: int|PldMsg|None,
+    return_value: str|None,
+    validate_pld_spec: bool,
+    raise_on_started_mte: bool = True,
+
+) -> None:
+    '''
+    Call ``Context.started()`` more then once (an error).
+
+    '''
+    expect_started_mte: bool = started_value == 10
+
+    # sanaity check that child RPC context is the current one
+    curr_ctx: Context = current_ipc_ctx()
+    assert ctx is curr_ctx
+
+    rx: msgops.PldRx = ctx._pld_rx
+    orig_pldec: _codec.MsgDec = rx.pld_dec
+    # senity that default pld-spec should be set
+    assert (
+        rx.pld_dec
+        is
+        msgops._def_any_pldec
+    )
+
+    try:
+        with msgops.limit_plds(
+            spec=maybe_msg_spec,
+        ) as pldec:
+            # sanity on `MsgDec` state
+            assert rx.pld_dec is pldec
+            assert pldec.spec is maybe_msg_spec
+
+            # 2 cases: hdndle send-side and recv-only validation
+            # - when `raise_on_started_mte == True`, send validate
+            # - else, parent-recv-side only validation
+            try:
+                await ctx.started(
+                    value=started_value,
+                    validate_pld_spec=validate_pld_spec,
+                )
+
+            except MsgTypeError:
+                log.exception('started()` raised an MTE!\n')
+                if not expect_started_mte:
+                    raise RuntimeError(
+                        'Child-ctx-task SHOULD NOT HAVE raised an MTE for\n\n'
+                        f'{started_value!r}\n'
+                    )
+
+                # propagate to parent?
+                if raise_on_started_mte:
+                    raise
+            else:
+                if expect_started_mte:
+                    raise RuntimeError(
+                        'Child-ctx-task SHOULD HAVE raised an MTE for\n\n'
+                        f'{started_value!r}\n'
+                    )
+
+            # XXX should always fail on recv side since we can't
+            # really do much else beside terminate and relay the
+            # msg-type-error from this RPC task ;)
+            return return_value
+
+    finally:
+        # sanity on `limit_plds()` reversion
+        assert (
+            rx.pld_dec
+            is
+            msgops._def_any_pldec
+        )
+        log.runtime(
+            'Reverted to previous pld-spec\n\n'
+            f'{orig_pldec}\n'
+        )
+
+
+@pytest.mark.parametrize(
+    'return_value',
+    [
+        None,
+        'yo',
+    ],
+    ids=[
+        'return[invalid-"yo"]',
+        'return[valid-None]',
+    ],
+)
+@pytest.mark.parametrize(
+    'started_value',
+    [
+        10,
+        PldMsg(field='yo'),
+    ],
+    ids=[
+        'Started[invalid-10]',
+        'Started[valid-PldMsg]',
+    ],
+)
+@pytest.mark.parametrize(
+    'pld_check_started_value',
+    [
+        True,
+        False,
+    ],
+    ids=[
+        'check-started-pld',
+        'no-started-pld-validate',
+    ],
+)
+def test_basic_payload_spec(
+    debug_mode: bool,
+    loglevel: str,
+    return_value: str|None,
+    started_value: int|PldMsg,
+    pld_check_started_value: bool,
+):
+    '''
+    Validate the most basic `PldRx` msg-type-spec semantics around
+    a IPC `Context` endpoint start, started-sync, and final return
+    value depending on set payload types and the currently applied
+    pld-spec.
+
+    '''
+    invalid_return: bool = return_value == 'yo'
+    invalid_started: bool = started_value == 10
+
+    async def main():
+        async with tractor.open_nursery(
+            debug_mode=debug_mode,
+            loglevel=loglevel,
+        ) as an:
+            p: Portal = await an.start_actor(
+                'child',
+                enable_modules=[__name__],
+            )
+
+            # since not opened yet.
+            assert current_ipc_ctx() is None
+
+            async with (
+                maybe_expect_raises(
+                    raises=MsgTypeError if (
+                        invalid_return
+                        or
+                        invalid_started
+                    ) else None,
+                    ensure_in_message=[
+                        "invalid `Return` payload",
+                        "value: `'yo'` does not match type-spec: `Return.pld: PldMsg|NoneType`",
+                    ],
+                ),
+                p.open_context(
+                    child,
+                    return_value=return_value,
+                    started_value=started_value,
+                    pld_spec=maybe_msg_spec,
+                    validate_pld_spec=pld_check_started_value,
+                ) as (ctx, first),
+            ):
+                # now opened with 'child' sub
+                assert current_ipc_ctx() is ctx
+
+                assert type(first) is PldMsg
+                assert first.field == 'yo'
+
+                try:
+                    assert (await ctx.result()) is None
+                except MsgTypeError as mte:
+                    if not invalid_return:
+                        raise
+
+                    else:  # expected this invalid `Return.pld`
+                        assert mte.cid == ctx.cid
+
+                        # verify expected remote mte deats
+                        await tractor.pause()
+                        assert ctx._remote_error is mte
+                        assert mte.expected_msg_type is Return
+
+            await p.cancel_actor()
+
+    trio.run(main)
-- 
2.34.1


From 42ba855d1b98eb410cd78d969fcfe69702e16b42 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 27 May 2024 14:59:40 -0400
Subject: [PATCH 327/378] More correct/explicit `.started()` send-side
 validation

In the sense that we handle it as a special case that exposed
through to `RxPld.dec_msg()` with a new `is_started_send_side: bool`.

(Non-ideal) `Context.started()` impl deats:
- only do send-side pld-spec validation when a new `validate_pld_spec`
  is set (by default it's not).
- call `self.pld_rx.dec_msg(is_started_send_side=True)` to validate the
  payload field from the just codec-ed `Started` msg's `msg_bytes` by
  passing the `roundtripped` msg (with it's `.pld: Raw`) directly.
- add a `hide_tb: bool` param and proxy it to the `.dec_msg()` call.

(Non-ideal) `PldRx.dec_msg()` impl deats:
- for now we're packing the MTE inside an `Error` via a manual call to
  `pack_error()` and then setting that as the `msg` passed to
  `_raise_from_unexpected_msg()` (though really we should just raise
  inline?).
- manually set the `MsgTypeError._ipc_msg` to the above..

Other,
- more comprehensive `Context` type doc string.
- various `hide_tb: bool` kwarg additions through `._ops.PldRx` meths.
- proto a `.msg._ops.validate_payload_msg()` helper planned to get the
  logic from this version of `.started()`'s send-side validation so as
  to be useful more generally elsewhere.. (like for raising back
  `Return` values on the child side?).

Warning: this commit may have been made out of order from required
changes to `._exceptions` which will come in a follow up!
---
 tractor/_context.py | 128 +++++++++++++++++++++-----------------------
 tractor/msg/_ops.py |  70 ++++++++++++++++++++----
 2 files changed, 122 insertions(+), 76 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index ed720a2d..42271b00 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -15,12 +15,22 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 '''
-The fundamental cross process SC abstraction: an inter-actor,
-cancel-scope linked task "context".
+The fundamental cross-process SC abstraction: an inter-actor,
+transitively cancel-scope linked, (dual) task IPC coupled "context".
 
-A ``Context`` is very similar to the ``trio.Nursery.cancel_scope`` built
-into each ``trio.Nursery`` except it links the lifetimes of memory space
-disjoint, parallel executing tasks in separate actors.
+A `Context` is very similar to the look and feel of the
+`.cancel_scope: trio.CancelScope` built into each `trio.Nursery`
+except that it links the lifetimes of 2 memory space disjoint,
+parallel executing, tasks scheduled in separate "actors".
+
+So while a `trio.Nursery` has a `.parent_task` which exists both
+before (open) and then inside the body of the `async with` of the
+nursery's scope (/block), a `Context` contains 2 tasks, a "parent"
+and a "child" side, where both execute independently in separate
+memory domains of different (host's) processes linked through
+a SC-transitive IPC "shuttle dialog protocol". The underlying IPC
+dialog-(un)protocol allows for the maintainance of SC properties
+end-2-end between the tasks.
 
 '''
 from __future__ import annotations
@@ -71,13 +81,11 @@ from .msg import (
     MsgCodec,
     NamespacePath,
     PayloadT,
-    Return,
     Started,
     Stop,
     Yield,
     current_codec,
     pretty_struct,
-    types as msgtypes,
     _ops as msgops,
 )
 from ._ipc import (
@@ -90,7 +98,7 @@ from ._state import (
     debug_mode,
     _ctxvar_Context,
 )
-
+# ------ - ------
 if TYPE_CHECKING:
     from ._portal import Portal
     from ._runtime import Actor
@@ -1598,16 +1606,15 @@ class Context:
     async def started(
         self,
 
-        # TODO: how to type this so that it's the
-        # same as the payload type? Is this enough?
         value: PayloadT|None = None,
+        validate_pld_spec: bool = True,
+        strict_pld_parity: bool = False,
 
-        strict_parity: bool = False,
+        # TODO: this will always emit for msgpack for any () vs. []
+        # inside the value.. do we want to offer warnings on that?
+        # complain_no_parity: bool = False,
 
-        # TODO: this will always emit now that we do `.pld: Raw`
-        # passthrough.. so maybe just only complain when above strict
-        # flag is set?
-        complain_no_parity: bool = False,
+        hide_tb: bool = True,
 
     ) -> None:
         '''
@@ -1648,63 +1655,54 @@ class Context:
         #
         # https://zguide.zeromq.org/docs/chapter7/#The-Cheap-or-Nasty-Pattern
         #
-        codec: MsgCodec = current_codec()
-        msg_bytes: bytes = codec.encode(started_msg)
-        try:
-            # be a "cheap" dialog (see above!)
-            if (
-                strict_parity
-                or
-                complain_no_parity
-            ):
-                rt_started: Started = codec.decode(msg_bytes)
-
-                # XXX something is prolly totes cucked with the
-                # codec state!
-                if isinstance(rt_started, dict):
-                    rt_started = msgtypes.from_dict_msg(
-                        dict_msg=rt_started,
-                    )
-                    raise RuntimeError(
-                        'Failed to roundtrip `Started` msg?\n'
-                        f'{pretty_struct.pformat(rt_started)}\n'
-                    )
-
-                if rt_started != started_msg:
+        __tracebackhide__: bool = hide_tb
+        if validate_pld_spec:
+            # __tracebackhide__: bool = False
+            codec: MsgCodec = current_codec()
+            msg_bytes: bytes = codec.encode(started_msg)
+            try:
+                roundtripped: Started = codec.decode(msg_bytes)
+                # pld: PayloadT = await self.pld_rx.recv_pld(
+                pld: PayloadT = self.pld_rx.dec_msg(
+                    msg=roundtripped,
+                    ipc=self,
+                    expect_msg=Started,
+                    hide_tb=hide_tb,
+                    is_started_send_side=True,
+                )
+                if (
+                    strict_pld_parity
+                    and
+                    pld != value
+                ):
                     # TODO: make that one a mod func too..
                     diff = pretty_struct.Struct.__sub__(
-                        rt_started,
+                        roundtripped,
                         started_msg,
                     )
                     complaint: str = (
                         'Started value does not match after roundtrip?\n\n'
                         f'{diff}'
                     )
+                    raise ValidationError(complaint)
 
-                    # TODO: rn this will pretty much always fail with
-                    # any other sequence type embeded in the
-                    # payload...
-                    if (
-                        self._strict_started
-                        or
-                        strict_parity
-                    ):
-                        raise ValueError(complaint)
-                    else:
-                        log.warning(complaint)
+            # raise any msg type error NO MATTER WHAT!
+            except ValidationError as verr:
+                # always show this src frame in the tb
+                # __tracebackhide__: bool = False
+                raise _mk_msg_type_err(
+                    msg=roundtripped,
+                    codec=codec,
+                    src_validation_error=verr,
+                    verb_header='Trying to send ',
+                    is_invalid_payload=True,
+                ) from verr
 
-            await self.chan.send(started_msg)
-
-        # raise any msg type error NO MATTER WHAT!
-        except ValidationError as verr:
-            raise _mk_msg_type_err(
-                msg=msg_bytes,
-                codec=codec,
-                src_validation_error=verr,
-                verb_header='Trying to send payload'
-                # > 'invalid `Started IPC msgs\n'
-            ) from verr
+        # TODO: maybe a flag to by-pass encode op if already done
+        # here in caller?
+        await self.chan.send(started_msg)
 
+        # set msg-related internal runtime-state
         self._started_called = True
         self._started_msg = started_msg
         self._started_pld = value
@@ -1997,12 +1995,7 @@ async def open_context_from_portal(
 
     pld_spec: TypeAlias|None = None,
     allow_overruns: bool = False,
-
-    # TODO: if we set this the wrapping `@acm` body will
-    # still be shown (awkwardly) on pdb REPL entry. Ideally
-    # we can similarly annotate that frame to NOT show? for now
-    # we DO SHOW this frame since it's awkward ow..
-    hide_tb: bool = False,
+    hide_tb: bool = True,
 
     # proxied to RPC
     **kwargs,
@@ -2115,6 +2108,7 @@ async def open_context_from_portal(
                 ipc=ctx,
                 expect_msg=Started,
                 passthrough_non_pld_msgs=False,
+                hide_tb=hide_tb,
             )
 
             # from .devx import pause
diff --git a/tractor/msg/_ops.py b/tractor/msg/_ops.py
index 3014c15b..6faf78ef 100644
--- a/tractor/msg/_ops.py
+++ b/tractor/msg/_ops.py
@@ -47,7 +47,7 @@ from tractor._exceptions import (
     _raise_from_unexpected_msg,
     MsgTypeError,
     _mk_msg_type_err,
-    pack_from_raise,
+    pack_error,
 )
 from tractor._state import current_ipc_ctx
 from ._codec import (
@@ -203,7 +203,6 @@ class PldRx(Struct):
         msg: MsgType = (
             ipc_msg
             or
-
             # async-rx msg from underlying IPC feeder (mem-)chan
             await ipc._rx_chan.receive()
         )
@@ -223,6 +222,10 @@ class PldRx(Struct):
         raise_error: bool = True,
         hide_tb: bool = True,
 
+        # XXX for special (default?) case of send side call with
+        # `Context.started(validate_pld_spec=True)`
+        is_started_send_side: bool = False,
+
     ) -> PayloadT|Raw:
         '''
         Decode a msg's payload field: `MsgType.pld: PayloadT|Raw` and
@@ -230,8 +233,6 @@ class PldRx(Struct):
 
         '''
         __tracebackhide__: bool = hide_tb
-
-        _src_err = None
         src_err: BaseException|None = None
         match msg:
             # payload-data shuttle msg; deliver the `.pld` value
@@ -256,18 +257,58 @@ class PldRx(Struct):
                     # pack mgterr into error-msg for
                     # reraise below; ensure remote-actor-err
                     # info is displayed nicely?
-                    msgterr: MsgTypeError = _mk_msg_type_err(
+                    mte: MsgTypeError = _mk_msg_type_err(
                         msg=msg,
                         codec=self.pld_dec,
                         src_validation_error=valerr,
                         is_invalid_payload=True,
+                        expected_msg=expect_msg,
+                        # ipc_msg=msg,
                     )
-                    msg: Error = pack_from_raise(
-                        local_err=msgterr,
+                    # NOTE: override the `msg` passed to
+                    # `_raise_from_unexpected_msg()` (below) so so that
+                    # we're effectively able to use that same func to
+                    # unpack and raise an "emulated remote `Error`" of
+                    # this local MTE.
+                    err_msg: Error = pack_error(
+                        exc=mte,
                         cid=msg.cid,
-                        src_uid=ipc.chan.uid,
+                        src_uid=(
+                            ipc.chan.uid
+                            if not is_started_send_side
+                            else ipc._actor.uid
+                        ),
+                        # tb=valerr.__traceback__,
+                        tb_str=mte._message,
                     )
+                    # ^-TODO-^ just raise this inline instead of all the
+                    # pack-unpack-repack non-sense!
+
+                    mte._ipc_msg = err_msg
+                    msg = err_msg
+
+                    # set emulated remote error more-or-less as the
+                    # runtime would
+                    ctx: Context = getattr(ipc, 'ctx', ipc)
+
+                    # TODO: should we instead make this explicit and
+                    # use the above masked `is_started_send_decode`,
+                    # expecting the `Context.started()` caller to set
+                    # it? Rn this is kinda, howyousayyy, implicitly
+                    # edge-case-y..
+                    if (
+                        expect_msg is not Started
+                        and not is_started_send_side
+                    ):
+                        ctx._maybe_cancel_and_set_remote_error(mte)
+
+                    # XXX NOTE: so when the `_raise_from_unexpected_msg()`
+                    # raises the boxed `err_msg` from above it raises
+                    # it from `None`.
                     src_err = valerr
+                    # if is_started_send_side:
+                    #     src_err = None
+
 
                 # XXX some other decoder specific failure?
                 # except TypeError as src_error:
@@ -379,6 +420,7 @@ class PldRx(Struct):
         # NOTE: generally speaking only for handling `Stop`-msgs that
         # arrive during a call to `drain_to_final_msg()` above!
         passthrough_non_pld_msgs: bool = True,
+        hide_tb: bool = True,
         **kwargs,
 
     ) -> tuple[MsgType, PayloadT]:
@@ -387,6 +429,7 @@ class PldRx(Struct):
         the pair of refs.
 
         '''
+        __tracebackhide__: bool = hide_tb
         msg: MsgType = await ipc._rx_chan.receive()
 
         if passthrough_non_pld_msgs:
@@ -401,6 +444,7 @@ class PldRx(Struct):
             msg,
             ipc=ipc,
             expect_msg=expect_msg,
+            hide_tb=hide_tb,
             **kwargs,
         )
         return msg, pld
@@ -414,7 +458,7 @@ def limit_plds(
 ) -> MsgDec:
     '''
     Apply a `MsgCodec` that will natively decode the SC-msg set's
-    `Msg.pld: Union[Type[Struct]]` payload fields using
+    `PayloadMsg.pld: Union[Type[Struct]]` payload fields using
     tagged-unions of `msgspec.Struct`s from the `payload_types`
     for all IPC contexts in use by the current `trio.Task`.
 
@@ -691,3 +735,11 @@ async def drain_to_final_msg(
         result_msg,
         pre_result_drained,
     )
+
+
+# TODO: factor logic from `.Context.started()` for send-side
+# validate raising!
+def validate_payload_msg(
+    msg: Started|Yield|Return,
+) -> MsgTypeError|None:
+    ...
-- 
2.34.1


From eee4c61b51e6d3053549e67650be04fcd03ab2d5 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 27 May 2024 22:36:05 -0400
Subject: [PATCH 328/378] Add `MsgTypeError` "bad msg" capture

Such that if caught by user code and/or the runtime we can introspect
the original msg which caused the type error. Previously this was kinda
half-baked with a `.msg_dict` which was delivered from an `Any`-decode
of the shuttle msg in `_mk_msg_type_err()` but now this more explicitly
refines the API and supports both `PayloadMsg`-instance or the msg-dict
style injection:
- allow passing either of `bad_msg: PayloadMsg|None` or
  `bad_msg_as_dict: dict|None` to `MsgTypeError.from_decode()`.
- expose public props for both ^ whilst dropping prior `.msgdict`.
- rework `.from_decode()` to explicitly accept `**extra_msgdata: dict`
  |_ only overriding it from any `bad_msg_as_dict` if the keys are found in
    `_ipcmsg_keys`, **except** for `_bad_msg` when `bad_msg` is passed.
  |_ drop `.ipc_msg` passthrough.
  |_ drop `msgdict` input.
- adjust `.cid` to only pull from the `.bad_msg` if set.

Related fixes/adjustments:
- `pack_from_raise()` should pull `boxed_type_str` from
  `boxed_type.__name__`, not the `type()` of it.. also add a
  `hide_tb: bool` flag.
- don't include `_msg_dict` and `_bad_msg` in the `_body_fields` set.
- allow more granular boxed traceback-str controls:
  |_ allow passing a `tb_str: str` explicitly in which case we use it
    verbatim and presume caller knows what they're doing.
  |_ when not provided, use the more explicit
    `traceback.format_exception(exc)` since the error instance is
    a required input (we still fail back to the old `.format_exc()` call
    if for some reason the caller passes `None`; but that should be
    a bug right?).
  |_ if a `tb: TracebackType` and a `tb_str` is passed, concat them.
- in `RemoteActorError.pformat()` don't indent the `._message` part used
  for the `body` when `with_type_header == False`.
- update `_mk_msg_type_err()` to use `bad_msg`/`bad_msg_as_dict`
  appropriately and drop passing `ipc_msg`.
---
 tractor/_exceptions.py | 220 +++++++++++++++++++++++++++--------------
 1 file changed, 148 insertions(+), 72 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 9a94bbdb..85957356 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -22,6 +22,9 @@ from __future__ import annotations
 import builtins
 import importlib
 from pprint import pformat
+from types import (
+    TracebackType,
+)
 from typing import (
     Any,
     Callable,
@@ -92,26 +95,30 @@ _ipcmsg_keys: list[str] = [
     fi.name
     for fi, k, v
     in iter_fields(Error)
-
 ]
 
 _body_fields: list[str] = list(
     set(_ipcmsg_keys)
 
-    # NOTE: don't show fields that either don't provide
-    # any extra useful info or that are already shown
-    # as part of `.__repr__()` output.
+    # XXX NOTE: DON'T-SHOW-FIELDS
+    # - don't provide any extra useful info or,
+    # - are already shown as part of `.__repr__()` or,
+    # - are sub-type specific.
     - {
         'src_type_str',
         'boxed_type_str',
         'tb_str',
         'relay_path',
-        '_msg_dict',
         'cid',
 
-        # since only ctxc should show it but `Error` does
+        # only ctxc should show it but `Error` does
         # have it as an optional field.
         'canceller',
+
+        # only for MTEs and generally only used
+        # when devving/testing/debugging.
+        '_msg_dict',
+        '_bad_msg',
     }
 )
 
@@ -146,6 +153,7 @@ def pack_from_raise(
         |MsgTypeError
     ),
     cid: str,
+    hide_tb: bool = True,
 
     **rae_fields,
 
@@ -156,7 +164,7 @@ def pack_from_raise(
     `Error`-msg using `pack_error()` to extract the tb info.
 
     '''
-    __tracebackhide__: bool = True
+    __tracebackhide__: bool = hide_tb
     try:
         raise local_err
     except type(local_err) as local_err:
@@ -231,7 +239,8 @@ class RemoteActorError(Exception):
 
         if (
             extra_msgdata
-            and ipc_msg
+            and
+            ipc_msg
         ):
             # XXX mutate the orig msg directly from
             # manually provided input params.
@@ -261,17 +270,16 @@ class RemoteActorError(Exception):
         # either by customizing `ContextCancelled.__init__()` or
         # through a special factor func?
         elif boxed_type:
-            boxed_type_str: str = type(boxed_type).__name__
+            boxed_type_str: str = boxed_type.__name__
             if (
                 ipc_msg
-                and not self._ipc_msg.boxed_type_str
+                and
+                self._ipc_msg.boxed_type_str != boxed_type_str
             ):
                 self._ipc_msg.boxed_type_str = boxed_type_str
                 assert self.boxed_type_str == self._ipc_msg.boxed_type_str
 
-            else:
-                self._extra_msgdata['boxed_type_str'] = boxed_type_str
-
+            # ensure any roundtripping evals to the input value
             assert self.boxed_type is boxed_type
 
     @property
@@ -309,7 +317,9 @@ class RemoteActorError(Exception):
             if self._ipc_msg
             else {}
         )
-        return self._extra_msgdata | msgdata
+        return {
+            k: v for k, v in self._extra_msgdata.items()
+        } | msgdata
 
     @property
     def src_type_str(self) -> str:
@@ -502,6 +512,8 @@ class RemoteActorError(Exception):
 
         '''
         header: str = ''
+        body: str = ''
+
         if with_type_header:
             header: str = f'<{type(self).__name__}(\n'
 
@@ -525,24 +537,22 @@ class RemoteActorError(Exception):
             )
             if not with_type_header:
                 body = '\n' + body
-        else:
-            first: str = ''
-            message: str = self._message
 
+        elif message := self._message:
             # split off the first line so it isn't indented
             # the same like the "boxed content".
             if not with_type_header:
                 lines: list[str] = message.splitlines()
-                first = lines[0]
-                message = ''.join(lines[1:])
+                first: str = lines[0]
+                message: str = message.removeprefix(first)
+
+            else:
+                first: str = ''
 
             body: str = (
                 first
                 +
-                textwrap.indent(
-                    message,
-                    prefix='  ',
-                )
+                message
                 +
                 '\n'
             )
@@ -708,52 +718,72 @@ class MsgTypeError(
     ]
 
     @property
-    def msg_dict(self) -> dict[str, Any]:
+    def bad_msg(self) -> PayloadMsg|None:
         '''
-        If the underlying IPC `MsgType` was received from a remote
-        actor but was unable to be decoded to a native
-        `Yield`|`Started`|`Return` struct, the interchange backend
-        native format decoder can be used to stash a `dict`
-        version for introspection by the invalidating RPC task.
+        Ref to the the original invalid IPC shuttle msg which failed
+        to decode thus providing for the reason for this error.
 
         '''
-        return self.msgdata.get('_msg_dict')
+        if (
+            (_bad_msg := self.msgdata.get('_bad_msg'))
+            and
+            isinstance(_bad_msg, PayloadMsg)
+        ):
+            return _bad_msg
 
-    @property
-    def expected_msg(self) -> MsgType|None:
-        '''
-        Attempt to construct what would have been the original
-        `MsgType`-with-payload subtype (i.e. an instance from the set
-        of msgs in `.msg.types._payload_msgs`) which failed
-        validation.
-
-        '''
-        if msg_dict := self.msg_dict.copy():
+        elif bad_msg_dict := self.bad_msg_as_dict:
             return msgtypes.from_dict_msg(
-                dict_msg=msg_dict,
+                dict_msg=bad_msg_dict.copy(),
                 # use_pretty=True,
                 # ^-TODO-^ would luv to use this BUT then the
                 # `field_prefix` in `pformat_boxed_tb()` cucks it
                 # all up.. XD
             )
+
         return None
 
+    @property
+    def bad_msg_as_dict(self) -> dict[str, Any]:
+        '''
+        If the underlying IPC `MsgType` was received from a remote
+        actor but was unable to be decoded to a native `PayloadMsg`
+        (`Yield`|`Started`|`Return`) struct, the interchange backend
+        native format decoder can be used to stash a `dict` version
+        for introspection by the invalidating RPC task.
+
+        Optionally when this error is constructed from
+        `.from_decode()` the caller can attempt to construct what
+        would have been the original `MsgType`-with-payload subtype
+        (i.e. an instance from the set of msgs in
+        `.msg.types._payload_msgs`) which failed validation.
+
+        '''
+        return self.msgdata.get('_bad_msg_as_dict')
+
     @property
     def expected_msg_type(self) -> Type[MsgType]|None:
-        return type(self.expected_msg)
+        return type(self.bad_msg)
 
     @property
     def cid(self) -> str:
-        # pre-packed using `.from_decode()` constructor
-        return self.msgdata.get('cid')
+        # pull from required `.bad_msg` ref (or src dict)
+        if bad_msg := self.bad_msg:
+            return bad_msg.cid
+
+        return self.msgdata['cid']
 
     @classmethod
     def from_decode(
         cls,
         message: str,
 
-        ipc_msg: PayloadMsg|None = None,
-        msgdict: dict|None = None,
+        bad_msg: PayloadMsg|None = None,
+        bad_msg_as_dict: dict|None = None,
+
+        # if provided, expand and pack all RAE compat fields into the
+        # `._extra_msgdata` auxillary data `dict` internal to
+        # `RemoteActorError`.
+        **extra_msgdata,
 
     ) -> MsgTypeError:
         '''
@@ -763,25 +793,44 @@ class MsgTypeError(
         (which is normally the caller of this).
 
         '''
-        # if provided, expand and pack all RAE compat fields into the
-        # `._extra_msgdata` auxillary data `dict` internal to
-        # `RemoteActorError`.
-        extra_msgdata: dict = {}
-        if msgdict:
-            extra_msgdata: dict = {
-                k: v
-                for k, v in msgdict.items()
-                if k in _ipcmsg_keys
-            }
+        if bad_msg_as_dict:
             # NOTE: original "vanilla decode" of the msg-bytes
             # is placed inside a value readable from
             # `.msgdata['_msg_dict']`
-            extra_msgdata['_msg_dict'] = msgdict
+            extra_msgdata['_bad_msg_as_dict'] = bad_msg_as_dict
+
+            # scrape out any underlying fields from the
+            # msg that failed validation.
+            for k, v in bad_msg_as_dict.items():
+                if (
+                    # always skip a duplicate entry
+                    # if already provided as an arg
+                    k == '_bad_msg' and bad_msg
+                    or
+                    # skip anything not in the default msg-field set.
+                    k not in _ipcmsg_keys
+                    # k not in _body_fields
+                ):
+                    continue
+
+                extra_msgdata[k] = v
+
+
+        elif bad_msg:
+            if not isinstance(bad_msg, PayloadMsg):
+                raise TypeError(
+                    'The provided `bad_msg` is not a `PayloadMsg` type?\n\n'
+                    f'{bad_msg}'
+                )
+            extra_msgdata['_bad_msg'] = bad_msg
+            extra_msgdata['cid'] = bad_msg.cid
+
+        if 'cid' not in extra_msgdata:
+            import pdbp; pdbp.set_trace()
 
         return cls(
             message=message,
             boxed_type=cls,
-            ipc_msg=ipc_msg,
             **extra_msgdata,
         )
 
@@ -836,9 +885,10 @@ class MessagingError(Exception):
 def pack_error(
     exc: BaseException|RemoteActorError,
 
-    tb: str|None = None,
     cid: str|None = None,
     src_uid: tuple[str, str]|None = None,
+    tb: TracebackType|None = None,
+    tb_str: str = '',
 
 ) -> Error:
     '''
@@ -848,10 +898,28 @@ def pack_error(
     the receiver side using `unpack_error()` below.
 
     '''
-    if tb:
-        tb_str = ''.join(traceback.format_tb(tb))
+    if not tb_str:
+        tb_str: str = (
+            ''.join(traceback.format_exception(exc))
+
+            # TODO: can we remove this is `exc` is required?
+            or
+            # NOTE: this is just a shorthand for the "last error" as
+            # provided by `sys.exeception()`, see:
+            # - https://docs.python.org/3/library/traceback.html#traceback.print_exc
+            # - https://docs.python.org/3/library/traceback.html#traceback.format_exc
+            traceback.format_exc()
+        )
     else:
-        tb_str = traceback.format_exc()
+        if tb_str[-2:] != '\n':
+            tb_str += '\n'
+
+    # when caller provides a tb instance (say pulled from some other
+    # src error's `.__traceback__`) we use that as the "boxed"
+    # tb-string instead.
+    if tb:
+        # https://docs.python.org/3/library/traceback.html#traceback.format_list
+        tb_str: str = ''.join(traceback.format_tb(tb)) + tb_str
 
     error_msg: dict[  # for IPC
         str,
@@ -1115,7 +1183,7 @@ def _mk_msg_type_err(
     src_validation_error: ValidationError|None = None,
     src_type_error: TypeError|None = None,
     is_invalid_payload: bool = False,
-    src_err_msg: Error|None = None,
+    # src_err_msg: Error|None = None,
 
     **mte_kwargs,
 
@@ -1164,10 +1232,10 @@ def _mk_msg_type_err(
                     '|_ https://jcristharif.com/msgspec/extending.html#defining-a-custom-extension-messagepack-only\n'
                 )
 
-
         msgtyperr = MsgTypeError(
             message=message,
             ipc_msg=msg,
+            bad_msg=msg,
         )
         # ya, might be `None`
         msgtyperr.__cause__ = src_type_error
@@ -1175,6 +1243,9 @@ def _mk_msg_type_err(
 
     # `Channel.recv()` case
     else:
+        msg_dict: dict|None = None
+        bad_msg: PayloadMsg|None = None
+
         if is_invalid_payload:
             msg_type: str = type(msg)
             any_pld: Any = msgpack.decode(msg.pld)
@@ -1186,19 +1257,20 @@ def _mk_msg_type_err(
                 # f' |_pld: {codec.pld_spec_str}\n'# != {any_pld!r}\n'
                 # f')>\n\n'
             )
+            # src_err_msg = msg
+            bad_msg = msg
             # TODO: should we just decode the msg to a dict despite
             # only the payload being wrong?
             # -[ ] maybe the better design is to break this construct
             #   logic into a separate explicit helper raiser-func?
-            msg_dict = None
 
         else:
-            msg: bytes
             # decode the msg-bytes using the std msgpack
-            # interchange-prot (i.e. without any
-            # `msgspec.Struct` handling) so that we can
-            # determine what `.msg.types.Msg` is the culprit
-            # by reporting the received value.
+            # interchange-prot (i.e. without any `msgspec.Struct`
+            # handling) so that we can determine what
+            # `.msg.types.PayloadMsg` is the culprit by reporting the
+            # received value.
+            msg: bytes
             msg_dict: dict = msgpack.decode(msg)
             msg_type_name: str = msg_dict['msg_type']
             msg_type = getattr(msgtypes, msg_type_name)
@@ -1235,9 +1307,13 @@ def _mk_msg_type_err(
         if verb_header:
             message = f'{verb_header} ' + message
 
+        # if not isinstance(bad_msg, PayloadMsg):
+        #     import pdbp; pdbp.set_trace()
+
         msgtyperr = MsgTypeError.from_decode(
             message=message,
-            msgdict=msg_dict,
+            bad_msg=bad_msg,
+            bad_msg_as_dict=msg_dict,
 
             # NOTE: for the send-side `.started()` pld-validate
             # case we actually set the `._ipc_msg` AFTER we return
@@ -1245,7 +1321,7 @@ def _mk_msg_type_err(
             # want to emulate the `Error` from the mte we build here
             # Bo
             # so by default in that case this is set to `None`
-            ipc_msg=src_err_msg,
+            # ipc_msg=src_err_msg,
         )
         msgtyperr.__cause__ = src_validation_error
         return msgtyperr
-- 
2.34.1


From 27fd96729a171caa852b925ef214e9e00ec61a46 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 28 May 2024 09:22:59 -0400
Subject: [PATCH 329/378] Tweaks to debugger examples

Light stuff like comments, typing, and a couple API usage updates.
---
 examples/debugging/multi_daemon_subactors.py   | 14 ++++++++++----
 ...ted_subactors_error_up_through_nurseries.py |  1 +
 examples/debugging/per_actor_debug.py          |  1 +
 .../debugging/root_actor_breakpoint_forever.py |  5 ++++-
 examples/debugging/subactor_error.py           | 18 ++++++++++++++----
 examples/debugging/sync_bp.py                  | 10 ++++++----
 6 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/examples/debugging/multi_daemon_subactors.py b/examples/debugging/multi_daemon_subactors.py
index ea5fe005..80ef933c 100644
--- a/examples/debugging/multi_daemon_subactors.py
+++ b/examples/debugging/multi_daemon_subactors.py
@@ -4,9 +4,15 @@ import trio
 
 async def breakpoint_forever():
     "Indefinitely re-enter debugger in child actor."
-    while True:
-        yield 'yo'
-        await tractor.breakpoint()
+    try:
+        while True:
+            yield 'yo'
+            await tractor.breakpoint()
+    except BaseException:
+        tractor.log.get_console_log().exception(
+            'Cancelled while trying to enter pause point!'
+        )
+        raise
 
 
 async def name_error():
@@ -19,7 +25,7 @@ async def main():
     """
     async with tractor.open_nursery(
         debug_mode=True,
-        loglevel='error',
+        loglevel='cancel',
     ) as n:
 
         p0 = await n.start_actor('bp_forever', enable_modules=[__name__])
diff --git a/examples/debugging/multi_nested_subactors_error_up_through_nurseries.py b/examples/debugging/multi_nested_subactors_error_up_through_nurseries.py
index 348a5ee9..8df52e3b 100644
--- a/examples/debugging/multi_nested_subactors_error_up_through_nurseries.py
+++ b/examples/debugging/multi_nested_subactors_error_up_through_nurseries.py
@@ -45,6 +45,7 @@ async def spawn_until(depth=0):
             )
 
 
+# TODO: notes on the new boxed-relayed errors through proxy actors
 async def main():
     """The main ``tractor`` routine.
 
diff --git a/examples/debugging/per_actor_debug.py b/examples/debugging/per_actor_debug.py
index 1db56981..c1bf5cab 100644
--- a/examples/debugging/per_actor_debug.py
+++ b/examples/debugging/per_actor_debug.py
@@ -23,5 +23,6 @@ async def main():
             n.start_soon(debug_actor.run, die)
             n.start_soon(crash_boi.run, die)
 
+
 if __name__ == '__main__':
     trio.run(main)
diff --git a/examples/debugging/root_actor_breakpoint_forever.py b/examples/debugging/root_actor_breakpoint_forever.py
index 3536a751..88a6e0e9 100644
--- a/examples/debugging/root_actor_breakpoint_forever.py
+++ b/examples/debugging/root_actor_breakpoint_forever.py
@@ -2,10 +2,13 @@ import trio
 import tractor
 
 
-async def main():
+async def main(
+    registry_addrs: tuple[str, int]|None = None
+):
 
     async with tractor.open_root_actor(
         debug_mode=True,
+        # loglevel='runtime',
     ):
         while True:
             await tractor.breakpoint()
diff --git a/examples/debugging/subactor_error.py b/examples/debugging/subactor_error.py
index e38c1614..d7aee447 100644
--- a/examples/debugging/subactor_error.py
+++ b/examples/debugging/subactor_error.py
@@ -3,16 +3,26 @@ import tractor
 
 
 async def name_error():
-    getattr(doggypants)
+    getattr(doggypants)  # noqa (on purpose)
 
 
 async def main():
     async with tractor.open_nursery(
         debug_mode=True,
-    ) as n:
+        # loglevel='transport',
+    ) as an:
 
-        portal = await n.run_in_actor(name_error)
-        await portal.result()
+        # TODO: ideally the REPL arrives at this frame in the parent,
+        # ABOVE the @api_frame of `Portal.run_in_actor()` (which
+        # should eventually not even be a portal method ... XD)
+        # await tractor.pause()
+        p: tractor.Portal = await an.run_in_actor(name_error)
+
+        # with this style, should raise on this line
+        await p.result()
+
+        # with this alt style should raise at `open_nusery()`
+        # return await p.result()
 
 
 if __name__ == '__main__':
diff --git a/examples/debugging/sync_bp.py b/examples/debugging/sync_bp.py
index 23469d6c..efa4e405 100644
--- a/examples/debugging/sync_bp.py
+++ b/examples/debugging/sync_bp.py
@@ -7,7 +7,7 @@ def sync_pause(
     error: bool = False,
 ):
     if use_builtin:
-        breakpoint()
+        breakpoint(hide_tb=False)
 
     else:
         tractor.pause_from_sync()
@@ -20,18 +20,20 @@ def sync_pause(
 async def start_n_sync_pause(
     ctx: tractor.Context,
 ):
-    # sync to requesting peer
+    actor: tractor.Actor = tractor.current_actor()
+
+    # sync to parent-side task
     await ctx.started()
 
-    actor: tractor.Actor = tractor.current_actor()
     print(f'entering SYNC PAUSE in {actor.uid}')
     sync_pause()
     print(f'back from SYNC PAUSE in {actor.uid}')
 
 
 async def main() -> None:
-
     async with tractor.open_nursery(
+        # NOTE: required for pausing from sync funcs
+        maybe_enable_greenback=True,
         debug_mode=True,
     ) as an:
 
-- 
2.34.1


From 8b860f4245114052a707c40695de064c2f2a465e Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 28 May 2024 09:34:08 -0400
Subject: [PATCH 330/378] Move `.devx` related deps to `dev` group

---
 pyproject.toml | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c163c7f7..0e80e14d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,8 +39,7 @@ msgspec='^0.18.5'  # interchange
 wrapt = "^1.16.0"  # decorators
 colorlog = "^6.8.2"  # logging
 
-# .devx tooling
-stackscope = "^0.2.2"
+# built-in multi-actor `pdb` REPL
 pdbp = "^1.5.0"
 
 
@@ -49,15 +48,19 @@ pdbp = "^1.5.0"
 # 'pyroute2
 
 # ------ - ------
-xontrib-vox = "^0.0.1"
 
 [tool.poetry.group.dev]
 optional = false
 [tool.poetry.group.dev.dependencies]
+# testing
 pytest = "^8.2.0"
 pexpect = "^4.9.0"
 
-# only for xonsh as sh..
+# .devx tooling
+greenback = "^1.2.1"
+stackscope = "^0.2.2"
+
+# (light) xonsh usage/integration
 xontrib-vox = "^0.0.1"
 prompt-toolkit = "^3.0.43"
 xonsh-vox-tabcomplete = "^0.5"
-- 
2.34.1


From 582144830f1cdcd4bbec4626cfff50e6f95033f1 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 28 May 2024 09:36:26 -0400
Subject: [PATCH 331/378] Parameterize the `return_msg_type` in `._invoke()`

Since we also handle a runtime-specific `CancelAck`, allow the
caller-scheduler to pass in the expected return-type msg per the RPC msg
endpoint loop.
---
 tractor/_rpc.py | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index df79c653..9b92d4e4 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -64,6 +64,7 @@ from .log import get_logger
 from .msg import (
     current_codec,
     MsgCodec,
+    PayloadT,
     NamespacePath,
     pretty_struct,
 )
@@ -98,7 +99,7 @@ async def _invoke_non_context(
 
     treat_as_gen: bool,
     is_rpc: bool,
-    return_msg: Return|CancelAck = Return,
+    return_msg_type: Return|CancelAck = Return,
 
     task_status: TaskStatus[
         Context | BaseException
@@ -220,7 +221,7 @@ async def _invoke_non_context(
                 and chan.connected()
             ):
                 try:
-                    ret_msg = return_msg(
+                    ret_msg = return_msg_type(
                         cid=cid,
                         pld=result,
                     )
@@ -419,7 +420,7 @@ async def _invoke(
 
     is_rpc: bool = True,
     hide_tb: bool = True,
-    return_msg: Return|CancelAck = Return,
+    return_msg_type: Return|CancelAck = Return,
 
     task_status: TaskStatus[
         Context | BaseException
@@ -533,7 +534,7 @@ async def _invoke(
                 kwargs,
                 treat_as_gen,
                 is_rpc,
-                return_msg,
+                return_msg_type,
                 task_status,
             )
             # XXX below fallthrough is ONLY for `@context` eps
@@ -593,18 +594,21 @@ async def _invoke(
                 ctx._scope = tn.cancel_scope
                 task_status.started(ctx)
 
-                # TODO: should would be nice to have our
-                # `TaskMngr` nursery here!
-                res: Any = await coro
-                ctx._result = res
-
-                # deliver final result to caller side.
-                await chan.send(
-                    return_msg(
-                        cid=cid,
-                        pld=res,
-                    )
+                # TODO: better `trionics` tooling:
+                # -[ ] should would be nice to have our `TaskMngr`
+                #   nursery here!
+                # -[ ] payload value checking like we do with
+                #   `.started()` such that the debbuger can engage
+                #   here in the child task instead of waiting for the
+                #   parent to crash with it's own MTE..
+                res: Any|PayloadT = await coro
+                return_msg: Return|CancelAck = return_msg_type(
+                    cid=cid,
+                    pld=res,
                 )
+                # set and shuttle final result to "parent"-side task.
+                ctx._result = res
+                await chan.send(return_msg)
 
             # NOTE: this happens IFF `ctx._scope.cancel()` is
             # called by any of,
@@ -940,7 +944,7 @@ async def process_messages(
                                 actor.cancel,
                                 kwargs,
                                 is_rpc=False,
-                                return_msg=CancelAck,
+                                return_msg_type=CancelAck,
                             )
 
                         log.runtime(
@@ -974,7 +978,7 @@ async def process_messages(
                                 actor._cancel_task,
                                 kwargs,
                                 is_rpc=False,
-                                return_msg=CancelAck,
+                                return_msg_type=CancelAck,
                             )
                         except BaseException:
                             log.exception(
-- 
2.34.1


From 7ac730e326e6ae20619d5f3ac3415d08b5b97a50 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 28 May 2024 09:55:16 -0400
Subject: [PATCH 332/378] Drop `msg.types.Msg` for new replacement types

The `TypeAlias` for the msg type-group is now `MsgType` and any user
touching shuttle messages can now be typed as `PayloadMsg`.

Relatedly, add MTE specific `Error._bad_msg[_as_dict]` fields which are
handy for introspection of remote decode failures.
---
 tractor/_ipc.py       |  2 +-
 tractor/msg/_codec.py | 10 +++++-----
 tractor/msg/types.py  | 39 +++++++++++++++++++++++----------------
 3 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/tractor/_ipc.py b/tractor/_ipc.py
index 511a053c..ec7d348a 100644
--- a/tractor/_ipc.py
+++ b/tractor/_ipc.py
@@ -291,7 +291,7 @@ class MsgpackTCPStream(MsgTransport):
 
     async def send(
         self,
-        msg: msgtypes.Msg,
+        msg: msgtypes.MsgType,
 
         strict_types: bool = True,
         # hide_tb: bool = False,
diff --git a/tractor/msg/_codec.py b/tractor/msg/_codec.py
index cd86552f..e1c59e94 100644
--- a/tractor/msg/_codec.py
+++ b/tractor/msg/_codec.py
@@ -140,7 +140,7 @@ class MsgDec(Struct):
     #      * also a `.__contains__()` for doing `None in
     #      TypeSpec[None|int]` since rn you need to do it on
     #      `.__args__` for unions..
-    #     - `MsgSpec: Union[Type[Msg]]
+    #     - `MsgSpec: Union[MsgType]
     #
     # -[ ] auto-genning this from new (in 3.12) type parameter lists Bo
     # |_ https://docs.python.org/3/reference/compound_stmts.html#type-params
@@ -188,7 +188,7 @@ def mk_dec(
 
     return MsgDec(
         _dec=msgpack.Decoder(
-            type=spec,  # like `Msg[Any]`
+            type=spec,  # like `MsgType[Any]`
             dec_hook=dec_hook,
         )
     )
@@ -561,7 +561,7 @@ def mk_codec(
 
     '''
     # (manually) generate a msg-payload-spec for all relevant
-    # god-boxing-msg subtypes, parameterizing the `Msg.pld: PayloadT`
+    # god-boxing-msg subtypes, parameterizing the `PayloadMsg.pld: PayloadT`
     # for the decoder such that all sub-type msgs in our SCIPP
     # will automatically decode to a type-"limited" payload (`Struct`)
     # object (set).
@@ -607,7 +607,7 @@ _def_msgspec_codec: MsgCodec = mk_codec(ipc_pld_spec=Any)
 
 # The built-in IPC `Msg` spec.
 # Our composing "shuttle" protocol which allows `tractor`-app code
-# to use any `msgspec` supported type as the `Msg.pld` payload,
+# to use any `msgspec` supported type as the `PayloadMsg.pld` payload,
 # https://jcristharif.com/msgspec/supported-types.html
 #
 _def_tractor_codec: MsgCodec = mk_codec(
@@ -743,7 +743,7 @@ def limit_msg_spec(
 ) -> MsgCodec:
     '''
     Apply a `MsgCodec` that will natively decode the SC-msg set's
-    `Msg.pld: Union[Type[Struct]]` payload fields using
+    `PayloadMsg.pld: Union[Type[Struct]]` payload fields using
     tagged-unions of `msgspec.Struct`s from the `payload_types`
     for all IPC contexts in use by the current `trio.Task`.
 
diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index f8205c23..08511ec0 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -89,11 +89,12 @@ class PayloadMsg(
     # -[ ] `uuid.UUID` which has multi-protocol support
     #  https://jcristharif.com/msgspec/supported-types.html#uuid
 
-    # The msgs "payload" (spelled without vowels):
+    # The msg's "payload" (spelled without vowels):
     # https://en.wikipedia.org/wiki/Payload_(computing)
-    #
-    # NOTE: inherited from any `Msg` (and maybe overriden
-    # by use of `limit_msg_spec()`), but by default is
+    pld: Raw
+
+    # ^-NOTE-^ inherited from any `PayloadMsg` (and maybe type
+    # overriden via the `._ops.limit_plds()` API), but by default is
     # parameterized to be `Any`.
     #
     # XXX this `Union` must strictly NOT contain `Any` if
@@ -106,7 +107,6 @@ class PayloadMsg(
     # TODO: could also be set to `msgspec.Raw` if the sub-decoders
     # approach is preferred over the generic parameterization 
     # approach as take by `mk_msg_spec()` below.
-    pld: Raw
 
 
 # TODO: complete rename
@@ -412,19 +412,24 @@ class Error(
     relay_path: list[tuple[str, str]]
     tb_str: str
 
-    cid: str|None = None
-
-    # TODO: use UNSET or don't include them via
+    # TODO: only optionally include sub-type specfic fields?
+    # -[ ] use UNSET or don't include them via `omit_defaults` (see
+    #      inheritance-line options above)
     #
-    # `ContextCancelled`
+    # `ContextCancelled` reports the src cancelling `Actor.uid`
     canceller: tuple[str, str]|None = None
 
-    # `StreamOverrun`
+    # `StreamOverrun`-specific src `Actor.uid`
     sender: tuple[str, str]|None = None
 
-    # for the `MsgTypeError` case where the receiver side
-    # decodes the underlying original `Msg`-subtype
-    _msg_dict: dict|None = None
+    # `MsgTypeError` meta-data
+    cid: str|None = None
+    # when the receiver side fails to decode a delivered
+    # `PayloadMsg`-subtype; one and/or both the msg-struct instance
+    # and `Any`-decoded to `dict` of the msg are set and relayed
+    # (back to the sender) for introspection.
+    _bad_msg: Started|Yield|Return|None = None
+    _bad_msg_as_dict: dict|None = None
 
 
 def from_dict_msg(
@@ -436,9 +441,11 @@ def from_dict_msg(
 
 ) -> MsgType:
     '''
-    Helper to build a specific `MsgType` struct from
-    a "vanilla" decoded `dict`-ified equivalent of the
-    msg: i.e. if the `msgpack.Decoder.type == Any`.
+    Helper to build a specific `MsgType` struct from a "vanilla"
+    decoded `dict`-ified equivalent of the msg: i.e. if the
+    `msgpack.Decoder.type == Any`, the default when using
+    `msgspec.msgpack` and not "typed decoding" using
+    `msgspec.Struct`.
 
     '''
     msg_type_tag_field: str = (
-- 
2.34.1


From f7fd8278af65e9c826223258c538483181a04709 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 28 May 2024 11:05:44 -0400
Subject: [PATCH 333/378] Fix `test_basic_payload_spec` bad msg matching

Expecting `Started` or `Return` with respective bad `.pld` values
depending on what type of failure is test parametrized.

This makes the suite run green it seems B)
---
 tests/test_pldrx_limiting.py | 50 ++++++++++--------------------------
 1 file changed, 13 insertions(+), 37 deletions(-)

diff --git a/tests/test_pldrx_limiting.py b/tests/test_pldrx_limiting.py
index d658fb51..562164c7 100644
--- a/tests/test_pldrx_limiting.py
+++ b/tests/test_pldrx_limiting.py
@@ -6,30 +6,19 @@ related settings around IPC contexts.
 '''
 from contextlib import (
     asynccontextmanager as acm,
-    contextmanager as cm,
-)
-# import typing
-from typing import (
-    # Any,
-    TypeAlias,
-    # Union,
 )
 from contextvars import (
     Context,
 )
 
 from msgspec import (
-    # structs,
-    # msgpack,
     Struct,
-    # ValidationError,
 )
 import pytest
 import trio
 
 import tractor
 from tractor import (
-    # _state,
     MsgTypeError,
     current_ipc_ctx,
     Portal,
@@ -40,20 +29,9 @@ from tractor.msg import (
 )
 from tractor.msg import (
     _codec,
-    # _ctxvar_MsgCodec,
-
-    # NamespacePath,
-    # MsgCodec,
-    # mk_codec,
-    # apply_codec,
-    # current_codec,
 )
 from tractor.msg.types import (
     log,
-    # _payload_msgs,
-    # PayloadMsg,
-    # Started,
-    # mk_msg_spec,
 )
 
 
@@ -64,23 +42,10 @@ class PldMsg(Struct):
 maybe_msg_spec = PldMsg|None
 
 
-@cm
-def custom_spec(
-    ctx: Context,
-    spec: TypeAlias,
-) -> _codec.MsgCodec:
-    '''
-    Apply a custom payload spec, remove on exit.
-
-    '''
-    rx: msgops.PldRx = ctx._pld_rx
-
-
 @acm
 async def maybe_expect_raises(
     raises: BaseException|None = None,
     ensure_in_message: list[str]|None = None,
-
     reraise: bool = False,
     timeout: int = 3,
 ) -> None:
@@ -271,6 +236,17 @@ def test_basic_payload_spec(
             # since not opened yet.
             assert current_ipc_ctx() is None
 
+            if invalid_started:
+                msg_type_str: str = 'Started'
+                bad_value_str: str = '10'
+            elif invalid_return:
+                msg_type_str: str = 'Return'
+                bad_value_str: str = "'yo'"
+            else:
+                # XXX but should never be used below then..
+                msg_type_str: str = ''
+                bad_value_str: str = ''
+
             async with (
                 maybe_expect_raises(
                     raises=MsgTypeError if (
@@ -279,8 +255,8 @@ def test_basic_payload_spec(
                         invalid_started
                     ) else None,
                     ensure_in_message=[
-                        "invalid `Return` payload",
-                        "value: `'yo'` does not match type-spec: `Return.pld: PldMsg|NoneType`",
+                        f"invalid `{msg_type_str}` payload",
+                        f"value: `{bad_value_str}` does not match type-spec: `{msg_type_str}.pld: PldMsg|NoneType`",
                     ],
                 ),
                 p.open_context(
-- 
2.34.1


From 6c2efc96dc102b8348ca6035890db8be2bcaccb7 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 28 May 2024 11:08:27 -0400
Subject: [PATCH 334/378] Factor `.started()` validation into `.msg._ops`

Filling out the helper `validate_payload_msg()` staged in a prior commit
and adjusting all imports to match.

Also add a `raise_mte: bool` flag for potential usage where the caller
wants to handle the MTE instance themselves.
---
 tractor/_context.py | 57 +++++++----------------------------------
 tractor/msg/_ops.py | 62 ++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 67 insertions(+), 52 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index 42271b00..e9730927 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -58,9 +58,6 @@ from typing import (
 import warnings
 # ------ - ------
 import trio
-from msgspec import (
-    ValidationError,
-)
 # ------ - ------
 from ._exceptions import (
     ContextCancelled,
@@ -78,19 +75,16 @@ from .log import (
 from .msg import (
     Error,
     MsgType,
-    MsgCodec,
     NamespacePath,
     PayloadT,
     Started,
     Stop,
     Yield,
-    current_codec,
     pretty_struct,
     _ops as msgops,
 )
 from ._ipc import (
     Channel,
-    _mk_msg_type_err,
 )
 from ._streaming import MsgStream
 from ._state import (
@@ -1657,54 +1651,21 @@ class Context:
         #
         __tracebackhide__: bool = hide_tb
         if validate_pld_spec:
-            # __tracebackhide__: bool = False
-            codec: MsgCodec = current_codec()
-            msg_bytes: bytes = codec.encode(started_msg)
-            try:
-                roundtripped: Started = codec.decode(msg_bytes)
-                # pld: PayloadT = await self.pld_rx.recv_pld(
-                pld: PayloadT = self.pld_rx.dec_msg(
-                    msg=roundtripped,
-                    ipc=self,
-                    expect_msg=Started,
-                    hide_tb=hide_tb,
-                    is_started_send_side=True,
-                )
-                if (
-                    strict_pld_parity
-                    and
-                    pld != value
-                ):
-                    # TODO: make that one a mod func too..
-                    diff = pretty_struct.Struct.__sub__(
-                        roundtripped,
-                        started_msg,
-                    )
-                    complaint: str = (
-                        'Started value does not match after roundtrip?\n\n'
-                        f'{diff}'
-                    )
-                    raise ValidationError(complaint)
-
-            # raise any msg type error NO MATTER WHAT!
-            except ValidationError as verr:
-                # always show this src frame in the tb
-                # __tracebackhide__: bool = False
-                raise _mk_msg_type_err(
-                    msg=roundtripped,
-                    codec=codec,
-                    src_validation_error=verr,
-                    verb_header='Trying to send ',
-                    is_invalid_payload=True,
-                ) from verr
+            msgops.validate_payload_msg(
+                pld_msg=started_msg,
+                pld_value=value,
+                ipc=self,
+                strict_pld_parity=strict_pld_parity,
+                hide_tb=hide_tb,
+            )
 
         # TODO: maybe a flag to by-pass encode op if already done
         # here in caller?
         await self.chan.send(started_msg)
 
         # set msg-related internal runtime-state
-        self._started_called = True
-        self._started_msg = started_msg
+        self._started_called: bool = True
+        self._started_msg: Started = started_msg
         self._started_pld = value
 
     async def _drain_overflows(
diff --git a/tractor/msg/_ops.py b/tractor/msg/_ops.py
index 6faf78ef..e22d39f1 100644
--- a/tractor/msg/_ops.py
+++ b/tractor/msg/_ops.py
@@ -53,6 +53,8 @@ from tractor._state import current_ipc_ctx
 from ._codec import (
     mk_dec,
     MsgDec,
+    MsgCodec,
+    current_codec,
 )
 from .types import (
     CancelAck,
@@ -737,9 +739,61 @@ async def drain_to_final_msg(
     )
 
 
-# TODO: factor logic from `.Context.started()` for send-side
-# validate raising!
 def validate_payload_msg(
-    msg: Started|Yield|Return,
+    pld_msg: Started|Yield|Return,
+    pld_value: PayloadT,
+    ipc: Context|MsgStream,
+
+    raise_mte: bool = True,
+    strict_pld_parity: bool = False,
+    hide_tb: bool = True,
+
 ) -> MsgTypeError|None:
-    ...
+    '''
+    Validate a `PayloadMsg.pld` value with the current
+    IPC ctx's `PldRx` and raise an appropriate `MsgTypeError`
+    on failure.
+
+    '''
+    __tracebackhide__: bool = hide_tb
+    codec: MsgCodec = current_codec()
+    msg_bytes: bytes = codec.encode(pld_msg)
+    try:
+        roundtripped: Started = codec.decode(msg_bytes)
+        ctx: Context = getattr(ipc, 'ctx', ipc)
+        pld: PayloadT = ctx.pld_rx.dec_msg(
+            msg=roundtripped,
+            ipc=ipc,
+            expect_msg=Started,
+            hide_tb=hide_tb,
+            is_started_send_side=True,
+        )
+        if (
+            strict_pld_parity
+            and
+            pld != pld_value
+        ):
+            # TODO: make that one a mod func too..
+            diff = pretty_struct.Struct.__sub__(
+                roundtripped,
+                pld_msg,
+            )
+            complaint: str = (
+                'Started value does not match after roundtrip?\n\n'
+                f'{diff}'
+            )
+            raise ValidationError(complaint)
+
+    # raise any msg type error NO MATTER WHAT!
+    except ValidationError as verr:
+        mte: MsgTypeError = _mk_msg_type_err(
+            msg=roundtripped,
+            codec=codec,
+            src_validation_error=verr,
+            verb_header='Trying to send ',
+            is_invalid_payload=True,
+        )
+        if not raise_mte:
+            return mte
+
+        raise mte from verr
-- 
2.34.1


From 59ca25618346d90c715fc0509f198722a4bef26b Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 28 May 2024 15:30:30 -0400
Subject: [PATCH 335/378] Set remote errors in `_raise_from_unexpected_msg()`

By calling `Context._maybe_cancel_and_set_remote_error(exc)` on any
unpacked `Error` msg; provides for `Context.maybe_error` consistency to
match all other error delivery cases.
---
 tractor/_exceptions.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 85957356..0dfaf675 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -902,7 +902,7 @@ def pack_error(
         tb_str: str = (
             ''.join(traceback.format_exception(exc))
 
-            # TODO: can we remove this is `exc` is required?
+            # TODO: can we remove this since `exc` is required.. right?
             or
             # NOTE: this is just a shorthand for the "last error" as
             # provided by `sys.exeception()`, see:
@@ -917,8 +917,8 @@ def pack_error(
     # when caller provides a tb instance (say pulled from some other
     # src error's `.__traceback__`) we use that as the "boxed"
     # tb-string instead.
+    # https://docs.python.org/3/library/traceback.html#traceback.format_list
     if tb:
-        # https://docs.python.org/3/library/traceback.html#traceback.format_list
         tb_str: str = ''.join(traceback.format_tb(tb)) + tb_str
 
     error_msg: dict[  # for IPC
@@ -961,15 +961,15 @@ def pack_error(
         error_msg['src_type_str'] =  type(exc).__name__
         error_msg['boxed_type_str'] = type(exc).__name__
 
-    # XXX alawys append us the last relay in error propagation path
+    # XXX always append us the last relay in error propagation path
     error_msg.setdefault(
         'relay_path',
         [],
     ).append(our_uid)
 
-    # XXX NOTE: always ensure the traceback-str is from the
-    # locally raised error (**not** the prior relay's boxed
-    # content's in `._ipc_msg.tb_str`).
+    # XXX NOTE XXX always ensure the traceback-str content is from
+    # the locally raised error (so, NOT the prior relay's boxed
+    # `._ipc_msg.tb_str`).
     error_msg['tb_str'] = tb_str
 
     if cid is not None:
@@ -1109,6 +1109,7 @@ def _raise_from_unexpected_msg(
             msg,
             ctx.chan,
         )
+        ctx._maybe_cancel_and_set_remote_error(exc)
         raise exc from src_err
 
     # `MsgStream` termination msg.
-- 
2.34.1


From a1b124b62b4cecf79c3aa1b34b0a967fce8d6459 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 28 May 2024 15:52:54 -0400
Subject: [PATCH 336/378] Raise send-side MTEs inline in `PldRx.dec_msg()`

So when `is_started_send_side is True` we raise the newly created
`MsgTypeError` (MTE) directly instead of doing all the `Error`-msg pack
and unpack to raise stuff via `_raise_from_unexpected_msg()` since the
raise should happen send side anyway and so doesn't emulate any remote
fault like in a bad `Return` or `Started` without send-side pld-spec
validation.

Oh, and proxy-through the `hide_tb: bool` input from `.drain_to_final_msg()`
to `.recv_msg_w_pld()`.
---
 tractor/msg/_ops.py | 75 ++++++++++++++++++++++++++++-----------------
 1 file changed, 47 insertions(+), 28 deletions(-)

diff --git a/tractor/msg/_ops.py b/tractor/msg/_ops.py
index e22d39f1..6f472afd 100644
--- a/tractor/msg/_ops.py
+++ b/tractor/msg/_ops.py
@@ -215,6 +215,9 @@ class PldRx(Struct):
             **dec_msg_kwargs,
         )
 
+    # TODO: rename to,
+    # -[ ] `.decode_pld()`?
+    # -[ ] `.dec_pld()`?
     def dec_msg(
         self,
         msg: MsgType,
@@ -248,8 +251,8 @@ class PldRx(Struct):
                     pld: PayloadT = self._pld_dec.decode(pld)
                     log.runtime(
                         'Decoded msg payload\n\n'
-                        f'{msg}\n\n'
-                        f'where payload is\n'
+                        f'{msg}\n'
+                        f'where payload decoded as\n'
                         f'|_pld={pld!r}\n'
                     )
                     return pld
@@ -265,13 +268,7 @@ class PldRx(Struct):
                         src_validation_error=valerr,
                         is_invalid_payload=True,
                         expected_msg=expect_msg,
-                        # ipc_msg=msg,
                     )
-                    # NOTE: override the `msg` passed to
-                    # `_raise_from_unexpected_msg()` (below) so so that
-                    # we're effectively able to use that same func to
-                    # unpack and raise an "emulated remote `Error`" of
-                    # this local MTE.
                     err_msg: Error = pack_error(
                         exc=mte,
                         cid=msg.cid,
@@ -283,34 +280,55 @@ class PldRx(Struct):
                         # tb=valerr.__traceback__,
                         tb_str=mte._message,
                     )
-                    # ^-TODO-^ just raise this inline instead of all the
-                    # pack-unpack-repack non-sense!
-
                     mte._ipc_msg = err_msg
-                    msg = err_msg
 
-                    # set emulated remote error more-or-less as the
-                    # runtime would
-                    ctx: Context = getattr(ipc, 'ctx', ipc)
+                    # NOTE: just raise the MTE inline instead of all
+                    # the pack-unpack-repack non-sense when this is
+                    # a "send side" validation error.
+                    if is_started_send_side:
+                        raise mte
+
+                        # XXX TODO: remove this right?
+                        # => any bad stated/return values should
+                        # always be treated a remote errors right?
+                        #
+                        # if (
+                        #     expect_msg is Return
+                        #     or expect_msg is Started
+                        # ):
+                        #     # set emulated remote error more-or-less as the
+                        #     # runtime would
+                        #     ctx: Context = getattr(ipc, 'ctx', ipc)
+                        #     ctx._maybe_cancel_and_set_remote_error(mte)
+
+
+                    # XXX override the `msg` passed to
+                    # `_raise_from_unexpected_msg()` (below) so so
+                    # that we're effectively able to use that same
+                    # func to unpack and raise an "emulated remote
+                    # `Error`" of this local MTE.
+                    msg = err_msg
+                    # XXX NOTE: so when the `_raise_from_unexpected_msg()`
+                    # raises the boxed `err_msg` from above it raises
+                    # it from the above caught interchange-lib
+                    # validation error.
+                    src_err = valerr
 
                     # TODO: should we instead make this explicit and
                     # use the above masked `is_started_send_decode`,
                     # expecting the `Context.started()` caller to set
                     # it? Rn this is kinda, howyousayyy, implicitly
                     # edge-case-y..
-                    if (
-                        expect_msg is not Started
-                        and not is_started_send_side
-                    ):
-                        ctx._maybe_cancel_and_set_remote_error(mte)
-
-                    # XXX NOTE: so when the `_raise_from_unexpected_msg()`
-                    # raises the boxed `err_msg` from above it raises
-                    # it from `None`.
-                    src_err = valerr
-                    # if is_started_send_side:
-                    #     src_err = None
-
+                    # TODO: remove this since it's been added to
+                    # `_raise_from_unexpected_msg()`..?
+                    # if (
+                    #     expect_msg is not Started
+                    #     and not is_started_send_side
+                    # ):
+                    #     # set emulated remote error more-or-less as the
+                    #     # runtime would
+                    #     ctx: Context = getattr(ipc, 'ctx', ipc)
+                    #     ctx._maybe_cancel_and_set_remote_error(mte)
 
                 # XXX some other decoder specific failure?
                 # except TypeError as src_error:
@@ -561,6 +579,7 @@ async def drain_to_final_msg(
                 ipc=ctx,
                 expect_msg=Return,
                 raise_error=False,
+                hide_tb=hide_tb,
             )
             # ^-TODO-^ some bad ideas?
             # -[ ] wrap final outcome .receive() in a scope so
-- 
2.34.1


From 2db03444f75c880e485d720d6205ef12abf3c889 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 28 May 2024 16:03:36 -0400
Subject: [PATCH 337/378] Don't (noisly) log about runtime cancel RPC tasks

Since in the case of the `Actor._cancel_task()` related runtime eps we
actually don't EVER register them in `Actor._rpc_tasks`.. logging about
them is just needless noise, though maybe we should track them in a diff
table; something like a `._runtime_rpc_tasks`?

Drop the cancel-request-for-stale-RPC-task (`KeyError` case in
`Actor._cancel_task()`) log-emit level in to `.runtime()`; it's
generally not useful info other then for granular race condition eval
when hacking the runtime.
---
 tractor/_rpc.py     | 18 ++++++++++++------
 tractor/_runtime.py |  7 ++++---
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index 9b92d4e4..f05a433e 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -397,12 +397,18 @@ async def _errors_relayed_via_ipc(
                     f'|_{ctx._task}\n'
                     f'  >> {ctx.repr_rpc}\n'
                 )
-            else:
-                log.cancel(
-                    'Failed to de-alloc internal runtime cancel task?\n'
-                    f'|_{ctx._task}\n'
-                    f'  >> {ctx.repr_rpc}\n'
-                )
+            # TODO: remove this right? rn the only non-`is_rpc` cases
+            # are cancellation methods and according the RPC loop eps
+            # for thoses below, nothing is ever registered in
+            # `Actor._rpc_tasks` for those cases.. but should we?
+            #
+            # -[ ] maybe we should have an equiv `Actor._runtime_rpc_tasks`?
+            # else:
+            #     log.cancel(
+            #         'Failed to de-alloc internal runtime cancel task?\n'
+            #         f'|_{ctx._task}\n'
+            #         f'  >> {ctx.repr_rpc}\n'
+            #     )
 
         finally:
             if not actor._rpc_tasks:
diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 1f81c74d..99a969b5 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -1256,9 +1256,10 @@ class Actor:
             # - child returns a result before cancel-msg/ctxc-raised
             # - child self raises ctxc before parent send request,
             # - child errors prior to cancel req.
-            log.cancel(
-                'Cancel request invalid, RPC task already completed?\n\n'
-                f'<= canceller: {requesting_uid}\n\n'
+            log.runtime(
+                'Cancel request for invalid RPC task.\n'
+                'The task likely already completed or was never started!\n\n'
+                f'<= canceller: {requesting_uid}\n'
                 f'=> {cid}@{parent_chan.uid}\n'
                 f'  |_{parent_chan}\n'
             )
-- 
2.34.1


From 6a4ee461f5268608e8f781032091c2134ef95355 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 28 May 2024 16:11:01 -0400
Subject: [PATCH 338/378] Raise remote errors rxed during `Context` child-sync

More specifically, if `.open_context()` is cancelled when awaiting the
first `Context.started()` during the child task sync phase, check to see
if it was due to `._scope.cancel_called` and raise any remote error via
`.maybe_raise()` instead the `trio.Cancelled` like in every other
remote-error handling case. Ensure we set `._scope[_nursery]` only after
the `Started` has arrived and audited.
---
 tractor/_context.py | 74 ++++++++++++++++++++++++++++++---------------
 1 file changed, 49 insertions(+), 25 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index e9730927..142cb1ab 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -664,7 +664,7 @@ class Context:
             'Setting remote error for ctx\n\n'
             f'<= {self.peer_side!r}: {self.chan.uid}\n'
             f'=> {self.side!r}: {self._actor.uid}\n\n'
-            f'{error}'
+            f'{error!r}'
         )
         self._remote_error: BaseException = error
 
@@ -718,7 +718,7 @@ class Context:
             log.error(
                 f'Remote context error:\n\n'
                 # f'{pformat(self)}\n'
-                f'{error}'
+                f'{error!r}'
             )
 
         if self._canceller is None:
@@ -742,26 +742,27 @@ class Context:
             and not cs.cancel_called
             and not cs.cancelled_caught
         ):
-            if not (
+            if (
                 msgerr
 
                 # NOTE: we allow user to config not cancelling the
                 # local scope on `MsgTypeError`s
-                and not self._cancel_on_msgerr
+                and
+                not self._cancel_on_msgerr
             ):
-                # TODO: it'd sure be handy to inject our own
-                # `trio.Cancelled` subtype here ;)
-                # https://github.com/goodboy/tractor/issues/368
-                message: str = 'Cancelling `Context._scope` !\n\n'
-                self._scope.cancel()
-
-            else:
                 message: str = (
                     'NOT Cancelling `Context._scope` since,\n'
                     f'Context._cancel_on_msgerr = {self._cancel_on_msgerr}\n\n'
                     f'AND we got a msg-type-error!\n'
                     f'{error}\n'
                 )
+            else:
+                # TODO: it'd sure be handy to inject our own
+                # `trio.Cancelled` subtype here ;)
+                # https://github.com/goodboy/tractor/issues/368
+                message: str = 'Cancelling `Context._scope` !\n\n'
+                self._scope.cancel()
+
         else:
             message: str = 'NOT cancelling `Context._scope` !\n\n'
             # from .devx import mk_pdb
@@ -2058,6 +2059,12 @@ async def open_context_from_portal(
             if maybe_msgdec:
                 assert maybe_msgdec.pld_spec == pld_spec
 
+            # NOTE: this in an implicit runtime nursery used to,
+            # - start overrun queuing tasks when as well as
+            # for cancellation of the scope opened by the user.
+            ctx._scope_nursery: trio.Nursery = tn
+            ctx._scope: trio.CancelScope = tn.cancel_scope
+
             # XXX NOTE since `._scope` is NOT set BEFORE we retreive the
             # `Started`-msg any cancellation triggered
             # in `._maybe_cancel_and_set_remote_error()` will
@@ -2065,25 +2072,42 @@ async def open_context_from_portal(
             # -> it's expected that if there is an error in this phase of
             # the dialog, the `Error` msg should be raised from the `msg`
             # handling block below.
-            started_msg, first = await ctx._pld_rx.recv_msg_w_pld(
-                ipc=ctx,
-                expect_msg=Started,
-                passthrough_non_pld_msgs=False,
-                hide_tb=hide_tb,
-            )
+            try:
+                started_msg, first = await ctx._pld_rx.recv_msg_w_pld(
+                    ipc=ctx,
+                    expect_msg=Started,
+                    passthrough_non_pld_msgs=False,
+                    hide_tb=hide_tb,
+                )
+            except trio.Cancelled as taskc:
+                ctx_cs: trio.CancelScope = ctx._scope
+                if not ctx_cs.cancel_called:
+                    raise
+
+                # from .devx import pause
+                # await pause(shield=True)
+
+                log.cancel(
+                    'IPC ctx was cancelled during "child" task sync due to\n\n'
+                    f'{ctx.maybe_error}\n'
+                )
+                # OW if the ctx's scope was cancelled manually,
+                # likely the `Context` was cancelled via a call to
+                # `._maybe_cancel_and_set_remote_error()` so ensure
+                # we raise the underlying `._remote_error` directly
+                # instead of bubbling that taskc.
+                ctx.maybe_raise()
+
+                # OW, some other unexpected cancel condition
+                # that should prolly never happen right?
+                raise InternalError(
+                    'Invalid cancellation during IPC ctx sync phase?\n'
+                ) from taskc
 
-            # from .devx import pause
-            # await pause()
             ctx._started_called: bool = True
             ctx._started_msg: bool = started_msg
             ctx._started_pld: bool = first
 
-            # NOTE: this in an implicit runtime nursery used to,
-            # - start overrun queuing tasks when as well as
-            # for cancellation of the scope opened by the user.
-            ctx._scope_nursery: trio.Nursery = tn
-            ctx._scope: trio.CancelScope = tn.cancel_scope
-
             # deliver context instance and .started() msg value
             # in enter tuple.
             yield ctx, first
-- 
2.34.1


From 4fa71cc01cdb7b70f51086a1927418f860a3d0b5 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 28 May 2024 19:46:42 -0400
Subject: [PATCH 339/378] Ensure ctx error-state matches the MTE scenario

Namely checking that `Context._remote_error` is set to the raised MTE
in the invalid started and return value cases since prior to the recent
underlying changes to the `Context.result()` impl, it would not match.

Further,
- do asserts for non-MTE raising cases in both the parent and child.
- add todos for testing ctx-outcomes for per-side-validation policies
  i anticipate supporting and implied msg-dialog race cases therein.
---
 tests/test_pldrx_limiting.py | 89 ++++++++++++++++++++++++++++--------
 1 file changed, 71 insertions(+), 18 deletions(-)

diff --git a/tests/test_pldrx_limiting.py b/tests/test_pldrx_limiting.py
index 562164c7..ee61dca0 100644
--- a/tests/test_pldrx_limiting.py
+++ b/tests/test_pldrx_limiting.py
@@ -148,12 +148,44 @@ async def child(
                 # propagate to parent?
                 if raise_on_started_mte:
                     raise
-            else:
-                if expect_started_mte:
-                    raise RuntimeError(
-                        'Child-ctx-task SHOULD HAVE raised an MTE for\n\n'
-                        f'{started_value!r}\n'
-                    )
+
+            # no-send-side-error fallthrough
+            if (
+                validate_pld_spec
+                and
+                expect_started_mte
+            ):
+                raise RuntimeError(
+                    'Child-ctx-task SHOULD HAVE raised an MTE for\n\n'
+                    f'{started_value!r}\n'
+                )
+
+            assert (
+                not expect_started_mte
+                or
+                not validate_pld_spec
+            )
+
+            # if wait_for_parent_to_cancel:
+            #     ...
+            #
+            # ^-TODO-^ logic for diff validation policies on each side:
+            #
+            # -[ ] ensure that if we don't validate on the send
+            #   side, that we are eventually error-cancelled by our
+            #   parent due to the bad `Started` payload!
+            # -[ ] the boxed error should be srced from the parent's
+            #   runtime NOT ours!
+            # -[ ] we should still error on bad `return_value`s
+            #   despite the parent not yet error-cancelling us?
+            #   |_ how do we want the parent side to look in that
+            #     case?
+            #     -[ ] maybe the equiv of "during handling of the
+            #       above error another occurred" for the case where
+            #       the parent sends a MTE to this child and while
+            #       waiting for the child to terminate it gets back
+            #       the MTE for this case?
+            #
 
             # XXX should always fail on recv side since we can't
             # really do much else beside terminate and relay the
@@ -247,13 +279,17 @@ def test_basic_payload_spec(
                 msg_type_str: str = ''
                 bad_value_str: str = ''
 
+            maybe_mte: MsgTypeError|None = None
+            should_raise: Exception|None = (
+                MsgTypeError if (
+                    invalid_return
+                    or
+                    invalid_started
+                ) else None
+            )
             async with (
                 maybe_expect_raises(
-                    raises=MsgTypeError if (
-                        invalid_return
-                        or
-                        invalid_started
-                    ) else None,
+                    raises=should_raise,
                     ensure_in_message=[
                         f"invalid `{msg_type_str}` payload",
                         f"value: `{bad_value_str}` does not match type-spec: `{msg_type_str}.pld: PldMsg|NoneType`",
@@ -274,18 +310,35 @@ def test_basic_payload_spec(
                 assert first.field == 'yo'
 
                 try:
-                    assert (await ctx.result()) is None
+                    res: None|PldMsg = await ctx.result(hide_tb=False)
+                    assert res is None
                 except MsgTypeError as mte:
+                    maybe_mte = mte
                     if not invalid_return:
                         raise
 
-                    else:  # expected this invalid `Return.pld`
-                        assert mte.cid == ctx.cid
+                    # expected this invalid `Return.pld` so audit
+                    # the error state + meta-data
+                    assert mte.expected_msg_type is Return
+                    assert mte.cid == ctx.cid
 
-                        # verify expected remote mte deats
-                        await tractor.pause()
-                        assert ctx._remote_error is mte
-                        assert mte.expected_msg_type is Return
+                    # verify expected remote mte deats
+                    try:
+                        assert ctx._local_error is None
+                        assert (
+                            mte is
+                            ctx._remote_error is
+                            ctx.maybe_error is
+                            ctx.outcome
+                        )
+                    except:
+                        # XXX should never get here..
+                        await tractor.pause(shield=True)
+                        raise
+
+
+            if should_raise is None:
+                assert maybe_mte is None
 
             await p.cancel_actor()
 
-- 
2.34.1


From 02a7c7c276f1f34172a9ab2d68bb476286498758 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 30 May 2024 01:11:29 -0400
Subject: [PATCH 340/378] Ensure only a boxed traceback for MTE on parent side

---
 tests/test_pldrx_limiting.py | 58 ++++++++++++++++++++++++------------
 1 file changed, 39 insertions(+), 19 deletions(-)

diff --git a/tests/test_pldrx_limiting.py b/tests/test_pldrx_limiting.py
index ee61dca0..0c5c7ee8 100644
--- a/tests/test_pldrx_limiting.py
+++ b/tests/test_pldrx_limiting.py
@@ -53,6 +53,9 @@ async def maybe_expect_raises(
     Async wrapper for ensuring errors propagate from the inner scope.
 
     '''
+    if tractor._state.debug_mode():
+        timeout += 999
+
     with trio.fail_after(timeout):
         try:
             yield
@@ -68,9 +71,10 @@ async def maybe_expect_raises(
                 # maybe check for error txt content
                 if ensure_in_message:
                     part: str
+                    err_repr: str = repr(inner_err)
                     for part in ensure_in_message:
                         for i, arg in enumerate(inner_err.args):
-                            if part in arg:
+                            if part in err_repr:
                                 break
                         # if part never matches an arg, then we're
                         # missing a match.
@@ -97,7 +101,7 @@ async def child(
     ctx: Context,
     started_value: int|PldMsg|None,
     return_value: str|None,
-    validate_pld_spec: bool,
+   validate_pld_spec: bool,
     raise_on_started_mte: bool = True,
 
 ) -> None:
@@ -131,13 +135,15 @@ async def child(
             # 2 cases: hdndle send-side and recv-only validation
             # - when `raise_on_started_mte == True`, send validate
             # - else, parent-recv-side only validation
+            mte: MsgTypeError|None = None
             try:
                 await ctx.started(
                     value=started_value,
                     validate_pld_spec=validate_pld_spec,
                 )
 
-            except MsgTypeError:
+            except MsgTypeError as _mte:
+                mte = _mte
                 log.exception('started()` raised an MTE!\n')
                 if not expect_started_mte:
                     raise RuntimeError(
@@ -145,6 +151,19 @@ async def child(
                         f'{started_value!r}\n'
                     )
 
+                boxed_div: str = '------ - ------'
+                assert boxed_div not in mte._message
+                assert boxed_div not in mte.tb_str
+                assert boxed_div not in repr(mte)
+                assert boxed_div not in str(mte)
+                mte_repr: str = repr(mte)
+                for line in mte.message.splitlines():
+                    assert line in mte_repr
+
+                # since this is a *local error* there should be no
+                # boxed traceback content!
+                assert not mte.tb_str
+
                 # propagate to parent?
                 if raise_on_started_mte:
                     raise
@@ -208,8 +227,8 @@ async def child(
 @pytest.mark.parametrize(
     'return_value',
     [
-        None,
         'yo',
+        None,
     ],
     ids=[
         'return[invalid-"yo"]',
@@ -291,8 +310,9 @@ def test_basic_payload_spec(
                 maybe_expect_raises(
                     raises=should_raise,
                     ensure_in_message=[
-                        f"invalid `{msg_type_str}` payload",
-                        f"value: `{bad_value_str}` does not match type-spec: `{msg_type_str}.pld: PldMsg|NoneType`",
+                        f"invalid `{msg_type_str}` msg payload",
+                        f"value: `{bad_value_str}` does not "
+                        f"match type-spec: `{msg_type_str}.pld: PldMsg|NoneType`",
                     ],
                 ),
                 p.open_context(
@@ -321,21 +341,21 @@ def test_basic_payload_spec(
                     # the error state + meta-data
                     assert mte.expected_msg_type is Return
                     assert mte.cid == ctx.cid
+                    mte_repr: str = repr(mte)
+                    for line in mte.message.splitlines():
+                        assert line in mte_repr
+
+                    assert mte.tb_str
+                    # await tractor.pause(shield=True)
 
                     # verify expected remote mte deats
-                    try:
-                        assert ctx._local_error is None
-                        assert (
-                            mte is
-                            ctx._remote_error is
-                            ctx.maybe_error is
-                            ctx.outcome
-                        )
-                    except:
-                        # XXX should never get here..
-                        await tractor.pause(shield=True)
-                        raise
-
+                    assert ctx._local_error is None
+                    assert (
+                        mte is
+                        ctx._remote_error is
+                        ctx.maybe_error is
+                        ctx.outcome
+                    )
 
             if should_raise is None:
                 assert maybe_mte is None
-- 
2.34.1


From 28af4749ccc9d49dcf8cb605d1806714e0b370dd Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 30 May 2024 09:05:23 -0400
Subject: [PATCH 341/378] Don't need to pack an `Error` with send-side MTEs

---
 tractor/msg/_ops.py | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/tractor/msg/_ops.py b/tractor/msg/_ops.py
index 6f472afd..97cd3f29 100644
--- a/tractor/msg/_ops.py
+++ b/tractor/msg/_ops.py
@@ -269,19 +269,6 @@ class PldRx(Struct):
                         is_invalid_payload=True,
                         expected_msg=expect_msg,
                     )
-                    err_msg: Error = pack_error(
-                        exc=mte,
-                        cid=msg.cid,
-                        src_uid=(
-                            ipc.chan.uid
-                            if not is_started_send_side
-                            else ipc._actor.uid
-                        ),
-                        # tb=valerr.__traceback__,
-                        tb_str=mte._message,
-                    )
-                    mte._ipc_msg = err_msg
-
                     # NOTE: just raise the MTE inline instead of all
                     # the pack-unpack-repack non-sense when this is
                     # a "send side" validation error.
@@ -301,6 +288,22 @@ class PldRx(Struct):
                         #     ctx: Context = getattr(ipc, 'ctx', ipc)
                         #     ctx._maybe_cancel_and_set_remote_error(mte)
 
+                    # NOTE: the `.message` is automatically
+                    # transferred into the message as long as we
+                    # define it as a `Error.message` field.
+                    err_msg: Error = pack_error(
+                        exc=mte,
+                        cid=msg.cid,
+                        src_uid=(
+                            ipc.chan.uid
+                            if not is_started_send_side
+                            else ipc._actor.uid
+                        ),
+                        # tb=valerr.__traceback__,
+                        # tb_str=mte._message,
+                        # message=mte._message,
+                    )
+                    mte._ipc_msg = err_msg
 
                     # XXX override the `msg` passed to
                     # `_raise_from_unexpected_msg()` (below) so so
-- 
2.34.1


From 6e54abc56d0721fe1a2d33420d66d41fce2707f6 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 30 May 2024 09:06:10 -0400
Subject: [PATCH 342/378] Fix missing newline in task-cancel log-message

---
 tractor/_rpc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index f05a433e..daf1ec82 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -393,7 +393,7 @@ async def _errors_relayed_via_ipc(
             # cancel scope will not have been inserted yet
             if is_rpc:
                 log.warning(
-                    'RPC task likely errored or cancelled before start?'
+                    'RPC task likely errored or cancelled before start?\n'
                     f'|_{ctx._task}\n'
                     f'  >> {ctx.repr_rpc}\n'
                 )
-- 
2.34.1


From 1db5d4def29ff8715cafaf1788e9951ced300c5e Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 30 May 2024 09:14:04 -0400
Subject: [PATCH 343/378] Add `Error.message: str`

Allows passing a custom error msg other then the traceback-str over the
wire. Make `.tb_str` optional (in the blank `''` sense) since it's
treated that way thus far in `._exceptions.pack_error()`.
---
 tractor/msg/types.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index 08511ec0..ad6d6fb8 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -410,7 +410,13 @@ class Error(
     src_type_str: str
     boxed_type_str: str
     relay_path: list[tuple[str, str]]
-    tb_str: str
+
+    # normally either both are provided or just
+    # a message for certain special cases where
+    # we pack a message for a locally raised
+    # mte or ctxc.
+    message: str|None = None
+    tb_str: str = ''
 
     # TODO: only optionally include sub-type specfic fields?
     # -[ ] use UNSET or don't include them via `omit_defaults` (see
-- 
2.34.1


From 0e8c60ee4aab56c4668f192b541bd7804256e6f1 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 30 May 2024 10:04:54 -0400
Subject: [PATCH 344/378] Better RAE `.pformat()`-ing for send-side MTEs

Send-side `MsgTypeError`s actually shouldn't have any "boxed" traceback
per say since they're raised in the transmitting actor's local task env
and we (normally) don't want the ascii decoration added around the
error's `._message: str`, that is not until the exc is `pack_error()`-ed
before transit. As such, the presentation of an embedded traceback (and
its ascii box) gets bypassed when only a `._message: str` is set (as we
now do for pld-spec failures in `_mk_msg_type_err()`).

Further this tweaks the `.pformat()` output to include the `._message`
part to look like `<RemoteActorError( <._message> ) ..` instead of
jamming it implicitly to the end of the embedded `.tb_str` (as was done
implicitly by `unpack_error()`) and also adds better handling for the
`with_type_header == False` case including forcing that case when we
detect that the currently handled exc is the RAE in `.pformat()`.
Toss in a lengthier doc-str explaining it all.

Surrounding/supporting changes,
- better `unpack_error()` message which just briefly reports the remote
  task's error type.
- add public `.message: str` prop.
- always set a `._extra_msgdata: dict` since some MTE props rely on it.
- handle `.boxed_type == None` for `.boxed_type_str`.
- maybe pack any detected input or `exc.message` in `pack_error()`.
- comment cruft cleanup in `_mk_msg_type_err()`.
---
 tractor/_exceptions.py | 199 +++++++++++++++++++++++++----------------
 1 file changed, 124 insertions(+), 75 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 0dfaf675..52048c17 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -22,6 +22,7 @@ from __future__ import annotations
 import builtins
 import importlib
 from pprint import pformat
+import sys
 from types import (
     TracebackType,
 )
@@ -110,6 +111,7 @@ _body_fields: list[str] = list(
         'tb_str',
         'relay_path',
         'cid',
+        'message',
 
         # only ctxc should show it but `Error` does
         # have it as an optional field.
@@ -236,6 +238,7 @@ class RemoteActorError(Exception):
         self._boxed_type: BaseException = boxed_type
         self._src_type: BaseException|None = None
         self._ipc_msg: Error|None = ipc_msg
+        self._extra_msgdata = extra_msgdata
 
         if (
             extra_msgdata
@@ -250,8 +253,6 @@ class RemoteActorError(Exception):
                     k,
                     v,
                 )
-        else:
-            self._extra_msgdata = extra_msgdata
 
         # TODO: mask out eventually or place in `pack_error()`
         # pre-`return` lines?
@@ -282,6 +283,17 @@ class RemoteActorError(Exception):
             # ensure any roundtripping evals to the input value
             assert self.boxed_type is boxed_type
 
+    @property
+    def message(self) -> str:
+        '''
+        Be explicit, instead of trying to read it from the the parent
+        type's loosely defined `.args: tuple`:
+
+        https://docs.python.org/3/library/exceptions.html#BaseException.args
+
+        '''
+        return self._message
+
     @property
     def ipc_msg(self) -> Struct:
         '''
@@ -355,7 +367,10 @@ class RemoteActorError(Exception):
 
         '''
         bt: Type[BaseException] = self.boxed_type
-        return str(bt.__name__)
+        if bt:
+            return str(bt.__name__)
+
+        return ''
 
     @property
     def boxed_type(self) -> Type[BaseException]:
@@ -426,8 +441,7 @@ class RemoteActorError(Exception):
 
         for key in fields:
             if (
-                key == 'relay_uid'
-                and not self.is_inception()
+                key == 'relay_uid' and not self.is_inception()
             ):
                 continue
 
@@ -504,19 +518,80 @@ class RemoteActorError(Exception):
     def pformat(
         self,
         with_type_header: bool = True,
+        # with_ascii_box: bool = True,
+
     ) -> str:
         '''
-        Nicely formatted boxed error meta data + traceback, OR just
-        the normal message from `.args` (for eg. as you'd want shown
-        by a locally raised `ContextCancelled`).
+        Format any boxed remote error by multi-line display of,
+
+          - error's src or relay actor meta-data,
+          - remote runtime env's traceback,
+
+        With optional control over the format of,
+
+          - whether the boxed traceback is ascii-decorated with
+            a surrounding "box" annotating the embedded stack-trace.
+          - if the error's type name should be added as margins
+            around the field and tb content like:
+
+            `<RemoteActorError(.. <<multi-line-content>> .. )>`
+
+          - the placement of the `.message: str` (explicit equiv of
+            `.args[0]`), either placed below the `.tb_str` or in the
+            first line's header when the error is raised locally (since
+            the type name is already implicitly shown by python).
 
         '''
         header: str = ''
         body: str = ''
+        message: str = ''
 
+        # XXX when the currently raised exception is this instance,
+        # we do not ever use the "type header" style repr.
+        is_being_raised: bool = False
+        if (
+            (exc := sys.exception())
+            and
+            exc is self
+        ):
+            is_being_raised: bool = True
+
+        with_type_header: bool = (
+            with_type_header
+            and
+            not is_being_raised
+        )
+
+        # <RemoteActorError( .. )> style
         if with_type_header:
-            header: str = f'<{type(self).__name__}(\n'
+            header: str = f'<{type(self).__name__}('
 
+        if message := self._message:
+
+            # split off the first line so, if needed, it isn't
+            # indented the same like the "boxed content" which
+            # since there is no `.tb_str` is just the `.message`.
+            lines: list[str] = message.splitlines()
+            first: str = lines[0]
+            message: str = message.removeprefix(first)
+
+            # with a type-style header we,
+            # - have no special message "first line" extraction/handling
+            # - place the message a space in from the header:
+            #  `MsgTypeError( <message> ..`
+            #                 ^-here
+            # - indent the `.message` inside the type body.
+            if with_type_header:
+                first = f' {first} )>'
+
+            message: str = textwrap.indent(
+                message,
+                prefix=' '*2,
+            )
+            message: str = first + message
+
+        # IFF there is an embedded traceback-str we always
+        # draw the ascii-box around it.
         if tb_str := self.tb_str:
             fields: str = self._mk_fields_str(
                 _body_fields
@@ -535,36 +610,19 @@ class RemoteActorError(Exception):
                 #             |___ ..
                 tb_body_indent=1,
             )
-            if not with_type_header:
-                body = '\n' + body
 
-        elif message := self._message:
-            # split off the first line so it isn't indented
-            # the same like the "boxed content".
-            if not with_type_header:
-                lines: list[str] = message.splitlines()
-                first: str = lines[0]
-                message: str = message.removeprefix(first)
-
-            else:
-                first: str = ''
-
-            body: str = (
-                first
-                +
-                message
-                +
-                '\n'
-            )
-
-        if with_type_header:
-            tail: str = ')>'
-        else:
-            tail = ''
+        tail = ''
+        if (
+            with_type_header
+            and not message
+        ):
+            tail: str = '>'
 
         return (
             header
             +
+            message
+            +
             f'{body}'
             +
             tail
@@ -577,7 +635,9 @@ class RemoteActorError(Exception):
     # |_ i guess `pexepect` relies on `str`-casing
     #    of output?
     def __str__(self) -> str:
-        return self.pformat(with_type_header=False)
+        return self.pformat(
+            with_type_header=False
+        )
 
     def unwrap(
         self,
@@ -825,9 +885,6 @@ class MsgTypeError(
             extra_msgdata['_bad_msg'] = bad_msg
             extra_msgdata['cid'] = bad_msg.cid
 
-        if 'cid' not in extra_msgdata:
-            import pdbp; pdbp.set_trace()
-
         return cls(
             message=message,
             boxed_type=cls,
@@ -889,6 +946,7 @@ def pack_error(
     src_uid: tuple[str, str]|None = None,
     tb: TracebackType|None = None,
     tb_str: str = '',
+    message: str = '',
 
 ) -> Error:
     '''
@@ -971,7 +1029,7 @@ def pack_error(
     # the locally raised error (so, NOT the prior relay's boxed
     # `._ipc_msg.tb_str`).
     error_msg['tb_str'] = tb_str
-
+    error_msg['message'] = message or getattr(exc, 'message', '')
     if cid is not None:
         error_msg['cid'] = cid
 
@@ -995,26 +1053,24 @@ def unpack_error(
     if not isinstance(msg, Error):
         return None
 
-    # retrieve the remote error's msg-encoded details
-    tb_str: str = msg.tb_str
-    message: str = (
-        f'{chan.uid}\n'
-        +
-        tb_str
-    )
-
     # try to lookup a suitable error type from the local runtime
     # env then use it to construct a local instance.
     # boxed_type_str: str = error_dict['boxed_type_str']
     boxed_type_str: str = msg.boxed_type_str
     boxed_type: Type[BaseException] = get_err_type(boxed_type_str)
 
-    if boxed_type_str == 'ContextCancelled':
-        box_type = ContextCancelled
-        assert boxed_type is box_type
+    # retrieve the error's msg-encoded remotoe-env info
+    message: str = f'remote task raised a {msg.boxed_type_str!r}\n'
 
-    elif boxed_type_str == 'MsgTypeError':
-        box_type = MsgTypeError
+    # TODO: do we even really need these checks for RAEs?
+    if boxed_type_str in [
+        'ContextCancelled',
+        'MsgTypeError',
+    ]:
+        box_type = {
+            'ContextCancelled': ContextCancelled,
+            'MsgTypeError': MsgTypeError,
+        }[boxed_type_str]
         assert boxed_type is box_type
 
     # TODO: already included by `_this_mod` in else loop right?
@@ -1029,19 +1085,21 @@ def unpack_error(
     exc = box_type(
         message,
         ipc_msg=msg,
+        tb_str=msg.tb_str,
     )
 
     return exc
 
 
-def is_multi_cancelled(exc: BaseException) -> bool:
+def is_multi_cancelled(
+    exc: BaseException|BaseExceptionGroup
+) -> bool:
     '''
     Predicate to determine if a possible ``BaseExceptionGroup`` contains
     only ``trio.Cancelled`` sub-exceptions (and is likely the result of
     cancelling a collection of subtasks.
 
     '''
-    # if isinstance(exc, eg.BaseExceptionGroup):
     if isinstance(exc, BaseExceptionGroup):
         return exc.subgroup(
             lambda exc: isinstance(exc, trio.Cancelled)
@@ -1184,7 +1242,6 @@ def _mk_msg_type_err(
     src_validation_error: ValidationError|None = None,
     src_type_error: TypeError|None = None,
     is_invalid_payload: bool = False,
-    # src_err_msg: Error|None = None,
 
     **mte_kwargs,
 
@@ -1251,19 +1308,11 @@ def _mk_msg_type_err(
             msg_type: str = type(msg)
             any_pld: Any = msgpack.decode(msg.pld)
             message: str = (
-                f'invalid `{msg_type.__qualname__}` payload\n\n'
-                f'value: `{any_pld!r}` does not match type-spec: ' #\n'
+                f'invalid `{msg_type.__qualname__}` msg payload\n\n'
+                f'value: `{any_pld!r}` does not match type-spec: '
                 f'`{type(msg).__qualname__}.pld: {codec.pld_spec_str}`'
-                # f'<{type(msg).__qualname__}(\n'
-                # f' |_pld: {codec.pld_spec_str}\n'# != {any_pld!r}\n'
-                # f')>\n\n'
             )
-            # src_err_msg = msg
             bad_msg = msg
-            # TODO: should we just decode the msg to a dict despite
-            # only the payload being wrong?
-            # -[ ] maybe the better design is to break this construct
-            #   logic into a separate explicit helper raiser-func?
 
         else:
             # decode the msg-bytes using the std msgpack
@@ -1308,21 +1357,21 @@ def _mk_msg_type_err(
         if verb_header:
             message = f'{verb_header} ' + message
 
-        # if not isinstance(bad_msg, PayloadMsg):
-        #     import pdbp; pdbp.set_trace()
-
         msgtyperr = MsgTypeError.from_decode(
             message=message,
             bad_msg=bad_msg,
             bad_msg_as_dict=msg_dict,
 
-            # NOTE: for the send-side `.started()` pld-validate
-            # case we actually set the `._ipc_msg` AFTER we return
-            # from here inside `Context.started()` since we actually
-            # want to emulate the `Error` from the mte we build here
-            # Bo
-            # so by default in that case this is set to `None`
-            # ipc_msg=src_err_msg,
+            # NOTE: for pld-spec MTEs we set the `._ipc_msg` manually:
+            # - for the send-side `.started()` pld-validate
+            #   case we actually raise inline so we don't need to
+            #   set the it at all.
+            # - for recv side we set it inside `PldRx.decode_pld()`
+            #   after a manual call to `pack_error()` since we
+            #   actually want to emulate the `Error` from the mte we
+            #   build here. So by default in that case, this is left
+            #   as `None` here.
+            #   ipc_msg=src_err_msg,
         )
         msgtyperr.__cause__ = src_validation_error
         return msgtyperr
-- 
2.34.1


From bbb4d4e52cdbd6d83c7149961a1aa06969b1f6f9 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 30 May 2024 15:24:25 -0400
Subject: [PATCH 345/378] Add `from_src_exc: BaseException` to maybe raisers

That is as a control to `Context._maybe_raise_remote_err()` such that
if set to anything other then the default (`False` value), we do
`raise remote_error from from_src_exc` such that caller can choose to
suppress or override the `.__cause__` tb.

Also tidy up and old masked TODO regarding calling `.maybe_raise()`
after the caller exits from the `yield` in `.open_context()`..
---
 tractor/_context.py | 51 ++++++++++++++++++---------------------------
 1 file changed, 20 insertions(+), 31 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index 142cb1ab..81db66c3 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -1190,6 +1190,7 @@ class Context:
         self,
         remote_error: Exception,
 
+        from_src_exc: BaseException|None|bool = False,
         raise_ctxc_from_self_call: bool = False,
         raise_overrun_from_self: bool = True,
         hide_tb: bool = True,
@@ -1284,7 +1285,10 @@ class Context:
         #       runtime frames from the tb explicitly?
         # https://docs.python.org/3/reference/simple_stmts.html#the-raise-statement
         # https://stackoverflow.com/a/24752607
-        raise remote_error # from None
+        if from_src_exc is not False:
+            raise remote_error from from_src_exc
+
+        raise remote_error
 
     # TODO: change  to `.wait_for_result()`?
     async def result(
@@ -2096,7 +2100,11 @@ async def open_context_from_portal(
                 # `._maybe_cancel_and_set_remote_error()` so ensure
                 # we raise the underlying `._remote_error` directly
                 # instead of bubbling that taskc.
-                ctx.maybe_raise()
+                ctx.maybe_raise(
+                    # mask the above taskc from the tb
+                    from_src_exc=None,
+                    hide_tb=hide_tb,
+                )
 
                 # OW, some other unexpected cancel condition
                 # that should prolly never happen right?
@@ -2108,13 +2116,14 @@ async def open_context_from_portal(
             ctx._started_msg: bool = started_msg
             ctx._started_pld: bool = first
 
-            # deliver context instance and .started() msg value
-            # in enter tuple.
+            # deliver context ref and `.started()` msg payload value
+            # in `__aenter__` tuple.
             yield ctx, first
 
             # ??TODO??: do we still want to consider this or is
             # the `else:` block handling via a `.result()`
             # call below enough??
+            #
             # -[ ] pretty sure `.result()` internals do the
             # same as our ctxc handler below so it ended up
             # being same (repeated?) behaviour, but ideally we
@@ -2123,33 +2132,13 @@ async def open_context_from_portal(
             # that we can re-use it around the `yield` ^ here
             # or vice versa?
             #
-            # NOTE: between the caller exiting and arriving
-            # here the far end may have sent a ctxc-msg or
-            # other error, so check for it here immediately
-            # and maybe raise so as to engage the ctxc
-            # handling block below!
+            # maybe TODO NOTE: between the caller exiting and
+            # arriving here the far end may have sent a ctxc-msg or
+            # other error, so the quetion is whether we should check
+            # for it here immediately and maybe raise so as to engage
+            # the ctxc handling block below ????
             #
-            # if re := ctx._remote_error:
-            #     maybe_ctxc: ContextCancelled|None = ctx._maybe_raise_remote_err(
-            #         re,
-            #         # TODO: do we want this to always raise?
-            #         # - means that on self-ctxc, if/when the
-            #         #   block is exited before the msg arrives
-            #         #   but then the msg during __exit__
-            #         #   calling we may not activate the
-            #         #   ctxc-handler block below? should we
-            #         #   be?
-            #         # - if there's a remote error that arrives
-            #         #   after the child has exited, we won't
-            #         #   handle until the `finally:` block
-            #         #   where `.result()` is always called,
-            #         #   again in which case we handle it
-            #         #   differently then in the handler block
-            #         #   that would normally engage from THIS
-            #         #   block?
-            #         raise_ctxc_from_self_call=True,
-            #     )
-            #     ctxc_from_callee = maybe_ctxc
+            # self.maybe_raise()
 
             # when in allow_overruns mode there may be
             # lingering overflow sender tasks remaining?
@@ -2460,7 +2449,7 @@ async def open_context_from_portal(
             #
             # NOTE: further, this should be the only place the
             # underlying feeder channel is
-            # once-and-only-CLOSED!
+            # once-forever-and-only-CLOSED!
             with trio.CancelScope(shield=True):
                 await ctx._rx_chan.aclose()
 
-- 
2.34.1


From 993281882b2b76beb511aeb15334c18f71865207 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 30 May 2024 15:55:34 -0400
Subject: [PATCH 346/378] Pass `boxed_type` from `_mk_msg_type_err()`

Such that we're boxing the interchanged lib's specific error
`msgspec.ValidationError` in this case) type much like how
a `ContextCancelled[trio.Cancelled]` is composed; allows for seemless
multi-backend-codec support later as well B)

Pass `ctx.maybe_raise(from_src_exc=src_err)` where needed in a couple
spots; as `None` in the send-side `Started` MTE case to avoid showing
the `._scope1.cancel_called` result in the traceback from the
`.open_context()` child-sync phase.
---
 tractor/_exceptions.py | 8 ++++----
 tractor/_streaming.py  | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 52048c17..92c3fafb 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -518,7 +518,6 @@ class RemoteActorError(Exception):
     def pformat(
         self,
         with_type_header: bool = True,
-        # with_ascii_box: bool = True,
 
     ) -> str:
         '''
@@ -885,9 +884,9 @@ class MsgTypeError(
             extra_msgdata['_bad_msg'] = bad_msg
             extra_msgdata['cid'] = bad_msg.cid
 
+        extra_msgdata.setdefault('boxed_type', cls)
         return cls(
             message=message,
-            boxed_type=cls,
             **extra_msgdata,
         )
 
@@ -1111,7 +1110,7 @@ def is_multi_cancelled(
 def _raise_from_unexpected_msg(
     ctx: Context,
     msg: MsgType,
-    src_err: AttributeError,
+    src_err: Exception,
     log: StackLevelAdapter,  # caller specific `log` obj
 
     expect_msg: Type[MsgType],
@@ -1212,7 +1211,7 @@ def _raise_from_unexpected_msg(
             # in case there already is some underlying remote error
             # that arrived which is probably the source of this stream
             # closure
-            ctx.maybe_raise()
+            ctx.maybe_raise(from_src_exc=src_err)
             raise eoc from src_err
 
         # TODO: our own transport/IPC-broke error subtype?
@@ -1361,6 +1360,7 @@ def _mk_msg_type_err(
             message=message,
             bad_msg=bad_msg,
             bad_msg_as_dict=msg_dict,
+            boxed_type=type(src_validation_error),
 
             # NOTE: for pld-spec MTEs we set the `._ipc_msg` manually:
             # - for the send-side `.started()` pld-validate
diff --git a/tractor/_streaming.py b/tractor/_streaming.py
index a008eaf5..016577d3 100644
--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@@ -233,6 +233,7 @@ class MsgStream(trio.abc.Channel):
         # ctx: Context = self._ctx
         ctx.maybe_raise(
             raise_ctxc_from_self_call=True,
+            from_src_exc=src_err,
         )
 
         # propagate any error but hide low-level frame details
-- 
2.34.1


From fcd089c08f95179da26833ab10921806a586fca6 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 30 May 2024 16:02:25 -0400
Subject: [PATCH 347/378] Always `.exception()` in `try_ship_error_to_remote()`
 on internal error

---
 tractor/_rpc.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index daf1ec82..8a1be7b2 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -80,7 +80,6 @@ from tractor.msg.types import (
     Yield,
 )
 
-
 if TYPE_CHECKING:
     from ._runtime import Actor
 
@@ -328,7 +327,6 @@ async def _errors_relayed_via_ipc(
                         f'|_{ctx}'
                     )
 
-
         # ALWAYS try to ship RPC errors back to parent/caller task
         if is_rpc:
 
@@ -819,6 +817,12 @@ async def try_ship_error_to_remote(
                 # TODO: use `.msg.preetty_struct` for this!
                 f'{msg}\n'
             )
+        except BaseException:
+            log.exception(
+                'Errored while attempting error shipment?'
+            )
+            __tracebackhide__: bool = False
+            raise
 
 
 async def process_messages(
-- 
2.34.1


From cdb1311e40c1d771a3b56bc16507bb2b45424baf Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 30 May 2024 16:02:59 -0400
Subject: [PATCH 348/378] Change `reraise` to `post_mortem: bool` in
 `maybe_expect_raises()`

---
 tests/test_pldrx_limiting.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/test_pldrx_limiting.py b/tests/test_pldrx_limiting.py
index 0c5c7ee8..5cb0d357 100644
--- a/tests/test_pldrx_limiting.py
+++ b/tests/test_pldrx_limiting.py
@@ -46,7 +46,7 @@ maybe_msg_spec = PldMsg|None
 async def maybe_expect_raises(
     raises: BaseException|None = None,
     ensure_in_message: list[str]|None = None,
-    reraise: bool = False,
+    post_mortem: bool = False,
     timeout: int = 3,
 ) -> None:
     '''
@@ -86,8 +86,8 @@ async def maybe_expect_raises(
                                 f'{inner_err.args}'
                         )
 
-                if reraise:
-                    raise inner_err
+                if post_mortem:
+                    await tractor.post_mortem()
 
         else:
             if raises:
@@ -314,6 +314,8 @@ def test_basic_payload_spec(
                         f"value: `{bad_value_str}` does not "
                         f"match type-spec: `{msg_type_str}.pld: PldMsg|NoneType`",
                     ],
+                    # only for debug
+                    post_mortem=True,
                 ),
                 p.open_context(
                     child,
-- 
2.34.1


From 2f854a3e86d898045c3bbf093e0df14e89a2b339 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 30 May 2024 16:03:28 -0400
Subject: [PATCH 349/378] Add a `tractor.post_mortem()` API test + example

Since turns out we didn't have a single example using that API Bo

The test granular-ly checks all use cases:
- `.post_mortem()` manual calls in both subactor and root.
- ensuring built-in RPC crash handling activates after each manual one
  from ^.
- drafted some call-stack frame checking that i commented out for now
  since we need to first do ANSI escape code removal due to the
  colorization that `pdbp` does by default.
  |_ added a TODO with SO link on `assert_before()`.

Also todo-staged a shielded-pause test to match with the already
existing-but-needs-refinement example B)
---
 examples/debugging/pm_in_subactor.py |  56 ++++++++++++
 tests/test_debugger.py               | 122 ++++++++++++++++++++++++++-
 2 files changed, 177 insertions(+), 1 deletion(-)
 create mode 100644 examples/debugging/pm_in_subactor.py

diff --git a/examples/debugging/pm_in_subactor.py b/examples/debugging/pm_in_subactor.py
new file mode 100644
index 00000000..a8f5048e
--- /dev/null
+++ b/examples/debugging/pm_in_subactor.py
@@ -0,0 +1,56 @@
+import trio
+import tractor
+
+
+@tractor.context
+async def name_error(
+    ctx: tractor.Context,
+):
+    '''
+    Raise a `NameError`, catch it and enter `.post_mortem()`, then
+    expect the `._rpc._invoke()` crash handler to also engage.
+
+    '''
+    try:
+        getattr(doggypants)  # noqa (on purpose)
+    except NameError:
+        await tractor.post_mortem()
+        raise
+
+
+async def main():
+    '''
+    Test 3 `PdbREPL` entries:
+      - one in the child due to manual `.post_mortem()`,
+      - another in the child due to runtime RPC crash handling.
+      - final one here in parent from the RAE.
+
+    '''
+    # XXX NOTE: ideally the REPL arrives at this frame in the parent
+    # ONE UP FROM the inner ctx block below!
+    async with tractor.open_nursery(
+        debug_mode=True,
+        # loglevel='cancel',
+    ) as an:
+        p: tractor.Portal = await an.start_actor(
+            'child',
+            enable_modules=[__name__],
+        )
+
+        # XXX should raise `RemoteActorError[NameError]`
+        # AND be the active frame when REPL enters!
+        try:
+            async with p.open_context(name_error) as (ctx, first):
+                assert first
+        except tractor.RemoteActorError as rae:
+            assert rae.boxed_type is NameError
+
+            # manually handle in root's parent task
+            await tractor.post_mortem()
+            raise
+        else:
+            raise RuntimeError('IPC ctx should have remote errored!?')
+
+
+if __name__ == '__main__':
+    trio.run(main)
diff --git a/tests/test_debugger.py b/tests/test_debugger.py
index 9d159ffe..a673c5d0 100644
--- a/tests/test_debugger.py
+++ b/tests/test_debugger.py
@@ -161,6 +161,10 @@ def in_prompt_msg(
 
     return True
 
+
+# TODO: todo support terminal color-chars stripping so we can match
+# against call stack frame output from the the 'll' command the like!
+# -[ ] SO answer for stipping ANSI codes: https://stackoverflow.com/a/14693789
 def assert_before(
     child,
     patts: list[str],
@@ -1125,7 +1129,112 @@ def test_pause_from_sync(
     child.expect(pexpect.EOF)
 
 
-# TODO!
+def test_post_mortem_api(
+    spawn,
+    ctlc: bool,
+):
+    '''
+    Verify the `tractor.post_mortem()` API works in an exception
+    handler block.
+
+    '''
+    child = spawn('pm_in_subactor')
+
+    # First entry is via manual `.post_mortem()`
+    child.expect(PROMPT)
+    assert_before(
+        child,
+        [
+            _crash_msg,
+            "<Task 'name_error'",
+            "NameError",
+            "('child'",
+            "tractor.post_mortem()",
+        ]
+    )
+    if ctlc:
+        do_ctlc(child)
+    child.sendline('c')
+
+    # 2nd is RPC crash handler
+    child.expect(PROMPT)
+    assert_before(
+        child,
+        [
+            _crash_msg,
+            "<Task 'name_error'",
+            "NameError",
+            "('child'",
+        ]
+    )
+    if ctlc:
+        do_ctlc(child)
+    child.sendline('c')
+
+    # 3rd is via RAE bubbled to root's parent ctx task and
+    # crash-handled via another manual pm call.
+    child.expect(PROMPT)
+    assert_before(
+        child,
+        [
+            _crash_msg,
+            "<Task '__main__.main'",
+            "('root'",
+            "NameError",
+            "tractor.post_mortem()",
+            "src_uid=('child'",
+        ]
+    )
+    if ctlc:
+        do_ctlc(child)
+    child.sendline('c')
+
+    # 4th and FINAL is via RAE bubbled to root's parent ctx task and
+    # crash-handled via another manual pm call.
+    child.expect(PROMPT)
+    assert_before(
+        child,
+        [
+            _crash_msg,
+            "<Task '__main__.main'",
+            "('root'",
+            "NameError",
+            "src_uid=('child'",
+        ]
+    )
+    if ctlc:
+        do_ctlc(child)
+
+
+    # TODO: ensure we're stopped and showing the right call stack frame
+    # -[ ] need a way to strip the terminal color chars in order to
+    #    pattern match... see TODO around `assert_before()` above!
+    # child.sendline('w')
+    # child.expect(PROMPT)
+    # assert_before(
+    #     child,
+    #     [
+    #         # error src block annot at ctx open
+    #         '-> async with p.open_context(name_error) as (ctx, first):',
+    #     ]
+    # )
+
+    # # step up a frame to ensure the it's the root's nursery
+    # child.sendline('u')
+    # child.expect(PROMPT)
+    # assert_before(
+    #     child,
+    #     [
+    #         # handler block annotation
+    #         '-> async with tractor.open_nursery(',
+    #     ]
+    # )
+
+    child.sendline('c')
+    child.expect(pexpect.EOF)
+
+
+# TODO: needs ANSI code stripping tho, see `assert_before()` # above!
 def test_correct_frames_below_hidden():
     '''
     Ensure that once a `tractor.pause()` enages, when the user
@@ -1138,4 +1247,15 @@ def test_correct_frames_below_hidden():
 
 
 def test_cant_pause_from_paused_task():
+    '''
+    Pausing from with an already paused task should raise an error.
+
+    Normally this should only happen in practise while debugging the call stack of `tractor.pause()` itself, likely
+    by a `.pause()` line somewhere inside our runtime.
+
+    '''
+    ...
+
+
+def test_shield_pause():
     ...
-- 
2.34.1


From 13ea500a44130ae25c8a93e3e9bcc8356625c427 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 30 May 2024 16:09:59 -0400
Subject: [PATCH 350/378] Rename `PldRx.dec_msg()` -> `.decode_pld()`

Keep the old alias, but i think it's better form to use longer names for
internal public APIs and this name better reflects the functionality:
decoding and returning a `PayloadMsg.pld` field.
---
 tractor/msg/_ops.py | 54 +++++++++++++--------------------------------
 1 file changed, 15 insertions(+), 39 deletions(-)

diff --git a/tractor/msg/_ops.py b/tractor/msg/_ops.py
index 97cd3f29..86f80395 100644
--- a/tractor/msg/_ops.py
+++ b/tractor/msg/_ops.py
@@ -167,7 +167,7 @@ class PldRx(Struct):
         ipc_msg: MsgType|None = None,
         expect_msg: Type[MsgType]|None = None,
         hide_tb: bool = False,
-        **dec_msg_kwargs,
+        **dec_pld_kwargs,
 
     ) -> Any|Raw:
         __tracebackhide__: bool = hide_tb
@@ -179,12 +179,12 @@ class PldRx(Struct):
             # sync-rx msg from underlying IPC feeder (mem-)chan
             ipc._rx_chan.receive_nowait()
         )
-        return self.dec_msg(
+        return self.decode_pld(
             msg,
             ipc=ipc,
             expect_msg=expect_msg,
             hide_tb=hide_tb,
-            **dec_msg_kwargs,
+            **dec_pld_kwargs,
         )
 
     async def recv_pld(
@@ -194,7 +194,7 @@ class PldRx(Struct):
         expect_msg: Type[MsgType]|None = None,
         hide_tb: bool = True,
 
-        **dec_msg_kwargs,
+        **dec_pld_kwargs,
 
     ) -> Any|Raw:
         '''
@@ -208,17 +208,14 @@ class PldRx(Struct):
             # async-rx msg from underlying IPC feeder (mem-)chan
             await ipc._rx_chan.receive()
         )
-        return self.dec_msg(
+        return self.decode_pld(
             msg=msg,
             ipc=ipc,
             expect_msg=expect_msg,
-            **dec_msg_kwargs,
+            **dec_pld_kwargs,
         )
 
-    # TODO: rename to,
-    # -[ ] `.decode_pld()`?
-    # -[ ] `.dec_pld()`?
-    def dec_msg(
+    def decode_pld(
         self,
         msg: MsgType,
         ipc: Context|MsgStream,
@@ -299,9 +296,6 @@ class PldRx(Struct):
                             if not is_started_send_side
                             else ipc._actor.uid
                         ),
-                        # tb=valerr.__traceback__,
-                        # tb_str=mte._message,
-                        # message=mte._message,
                     )
                     mte._ipc_msg = err_msg
 
@@ -317,29 +311,6 @@ class PldRx(Struct):
                     # validation error.
                     src_err = valerr
 
-                    # TODO: should we instead make this explicit and
-                    # use the above masked `is_started_send_decode`,
-                    # expecting the `Context.started()` caller to set
-                    # it? Rn this is kinda, howyousayyy, implicitly
-                    # edge-case-y..
-                    # TODO: remove this since it's been added to
-                    # `_raise_from_unexpected_msg()`..?
-                    # if (
-                    #     expect_msg is not Started
-                    #     and not is_started_send_side
-                    # ):
-                    #     # set emulated remote error more-or-less as the
-                    #     # runtime would
-                    #     ctx: Context = getattr(ipc, 'ctx', ipc)
-                    #     ctx._maybe_cancel_and_set_remote_error(mte)
-
-                # XXX some other decoder specific failure?
-                # except TypeError as src_error:
-                #     from .devx import mk_pdb
-                #     mk_pdb().set_trace()
-                #     raise src_error
-                # ^-TODO-^ can remove?
-
             # a runtime-internal RPC endpoint response.
             # always passthrough since (internal) runtime
             # responses are generally never exposed to consumer
@@ -435,6 +406,8 @@ class PldRx(Struct):
             __tracebackhide__: bool = False
             raise
 
+    dec_msg = decode_pld
+
     async def recv_msg_w_pld(
         self,
         ipc: Context|MsgStream,
@@ -463,7 +436,7 @@ class PldRx(Struct):
         # TODO: is there some way we can inject the decoded
         # payload into an existing output buffer for the original
         # msg instance?
-        pld: PayloadT = self.dec_msg(
+        pld: PayloadT = self.decode_pld(
             msg,
             ipc=ipc,
             expect_msg=expect_msg,
@@ -610,7 +583,10 @@ async def drain_to_final_msg(
             # only when we are sure the remote error is
             # the source cause of this local task's
             # cancellation.
-            ctx.maybe_raise()
+            ctx.maybe_raise(
+                # TODO: when use this/
+                # from_src_exc=taskc,
+            )
 
             # CASE 1: we DID request the cancel we simply
             # continue to bubble up as normal.
@@ -783,7 +759,7 @@ def validate_payload_msg(
     try:
         roundtripped: Started = codec.decode(msg_bytes)
         ctx: Context = getattr(ipc, 'ctx', ipc)
-        pld: PayloadT = ctx.pld_rx.dec_msg(
+        pld: PayloadT = ctx.pld_rx.decode_pld(
             msg=roundtripped,
             ipc=ipc,
             expect_msg=Started,
-- 
2.34.1


From 8ea0f08386ec62721581a792156f62d124b0d2aa Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 30 May 2024 17:52:24 -0400
Subject: [PATCH 351/378] Finally, officially support shielded REPL-ing!

It's been a long time prepped and now finally implemented!

Offer a `shield: bool` argument from our async `._debug` APIs:
- `await tractor.pause(shield=True)`,
- `await tractor.post_mortem(shield=True)`

^-These-^ can now be used inside cancelled `trio.CancelScope`s,
something very handy when introspecting complex (distributed) system
tear/shut-downs particularly under remote error or (inter-peer)
cancellation conditions B)

Thanks to previous prepping in a prior attempt and various patches from
the rigorous rework of `.devx._debug` internals around typed msg specs,
there ain't much that was needed!

Impl deats
- obvi passthrough `shield` from the public API endpoints (was already
  done from a prior attempt).
- put ad-hoc internal `with trio.CancelScope(shield=shield):` around all
  checkpoints inside `._pause()` for both the root-process and subactor
  case branches.

Add a fairly rigorous example, `examples/debugging/shielded_pause.py`
with a wrapping `pexpect` test, `test_debugger.test_shield_pause()` and
ensure it covers as many cases as i can think of offhand:

- multiple `.pause()` entries in a loop despite parent scope
  cancellation in a subactor RPC task which itself spawns a sub-task.
- a `trio.Nursery.parent_task` which raises, is handled and
  tries to enter and unshielded `.post_mortem()`, which of course
  internally raises `Cancelled` in a `._pause()` checkpoint, so we catch
  the `Cancelled` again and then debug the debugger's internal
  cancellation with specific checks for the particular raising
  checkpoint-LOC.
- do ^- the latter -^ for both subactor and root cases to ensure we
  can debug `._pause()` itself when it tries to REPL engage from
  a cancelled task scope Bo
---
 examples/debugging/shielded_pause.py | 88 ++++++++++++++++++++++++++++
 tests/test_debugger.py               | 79 +++++++++++++++++++++++--
 tractor/devx/_debug.py               | 59 +++++++++++--------
 3 files changed, 199 insertions(+), 27 deletions(-)
 create mode 100644 examples/debugging/shielded_pause.py

diff --git a/examples/debugging/shielded_pause.py b/examples/debugging/shielded_pause.py
new file mode 100644
index 00000000..3e34d8fc
--- /dev/null
+++ b/examples/debugging/shielded_pause.py
@@ -0,0 +1,88 @@
+import trio
+import tractor
+
+
+async def cancellable_pause_loop(
+    task_status: trio.TaskStatus[trio.CancelScope] = trio.TASK_STATUS_IGNORED
+):
+    with trio.CancelScope() as cs:
+        task_status.started(cs)
+        for _ in range(3):
+            try:
+                # ON first entry, there is no level triggered
+                # cancellation yet, so this cp does a parent task
+                # ctx-switch so that this scope raises for the NEXT
+                # checkpoint we hit.
+                await trio.lowlevel.checkpoint()
+                await tractor.pause()
+
+                cs.cancel()
+
+                # parent should have called `cs.cancel()` by now
+                await trio.lowlevel.checkpoint()
+
+            except trio.Cancelled:
+                print('INSIDE SHIELDED PAUSE')
+                await tractor.pause(shield=True)
+        else:
+            # should raise it again, bubbling up to parent
+            print('BUBBLING trio.Cancelled to parent task-nursery')
+            await trio.lowlevel.checkpoint()
+
+
+async def pm_on_cancelled():
+    async with trio.open_nursery() as tn:
+        tn.cancel_scope.cancel()
+        try:
+            await trio.sleep_forever()
+        except trio.Cancelled:
+            # should also raise `Cancelled` since
+            # we didn't pass `shield=True`.
+            try:
+                await tractor.post_mortem(hide_tb=False)
+            except trio.Cancelled as taskc:
+
+                # should enter just fine, in fact it should
+                # be debugging the internals of the previous
+                # sin-shield call above Bo
+                await tractor.post_mortem(
+                    hide_tb=False,
+                    shield=True,
+                )
+                raise taskc
+
+        else:
+            raise RuntimeError('Dint cancel as expected!?')
+
+
+async def cancelled_before_pause(
+):
+    '''
+    Verify that using a shielded pause works despite surrounding
+    cancellation called state in the calling task.
+
+    '''
+    async with trio.open_nursery() as tn:
+        cs: trio.CancelScope = await tn.start(cancellable_pause_loop)
+        await trio.sleep(0.1)
+
+    assert cs.cancelled_caught
+
+    await pm_on_cancelled()
+
+
+async def main():
+    async with tractor.open_nursery(
+        debug_mode=True,
+    ) as n:
+        portal: tractor.Portal = await n.run_in_actor(
+            cancelled_before_pause,
+        )
+        await portal.result()
+
+        # ensure the same works in the root actor!
+        await pm_on_cancelled()
+
+
+if __name__ == '__main__':
+    trio.run(main)
diff --git a/tests/test_debugger.py b/tests/test_debugger.py
index a673c5d0..72778bda 100644
--- a/tests/test_debugger.py
+++ b/tests/test_debugger.py
@@ -1234,6 +1234,81 @@ def test_post_mortem_api(
     child.expect(pexpect.EOF)
 
 
+def test_shield_pause(
+    spawn,
+):
+    '''
+    Verify the `tractor.pause()/.post_mortem()` API works inside an
+    already cancelled `trio.CancelScope` and that you can step to the
+    next checkpoint wherein the cancelled will get raised.
+
+    '''
+    child = spawn('shielded_pause')
+
+    # First entry is via manual `.post_mortem()`
+    child.expect(PROMPT)
+    assert_before(
+        child,
+        [
+            _pause_msg,
+            "cancellable_pause_loop'",
+            "('cancelled_before_pause'",  # actor name
+        ]
+    )
+
+    # since 3 tries in ex. shield pause loop
+    for i in range(3):
+        child.sendline('c')
+        child.expect(PROMPT)
+        assert_before(
+            child,
+            [
+                _pause_msg,
+                "INSIDE SHIELDED PAUSE",
+                "('cancelled_before_pause'",  # actor name
+            ]
+        )
+
+    # back inside parent task that opened nursery
+    child.sendline('c')
+    child.expect(PROMPT)
+    assert_before(
+        child,
+        [
+            _crash_msg,
+            "('cancelled_before_pause'",  # actor name
+            "Failed to engage debugger via `_pause()`",
+            "trio.Cancelled",
+            "raise Cancelled._create()",
+
+            # we should be handling a taskc inside
+            # the first `.port_mortem()` sin-shield!
+            'await DebugStatus.req_finished.wait()',
+        ]
+    )
+
+    # same as above but in the root actor's task
+    child.sendline('c')
+    child.expect(PROMPT)
+    assert_before(
+        child,
+        [
+            _crash_msg,
+            "('root'",  # actor name
+            "Failed to engage debugger via `_pause()`",
+            "trio.Cancelled",
+            "raise Cancelled._create()",
+
+            # handling a taskc inside the first unshielded
+            # `.port_mortem()`.
+            # BUT in this case in the root-proc path ;)
+            'wait Lock._debug_lock.acquire()',
+        ]
+    )
+    child.sendline('c')
+    child.expect(pexpect.EOF)
+
+
 # TODO: needs ANSI code stripping tho, see `assert_before()` # above!
 def test_correct_frames_below_hidden():
     '''
@@ -1255,7 +1330,3 @@ def test_cant_pause_from_paused_task():
 
     '''
     ...
-
-
-def test_shield_pause():
-    ...
diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 753c1985..2f0e7e12 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -1600,25 +1600,27 @@ async def _pause(
                     f'REPL: {Lock.repl}\n'
                     # TODO: use `._frame_stack` scanner to find the @api_frame
                 )
-                await trio.lowlevel.checkpoint()
+                with trio.CancelScope(shield=shield):
+                    await trio.lowlevel.checkpoint()
                 return
 
             # XXX: since we need to enter pdb synchronously below,
             # we have to release the lock manually from pdb completion
             # callbacks. Can't think of a nicer way then this atm.
-            if Lock._debug_lock.locked():
-                log.warning(
-                    'attempting to shield-acquire active TTY lock owned by\n'
-                    f'{ctx}'
-                )
+            with trio.CancelScope(shield=shield):
+                if Lock._debug_lock.locked():
+                    log.warning(
+                        'attempting to shield-acquire active TTY lock owned by\n'
+                        f'{ctx}'
+                    )
 
-                # must shield here to avoid hitting a ``Cancelled`` and
-                # a child getting stuck bc we clobbered the tty
-                with trio.CancelScope(shield=True):
+                    # must shield here to avoid hitting a ``Cancelled`` and
+                    # a child getting stuck bc we clobbered the tty
+                    # with trio.CancelScope(shield=True):
+                    await Lock._debug_lock.acquire()
+                else:
+                    # may be cancelled
                     await Lock._debug_lock.acquire()
-            else:
-                # may be cancelled
-                await Lock._debug_lock.acquire()
 
             # enter REPL from root, no TTY locking IPC ctx necessary
             _enter_repl_sync(debug_func)
@@ -1659,7 +1661,8 @@ async def _pause(
                             f'{task.name}@{actor.uid} already has TTY lock\n'
                             f'ignoring..'
                         )
-                        await trio.lowlevel.checkpoint()
+                        with trio.CancelScope(shield=shield):
+                            await trio.lowlevel.checkpoint()
                         return
 
                     else:
@@ -1671,8 +1674,9 @@ async def _pause(
                             f'{task}@{actor.uid} already has TTY lock\n'
                             f'waiting for release..'
                         )
-                        await DebugStatus.repl_release.wait()
-                        await trio.sleep(0.1)
+                        with trio.CancelScope(shield=shield):
+                            await DebugStatus.repl_release.wait()
+                            await trio.sleep(0.1)
 
                 elif (
                     req_task
@@ -1683,7 +1687,8 @@ async def _pause(
 
                         'Waiting for previous request to complete..\n'
                     )
-                    await DebugStatus.req_finished.wait()
+                    with trio.CancelScope(shield=shield):
+                        await DebugStatus.req_finished.wait()
 
             # this **must** be awaited by the caller and is done using the
             # root nursery so that the debugger can continue to run without
@@ -1721,14 +1726,15 @@ async def _pause(
                 'Starting request task\n'
                 f'|_{task}\n'
             )
-            req_ctx: Context = await actor._service_n.start(
-                partial(
-                    request_root_stdio_lock,
-                    actor_uid=actor.uid,
-                    task_uid=(task.name, id(task)),  # task uuid (effectively)
-                    shield=shield,
+            with trio.CancelScope(shield=shield):
+                req_ctx: Context = await actor._service_n.start(
+                    partial(
+                        request_root_stdio_lock,
+                        actor_uid=actor.uid,
+                        task_uid=(task.name, id(task)),  # task uuid (effectively)
+                        shield=shield,
+                    )
                 )
-            )
             # XXX sanity, our locker task should be the one which
             # entered a new IPC ctx with the root actor, NOT the one
             # that exists around the task calling into `._pause()`.
@@ -2147,6 +2153,13 @@ async def post_mortem(
     **_pause_kwargs,
 
 ) -> None:
+    '''
+    `tractor`'s builtin async equivalient of `pdb.post_mortem()`
+    which can be used inside exception handlers.
+
+    It's also used for the crash handler when `debug_mode == True` ;)
+
+    '''
     __tracebackhide__: bool = hide_tb
 
     tb: TracebackType = tb or sys.exc_info()[2]
-- 
2.34.1


From d802c8aa90d63c99930279d86f065a5d927f2989 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 30 May 2024 18:33:25 -0400
Subject: [PATCH 352/378] Woops, set `post_mortem=False` by default again!

---
 tests/test_pldrx_limiting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_pldrx_limiting.py b/tests/test_pldrx_limiting.py
index 5cb0d357..ddf2a234 100644
--- a/tests/test_pldrx_limiting.py
+++ b/tests/test_pldrx_limiting.py
@@ -315,7 +315,7 @@ def test_basic_payload_spec(
                         f"match type-spec: `{msg_type_str}.pld: PldMsg|NoneType`",
                     ],
                     # only for debug
-                    post_mortem=True,
+                    # post_mortem=True,
                 ),
                 p.open_context(
                     child,
-- 
2.34.1


From 4a270f85ca940b15a67a5d9496dd88cc6847cd02 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 31 May 2024 12:03:18 -0400
Subject: [PATCH 353/378] Drop sub-decoder proto-cruft from `.msg._codec`

It ended up getting necessarily implemented as the `PldRx` though at
a different layer and won't be needed as part of `MsgCodec` most likely,
though this original idea did provide the source of inspiration for how
things work now!

Also Move the commented TODO proto for a codec hook factory from
`.types` to `._codec` where it prolly better fits and update some msg
related todo/questions.
---
 tractor/msg/_codec.py | 196 ++++++++----------------------------------
 tractor/msg/types.py  |  62 +++++--------
 2 files changed, 58 insertions(+), 200 deletions(-)

diff --git a/tractor/msg/_codec.py b/tractor/msg/_codec.py
index e1c59e94..c1301bd2 100644
--- a/tractor/msg/_codec.py
+++ b/tractor/msg/_codec.py
@@ -52,10 +52,6 @@ from msgspec import (
     msgpack,
     Raw,
 )
-# from trio.lowlevel import (
-#     RunVar,
-#     RunVarToken,
-# )
 # TODO: see notes below from @mikenerone..
 # from tricycle import TreeVar
 
@@ -368,160 +364,16 @@ class MsgCodec(Struct):
         # https://jcristharif.com/msgspec/usage.html#typed-decoding
         return self._dec.decode(msg)
 
-    # TODO: a sub-decoder system as well?
-    # payload_msg_specs: Union[Type[Struct]] = Any
-    # see related comments in `.msg.types`
-    # _payload_decs: (
-    #     dict[
-    #         str,
-    #         msgpack.Decoder,
-    #     ]
-    #     |None
-    # ) = None
-    # OR
-    # ) = {
-    #     # pre-seed decoders for std-py-type-set for use when
-    #     # `MsgType.pld == None|Any`.
-    #     None: msgpack.Decoder(Any),
-    #     Any: msgpack.Decoder(Any),
-    # }
-    #
-    # -[ ] do we still want to try and support the sub-decoder with
-    # `.Raw` technique in the case that the `Generic` approach gives
-    # future grief?
-    #
-    # -[ ] <NEW-ISSUE-FOR-ThIS-HERE>
-    #  -> https://jcristharif.com/msgspec/api.html#raw
-    #
-    #def mk_pld_subdec(
-    #    self,
-    #    payload_types: Union[Type[Struct]],
 
-    #) -> msgpack.Decoder:
-    #    # TODO: sub-decoder suppor for `.pld: Raw`?
-    #    # => see similar notes inside `.msg.types`..
-    #    #
-    #    # not sure we'll end up needing this though it might have
-    #    # unforeseen advantages in terms of enabling encrypted
-    #    # appliciation layer (only) payloads?
-    #    #
-    #    # register sub-payload decoders to load `.pld: Raw`
-    #    # decoded `Msg`-packets using a dynamic lookup (table)
-    #    # instead of a pre-defined msg-spec via `Generic`
-    #    # parameterization.
-    #    #
-    #    (
-    #        tags,
-    #        payload_dec,
-    #    ) = mk_tagged_union_dec(
-    #        tagged_structs=list(payload_types.__args__),
-    #    )
-    #    # register sub-decoders by tag
-    #    subdecs: dict[str, msgpack.Decoder]|None = self._payload_decs
-    #    for name in tags:
-    #        subdecs.setdefault(
-    #            name,
-    #            payload_dec,
-    #        )
-
-    #    return payload_dec
-
-    # sub-decoders for retreiving embedded
-    # payload data and decoding to a sender
-    # side defined (struct) type.
-    # def dec_payload(
-    #     codec: MsgCodec,
-    #     msg: Msg,
-
-    # ) -> Any|Struct:
-
-    #     msg: PayloadMsg = codec.dec.decode(msg)
-    #     payload_tag: str = msg.header.payload_tag
-    #     payload_dec: msgpack.Decoder = codec._payload_decs[payload_tag]
-    #     return payload_dec.decode(msg.pld)
-
-    # def enc_payload(
-    #     codec: MsgCodec,
-    #     payload: Any,
-    #     cid: str,
-
-    # ) -> bytes:
-
-    #     # tag_field: str|None = None
-
-    #     plbytes = codec.enc.encode(payload)
-    #     if b'msg_type' in plbytes:
-    #         assert isinstance(payload, Struct)
-
-    #         # tag_field: str = type(payload).__name__
-    #         payload = msgspec.Raw(plbytes)
-
-    #     msg = Msg(
-    #         cid=cid,
-    #         pld=payload,
-    #         # Header(
-    #         #     payload_tag=tag_field,
-    #         #     # dialog_id,
-    #         # ),
-    #     )
-    #     return codec.enc.encode(msg)
-
-
-
-# TODO: sub-decoded `Raw` fields?
-# -[ ] see `MsgCodec._payload_decs` notes
+# [x] TODO: a sub-decoder system as well? => No!
 #
-# XXX if we wanted something more complex then field name str-keys
-# we might need a header field type to describe the lookup sys?
-# class Header(Struct, tag=True):
-#     '''
-#     A msg header which defines payload properties
-
-#     '''
-#     payload_tag: str|None = None
-
-
- #def mk_tagged_union_dec(
-    # tagged_structs: list[Struct],
-
- #) -> tuple[
-    # list[str],
-    # msgpack.Decoder,
- #]:
-    # '''
-    # Create a `msgpack.Decoder` for an input `list[msgspec.Struct]`
-    # and return a `list[str]` of each struct's `tag_field: str` value
-    # which can be used to "map to" the initialized dec.
-
-    # '''
-    # # See "tagged unions" docs:
-    # # https://jcristharif.com/msgspec/structs.html#tagged-unions
-
-    # # "The quickest way to enable tagged unions is to set tag=True when
-    # # defining every struct type in the union. In this case tag_field
-    # # defaults to "type", and tag defaults to the struct class name
-    # # (e.g. "Get")."
-    # first: Struct = tagged_structs[0]
-    # types_union: Union[Type[Struct]] = Union[
-    #    first
-    # ]|Any
-    # tags: list[str] = [first.__name__]
-
-    # for struct in tagged_structs[1:]:
-    #     types_union |= struct
-    #     tags.append(
-    #         getattr(
-    #             struct,
-    #             struct.__struct_config__.tag_field,
-    #             struct.__name__,
-    #         )
-    #     )
-
-    # dec = msgpack.Decoder(types_union)
-    # return (
-    #     tags,
-    #     dec,
-    # )
+# -[x] do we still want to try and support the sub-decoder with
+# `.Raw` technique in the case that the `Generic` approach gives
+# future grief?
+# => NO, since we went with the `PldRx` approach instead B)
+#
+# IF however you want to see the code that was staged for this
+# from wayyy back, see the pure removal commit.
 
 
 def mk_codec(
@@ -644,10 +496,6 @@ _def_tractor_codec: MsgCodec = mk_codec(
 # 3. We similarly set the pending values for the child nurseries
 #    of the *current* task.
 #
-
-# TODO: STOP USING THIS, since it's basically a global and won't
-# allow sub-IPC-ctxs to limit the msg-spec however desired..
-# _ctxvar_MsgCodec: MsgCodec = RunVar(
 _ctxvar_MsgCodec: ContextVar[MsgCodec] = ContextVar(
     'msgspec_codec',
     default=_def_tractor_codec,
@@ -782,3 +630,31 @@ def limit_msg_spec(
 #         # import pdbp; pdbp.set_trace()
 #         assert ext_codec.pld_spec == extended_spec
 #         yield ext_codec
+
+
+# TODO: make something similar to this inside `._codec` such that
+# user can just pass a type table of some sort?
+# -[ ] we would need to decode all msgs to `pretty_struct.Struct`
+#     and then call `.to_dict()` on them?
+# -[x] we're going to need to re-impl all the stuff changed in the
+#    runtime port such that it can handle dicts or `Msg`s?
+#
+# def mk_dict_msg_codec_hooks() -> tuple[Callable, Callable]:
+#     '''
+#     Deliver a `enc_hook()`/`dec_hook()` pair which does
+#     manual convertion from our above native `Msg` set
+#     to `dict` equivalent (wire msgs) in order to keep legacy compat
+#     with the original runtime implementation.
+#
+#     Note: this is is/was primarly used while moving the core
+#     runtime over to using native `Msg`-struct types wherein we
+#     start with the send side emitting without loading
+#     a typed-decoder and then later flipping the switch over to
+#     load to the native struct types once all runtime usage has
+#     been adjusted appropriately.
+#
+#     '''
+#     return (
+#         # enc_to_dict,
+#         dec_from_dict,
+#     )
diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index ad6d6fb8..0fc0ee96 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -26,7 +26,6 @@ from __future__ import annotations
 import types
 from typing import (
     Any,
-    # Callable,
     Generic,
     Literal,
     Type,
@@ -161,7 +160,6 @@ class SpawnSpec(
     bind_addrs: list[tuple[str, int]]
 
 
-
 # TODO: caps based RPC support in the payload?
 #
 # -[ ] integration with our ``enable_modules: list[str]`` caps sys.
@@ -314,8 +312,9 @@ class Started(
     pld: PayloadT|Raw
 
 
-# TODO: instead of using our existing `Start`
-# for this (as we did with the original `{'cmd': ..}` style)
+# TODO: cancel request dedicated msg?
+# -[ ] instead of using our existing `Start`?
+#
 # class Cancel:
 #     cid: str
 
@@ -477,12 +476,16 @@ def from_dict_msg(
         )
     return msgT(**dict_msg)
 
-# TODO: should be make a msg version of `ContextCancelled?`
-# and/or with a scope field or a full `ActorCancelled`?
+# TODO: should be make a set of cancel msgs?
+# -[ ] a version of `ContextCancelled`?
+#     |_ and/or with a scope field?
+# -[ ] or, a full `ActorCancelled`?
+#
 # class Cancelled(MsgType):
 #     cid: str
-
-# TODO what about overruns?
+#
+# -[ ] what about overruns?
+#
 # class Overrun(MsgType):
 #     cid: str
 
@@ -564,10 +567,17 @@ def mk_msg_spec(
     Create a payload-(data-)type-parameterized IPC message specification.
 
     Allows generating IPC msg types from the above builtin set
-    with a payload (field) restricted data-type via the `Msg.pld:
-    PayloadT` type var. This allows runtime-task contexts to use
-    the python type system to limit/filter payload values as
-    determined by the input `payload_type_union: Union[Type]`.
+    with a payload (field) restricted data-type, the `Msg.pld: PayloadT`.
+
+    This allows runtime-task contexts to use the python type system
+    to limit/filter payload values as determined by the input
+    `payload_type_union: Union[Type]`.
+
+    Notes: originally multiple approaches for constructing the
+    type-union passed to `msgspec` were attempted as selected via the
+    `spec_build_method`, but it turns out only the defaul method
+    'indexed_generics' seems to work reliably in all use cases. As
+    such, the others will likely be removed in the near future.
 
     '''
     submsg_types: list[MsgType] = Msg.__subclasses__()
@@ -707,31 +717,3 @@ def mk_msg_spec(
         +
         ipc_msg_types,
     )
-
-
-# TODO: make something similar to this inside `._codec` such that
-# user can just pass a type table of some sort?
-# -[ ] we would need to decode all msgs to `pretty_struct.Struct`
-#     and then call `.to_dict()` on them?
-# -[ ] we're going to need to re-impl all the stuff changed in the
-#    runtime port such that it can handle dicts or `Msg`s?
-#
-# def mk_dict_msg_codec_hooks() -> tuple[Callable, Callable]:
-#     '''
-#     Deliver a `enc_hook()`/`dec_hook()` pair which does
-#     manual convertion from our above native `Msg` set
-#     to `dict` equivalent (wire msgs) in order to keep legacy compat
-#     with the original runtime implementation.
-#
-#     Note: this is is/was primarly used while moving the core
-#     runtime over to using native `Msg`-struct types wherein we
-#     start with the send side emitting without loading
-#     a typed-decoder and then later flipping the switch over to
-#     load to the native struct types once all runtime usage has
-#     been adjusted appropriately.
-#
-#     '''
-#     return (
-#         # enc_to_dict,
-#         dec_from_dict,
-#     )
-- 
2.34.1


From 21f633a9005c9ae3dcda6c98dd3893dde65af3fa Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 31 May 2024 14:40:55 -0400
Subject: [PATCH 354/378] Use `Context` repr APIs for RPC outcome logs

Delegate to the new `.repr_state: str` and adjust log level based on
error vs. cancel vs. result.
---
 tractor/_rpc.py | 39 +++++++++++++--------------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index 8a1be7b2..fa615772 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -739,37 +739,24 @@ async def _invoke(
                 cid,
             ))
 
+            logmeth: Callable = log.runtime
             merr: Exception|None = ctx.maybe_error
-            (
-                res_type_str,
-                res_str,
-            ) = (
-                ('error', f'{type(merr)}',) if merr
-                else (
-                    'result',
-                    f'`{repr(ctx.outcome)}`',
-                )
-            )
+            descr_str: str = 'with final result `{repr(ctx.outcome)}`'
             message: str = (
-                f'IPC context terminated with a final {res_type_str}\n\n'
-                f'{ctx}'
+                f'IPC context terminated {descr_str}\n\n'
             )
             if merr:
-                from tractor import RemoteActorError
-                if not isinstance(merr, RemoteActorError):
-                    fmt_merr: str = (
-                        f'\n{merr!r}\n'
-                        # f'{merr.args[0]!r}\n'
-                    )
-                else:
-                    fmt_merr = f'\n{merr!r}'
-                log.error(
-                    message
-                    +
-                    fmt_merr
+                descr_str: str = (
+                    f'with ctx having {ctx.repr_state!r}\n'
+                    f'{ctx.repr_outcome()}\n'
                 )
-            else:
-                log.runtime(message)
+                if isinstance(merr, ContextCancelled):
+                    logmeth: Callable = log.runtime
+                else:
+                    logmeth: Callable = log.error
+                    message += f'\n{merr!r}\n'
+
+            logmeth(message)
 
 
 async def try_ship_error_to_remote(
-- 
2.34.1


From f0342d6ae31416da506ef4cbcdbe5baf528f9104 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 31 May 2024 17:32:11 -0400
Subject: [PATCH 355/378] Move `Context.open_stream()` impl to `._streaming`

Exactly like how it's organized for `Portal.open_context()`, put the
main streaming API `@acm` with the `MsgStream` code and bind the method
to the new module func.

Other,
- rename `Context.result()` -> `.wait_for_result()` to better match the
  blocking semantics and rebind `.result()` as deprecated.
- add doc-str for `Context.maybe_raise()`.
---
 tractor/_context.py   | 247 +++++++-----------------------------------
 tractor/_streaming.py | 209 +++++++++++++++++++++++++++++++++++
 2 files changed, 248 insertions(+), 208 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index 81db66c3..ec64b157 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -86,7 +86,10 @@ from .msg import (
 from ._ipc import (
     Channel,
 )
-from ._streaming import MsgStream
+from ._streaming import (
+    MsgStream,
+    open_stream_from_ctx,
+)
 from ._state import (
     current_actor,
     debug_mode,
@@ -978,198 +981,6 @@ class Context:
             assert self._scope
             self._scope.cancel()
 
-    # TODO? should we move this to `._streaming` much like we
-    # moved `Portal.open_context()`'s def to this mod?
-    @acm
-    async def open_stream(
-        self,
-        allow_overruns: bool|None = False,
-        msg_buffer_size: int|None = None,
-
-    ) -> AsyncGenerator[MsgStream, None]:
-        '''
-        Open a ``MsgStream``, a bi-directional stream connected to the
-        cross-actor (far end) task for this ``Context``.
-
-        This context manager must be entered on both the caller and
-        callee for the stream to logically be considered "connected".
-
-        A ``MsgStream`` is currently "one-shot" use, meaning if you
-        close it you can not "re-open" it for streaming and instead you
-        must re-establish a new surrounding ``Context`` using
-        ``Portal.open_context()``.  In the future this may change but
-        currently there seems to be no obvious reason to support
-        "re-opening":
-          - pausing a stream can be done with a message.
-          - task errors will normally require a restart of the entire
-            scope of the inter-actor task context due to the nature of
-            ``trio``'s cancellation system.
-
-        '''
-        actor: Actor = self._actor
-
-        # If the surrounding context has been cancelled by some
-        # task with a handle to THIS, we error here immediately
-        # since it likely means the surrounding lexical-scope has
-        # errored, been `trio.Cancelled` or at the least
-        # `Context.cancel()` was called by some task.
-        if self._cancel_called:
-
-            # XXX NOTE: ALWAYS RAISE any remote error here even if
-            # it's an expected `ContextCancelled` due to a local
-            # task having called `.cancel()`!
-            #
-            # WHY: we expect the error to always bubble up to the
-            # surrounding `Portal.open_context()` call and be
-            # absorbed there (silently) and we DO NOT want to
-            # actually try to stream - a cancel msg was already
-            # sent to the other side!
-            self.maybe_raise(
-                raise_ctxc_from_self_call=True,
-            )
-            # NOTE: this is diff then calling
-            # `._maybe_raise_remote_err()` specifically
-            # because we want to raise a ctxc on any task entering this `.open_stream()`
-            # AFTER cancellation was already been requested,
-            # we DO NOT want to absorb any ctxc ACK silently!
-            # if self._remote_error:
-            #     raise self._remote_error
-
-            # XXX NOTE: if no `ContextCancelled` has been responded
-            # back from the other side (yet), we raise a different
-            # runtime error indicating that this task's usage of
-            # `Context.cancel()` and then `.open_stream()` is WRONG!
-            task: str = trio.lowlevel.current_task().name
-            raise RuntimeError(
-                'Stream opened after `Context.cancel()` called..?\n'
-                f'task: {actor.uid[0]}:{task}\n'
-                f'{self}'
-            )
-
-        if (
-            not self._portal
-            and not self._started_called
-        ):
-            raise RuntimeError(
-                'Context.started()` must be called before opening a stream'
-            )
-
-        # NOTE: in one way streaming this only happens on the
-        # parent-ctx-task side (on the side that calls
-        # `Actor.start_remote_task()`) so if you try to send
-        # a stop from the caller to the callee in the
-        # single-direction-stream case you'll get a lookup error
-        # currently.
-        ctx: Context = actor.get_context(
-            chan=self.chan,
-            cid=self.cid,
-            nsf=self._nsf,
-            # side=self.side,
-
-            msg_buffer_size=msg_buffer_size,
-            allow_overruns=allow_overruns,
-        )
-        ctx._allow_overruns: bool = allow_overruns
-        assert ctx is self
-
-        # XXX: If the underlying channel feeder receive mem chan has
-        # been closed then likely client code has already exited
-        # a ``.open_stream()`` block prior or there was some other
-        # unanticipated error or cancellation from ``trio``.
-
-        if ctx._rx_chan._closed:
-            raise trio.ClosedResourceError(
-                'The underlying channel for this stream was already closed!\n'
-            )
-
-        # NOTE: implicitly this will call `MsgStream.aclose()` on
-        # `.__aexit__()` due to stream's parent `Channel` type!
-        #
-        # XXX NOTE XXX: ensures the stream is "one-shot use",
-        # which specifically means that on exit,
-        # - signal ``trio.EndOfChannel``/``StopAsyncIteration`` to
-        #   the far end indicating that the caller exited
-        #   the streaming context purposefully by letting
-        #   the exit block exec.
-        # - this is diff from the cancel/error case where
-        #   a cancel request from this side or an error
-        #   should be sent to the far end indicating the
-        #   stream WAS NOT just closed normally/gracefully.
-        async with MsgStream(
-            ctx=self,
-            rx_chan=ctx._rx_chan,
-        ) as stream:
-
-            # NOTE: we track all existing streams per portal for
-            # the purposes of attempting graceful closes on runtime
-            # cancel requests.
-            if self._portal:
-                self._portal._streams.add(stream)
-
-            try:
-                self._stream_opened: bool = True
-                self._stream = stream
-
-                # XXX: do we need this?
-                # ensure we aren't cancelled before yielding the stream
-                # await trio.lowlevel.checkpoint()
-                yield stream
-
-                # XXX: (MEGA IMPORTANT) if this is a root opened process we
-                # wait for any immediate child in debug before popping the
-                # context from the runtime msg loop otherwise inside
-                # ``Actor._deliver_ctx_payload()`` the msg will be discarded and in
-                # the case where that msg is global debugger unlock (via
-                # a "stop" msg for a stream), this can result in a deadlock
-                # where the root is waiting on the lock to clear but the
-                # child has already cleared it and clobbered IPC.
-                #
-                # await maybe_wait_for_debugger()
-
-                # XXX TODO: pretty sure this isn't needed (see
-                # note above this block) AND will result in
-                # a double `.send_stop()` call. The only reason to
-                # put it here would be to due with "order" in
-                # terms of raising any remote error (as per
-                # directly below) or bc the stream's
-                # `.__aexit__()` block might not get run
-                # (doubtful)? Either way if we did put this back
-                # in we also need a state var to avoid the double
-                # stop-msg send..
-                #
-                # await stream.aclose()
-
-            # NOTE: absorb and do not raise any
-            # EoC received from the other side such that
-            # it is not raised inside the surrounding
-            # context block's scope!
-            except trio.EndOfChannel as eoc:
-                if (
-                    eoc
-                    and
-                    stream.closed
-                ):
-                    # sanity, can remove?
-                    assert eoc is stream._eoc
-
-                    log.warning(
-                        'Stream was terminated by EoC\n\n'
-                        # NOTE: won't show the error <Type> but
-                        # does show txt followed by IPC msg.
-                        f'{str(eoc)}\n'
-                    )
-
-            finally:
-                if self._portal:
-                    try:
-                        self._portal._streams.remove(stream)
-                    except KeyError:
-                        log.warning(
-                            f'Stream was already destroyed?\n'
-                            f'actor: {self.chan.uid}\n'
-                            f'ctx id: {self.cid}'
-                        )
-
     # TODO: replace all the `._maybe_raise_remote_err()` usage
     # with instances of this!!
     def maybe_raise(
@@ -1178,6 +989,14 @@ class Context:
         **kwargs,
 
     ) -> Exception|None:
+        '''
+        Check for for a remote error delivered by the runtime from
+        our peer (task); if set immediately raise.
+
+        This is a convenience wrapper for
+        `._maybe_raise_remote_err(self._remote_error)`.
+
+        '''
         __tracebackhide__: bool = hide_tb
         if re := self._remote_error:
             return self._maybe_raise_remote_err(
@@ -1290,8 +1109,7 @@ class Context:
 
         raise remote_error
 
-    # TODO: change  to `.wait_for_result()`?
-    async def result(
+    async def wait_for_result(
         self,
         hide_tb: bool = True,
 
@@ -1380,18 +1198,27 @@ class Context:
                 (not self._cancel_called)
             )
         )
+        # TODO: eventually make `.outcome: Outcome` and thus return
+        # `self.outcome.unwrap()` here!
         return self.outcome
 
     # TODO: switch this with above!
     # -[ ] should be named `.wait_for_outcome()` and instead do
     #     a `.outcome.Outcome.unwrap()` ?
     #
-    # @property
-    # def result(self) -> Any|None:
-    #     if self._final_result_is_set():
-    #         return self._result
-
-    #     raise RuntimeError('No result is available!')
+    async def result(
+        self,
+        *args,
+        **kwargs,
+    ) -> Any|Exception:
+        log.warning(
+            '`Context.result()` is DEPRECATED!\n'
+            'Use `Context.[no]wait_for_result()` instead!\n'
+        )
+        return await self.wait_for_result(
+            *args,
+            **kwargs,
+        )
 
     @property
     def maybe_error(self) -> BaseException|None:
@@ -1447,6 +1274,9 @@ class Context:
         return self._result is not Unresolved
 
     # def get_result_nowait(self) -> Any|None:
+    # def get_outcome_nowait(self) -> Any|None:
+    # def recv_result_nowait(self) -> Any|None:
+    # def receive_outcome_nowait(self) -> Any|None:
     # TODO: use `outcome.Outcome` here instead?
     @property
     def outcome(self) -> (
@@ -1476,7 +1306,6 @@ class Context:
     def has_outcome(self) -> bool:
         return bool(self.maybe_error) or self._final_result_is_set()
 
-    # @property
     def repr_outcome(
         self,
         show_error_fields: bool = False,
@@ -1498,7 +1327,8 @@ class Context:
             # just deliver the type name.
             if (
                 (reprol := getattr(merr, 'reprol', False))
-                and show_error_fields
+                and
+                show_error_fields
             ):
                 return reprol()
 
@@ -1515,10 +1345,6 @@ class Context:
                     repr(merr)
                 )
 
-            # just the type name
-            # else:  # but wen?
-            #     return type(merr).__name__
-
             # for all other errors show their regular output
             return (
                 str(merr)
@@ -1572,7 +1398,7 @@ class Context:
                 _,  # any non-unresolved value
                 None,
             ) if self._final_result_is_set():
-                status = 'returned'
+                status = 'result-returned'
 
             # normal operation but still in a pre-`Return`-result
             # dialog phase
@@ -1940,6 +1766,11 @@ class Context:
             # ow, indicate unable to deliver by default
             return False
 
+    # NOTE: similar to `Portal.open_context()`, this impl is found in
+    # the `._streaming`` mod to make reading/groking the details
+    # simpler code-org-wise.
+    open_stream = open_stream_from_ctx
+
 
 # TODO: exception tb masking by using a manual
 # `.__aexit__()`/.__aenter__()` pair on a type?
diff --git a/tractor/_streaming.py b/tractor/_streaming.py
index 016577d3..314a93b8 100644
--- a/tractor/_streaming.py
+++ b/tractor/_streaming.py
@@ -26,6 +26,7 @@ import inspect
 from pprint import pformat
 from typing import (
     Any,
+    AsyncGenerator,
     Callable,
     AsyncIterator,
     TYPE_CHECKING,
@@ -51,6 +52,7 @@ from tractor.msg import (
 )
 
 if TYPE_CHECKING:
+    from ._runtime import Actor
     from ._context import Context
     from ._ipc import Channel
 
@@ -550,6 +552,213 @@ class MsgStream(trio.abc.Channel):
     #     ...
 
 
+@acm
+async def open_stream_from_ctx(
+    ctx: Context,
+    allow_overruns: bool|None = False,
+    msg_buffer_size: int|None = None,
+
+) -> AsyncGenerator[MsgStream, None]:
+    '''
+    Open a `MsgStream`, a bi-directional msg transport dialog
+    connected to the cross-actor peer task for an IPC `Context`.
+
+    This context manager must be entered in both the "parent" (task
+    which entered `Portal.open_context()`) and "child" (RPC task
+    which is decorated by `@context`) tasks for the stream to
+    logically be considered "open"; if one side begins sending to an
+    un-opened peer, depending on policy config, msgs will either be
+    queued until the other side opens and/or a `StreamOverrun` will
+    (eventually) be raised.
+
+                         ------ - ------
+
+    Runtime semantics design:
+
+    A `MsgStream` session adheres to "one-shot use" semantics,
+    meaning if you close the scope it **can not** be "re-opened".
+
+    Instead you must re-establish a new surrounding RPC `Context`
+    (RTC: remote task context?) using `Portal.open_context()`.
+
+    In the future this *design choice* may need to be changed but
+    currently there seems to be no obvious reason to support such
+    semantics..
+
+    - "pausing a stream" can be supported with a message implemented
+      by the `tractor` application dev.
+
+    - any remote error will normally require a restart of the entire
+      `trio.Task`'s scope due to the nature of `trio`'s cancellation
+      (`CancelScope`) system and semantics (level triggered).
+
+    '''
+    actor: Actor = ctx._actor
+
+    # If the surrounding context has been cancelled by some
+    # task with a handle to THIS, we error here immediately
+    # since it likely means the surrounding lexical-scope has
+    # errored, been `trio.Cancelled` or at the least
+    # `Context.cancel()` was called by some task.
+    if ctx._cancel_called:
+
+        # XXX NOTE: ALWAYS RAISE any remote error here even if
+        # it's an expected `ContextCancelled` due to a local
+        # task having called `.cancel()`!
+        #
+        # WHY: we expect the error to always bubble up to the
+        # surrounding `Portal.open_context()` call and be
+        # absorbed there (silently) and we DO NOT want to
+        # actually try to stream - a cancel msg was already
+        # sent to the other side!
+        ctx.maybe_raise(
+            raise_ctxc_from_self_call=True,
+        )
+        # NOTE: this is diff then calling
+        # `._maybe_raise_remote_err()` specifically
+        # because we want to raise a ctxc on any task entering this `.open_stream()`
+        # AFTER cancellation was already been requested,
+        # we DO NOT want to absorb any ctxc ACK silently!
+        # if ctx._remote_error:
+        #     raise ctx._remote_error
+
+        # XXX NOTE: if no `ContextCancelled` has been responded
+        # back from the other side (yet), we raise a different
+        # runtime error indicating that this task's usage of
+        # `Context.cancel()` and then `.open_stream()` is WRONG!
+        task: str = trio.lowlevel.current_task().name
+        raise RuntimeError(
+            'Stream opened after `Context.cancel()` called..?\n'
+            f'task: {actor.uid[0]}:{task}\n'
+            f'{ctx}'
+        )
+
+    if (
+        not ctx._portal
+        and not ctx._started_called
+    ):
+        raise RuntimeError(
+            'Context.started()` must be called before opening a stream'
+        )
+
+    # NOTE: in one way streaming this only happens on the
+    # parent-ctx-task side (on the side that calls
+    # `Actor.start_remote_task()`) so if you try to send
+    # a stop from the caller to the callee in the
+    # single-direction-stream case you'll get a lookup error
+    # currently.
+    ctx: Context = actor.get_context(
+        chan=ctx.chan,
+        cid=ctx.cid,
+        nsf=ctx._nsf,
+        # side=ctx.side,
+
+        msg_buffer_size=msg_buffer_size,
+        allow_overruns=allow_overruns,
+    )
+    ctx._allow_overruns: bool = allow_overruns
+    assert ctx is ctx
+
+    # XXX: If the underlying channel feeder receive mem chan has
+    # been closed then likely client code has already exited
+    # a ``.open_stream()`` block prior or there was some other
+    # unanticipated error or cancellation from ``trio``.
+
+    if ctx._rx_chan._closed:
+        raise trio.ClosedResourceError(
+            'The underlying channel for this stream was already closed!\n'
+        )
+
+    # NOTE: implicitly this will call `MsgStream.aclose()` on
+    # `.__aexit__()` due to stream's parent `Channel` type!
+    #
+    # XXX NOTE XXX: ensures the stream is "one-shot use",
+    # which specifically means that on exit,
+    # - signal ``trio.EndOfChannel``/``StopAsyncIteration`` to
+    #   the far end indicating that the caller exited
+    #   the streaming context purposefully by letting
+    #   the exit block exec.
+    # - this is diff from the cancel/error case where
+    #   a cancel request from this side or an error
+    #   should be sent to the far end indicating the
+    #   stream WAS NOT just closed normally/gracefully.
+    async with MsgStream(
+        ctx=ctx,
+        rx_chan=ctx._rx_chan,
+    ) as stream:
+
+        # NOTE: we track all existing streams per portal for
+        # the purposes of attempting graceful closes on runtime
+        # cancel requests.
+        if ctx._portal:
+            ctx._portal._streams.add(stream)
+
+        try:
+            ctx._stream_opened: bool = True
+            ctx._stream = stream
+
+            # XXX: do we need this?
+            # ensure we aren't cancelled before yielding the stream
+            # await trio.lowlevel.checkpoint()
+            yield stream
+
+            # XXX: (MEGA IMPORTANT) if this is a root opened process we
+            # wait for any immediate child in debug before popping the
+            # context from the runtime msg loop otherwise inside
+            # ``Actor._deliver_ctx_payload()`` the msg will be discarded and in
+            # the case where that msg is global debugger unlock (via
+            # a "stop" msg for a stream), this can result in a deadlock
+            # where the root is waiting on the lock to clear but the
+            # child has already cleared it and clobbered IPC.
+            #
+            # await maybe_wait_for_debugger()
+
+            # XXX TODO: pretty sure this isn't needed (see
+            # note above this block) AND will result in
+            # a double `.send_stop()` call. The only reason to
+            # put it here would be to due with "order" in
+            # terms of raising any remote error (as per
+            # directly below) or bc the stream's
+            # `.__aexit__()` block might not get run
+            # (doubtful)? Either way if we did put this back
+            # in we also need a state var to avoid the double
+            # stop-msg send..
+            #
+            # await stream.aclose()
+
+        # NOTE: absorb and do not raise any
+        # EoC received from the other side such that
+        # it is not raised inside the surrounding
+        # context block's scope!
+        except trio.EndOfChannel as eoc:
+            if (
+                eoc
+                and
+                stream.closed
+            ):
+                # sanity, can remove?
+                assert eoc is stream._eoc
+
+                log.warning(
+                    'Stream was terminated by EoC\n\n'
+                    # NOTE: won't show the error <Type> but
+                    # does show txt followed by IPC msg.
+                    f'{str(eoc)}\n'
+                )
+
+        finally:
+            if ctx._portal:
+                try:
+                    ctx._portal._streams.remove(stream)
+                except KeyError:
+                    log.warning(
+                        f'Stream was already destroyed?\n'
+                        f'actor: {ctx.chan.uid}\n'
+                        f'ctx id: {ctx.cid}'
+                    )
+
+
+
 def stream(func: Callable) -> Callable:
     '''
     Mark an async function as a streaming routine with ``@stream``.
-- 
2.34.1


From 408a74784e7c549db0d1f177137819867e5b9ad6 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 6 Jun 2024 16:14:58 -0400
Subject: [PATCH 356/378] Catch `.pause_from_sync()` in root bg thread bugs!

Originally discovered as while using `tractor.pause_from_sync()`
from the `i3ipc` client running in a bg-thread that uses `asyncio`
inside `modden`.

Turns out we definitely aren't correctly handling `.pause_from_sync()`
from the root actor when called from a `trio.to_thread.run_sync()`
bg thread:
- root-actor bg threads which can't `Lock._debug_lock.acquire()` since
  they aren't in `trio.Task`s.
- even if scheduled via `.to_thread.run_sync(_debug._pause)` the
  acquirer won't be the task/thread which calls `Lock.release()` from
  `PdbREPL` hooks; this results in a RTE raised by `trio`..
- multiple threads will step on each other's stdio since cpython's GIL
  seems to ctx switch threads on every input from the user to the REPL
  loop..

Reproduce via reworking our example and test so that they catch and fail
for all edge cases:
- rework the `/examples/debugging/sync_bp.py` example to demonstrate the
  above issues, namely the stdio clobbering in the REPL when multiple
  threads and/or a subactor try to debug simultaneously.
  |_ run one thread using a task nursery to ensure it runs conc with the
     nursery's parent task.
  |_ ensure the bg threads run conc a subactor usage of
     `.pause_from_sync()`.
  |_ gravely detail all the special cases inside a TODO comment.
  |_ add some control flags to `sync_pause()` helper and don't use
     `breakpoint()` by default.
- extend and adjust `test_debugger.test_pause_from_sync` to match (and
  thus currently fail) by ensuring exclusive `PdbREPL` attachment when
  the 2 bg root-actor threads are concurrently interacting alongside the
  subactor:
  |_ should only see one of the `_pause_msg` logs at a time for either
     one of the threads or the subactor.
  |_ ensure each attaches (in no particular order) before expecting the
     script to exit.

Impl adjustments to `.devx._debug`:
- drop `Lock.repl`, no longer used.
- add `Lock._owned_by_root: bool` for the `.ctx_in_debug == None`
  root-actor-task active case.
- always `log.exception()` for any `._debug_lock.release()` ownership
  RTE emitted by `trio`, like we used to..
- add special `Lock.release()` log message for the stale lock but
  `._owned_by_root == True` case; oh yeah and actually
  `log.devx(message)`..
- rename `Lock.acquire()` -> `.acquire_for_ctx()` since it's only ever
  used from subactor IPC usage; well that and for local root-task
  usage we should prolly add a `.acquire_from_root_task()`?
- buncha `._pause()` impl improvements:
 |_ type `._pause()`'s `debug_func` as a `partial` as well.
 |_ offer `called_from_sync: bool` and `called_from_bg_thread: bool`
    for the special case handling when called from `.pause_from_sync()`
 |_ only set `DebugStatus.repl/repl_task` when `debug_func != None`
   (OW ensure the `.repl_task` is not the current one).
 |_ handle error logging even when `debug_func is None`..
 |_ lotsa detailed commentary around root-actor-bg-thread special cases.
- when `._set_trace(hide_tb=False)` do `pdbp.set_trace(frame=currentframe())`
  so the `._debug` internal frames are always included.
- by default always hide tracebacks for `.pause[_from_sync]()` internals.
- improve `.pause_from_sync()` to avoid root-bg-thread crashes:
 |_ pass new `called_from_xxx_` flags and ensure `DebugStatus.repl_task`
    is actually set to the `threading.current_thread()` when needed.
 |_ manually call `Lock._debug_lock.acquire_nowait()` for the non-bg
    thread case.
 |_ TODO: still need to implement the bg-thread case using a bg
    `trio.Task`-in-thread with an `trio.Event` set by thread REPL exit.
---
 examples/debugging/sync_bp.py | 125 +++++++++++++++---
 tests/test_debugger.py        |  71 +++++++---
 tractor/devx/_debug.py        | 241 +++++++++++++++++++++++-----------
 3 files changed, 323 insertions(+), 114 deletions(-)

diff --git a/examples/debugging/sync_bp.py b/examples/debugging/sync_bp.py
index efa4e405..e265df44 100644
--- a/examples/debugging/sync_bp.py
+++ b/examples/debugging/sync_bp.py
@@ -1,15 +1,32 @@
+from functools import partial
+import time
+from threading import current_thread
+
 import trio
 import tractor
 
 
 def sync_pause(
-    use_builtin: bool = True,
+    use_builtin: bool = False,
     error: bool = False,
+    hide_tb: bool = True,
+    pre_sleep: float|None = None,
 ):
+    if pre_sleep:
+        time.sleep(pre_sleep)
+
     if use_builtin:
-        breakpoint(hide_tb=False)
+        print(
+            f'Entering `breakpoint()` from\n'
+            f'{current_thread()}\n'
+        )
+        breakpoint(hide_tb=hide_tb)
 
     else:
+        print(
+            f'Entering `tractor.pause_from_sync()` from\n'
+            f'{current_thread()}@{tractor.current_actor().uid}\n'
+        )
         tractor.pause_from_sync()
 
     if error:
@@ -25,44 +42,114 @@ async def start_n_sync_pause(
     # sync to parent-side task
     await ctx.started()
 
-    print(f'entering SYNC PAUSE in {actor.uid}')
+    print(f'Entering `sync_pause()` in subactor: {actor.uid}\n')
     sync_pause()
-    print(f'back from SYNC PAUSE in {actor.uid}')
+    print(f'Exited `sync_pause()` in subactor: {actor.uid}\n')
 
 
 async def main() -> None:
-    async with tractor.open_nursery(
-        # NOTE: required for pausing from sync funcs
-        maybe_enable_greenback=True,
-        debug_mode=True,
-    ) as an:
+    async with (
+        tractor.open_nursery(
+            # NOTE: required for pausing from sync funcs
+            maybe_enable_greenback=True,
+            debug_mode=True,
+            # loglevel='cancel',
+        ) as an,
+        trio.open_nursery() as tn,
+    ):
+        # just from root task
+        sync_pause()
 
         p: tractor.Portal  = await an.start_actor(
             'subactor',
             enable_modules=[__name__],
             # infect_asyncio=True,
             debug_mode=True,
-            loglevel='cancel',
         )
 
         # TODO: 3 sub-actor usage cases:
+        # -[x] via a `.open_context()`
         # -[ ] via a `.run_in_actor()` call
         # -[ ] via a `.run()`
-        # -[ ] via a `.open_context()`
-        #
+        # -[ ] via a `.to_thread.run_sync()` in subactor
         async with p.open_context(
             start_n_sync_pause,
         ) as (ctx, first):
             assert first is None
 
-            await tractor.pause()
-            sync_pause()
+            # TODO: handle bg-thread-in-root-actor special cases!
+            #
+            # there are a couple very subtle situations possible here
+            # and they are likely to become more important as cpython
+            # moves to support no-GIL.
+            #
+            # Cases:
+            # 1. root-actor bg-threads that call `.pause_from_sync()`
+            #   whilst an in-tree subactor also is using ` .pause()`.
+            # |_ since the root-actor bg thread can not
+            #   `Lock._debug_lock.acquire_nowait()` without running
+            #   a `trio.Task`, AND because the
+            #   `PdbREPL.set_continue()` is called from that
+            #   bg-thread, we can not `._debug_lock.release()`
+            #   either!
+            #  |_ this results in no actor-tree `Lock` being used
+            #    on behalf of the bg-thread and thus the subactor's
+            #    task and the thread trying to to use stdio
+            #    simultaneously which results in the classic TTY
+            #    clobbering!
+            #
+            # 2. mutiple sync-bg-threads that call
+            #   `.pause_from_sync()` where one is scheduled via
+            #   `Nursery.start_soon(to_thread.run_sync)` in a bg
+            #   task.
+            #
+            #   Due to the GIL, the threads never truly try to step
+            #   through the REPL simultaneously, BUT their `logging`
+            #   and traceback outputs are interleaved since the GIL
+            #   (seemingly) on every REPL-input from the user
+            #   switches threads..
+            #
+            #   Soo, the context switching semantics of the GIL
+            #   result in a very confusing and messy interaction UX
+            #   since eval and (tb) print output is NOT synced to
+            #   each REPL-cycle (like we normally make it via
+            #   a `.set_continue()` callback triggering the
+            #   `Lock.release()`). Ideally we can solve this
+            #   usability issue NOW because this will of course be
+            #   that much more important when eventually there is no
+            #   GIL!
 
-        # TODO: make this work!!
-        await trio.to_thread.run_sync(
-            sync_pause,
-            abandon_on_cancel=False,
-        )
+            # XXX should cause double REPL entry and thus TTY
+            # clobbering due to case 1. above!
+            tn.start_soon(
+                partial(
+                    trio.to_thread.run_sync,
+                    partial(
+                        sync_pause,
+                        use_builtin=False,
+                        # pre_sleep=0.5,
+                    ),
+                    abandon_on_cancel=True,
+                    thread_name='start_soon_root_bg_thread',
+                )
+            )
+
+            await tractor.pause()
+
+            # XXX should cause double REPL entry and thus TTY
+            # clobbering due to case 2. above!
+            await trio.to_thread.run_sync(
+                partial(
+                    sync_pause,
+                    # NOTE this already works fine since in the new
+                    # thread the `breakpoint()` built-in is never
+                    # overloaded, thus NO locking is used, HOWEVER
+                    # the case 2. from above still exists!
+                    use_builtin=True,
+                ),
+                abandon_on_cancel=False,
+                thread_name='inline_root_bg_thread',
+            )
 
         await ctx.cancel()
 
diff --git a/tests/test_debugger.py b/tests/test_debugger.py
index 72778bda..5f818a60 100644
--- a/tests/test_debugger.py
+++ b/tests/test_debugger.py
@@ -1073,6 +1073,8 @@ def test_pause_from_sync(
 
     '''
     child = spawn('sync_bp')
+
+    # first `sync_pause()` after nurseries open
     child.expect(PROMPT)
     assert_before(
         child,
@@ -1087,43 +1089,70 @@ def test_pause_from_sync(
         do_ctlc(child)
 
     child.sendline('c')
+
+
+    # first `await tractor.pause()` inside `p.open_context()` body
     child.expect(PROMPT)
 
-    # XXX shouldn't see gb loaded again
+    # XXX shouldn't see gb loaded message with PDB loglevel!
     before = str(child.before.decode())
     assert not in_prompt_msg(
         before,
         ['`greenback` portal opened!'],
     )
+    # should be same root task
     assert_before(
         child,
-        [_pause_msg, "('root'",],
+        [
+            _pause_msg,
+            "<Task '__main__.main'",
+            "('root'",
+        ]
     )
 
     if ctlc:
         do_ctlc(child)
 
-    child.sendline('c')
-    child.expect(PROMPT)
-    assert_before(
-        child,
-        [_pause_msg, "('subactor'",],
-    )
+    # one of the bg thread or subactor should have
+    # `Lock.acquire()`-ed
+    # (NOT both, which will result in REPL clobbering!)
+    attach_patts: dict[str, list[str]] = {
+        'subactor': [
+            "'start_n_sync_pause'",
+            "('subactor'",
+        ],
+        'inline_root_bg_thread': [
+            "<Thread(inline_root_bg_thread",
+            "('root'",
+        ],
+        'start_soon_root_bg_thread': [
+            "<Thread(start_soon_root_bg_thread",
+            "('root'",
+        ],
+    }
+    while attach_patts:
+        child.sendline('c')
+        child.expect(PROMPT)
+        before = str(child.before.decode())
+        for key in attach_patts.copy():
+            if key in before:
+                expected_patts: str = attach_patts.pop(key)
+                assert_before(
+                    child,
+                    [_pause_msg] + expected_patts
+                )
+                break
 
-    if ctlc:
-        do_ctlc(child)
+        # ensure no other task/threads engaged a REPL
+        # at the same time as the one that was detected above.
+        for key, other_patts in attach_patts.items():
+            assert not in_prompt_msg(
+                before,
+                other_patts,
+            )
 
-    child.sendline('c')
-    child.expect(PROMPT)
-    # non-main thread case
-    # TODO: should we agument the pre-prompt msg in this case?
-    assert_before(
-        child,
-        [_pause_msg, "('root'",],
-    )
-
-    if ctlc:
-        do_ctlc(child)
+        if ctlc:
+            do_ctlc(child)
 
     child.sendline('c')
     child.expect(pexpect.EOF)
diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 2f0e7e12..858133fd 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -182,6 +182,8 @@ class LockRelease(
 __pld_spec__: TypeAlias = LockStatus|LockRelease
 
 
+# TODO: instantiate this only in root from factory
+# so as to allow runtime errors from subactors.
 class Lock:
     '''
     Actor-tree-global debug lock state, exists only in a root process.
@@ -189,12 +191,6 @@ class Lock:
     Mostly to avoid a lot of global declarations for now XD.
 
     '''
-    # XXX local ref to the `Pbp` instance, ONLY set in the
-    # actor-process that currently has activated a REPL
-    # i.e. it will be `None` (unset) in any other actor-process
-    # that does not have this lock acquired in the root proc.
-    repl: PdbREPL|None = None
-
     @staticmethod
     def get_locking_task_cs() -> CancelScope|None:
         if not is_root_process():
@@ -223,6 +219,7 @@ class Lock:
     ctx_in_debug: Context|None = None
     req_handler_finished: trio.Event|None = None
 
+    _owned_by_root: bool = False
     _debug_lock: trio.StrictFIFOLock = trio.StrictFIFOLock()
     _blocked: set[
         tuple[str, str]  # `Actor.uid` for per actor
@@ -231,23 +228,16 @@ class Lock:
 
     @classmethod
     def repr(cls) -> str:
-
-        # both root and subs
+        lock_stats: trio.LockStatistics = cls._debug_lock.statistics()
         fields: str = (
-            f'repl: {cls.repl}\n'
+            f'req_handler_finished: {cls.req_handler_finished}\n'
+            f'_blocked: {cls._blocked}\n\n'
+            f'_debug_lock: {cls._debug_lock}\n'
+            f'lock_stats: {lock_stats}\n'
+            f'ctx_in_debug: {cls.ctx_in_debug}\n'
+
         )
 
-        if is_root_process():
-            lock_stats: trio.LockStatistics = cls._debug_lock.statistics()
-            fields += (
-                f'req_handler_finished: {cls.req_handler_finished}\n'
-
-                f'_blocked: {cls._blocked}\n\n'
-                f'_debug_lock: {cls._debug_lock}\n'
-                f'lock_stats: {lock_stats}\n'
-
-            )
-
         body: str = textwrap.indent(
             fields,
             prefix=' |_',
@@ -256,8 +246,6 @@ class Lock:
             f'<{cls.__name__}(\n'
             f'{body}'
             ')>\n\n'
-
-            f'{cls.ctx_in_debug}\n'
         )
 
     @classmethod
@@ -266,7 +254,10 @@ class Lock:
         cls,
         force: bool = False,
     ):
-        message: str = 'TTY lock not held by any child\n'
+        if not cls._owned_by_root:
+            message: str = 'TTY lock not held by any child\n'
+        else:
+            message: str = 'TTY lock held in root-actor task\n'
 
         if not (is_trio_main := DebugStatus.is_main_trio_thread()):
             task: threading.Thread = threading.current_thread()
@@ -279,8 +270,20 @@ class Lock:
             if (
                 lock.locked()
                 and
-                owner is task
-                # ^-NOTE-^ if not will raise a RTE..
+                (
+                    owner is task
+                    # or
+                    # cls._owned_by_root
+                )
+                # ^-NOTE-^ if we do NOT ensure this, `trio` will
+                # raise a RTE when a non-owner tries to releasee the
+                # lock.
+                #
+                # Further we need to be extra pedantic about the
+                # correct task, greenback-spawned-task and/or thread
+                # being set to the `.repl_task` such that the above
+                # condition matches and we actually release the lock.
+                # This is particular of note from `.pause_from_sync()`!
             ):
                 if not is_trio_main:
                     trio.from_thread.run_sync(
@@ -290,6 +293,10 @@ class Lock:
                     cls._debug_lock.release()
                     message: str = 'TTY lock released for child\n'
 
+        except RuntimeError as rte:
+            log.exception('Failed to release `Lock`?')
+            raise rte
+
         finally:
             # IFF there are no more requesting tasks queued up fire, the
             # "tty-unlocked" event thereby alerting any monitors of the lock that
@@ -305,7 +312,11 @@ class Lock:
             ):
                 message += '-> No more child ctx tasks hold the TTY lock!\n'
 
-            elif req_handler_finished:
+            elif (
+                req_handler_finished
+                and
+                lock.locked()
+            ):
                 req_stats = req_handler_finished.statistics()
                 message += (
                     f'-> A child ctx task still owns the `Lock` ??\n'
@@ -315,9 +326,20 @@ class Lock:
 
             cls.ctx_in_debug = None
 
+            if (
+                cls._owned_by_root
+            ):
+                if not lock.locked():
+                    cls._owned_by_root = False
+                else:
+                    message += 'Lock still held by root actor task?!?\n'
+                    lock.release()
+
+            log.devx(message)
+
     @classmethod
     @acm
-    async def acquire(
+    async def acquire_for_ctx(
         cls,
         ctx: Context,
 
@@ -372,7 +394,7 @@ class Lock:
             )
 
             # NOTE: critical section: this yield is unshielded!
-
+            #
             # IF we received a cancel during the shielded lock entry of some
             # next-in-queue requesting task, then the resumption here will
             # result in that ``trio.Cancelled`` being raised to our caller
@@ -384,7 +406,7 @@ class Lock:
             yield cls._debug_lock
 
         finally:
-            message :str = 'Exiting `Lock.acquire()` on behalf of sub-actor\n'
+            message :str = 'Exiting `Lock.acquire_for_ctx()` on behalf of sub-actor\n'
             if we_acquired:
                 message += '-> TTY lock released by child\n'
                 cls.release()
@@ -468,11 +490,11 @@ async def lock_tty_for_child(
 
         # TODO: use `.msg._ops.maybe_limit_plds()` here instead so we
         # can merge into a single async with, with the
-        # `Lock.acquire()` enter below?
+        # `Lock.acquire_for_ctx()` enter below?
         #
         # enable the locking msgspec
         with apply_debug_pldec():
-            async with Lock.acquire(ctx=ctx):
+            async with Lock.acquire_for_ctx(ctx=ctx):
                 debug_lock_cs.shield = True
 
                 log.devx(
@@ -567,6 +589,11 @@ class DebugStatus:
     whenever a local task is an active REPL.
 
     '''
+    # XXX local ref to the `pdbp.Pbp` instance, ONLY set in the
+    # actor-process that currently has activated a REPL i.e. it
+    # should be `None` (unset) in any other actor-process that does
+    # not yet have the `Lock` acquired via a root-actor debugger
+    # request.
     repl: PdbREPL|None = None
 
     # TODO: yet again this looks like a task outcome where we need
@@ -1443,7 +1470,7 @@ class DebugRequestError(RuntimeError):
 
 async def _pause(
 
-    debug_func: Callable|None,
+    debug_func: Callable|partial|None,
 
     # NOTE: must be passed in the `.pause_from_sync()` case!
     repl: PdbREPL|None = None,
@@ -1457,7 +1484,9 @@ async def _pause(
     # be no way to override it?..
     #
     shield: bool = False,
-    hide_tb: bool = False,
+    hide_tb: bool = True,
+    called_from_sync: bool = False,
+    called_from_bg_thread: bool = False,
     task_status: TaskStatus[trio.Event] = trio.TASK_STATUS_IGNORED,
     **debug_func_kwargs,
 
@@ -1502,27 +1531,15 @@ async def _pause(
     # -[ ] factor out better, main reason for it is common logic for
     #   both root and sub repl entry
     def _enter_repl_sync(
-        debug_func: Callable,
+        debug_func: partial[None],
     ) -> None:
         __tracebackhide__: bool = hide_tb
+        debug_func_name: str = (
+            debug_func.func.__name__ if debug_func else 'None'
+        )
 
         try:
-            # set local actor task to avoid recurrent
-            # entries/requests from the same local task (to the root
-            # process).
-            DebugStatus.repl_task = task
-            DebugStatus.repl = repl
-
-            # TODO: do we want to support using this **just** for the
-            # locking / common code (prolly to help address #320)?
-            if debug_func is None:
-                task_status.started(DebugStatus)
-
-            else:
-                log.warning(
-                    'Entering REPL for task fuck you!\n'
-                    f'{task}\n'
-                )
+            if debug_func:
                 # block here one (at the appropriate frame *up*) where
                 # ``breakpoint()`` was awaited and begin handling stdio.
                 log.devx(
@@ -1531,6 +1548,12 @@ async def _pause(
                     f'  |_{task}\n'
                  )
 
+                # set local actor task to avoid recurrent
+                # entries/requests from the same local task (to the root
+                # process).
+                DebugStatus.repl = repl
+                DebugStatus.repl_task = task
+
                 # invoke the low-level REPL activation routine which itself
                 # should call into a `Pdb.set_trace()` of some sort.
                 debug_func(
@@ -1539,10 +1562,27 @@ async def _pause(
                     **debug_func_kwargs,
                 )
 
+            # TODO: maybe invert this logic and instead
+            # do `assert debug_func is None` when
+            # `called_from_sync`?
+            else:
+                if (
+                    called_from_sync
+                    # and
+                    # is_root_process()
+                    and
+                    not DebugStatus.is_main_trio_thread()
+                ):
+                    assert DebugStatus.repl_task is not task
+
+                # TODO: do we want to support using this **just** for the
+                # locking / common code (prolly to help address #320)?
+                task_status.started(DebugStatus)
+
         except trio.Cancelled:
             log.exception(
-                'Cancelled during invoke of internal `debug_func = '
-                f'{debug_func.func.__name__}`\n'
+                'Cancelled during invoke of internal\n\n'
+                f'`debug_func = {debug_func_name}`\n'
             )
             # XXX NOTE: DON'T release lock yet
             raise
@@ -1550,8 +1590,8 @@ async def _pause(
         except BaseException:
             __tracebackhide__: bool = False
             log.exception(
-                'Failed to invoke internal `debug_func = '
-                f'{debug_func.func.__name__}`\n'
+                'Failed to invoke internal\n\n'
+                f'`debug_func = {debug_func_name}`\n'
             )
             # NOTE: OW this is ONLY called from the
             # `.set_continue/next` hooks!
@@ -1597,34 +1637,56 @@ async def _pause(
                     f'This root actor task is already within an active REPL session\n'
                     f'Ignoring this re-entered `tractor.pause()`\n'
                     f'task: {task.name}\n'
-                    f'REPL: {Lock.repl}\n'
                     # TODO: use `._frame_stack` scanner to find the @api_frame
                 )
                 with trio.CancelScope(shield=shield):
                     await trio.lowlevel.checkpoint()
                 return
 
-            # XXX: since we need to enter pdb synchronously below,
-            # we have to release the lock manually from pdb completion
-            # callbacks. Can't think of a nicer way then this atm.
+            # must shield here to avoid hitting a `Cancelled` and
+            # a child getting stuck bc we clobbered the tty
             with trio.CancelScope(shield=shield):
                 if Lock._debug_lock.locked():
-                    log.warning(
-                        'attempting to shield-acquire active TTY lock owned by\n'
+
+                    acq_prefix: str = 'shield-' if shield else ''
+                    ctx_line: str = (
+                        'lock owned by ctx\n\n'
                         f'{ctx}'
+                    ) if ctx else 'stale lock with no request ctx!?'
+                    log.devx(
+                        f'attempting to {acq_prefix}acquire active TTY '
+                        f'{ctx_line}'
                     )
 
-                    # must shield here to avoid hitting a ``Cancelled`` and
-                    # a child getting stuck bc we clobbered the tty
-                    # with trio.CancelScope(shield=True):
-                    await Lock._debug_lock.acquire()
-                else:
-                    # may be cancelled
+                # XXX: since we need to enter pdb synchronously below,
+                # and we don't want to block the thread that starts
+                # stepping through the application thread, we later
+                # must `Lock._debug_lock.release()` manually from
+                # some `PdbREPL` completion callback(`.set_[continue/exit]()`).
+                #
+                # So, when `._pause()` is called from a (bg/non-trio)
+                # thread, special provisions are needed and we need
+                # to do the `.acquire()`/`.release()` calls from
+                # a common `trio.task` (due to internal impl of
+                # `FIFOLock`). Thus we do not acquire here and
+                # instead expect `.pause_from_sync()` to take care of
+                # this detail depending on the caller's (threading)
+                # usage.
+                #
+                # NOTE that this special case is ONLY required when
+                # using `.pause_from_sync()` from the root actor
+                # since OW a subactor will instead make an IPC
+                # request (in the branch below) to acquire the
+                # `Lock`-mutex and a common root-actor RPC task will
+                # take care of `._debug_lock` mgmt!
+                if not called_from_sync:
                     await Lock._debug_lock.acquire()
+                    Lock._owned_by_root = True
 
             # enter REPL from root, no TTY locking IPC ctx necessary
+            # since we can acquire the `Lock._debug_lock` directly in
+            # thread.
             _enter_repl_sync(debug_func)
-            return  # next branch is mutex and for subactors
 
         # TODO: need a more robust check for the "root" actor
         elif (
@@ -1843,6 +1905,11 @@ def _set_trace(
     # called our API.
     caller_frame: FrameType = api_frame.f_back  # type: ignore
 
+    # pretend this frame is the caller frame to show
+    # the entire call-stack all the way down to here.
+    if not hide_tb:
+        caller_frame: FrameType = inspect.currentframe()
+
     # engage ze REPL
     # B~()
     repl.set_trace(frame=caller_frame)
@@ -1850,7 +1917,7 @@ def _set_trace(
 
 async def pause(
     *,
-    hide_tb: bool = False,
+    hide_tb: bool = True,
     api_frame: FrameType|None = None,
 
     # TODO: figure out how to still make this work:
@@ -1970,13 +2037,12 @@ async def maybe_init_greenback(
 # runtime aware version which takes care of all .
 def pause_from_sync(
 
-    hide_tb: bool = False,
-    # proxied to `_pause()`
+    hide_tb: bool = True,
 
-    **_pause_kwargs,
-    # for eg.
+    # proxy to `._pause()`, for ex:
     # shield: bool = False,
     # api_frame: FrameType|None = None,
+    **_pause_kwargs,
 
 ) -> None:
 
@@ -2020,26 +2086,53 @@ def pause_from_sync(
             # noop: non-cancelled `.to_thread`
             # `trio.Cancelled`: cancelled `.to_thread`
             #
+            log.warning(
+                'Engaging `.pause_from_sync()` from ANOTHER THREAD!'
+            )
+            task: threading.Thread = threading.current_thread()
+            DebugStatus.repl_task: str = task
+
+            # TODO: make root-actor bg thread usage work!
+            # if is_root_process():
+            #     async def _pause_from_sync_thread():
+            #         ...
+            # else:
+            #       .. the below ..
+
             trio.from_thread.run(
                 partial(
                     _pause,
                     debug_func=None,
                     repl=mdb,
+                    hide_tb=hide_tb,
+
+                    # XXX to prevent `._pause()` for setting
+                    # `DebugStatus.repl_task` to the gb task!
+                    called_from_sync=True,
+                    called_from_bg_thread=True,
+
                     **_pause_kwargs
                 ),
             )
-            task: threading.Thread = threading.current_thread()
 
         else:  # we are presumably the `trio.run()` + main thread
             task: trio.Task = current_task()
+            DebugStatus.repl_task: str = task
             greenback.await_(
                 _pause(
                     debug_func=None,
                     repl=mdb,
+                    hide_tb=hide_tb,
+                    called_from_sync=True,
                     **_pause_kwargs,
                 )
             )
-            DebugStatus.repl_task: str = current_task()
+
+            if is_root_process():
+                # Manually acquire since otherwise on release we'll
+                # get a RTE raised by `trio` due to ownership..
+                Lock._debug_lock.acquire_nowait()
+                Lock._owned_by_root = True
 
         # TODO: ensure we aggressively make the user aware about
         # entering the global ``breakpoint()`` built-in from sync
-- 
2.34.1


From 30d60379c154831e796d80aa81e3f083c482bb9b Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 7 Jun 2024 22:35:59 -0400
Subject: [PATCH 357/378] Drop thread logging to make `log.pdb()` patts match
 in test

---
 examples/debugging/sync_bp.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/examples/debugging/sync_bp.py b/examples/debugging/sync_bp.py
index e265df44..137710fc 100644
--- a/examples/debugging/sync_bp.py
+++ b/examples/debugging/sync_bp.py
@@ -1,6 +1,5 @@
 from functools import partial
 import time
-from threading import current_thread
 
 import trio
 import tractor
@@ -16,17 +15,9 @@ def sync_pause(
         time.sleep(pre_sleep)
 
     if use_builtin:
-        print(
-            f'Entering `breakpoint()` from\n'
-            f'{current_thread()}\n'
-        )
         breakpoint(hide_tb=hide_tb)
 
     else:
-        print(
-            f'Entering `tractor.pause_from_sync()` from\n'
-            f'{current_thread()}@{tractor.current_actor().uid}\n'
-        )
         tractor.pause_from_sync()
 
     if error:
-- 
2.34.1


From 6534a363a591166658810b852db71f17e5228ca6 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 10 Jun 2024 08:54:03 -0400
Subject: [PATCH 358/378] First proto: multi-threaded synced `pdb`-REPLs

Functionally working for multi-threaded (via cpython threads spawned
from `to_trio.to_thread.run_sync()`) alongside subactors, tested (for
now) only with threads started inside the root actor (which seemed to
have the most issues in terms of the impl and special cases..) using the
new `tractor.pause_from_sync()` API!

Main implementation changes to `.pause_from_sync()`
------ - ------
- from the root actor, we need to ensure bg thread case is handled
  *specially* since no IPC is used to request the TTY stdio mutex and
  `Lock` (API) usage is conducted entirely from a local task or thread;
  dedicated `Lock` usage for the root-actor already is branched inside
  `._pause()` and needs similar handling from a root bg-thread:
 |_for the special case of a root bg thread we need to
   `trio`-main-thread schedule a bg task inside a new
   `_pause_from_bg_root_thread()`. The new task needs to implement most
   of what was is handled inside `._pause()` manually, mostly because in
   this root-actor-bg-thread case we have 2 constraints:
   1. to enter `PdbREPL.interaction()` **from the bg thread** directly,
   2. the task that `Lock._debug_lock.acquire()`s has to be the same
      that calls `.release() (a `trio.FIFOLock` constraint)
 |_impl deats of this `_pause_from_bg_root_thread()` include:
   - (for now) calling `._pause()` to acquire the `Lock._debug_lock`.
   - setting its own `DebugStatus.repl_release`.
   - calling `.DebugStatus.shield_sigint()` to ensure the root's
     main thread  uses the right handler when the bg one is REPL-ing.
   - wait manually on the `.repl_release()` to be set by the thread's
     dedicated `PdbREPL` exit.
   - manually calling `Lock.release()` from the **same task** that
     acquired it.
- expect calls to `._pause()` to deliver a `tuple[Task, PdbREPL]` such
  that we always get the handle both to any newly created REPl instance
  and the (maybe) the scheduled bg task within which is runs.
- add a single `message: str` style to `log.devx()` based on branching
  style for logging.
- ensure both `DebugStatus.repl` and `.repl_task` are set **just
  before** calling `._set_trace()` to ensure the correct `Task|Thread`
  is set when the REPL is finally entered from sync code.
- add a wrapping caller `_sync_pause_from_builtin()` which passes in the
  new `called_from_builtin=True` to indicate `breakpoint()` caller
  usage, obvi pass in `api_frame`.

Changes to `._pause()` in support of ^
------ - ------
- `TaskStatus.started()` and return the `tuple[Task, PdbREPL]` to
  callers / starters.
- only call `DebugStatus.shield_sigint()` when no `repl` passed bc some
  callers (like bg threads) may need to apply it at some specific point
  themselves.
- tweak some asserts for the `debug_func == None` / non-`trio`-thread
  case.
- add a mod-level `_repl_fail_msg: str` to be used when there's an
  internal `._pause()` failure for testing, easier to pexpect match.
- more comprehensive logging for the root-actor branched case to
  (attempt to) indicate any of the 3 cases:
  - remote ctx from subactor has the `Lock`,
  - already existing root task or thread has it or,
  - some kinda stale `.locked()` situation where the root has the lock
    but we don't know why.
- for root usage, revert to always `await Lock._debug_lock.acquire()`-ing
  despite `called_from_sync` since `.pause_from_sync()` was reworked to
  instead handle the special bg thread case in the new
  `_pause_from_bg_root_thread()` task.
- always do `return _enter_repl_sync(debug_func)`.
- try to report any `repl_task: Task|Thread` set by the caller
  (particularly for the bg thread cases) as being the thread or task
  `._pause()` was called "on behalf of"

Changes to `DebugStatus`/`Lock` in support of ^
------ - ------
- only call `Lock.release()` from `DebugStatus.set_[quit/continue]()`
  when called from the main `trio` thread and always call
  `DebugStatus.release()` **after** to ensure `.repl_released()` is set
  **after** `._debug_lock.release()`.
- only call `.repl_release.set()` from `trio` thread otherwise use
  `.from_thread.run()`.
- much more refinements in `Lock.release()` for threading cases:
  - return `bool` to indicate whether lock was released by caller.
  - mask (in prep to drop) `_pause()` usage of
    `Lock.release.force=True)` since forcing a release can't ever avoid
    the RTE from `trio`.. same task **must** acquire/release.
  - don't allow usage from non-`trio`-main-threads, ever; there's no
    point since the same-task-needs-to-manage-`FIFOLock` constraint.
  - much more detailed logging using `message`-building-style for all
    caller (edge) cases.
   |_ use a `we_released: bool` to determine failed-to-release edge
      cases which can happen if called from bg threads, ensure we
      `log.exception()` on any incorrect usage resulting in  release
      failure.
   |_ complain loudly if the release fails and some other task/thread
      still holds the lock.
   |_ be explicit about "who" (which task or thread) the release is "on
      behalf of" by reading `DebugStatus.repl_task` since the caller
      isn't the REPL operator in many sync cases.
  - more or less drop `force` support, as mentioned above.
  - ensure we unset `._owned_by_root` if the caller is a root task.

Other misc
------ - ------
- rename `lock_tty_for_child()` -> `lock_stdio_for_peer()`.
- rejig `Lock.repr()` to show lock and event stats.
- stage `Lock.stats` and `.owner` methods in prep for doing a singleton
  instance and `@property`s.
---
 tractor/devx/_debug.py | 732 +++++++++++++++++++++++++++++------------
 1 file changed, 525 insertions(+), 207 deletions(-)

diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 858133fd..3218cffa 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -62,7 +62,6 @@ import trio
 from trio import CancelScope
 from trio.lowlevel import (
     current_task,
-    Task,
 )
 from trio import (
     TaskStatus,
@@ -81,6 +80,8 @@ from tractor._state import (
 # )
 
 if TYPE_CHECKING:
+    from trio.lowlevel import Task
+    from threading import Thread
     from tractor._ipc import Channel
     from tractor._context import Context
     from tractor._runtime import (
@@ -92,6 +93,11 @@ if TYPE_CHECKING:
 
 log = get_logger(__name__)
 
+# TODO: refine the internal impl and APIs in this module!
+#
+# -[ ] separate `._pause()` branch-cases for calling from a root task
+#     vs. from subactors
+
 
 def hide_runtime_frames() -> dict[FunctionType, CodeType]:
     '''
@@ -203,6 +209,15 @@ class Lock:
 
         return None
 
+    # TODO: once we convert to singleton-per-actor-style
+    # @property
+    # def stats(cls) -> trio.LockStatistics:
+    #     return cls._debug_lock.statistics()
+
+    # @property
+    # def owner(cls) -> Task:
+    #     return cls._debug_lock.statistics().owner
+
     #     ROOT ONLY
     # ------ - -------
     # the root-actor-ONLY singletons for, 
@@ -229,18 +244,22 @@ class Lock:
     @classmethod
     def repr(cls) -> str:
         lock_stats: trio.LockStatistics = cls._debug_lock.statistics()
+        req: trio.Event|None = cls.req_handler_finished
         fields: str = (
-            f'req_handler_finished: {cls.req_handler_finished}\n'
-            f'_blocked: {cls._blocked}\n\n'
-            f'_debug_lock: {cls._debug_lock}\n'
-            f'lock_stats: {lock_stats}\n'
-            f'ctx_in_debug: {cls.ctx_in_debug}\n'
+            f'|_ ._blocked: {cls._blocked}\n'
+            f'|_ ._debug_lock: {cls._debug_lock}\n'
+            f'  {lock_stats}\n\n'
 
+            f'|_ .ctx_in_debug: {cls.ctx_in_debug}\n'
+            f'|_ .req_handler_finished: {req}\n'
         )
+        if req:
+            req_stats: trio.EventStatistics = req.statistics()
+            fields += f'  {req_stats}\n'
 
         body: str = textwrap.indent(
             fields,
-            prefix=' |_',
+            prefix=' ',
         )
         return (
             f'<{cls.__name__}(\n'
@@ -253,28 +272,59 @@ class Lock:
     def release(
         cls,
         force: bool = False,
-    ):
-        if not cls._owned_by_root:
-            message: str = 'TTY lock not held by any child\n'
-        else:
-            message: str = 'TTY lock held in root-actor task\n'
+        raise_on_thread: bool = True,
 
-        if not (is_trio_main := DebugStatus.is_main_trio_thread()):
-            task: threading.Thread = threading.current_thread()
+    ) -> bool:
+        '''
+        Release the actor-tree global TTY stdio lock (only) from the
+        `trio.run()`-main-thread.
+
+        '''
+        we_released: bool = False
+        ctx_in_debug: Context|None = cls.ctx_in_debug
+        repl_task: Task|Thread|None = DebugStatus.repl_task
+        if not DebugStatus.is_main_trio_thread():
+            thread: threading.Thread = threading.current_thread()
+            message: str = (
+                '`Lock.release()` can not be called from a non-main-`trio` thread!\n'
+                f'{thread}\n'
+            )
+            if raise_on_thread:
+                raise RuntimeError(message)
+
+            log.devx(message)
+            return False
+
+        task: Task = current_task()
+
+        # sanity check that if we're the root actor
+        # the lock is marked as such.
+        # note the pre-release value may be diff the the
+        # post-release task.
+        if repl_task is task:
+            assert cls._owned_by_root
+            message: str = (
+                'TTY lock held by root-actor on behalf of local task\n'
+                f'|_{repl_task}\n'
+            )
         else:
-            task: trio.Task = current_task()
+            assert DebugStatus.repl_task is not task
+
+        message: str = (
+            'TTY lock was NOT released on behalf of caller\n'
+            f'|_{task}\n'
+        )
 
         try:
             lock: trio.StrictFIFOLock = cls._debug_lock
             owner: Task = lock.statistics().owner
             if (
-                lock.locked()
-                and
-                (
-                    owner is task
-                    # or
-                    # cls._owned_by_root
-                )
+                (lock.locked() or force)
+                # ^-TODO-NOTE-^ should we just remove this, since the
+                # RTE case above will always happen when you force
+                # from the wrong task?
+
+                and (owner is task)
                 # ^-NOTE-^ if we do NOT ensure this, `trio` will
                 # raise a RTE when a non-owner tries to releasee the
                 # lock.
@@ -284,17 +334,27 @@ class Lock:
                 # being set to the `.repl_task` such that the above
                 # condition matches and we actually release the lock.
                 # This is particular of note from `.pause_from_sync()`!
+
             ):
-                if not is_trio_main:
-                    trio.from_thread.run_sync(
-                        cls._debug_lock.release
+                cls._debug_lock.release()
+                we_released: bool = True
+                if repl_task:
+                    message: str = (
+                        'Lock released on behalf of root-actor-local REPL owner\n'
+                        f'|_{repl_task}\n'
                     )
                 else:
-                    cls._debug_lock.release()
-                    message: str = 'TTY lock released for child\n'
+                    message: str = (
+                        'TTY lock released by us on behalf of remote peer?\n'
+                        f'|_ctx_in_debug: {ctx_in_debug}\n\n'
+                    )
+                    # mk_pdb().set_trace()
+                # elif owner:
 
         except RuntimeError as rte:
-            log.exception('Failed to release `Lock`?')
+            log.exception(
+                'Failed to release `Lock._debug_lock: trio.FIFOLock`?\n'
+            )
             raise rte
 
         finally:
@@ -303,40 +363,59 @@ class Lock:
             # we are now back in the "tty unlocked" state. This is basically
             # and edge triggered signal around an empty queue of sub-actor
             # tasks that may have tried to acquire the lock.
-            lock_stats = cls._debug_lock.statistics()
+            lock_stats: trio.LockStatistics = cls._debug_lock.statistics()
             req_handler_finished: trio.Event|None = Lock.req_handler_finished
             if (
                 not lock_stats.owner
-                or force
                 and req_handler_finished is None
             ):
-                message += '-> No more child ctx tasks hold the TTY lock!\n'
-
-            elif (
-                req_handler_finished
-                and
-                lock.locked()
-            ):
-                req_stats = req_handler_finished.statistics()
                 message += (
-                    f'-> A child ctx task still owns the `Lock` ??\n'
-                    f'  |_lock_stats: {lock_stats}\n'
-                    f'  |_req_stats: {req_stats}\n'
+                    '-> No new task holds the TTY lock!\n\n'
+                    f'{Lock.repr()}\n'
                 )
 
-            cls.ctx_in_debug = None
+            elif (
+                req_handler_finished  # new IPC ctx debug request active
+                and
+                lock.locked()  # someone has the lock
+            ):
+                behalf_of_task = (
+                    ctx_in_debug
+                    or
+                    repl_task
+                )
+                message += (
+                    f'\nA non-caller task still owns this lock on behalf of '
+                    f'{behalf_of_task}\n'
+                    f'|_{lock_stats.owner}\n'
+                )
 
             if (
-                cls._owned_by_root
+                we_released
+                and
+                ctx_in_debug
             ):
-                if not lock.locked():
-                    cls._owned_by_root = False
-                else:
-                    message += 'Lock still held by root actor task?!?\n'
-                    lock.release()
+                cls.ctx_in_debug = None  # unset
+
+            # post-release value (should be diff then value above!)
+            repl_task: Task|Thread|None = DebugStatus.repl_task
+            if (
+                cls._owned_by_root
+                and
+                we_released
+            ):
+                cls._owned_by_root = False
+
+                if task is not repl_task:
+                    message += (
+                        'Lock released by root actor on behalf of bg thread\n'
+                        f'|_{repl_task}\n'
+                    )
 
             log.devx(message)
 
+        return we_released
+
     @classmethod
     @acm
     async def acquire_for_ctx(
@@ -380,7 +459,7 @@ class Lock:
             log.runtime(pre_msg)
 
             # NOTE: if the surrounding cancel scope from the
-            # `lock_tty_for_child()` caller is cancelled, this line should
+            # `lock_stdio_for_peer()` caller is cancelled, this line should
             # unblock and NOT leave us in some kind of
             # a "child-locked-TTY-but-child-is-uncontactable-over-IPC"
             # condition.
@@ -398,7 +477,7 @@ class Lock:
             # IF we received a cancel during the shielded lock entry of some
             # next-in-queue requesting task, then the resumption here will
             # result in that ``trio.Cancelled`` being raised to our caller
-            # (likely from ``lock_tty_for_child()`` below)!  In
+            # (likely from `lock_stdio_for_peer()` below)!  In
             # this case the ``finally:`` below should trigger and the
             # surrounding caller side context should cancel normally
             # relaying back to the caller.
@@ -408,8 +487,8 @@ class Lock:
         finally:
             message :str = 'Exiting `Lock.acquire_for_ctx()` on behalf of sub-actor\n'
             if we_acquired:
-                message += '-> TTY lock released by child\n'
                 cls.release()
+                message += '-> TTY lock released by child\n'
 
             else:
                 message += '-> TTY lock never acquired by child??\n'
@@ -421,7 +500,7 @@ class Lock:
 
 
 @tractor.context
-async def lock_tty_for_child(
+async def lock_stdio_for_peer(
     ctx: Context,
     subactor_task_uid: tuple[str, int],
 
@@ -545,25 +624,26 @@ async def lock_tty_for_child(
 
     except BaseException as req_err:
         message: str = (
+            f'On behalf of remote peer {subactor_task_uid!r}@{ctx.chan.uid!r}\n\n'
             'Forcing `Lock.release()` for req-ctx since likely an '
             'internal error!\n\n'
             f'{ctx}'
         )
         if isinstance(req_err, trio.Cancelled):
             message = (
-                'Cancelled during root TTY-lock dialog?\n'
+                'Cancelled during root TTY-lock dialog\n'
                 +
                 message
             )
         else:
             message = (
-                'Errored during root TTY-lock dialog?\n'
+                'Errored during root TTY-lock dialog\n'
                 +
                 message
             )
 
         log.exception(message)
-        Lock.release(force=True)
+        Lock.release() #force=True)
         raise
 
     finally:
@@ -645,7 +725,7 @@ class DebugStatus:
     def shield_sigint(cls):
         '''
         Shield out SIGINT handling (which by default triggers
-        `trio.Task` cancellation) in subactors when a `pdb` REPL
+        `Task` cancellation) in subactors when a `pdb` REPL
         is active.
 
         Avoids cancellation of the current actor (task) when the user
@@ -767,9 +847,17 @@ class DebugStatus:
         try:
             # sometimes the task might already be terminated in
             # which case this call will raise an RTE?
-            if repl_release is not None:
-                repl_release.set()
-
+            if (
+                repl_release is not None
+            ):
+                if cls.is_main_trio_thread():
+                    repl_release.set()
+                else:
+                    # XXX NOTE ONLY used for bg root-actor sync
+                    # threads, see `.pause_from_sync()`.
+                    trio.from_thread.run_sync(
+                        repl_release.set
+                    )
         finally:
             # if req_ctx := cls.req_ctx:
             #     req_ctx._scope.cancel()
@@ -856,8 +944,6 @@ class PdbREPL(pdbp.Pdb):
         try:
             super().set_continue()
         finally:
-            DebugStatus.release()
-
             # NOTE: for subactors the stdio lock is released via the
             # allocated RPC locker task, so for root we have to do it
             # manually.
@@ -865,21 +951,32 @@ class PdbREPL(pdbp.Pdb):
                 is_root_process()
                 and
                 Lock._debug_lock.locked()
+                and
+                DebugStatus.is_main_trio_thread()
             ):
+                # Lock.release(raise_on_thread=False)
                 Lock.release()
 
+            # XXX after `Lock.release()` for root local repl usage
+            DebugStatus.release()
+
     def set_quit(self):
         try:
             super().set_quit()
         finally:
-            DebugStatus.release()
             if (
                 is_root_process()
                 and
                 Lock._debug_lock.locked()
+                and
+                DebugStatus.is_main_trio_thread()
             ):
+                # Lock.release(raise_on_thread=False)
                 Lock.release()
 
+            # XXX after `Lock.release()` for root local repl usage
+            DebugStatus.release()
+
     # XXX NOTE: we only override this because apparently the stdlib pdb
     # bois likes to touch the SIGINT handler as much as i like to touch
     # my d$%&.
@@ -960,20 +1057,24 @@ async def request_root_stdio_lock(
     task_status: TaskStatus[CancelScope] = trio.TASK_STATUS_IGNORED,
 ):
     '''
-    Connect to the root actor of this process tree and RPC-invoke
-    a task which acquires a std-streams global `Lock`: a actor tree
-    global mutex which prevents other subactors from entering
-    a `PdbREPL` at the same time as any other.
+    Connect to the root actor for this actor's process tree and
+    RPC-invoke a task which acquires the std-streams global `Lock`:
+    a process-tree-global mutex which prevents multiple actors from
+    entering `PdbREPL.interaction()` at the same time such that the
+    parent TTY's stdio is never "clobbered" by simultaneous
+    reads/writes.
 
-    The actual `Lock` singleton exists ONLY in the root actor's
-    memory and does nothing more then set process-tree global state.
-    The actual `PdbREPL` interaction is completely isolated to each
-    sub-actor and with the `Lock` merely providing the multi-process
-    syncing mechanism to avoid any subactor (or the root itself) from
-    entering the REPL at the same time.
+    The actual `Lock` singleton instance exists ONLY in the root
+    actor's memory space and does nothing more then manage
+    process-tree global state,
+    namely a `._debug_lock: trio.FIFOLock`.
+
+    The actual `PdbREPL` interaction/operation is completely isolated
+    to each sub-actor (process) with the root's `Lock` providing the
+    multi-process mutex-syncing mechanism to avoid parallel REPL
+    usage within an actor tree.
 
     '''
-
     log.devx(
         'Initing stdio-lock request task with root actor'
     )
@@ -1004,7 +1105,7 @@ async def request_root_stdio_lock(
         # `.repl_release: # trio.Event`.
         with trio.CancelScope(shield=shield) as req_cs:
             # XXX: was orig for debugging cs stack corruption..
-            # log.info(
+            # log.devx(
             #     'Request cancel-scope is:\n\n'
             #     f'{pformat_cs(req_cs, var_name="req_cs")}\n\n'
             # )
@@ -1014,7 +1115,7 @@ async def request_root_stdio_lock(
                 # TODO: merge into single async with ?
                 async with get_root() as portal:
                     async with portal.open_context(
-                        lock_tty_for_child,
+                        lock_stdio_for_peer,
                         subactor_task_uid=task_uid,
                         # NOTE: set it here in the locker request task bc it's
                         # possible for multiple such requests for the lock in any
@@ -1468,6 +1569,11 @@ class DebugRequestError(RuntimeError):
     '''
 
 
+_repl_fail_msg: str = (
+    'Failed to REPl via `_pause()` '
+)
+
+
 async def _pause(
 
     debug_func: Callable|partial|None,
@@ -1487,10 +1593,13 @@ async def _pause(
     hide_tb: bool = True,
     called_from_sync: bool = False,
     called_from_bg_thread: bool = False,
-    task_status: TaskStatus[trio.Event] = trio.TASK_STATUS_IGNORED,
+    task_status: TaskStatus[
+        tuple[Task, PdbREPL],
+        trio.Event
+    ] = trio.TASK_STATUS_IGNORED,
     **debug_func_kwargs,
 
-) -> None:
+) -> tuple[PdbREPL, Task]|None:
     '''
     Inner impl for `pause()` to avoid the `trio.CancelScope.__exit__()`
     stack frame when not shielded (since apparently i can't figure out
@@ -1502,25 +1611,26 @@ async def _pause(
     __tracebackhide__: bool = hide_tb
     actor: Actor = current_actor()
     try:
-        # TODO: use the `Task` instance instead for `is` checks
-        # below!
-        task: Task = trio.lowlevel.current_task()
+        task: Task = current_task()
     except RuntimeError as rte:
+        log.exception('Failed to get current task?')
         if actor.is_infected_aio():
             raise RuntimeError(
                 '`tractor.pause[_from_sync]()` not yet supported '
                 'for infected `asyncio` mode!'
             ) from rte
 
+        raise
+
     if debug_func is not None:
         debug_func = partial(debug_func)
 
-    repl: PdbREPL = repl or mk_pdb()
-
     # XXX NOTE XXX set it here to avoid ctl-c from cancelling a debug
     # request from a subactor BEFORE the REPL is entered by that
     # process.
-    DebugStatus.shield_sigint()
+    if not repl:
+        DebugStatus.shield_sigint()
+    repl: PdbREPL = repl or mk_pdb()
 
     # TODO: move this into a `open_debug_request()` @acm?
     # -[ ] prolly makes the most sense to do the request
@@ -1538,6 +1648,9 @@ async def _pause(
             debug_func.func.__name__ if debug_func else 'None'
         )
 
+        # TODO: do we want to support using this **just** for the
+        # locking / common code (prolly to help address #320)?
+        task_status.started((task, repl))
         try:
             if debug_func:
                 # block here one (at the appropriate frame *up*) where
@@ -1548,11 +1661,11 @@ async def _pause(
                     f'  |_{task}\n'
                  )
 
-                # set local actor task to avoid recurrent
-                # entries/requests from the same local task (to the root
-                # process).
-                DebugStatus.repl = repl
+                # set local task on process-global state to avoid
+                # recurrent entries/requests from the same
+                # actor-local task.
                 DebugStatus.repl_task = task
+                DebugStatus.repl = repl
 
                 # invoke the low-level REPL activation routine which itself
                 # should call into a `Pdb.set_trace()` of some sort.
@@ -1568,16 +1681,13 @@ async def _pause(
             else:
                 if (
                     called_from_sync
-                    # and
-                    # is_root_process()
                     and
                     not DebugStatus.is_main_trio_thread()
                 ):
+                    assert called_from_bg_thread
                     assert DebugStatus.repl_task is not task
 
-                # TODO: do we want to support using this **just** for the
-                # locking / common code (prolly to help address #320)?
-                task_status.started(DebugStatus)
+                return (task, repl)
 
         except trio.Cancelled:
             log.exception(
@@ -1607,12 +1717,23 @@ async def _pause(
     # TODO: this should be created as part of `DebugRequest()` init
     # which should instead be a one-shot-use singleton much like
     # the `PdbREPL`.
+    repl_task: Thread|Task|None = DebugStatus.repl_task
     if (
         not DebugStatus.repl_release
         or
         DebugStatus.repl_release.is_set()
     ):
+        log.devx(
+            'Setting new `DebugStatus.repl_release: trio.Event` for requesting task\n'
+            f'|_{task}\n'
+        )
         DebugStatus.repl_release = trio.Event()
+    else:
+        log.devx(
+            'Already an existing actor-local REPL user task\n'
+            f'|_{repl_task}\n'
+        )
+
     # ^-NOTE-^ this must be created BEFORE scheduling any subactor
     # debug-req task since it needs to wait on it just after
     # `.started()`-ing back its wrapping `.req_cs: CancelScope`.
@@ -1620,73 +1741,110 @@ async def _pause(
     repl_err: BaseException|None = None
     try:
         if is_root_process():
-
             # we also wait in the root-parent for any child that
             # may have the tty locked prior
-            # TODO: wait, what about multiple root tasks acquiring it though?
+            # TODO: wait, what about multiple root tasks (with bg
+            # threads) acquiring it though?
             ctx: Context|None = Lock.ctx_in_debug
+            repl_task: Task|None = DebugStatus.repl_task
             if (
                 ctx is None
                 and
-                DebugStatus.repl
-                and
-                DebugStatus.repl_task is task
+                repl_task is task
+                # and
+                # DebugStatus.repl
+                # ^-NOTE-^ matches for multi-threaded case as well?
             ):
                 # re-entrant root process already has it: noop.
                 log.warning(
                     f'This root actor task is already within an active REPL session\n'
-                    f'Ignoring this re-entered `tractor.pause()`\n'
-                    f'task: {task.name}\n'
+                    f'Ignoring this recurrent`tractor.pause()` entry\n\n'
+                    f'|_{task}\n'
                     # TODO: use `._frame_stack` scanner to find the @api_frame
                 )
                 with trio.CancelScope(shield=shield):
                     await trio.lowlevel.checkpoint()
-                return
+                return repl, task
+
+            # elif repl_task:
+            #     log.warning(
+            #         f'This root actor has another task already in REPL\n'
+            #         f'Waitin for the other task to complete..\n\n'
+            #         f'|_{task}\n'
+            #         # TODO: use `._frame_stack` scanner to find the @api_frame
+            #     )
+            #     with trio.CancelScope(shield=shield):
+            #         await DebugStatus.repl_release.wait()
+            #         await trio.sleep(0.1)
 
             # must shield here to avoid hitting a `Cancelled` and
             # a child getting stuck bc we clobbered the tty
             with trio.CancelScope(shield=shield):
-                if Lock._debug_lock.locked():
+                ctx_line = '`Lock` in this root actor task'
+                acq_prefix: str = 'shield-' if shield else ''
+                if (
+                    Lock._debug_lock.locked()
+                ):
+                    if ctx:
+                        ctx_line: str = (
+                            'active `Lock` owned by ctx\n\n'
+                            f'{ctx}'
+                        )
+                    elif Lock._owned_by_root:
+                        ctx_line: str = (
+                            'Already owned by root-task `Lock`\n\n'
+                            f'repl_task: {DebugStatus.repl_task}\n'
+                            f'repl: {DebugStatus.repl}\n'
+                        )
+                    else:
+                        ctx_line: str = (
+                            '**STALE `Lock`** held by unknown root/remote task '
+                            'with no request ctx !?!?'
+                        )
 
-                    acq_prefix: str = 'shield-' if shield else ''
-                    ctx_line: str = (
-                        'lock owned by ctx\n\n'
-                        f'{ctx}'
-                    ) if ctx else 'stale lock with no request ctx!?'
-                    log.devx(
-                        f'attempting to {acq_prefix}acquire active TTY '
-                        f'{ctx_line}'
-                    )
+                log.devx(
+                    f'attempting to {acq_prefix}acquire '
+                    f'{ctx_line}'
+                )
+                await Lock._debug_lock.acquire()
+                Lock._owned_by_root = True
+                # else:
 
-                # XXX: since we need to enter pdb synchronously below,
-                # and we don't want to block the thread that starts
-                # stepping through the application thread, we later
-                # must `Lock._debug_lock.release()` manually from
-                # some `PdbREPL` completion callback(`.set_[continue/exit]()`).
-                #
-                # So, when `._pause()` is called from a (bg/non-trio)
-                # thread, special provisions are needed and we need
-                # to do the `.acquire()`/`.release()` calls from
-                # a common `trio.task` (due to internal impl of
-                # `FIFOLock`). Thus we do not acquire here and
-                # instead expect `.pause_from_sync()` to take care of
-                # this detail depending on the caller's (threading)
-                # usage.
-                #
-                # NOTE that this special case is ONLY required when
-                # using `.pause_from_sync()` from the root actor
-                # since OW a subactor will instead make an IPC
-                # request (in the branch below) to acquire the
-                # `Lock`-mutex and a common root-actor RPC task will
-                # take care of `._debug_lock` mgmt!
-                if not called_from_sync:
-                    await Lock._debug_lock.acquire()
-                    Lock._owned_by_root = True
+                # if (
+                #     not called_from_bg_thread
+                #     and not called_from_sync
+                # ):
+                #     log.devx(
+                #         f'attempting to {acq_prefix}acquire '
+                #         f'{ctx_line}'
+                #     )
+
+                    # XXX: since we need to enter pdb synchronously below,
+                    # and we don't want to block the thread that starts
+                    # stepping through the application thread, we later
+                    # must `Lock._debug_lock.release()` manually from
+                    # some `PdbREPL` completion callback(`.set_[continue/exit]()`).
+                    #
+                    # So, when `._pause()` is called from a (bg/non-trio)
+                    # thread, special provisions are needed and we need
+                    # to do the `.acquire()`/`.release()` calls from
+                    # a common `trio.task` (due to internal impl of
+                    # `FIFOLock`). Thus we do not acquire here and
+                    # instead expect `.pause_from_sync()` to take care of
+                    # this detail depending on the caller's (threading)
+                    # usage.
+                    #
+                    # NOTE that this special case is ONLY required when
+                    # using `.pause_from_sync()` from the root actor
+                    # since OW a subactor will instead make an IPC
+                    # request (in the branch below) to acquire the
+                    # `Lock`-mutex and a common root-actor RPC task will
+                    # take care of `._debug_lock` mgmt!
 
             # enter REPL from root, no TTY locking IPC ctx necessary
             # since we can acquire the `Lock._debug_lock` directly in
             # thread.
-            _enter_repl_sync(debug_func)
+            return _enter_repl_sync(debug_func)
 
         # TODO: need a more robust check for the "root" actor
         elif (
@@ -1809,7 +1967,7 @@ async def _pause(
             )
 
             # enter REPL
-            _enter_repl_sync(debug_func)
+            return _enter_repl_sync(debug_func)
 
     # TODO: prolly factor this plus the similar block from
     # `_enter_repl_sync()` into a common @cm?
@@ -1838,7 +1996,9 @@ async def _pause(
 
         else:
             log.exception(
-                'Failed to engage debugger via `_pause()` ??\n'
+                _repl_fail_msg
+                +
+                f'on behalf of {repl_task} ??\n'
             )
 
         DebugStatus.release(cancel_req_task=True)
@@ -1882,11 +2042,11 @@ def _set_trace(
     # optionally passed in to provide support for
     # `pause_from_sync()` where
     actor: tractor.Actor|None = None,
-    task: trio.Task|None = None,
+    task: Task|Thread|None = None,
 ):
     __tracebackhide__: bool = hide_tb
     actor: tractor.Actor = actor or current_actor()
-    task: trio.Task = task or current_task()
+    task: Task|Thread = task or current_task()
 
     # else:
     # TODO: maybe print the actor supervion tree up to the
@@ -2023,7 +2183,7 @@ async def maybe_init_greenback(
 
     if mod := maybe_import_greenback(**kwargs):
         await mod.ensure_portal()
-        log.info(
+        log.devx(
             '`greenback` portal opened!\n'
             'Sync debug support activated!\n'
         )
@@ -2032,12 +2192,116 @@ async def maybe_init_greenback(
     return None
 
 
-# TODO: allow pausing from sync code.
-# normally by remapping python's builtin breakpoint() hook to this
-# runtime aware version which takes care of all .
-def pause_from_sync(
 
+async def _pause_from_bg_root_thread(
+    behalf_of_thread: Thread,
+    repl: PdbREPL,
+    hide_tb: bool,
+    task_status: TaskStatus[Task] = trio.TASK_STATUS_IGNORED,
+    **_pause_kwargs,
+):
+    '''
+    Acquire the `Lock._debug_lock` from a bg (only need for
+    root-actor) non-`trio` thread (started via a call to
+    `.to_thread.run_sync()` in some actor) by scheduling this func in
+    the actor's service (TODO eventually a special debug_mode)
+    nursery. This task acquires the lock then `.started()`s the
+    `DebugStatus.repl_release: trio.Event` waits for the `PdbREPL` to
+    set it, then terminates very much the same way as
+    `request_root_stdio_lock()` uses an IPC `Context` from a subactor
+    to do the same from a remote process.
+
+    This task is normally only required to be scheduled for the
+    special cases of a bg sync thread running in the root actor; see
+    the only usage inside `.pause_from_sync()`.
+
+    '''
+    global Lock
+    # TODO: unify this copied code with where it was
+    # from in `maybe_wait_for_debugger()`
+    # if (
+    #     Lock.req_handler_finished is not None
+    #     and not Lock.req_handler_finished.is_set()
+    #     and (in_debug := Lock.ctx_in_debug)
+    # ):
+    #     log.devx(
+    #         '\nRoot is waiting on tty lock to release from\n\n'
+    #         # f'{caller_frame_info}\n'
+    #     )
+    #     with trio.CancelScope(shield=True):
+    #         await Lock.req_handler_finished.wait()
+
+    #     log.pdb(
+    #         f'Subactor released debug lock\n'
+    #         f'|_{in_debug}\n'
+    #     )
+    task: Task = current_task()
+
+    # Manually acquire since otherwise on release we'll
+    # get a RTE raised by `trio` due to ownership..
+    log.devx(
+        'Trying to acquire `Lock` on behalf of bg thread\n'
+        f'|_{behalf_of_thread}\n'
+    )
+    # DebugStatus.repl_task = behalf_of_thread
+    out = await _pause(
+        debug_func=None,
+        repl=repl,
+        hide_tb=hide_tb,
+        called_from_sync=True,
+        called_from_bg_thread=True,
+        **_pause_kwargs
+    )
+    lock: trio.FIFOLock = Lock._debug_lock
+    stats: trio.LockStatistics= lock.statistics()
+    assert stats.owner is task
+    assert Lock._owned_by_root
+    assert DebugStatus.repl_release
+
+    # TODO: do we actually need this?
+    # originally i was trying to solve wy this was
+    # unblocking too soon in a thread but it was actually
+    # that we weren't setting our own `repl_release` below..
+    while stats.owner is not task:
+        log.devx(
+            'Trying to acquire `._debug_lock` from {stats.owner} for\n'
+            f'|_{behalf_of_thread}\n'
+        )
+        await lock.acquire()
+        break
+
+    # XXX NOTE XXX super important dawg..
+    # set our own event since the current one might
+    # have already been overriden and then set when the
+    # last REPL mutex holder exits their sesh!
+    # => we do NOT want to override any existing one
+    #   and we want to ensure we set our own ONLY AFTER we have
+    #   acquired the `._debug_lock`
+    repl_release = DebugStatus.repl_release = trio.Event()
+
+    # unblock caller thread delivering this bg task
+    log.devx(
+        'Unblocking root-bg-thread since we acquired lock via `._pause()`\n'
+        f'|_{behalf_of_thread}\n'
+    )
+    task_status.started(out)
+    DebugStatus.shield_sigint()
+
+    # wait for bg thread to exit REPL sesh.
+    try:
+        await repl_release.wait()
+    finally:
+        log.devx(
+            'releasing lock from bg root thread task!\n'
+            f'|_ {behalf_of_thread}\n'
+        )
+        Lock.release()
+
+
+def pause_from_sync(
     hide_tb: bool = True,
+    called_from_builtin: bool = False,
+    api_frame: FrameType|None = None,
 
     # proxy to `._pause()`, for ex:
     # shield: bool = False,
@@ -2045,15 +2309,24 @@ def pause_from_sync(
     **_pause_kwargs,
 
 ) -> None:
+    '''
+    Pause a `tractor` scheduled task or thread from sync (non-async
+    function) code.
 
+    When `greenback` is installed we remap python's builtin
+    `breakpoint()` hook to this runtime-aware version which takes
+    care of all bg-thread detection and appropriate synchronization
+    with the root actor's `Lock` to avoid mult-thread/process REPL
+    clobbering Bo
+
+    '''
     __tracebackhide__: bool = hide_tb
     try:
         actor: tractor.Actor = current_actor(
             err_on_no_runtime=False,
         )
-        log.debug(
-            f'{actor.uid}: JUST ENTERED `tractor.pause_from_sync()`'
-            f'|_{actor}\n'
+        message: str = (
+            f'{actor.uid} task called `tractor.pause_from_sync()`\n\n'
         )
         if not actor:
             raise RuntimeError(
@@ -2063,7 +2336,7 @@ def pause_from_sync(
                 '- `async with tractor.open_root_actor()`\n'
             )
 
-        # NOTE: once supported, remove this AND the one
+        # TODO: once supported, remove this AND the one
         # inside `._pause()`!
         if actor.is_infected_aio():
             raise RuntimeError(
@@ -2071,78 +2344,111 @@ def pause_from_sync(
                 'for infected `asyncio` mode!'
             )
 
-        # raises on not-found by default
-        greenback: ModuleType = maybe_import_greenback()
-        mdb: PdbREPL = mk_pdb()
+        DebugStatus.shield_sigint()
+        repl: PdbREPL = mk_pdb()
 
-        # run async task which will lock out the root proc's TTY.
+        # message += f'-> created local REPL {repl}\n'
+        is_root: bool = is_root_process()
+
+        # TODO: we could also check for a non-`.to_thread` context
+        # using `trio.from_thread.check_cancelled()` (says
+        # oremanj) wherein we get the following outputs:
+        #
+        # `RuntimeError`: non-`.to_thread` spawned thread
+        # noop: non-cancelled `.to_thread`
+        # `trio.Cancelled`: cancelled `.to_thread`
+
+        # when called from a (bg) thread, run an async task in a new
+        # thread which will call `._pause()` manually with special
+        # handling for root-actor caller usage.
         if not DebugStatus.is_main_trio_thread():
-
-            # TODO: we could also check for a non-`.to_thread` context
-            # using `trio.from_thread.check_cancelled()` (says
-            # oremanj) wherein we get the following outputs:
-            #
-            # `RuntimeError`: non-`.to_thread` spawned thread
-            # noop: non-cancelled `.to_thread`
-            # `trio.Cancelled`: cancelled `.to_thread`
-            #
-            log.warning(
-                'Engaging `.pause_from_sync()` from ANOTHER THREAD!'
-            )
-            task: threading.Thread = threading.current_thread()
-            DebugStatus.repl_task: str = task
+            thread: threading.Thread = threading.current_thread()
+            repl_owner = thread
 
             # TODO: make root-actor bg thread usage work!
-            # if is_root_process():
-            #     async def _pause_from_sync_thread():
-            #         ...
-            # else:
-            #       .. the below ..
+            if is_root:
+                message += (
+                    f'-> called from a root-actor bg {thread}\n'
+                    f'-> scheduling `._pause_from_sync_thread()`..\n'
+                )
+                bg_task, repl = trio.from_thread.run(
+                    afn=partial(
+                        actor._service_n.start,
+                        partial(
+                            _pause_from_bg_root_thread,
+                            behalf_of_thread=thread,
+                            repl=repl,
+                            hide_tb=hide_tb,
+                            **_pause_kwargs,
+                        ),
+                    )
+                )
+                message += (
+                    f'-> `._pause_from_sync_thread()` started bg task {bg_task}\n'
+                )
+            else:
+                message += f'-> called from a bg {thread}\n'
+                # NOTE: since this is a subactor, `._pause()` will
+                # internally issue a debug request via
+                # `request_root_stdio_lock()` and we don't need to
+                # worry about all the special considerations as with
+                # the root-actor per above.
+                bg_task, repl = trio.from_thread.run(
+                    afn=partial(
+                        _pause,
+                        debug_func=None,
+                        repl=repl,
+                        hide_tb=hide_tb,
 
-            trio.from_thread.run(
-                partial(
-                    _pause,
-                    debug_func=None,
-                    repl=mdb,
-                    hide_tb=hide_tb,
+                        # XXX to prevent `._pause()` for setting
+                        # `DebugStatus.repl_task` to the gb task!
+                        called_from_sync=True,
+                        called_from_bg_thread=True,
 
-                    # XXX to prevent `._pause()` for setting
-                    # `DebugStatus.repl_task` to the gb task!
-                    called_from_sync=True,
-                    called_from_bg_thread=True,
-
-                    **_pause_kwargs
-                ),
-            )
+                        **_pause_kwargs
+                    ),
+                )
+                assert bg_task is not DebugStatus.repl_task
 
         else:  # we are presumably the `trio.run()` + main thread
-            task: trio.Task = current_task()
-            DebugStatus.repl_task: str = task
-            greenback.await_(
+            # raises on not-found by default
+            greenback: ModuleType = maybe_import_greenback()
+            message += f'-> imported {greenback}\n'
+            repl_owner: Task = current_task()
+            message += '-> calling `greenback.await_(_pause(debug_func=None))` from sync caller..\n'
+            out = greenback.await_(
                 _pause(
                     debug_func=None,
-                    repl=mdb,
+                    repl=repl,
                     hide_tb=hide_tb,
                     called_from_sync=True,
                     **_pause_kwargs,
                 )
             )
+            if out:
+                bg_task, repl = out
+                assert repl is repl
+                assert bg_task is repl_owner
 
-            if is_root_process():
-                # Manually acquire since otherwise on release we'll
-                # get a RTE raised by `trio` due to ownership..
-                Lock._debug_lock.acquire_nowait()
-                Lock._owned_by_root = True
+        # NOTE: normally set inside `_enter_repl_sync()`
+        DebugStatus.repl_task: str = repl_owner
 
         # TODO: ensure we aggressively make the user aware about
-        # entering the global ``breakpoint()`` built-in from sync
+        # entering the global `breakpoint()` built-in from sync
         # code?
+        message += (
+            f'-> successfully scheduled `._pause()` in `trio` thread on behalf of {bg_task}\n'
+            f'-> Entering REPL via `tractor._set_trace()` from caller {repl_owner}\n'
+        )
+        log.devx(message)
+
+        DebugStatus.repl = repl
         _set_trace(
-            api_frame=inspect.currentframe(),
-            repl=mdb,
+            api_frame=api_frame or inspect.currentframe(),
+            repl=repl,
             hide_tb=hide_tb,
             actor=actor,
-            task=task,
+            task=repl_owner,
         )
         # LEGACY NOTE on next LOC's frame showing weirdness..
         #
@@ -2155,6 +2461,26 @@ def pause_from_sync(
         raise err
 
 
+def _sync_pause_from_builtin(
+    *args,
+    called_from_builtin=True,
+    **kwargs,
+) -> None:
+    '''
+    Proxy call `.pause_from_sync()` but indicate the caller is the
+    `breakpoint()` built-in.
+
+    Note: this assigned to `os.environ['PYTHONBREAKPOINT']` inside `._root`
+
+    '''
+    pause_from_sync(
+        *args,
+        called_from_builtin=True,
+        api_frame=inspect.currentframe(),
+        **kwargs,
+    )
+
+
 # NOTE prefer a new "pause" semantic since it better describes
 # "pausing the actor's runtime" for this particular
 # paralell task to do debugging in a REPL.
@@ -2406,7 +2732,6 @@ async def maybe_wait_for_debugger(
                 and not Lock.req_handler_finished.is_set()
                 and in_debug is not None
             ):
-
                 # caller_frame_info: str = pformat_caller_frame()
                 logmeth(
                     msg
@@ -2421,7 +2746,7 @@ async def maybe_wait_for_debugger(
                 with trio.CancelScope(shield=True):
                     await Lock.req_handler_finished.wait()
 
-                log.pdb(
+                log.devx(
                     f'Subactor released debug lock\n'
                     f'|_{in_debug}\n'
                 )
@@ -2453,13 +2778,6 @@ async def maybe_wait_for_debugger(
                     await trio.sleep(poll_delay)
                     continue
 
-        # fallthrough on failure to acquire..
-        # else:
-        #     raise RuntimeError(
-        #         msg
-        #         +
-        #         'Root actor failed to acquire debug lock?'
-        #     )
         return True
 
     # else:
-- 
2.34.1


From 43a8cf4be1e20e439868bce1d8945e22a14a59c2 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 10 Jun 2024 17:46:10 -0400
Subject: [PATCH 359/378] Make big TODO: for `devx._debug` refinements

Hopefully would make grok-ing this fairly sophisticated sub-sys possible
for any up-and-coming `tractor` hacker

XD

A lot of internal API and re-org ideas I discovered/realized as part of
finishing the `__pld_spec__` and multi-threaded support. Particularly
better isolation between root-actor vs subactor task APIs and generally
less globally-state-ful stuff like `DebugStatus` and `Lock` method APIs
would likely make a lot of the hard to follow edge cases more clear?
---
 tractor/devx/_debug.py | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 3218cffa..5578e8a6 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -95,8 +95,38 @@ log = get_logger(__name__)
 
 # TODO: refine the internal impl and APIs in this module!
 #
-# -[ ] separate `._pause()` branch-cases for calling from a root task
-#     vs. from subactors
+# -[ ] rework `._pause()` and it's branch-cases for root vs.
+#     subactor:
+#  -[ ] `._pause_from_root()` + `_pause_from_subactor()`?
+#  -[ ]  do the de-factor based on bg-thread usage in
+#    `.pause_from_sync()` & `_pause_from_bg_root_thread()`.
+#  -[ ] drop `debug_func == None` case which is confusing af..
+#  -[ ]  factor out `_enter_repl_sync()` into a util func for calling
+#    the `_set_trace()` / `_post_mortem()` APIs?
+#
+# -[ ] figure out if we need `acquire_debug_lock()` and/or re-implement
+#    it as part of the `.pause_from_sync()` rework per above?
+#
+# -[ ] pair the `._pause_from_subactor()` impl with a "debug nursery"
+#   that's dynamically allocated inside the `._rpc` task thus
+#   avoiding the `._service_n.start()` usage for the IPC request?
+#  -[ ] see the TODO inside `._rpc._errors_relayed_via_ipc()`
+#
+# -[ ] impl a `open_debug_request()` which encaps all
+#   `request_root_stdio_lock()` task scheduling deats
+#   + `DebugStatus` state mgmt; which should prolly be re-branded as
+#   a `DebugRequest` type anyway AND with suppoort for bg-thread
+#   (from root actor) usage?
+#
+# -[ ] handle the `xonsh` case for bg-root-threads in the SIGINT
+#     handler!
+#   -[ ] do we need to do the same for subactors?
+#   -[ ] make the failing tests finally pass XD
+#
+# -[ ] simplify `maybe_wait_for_debugger()` to be a root-task only
+#     API?
+#   -[ ] currently it's implemented as that so might as well make it
+#     formal?
 
 
 def hide_runtime_frames() -> dict[FunctionType, CodeType]:
-- 
2.34.1


From a6058d14ae8be5d9de718be5dfcfac4fc399c837 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 10 Jun 2024 17:57:43 -0400
Subject: [PATCH 360/378] Use new `._debug._repl_fail_msg` inside
 `test_pause_from_sync`

---
 tests/test_debugger.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/test_debugger.py b/tests/test_debugger.py
index 5f818a60..43dadbb0 100644
--- a/tests/test_debugger.py
+++ b/tests/test_debugger.py
@@ -12,11 +12,8 @@ TODO:
 """
 from functools import partial
 import itertools
-# from os import path
-from typing import Optional
 import platform
 import pathlib
-# import sys
 import time
 
 import pytest
@@ -29,6 +26,7 @@ from pexpect.exceptions import (
 from tractor.devx._debug import (
     _pause_msg,
     _crash_msg,
+    _repl_fail_msg,
 )
 from tractor._testing import (
     examples_dir,
@@ -293,7 +291,7 @@ def do_ctlc(
     child,
     count: int = 3,
     delay: float = 0.1,
-    patt: Optional[str] = None,
+    patt: str|None = None,
 
     # expect repl UX to reprint the prompt after every
     # ctrl-c send.
@@ -1306,7 +1304,7 @@ def test_shield_pause(
         [
             _crash_msg,
             "('cancelled_before_pause'",  # actor name
-            "Failed to engage debugger via `_pause()`",
+            _repl_fail_msg,
             "trio.Cancelled",
             "raise Cancelled._create()",
 
@@ -1324,7 +1322,7 @@ def test_shield_pause(
         [
             _crash_msg,
             "('root'",  # actor name
-            "Failed to engage debugger via `_pause()`",
+            _repl_fail_msg,
             "trio.Cancelled",
             "raise Cancelled._create()",
 
-- 
2.34.1


From 7d4cd8944cd7311277d465c2797ba30d7e720fdf Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 10 Jun 2024 19:16:21 -0400
Subject: [PATCH 361/378] Use `_debug._sync_pause_from_builtin()` as
 `breakpoint()` override

---
 tractor/_root.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tractor/_root.py b/tractor/_root.py
index a01ecd01..203627da 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -119,7 +119,7 @@ async def open_root_actor(
         )
     ):
         os.environ['PYTHONBREAKPOINT'] = (
-            'tractor.devx._debug.pause_from_sync'
+            'tractor.devx._debug._sync_pause_from_builtin'
         )
         _state._runtime_vars['use_greenback'] = True
 
-- 
2.34.1


From 7a89b59a3f62dc9d521e70f754169d1f6299ff1c Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 11 Jun 2024 20:45:41 -0400
Subject: [PATCH 362/378] Bleh, make `log.devx()` level less then cancel but >
 `.runtime()`

---
 tractor/log.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tractor/log.py b/tractor/log.py
index 41a910e8..edb058e3 100644
--- a/tractor/log.py
+++ b/tractor/log.py
@@ -57,8 +57,8 @@ DATE_FORMAT = '%b %d %H:%M:%S'
 CUSTOM_LEVELS: dict[str, int] = {
     'TRANSPORT': 5,
     'RUNTIME': 15,
-    'CANCEL': 16,
     'DEVX': 17,
+    'CANCEL': 18,
     'PDB': 500,
 }
 STD_PALETTE = {
@@ -111,7 +111,7 @@ class StackLevelAdapter(LoggerAdapter):
 
         '''
         return self.log(
-            level=16,
+            level=22,
             msg=msg,
             # stacklevel=4,
         )
-- 
2.34.1


From d528e7ab4d217b59b96a017e5d05c09287d118a3 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 14 Jun 2024 15:27:35 -0400
Subject: [PATCH 363/378] Add `@context(pld_spec=<TypeAlias>)` TODO list

Longer run we don't want `tractor` app devs having to call
`msg._ops.limit_plds()` from every child endpoint.. so this starts
a list of decorator API ideas and obviously ties in with an ideal final
API design that will come with py3.13 and typed funcs. Obviously this
is directly fueled by,

- https://github.com/goodboy/tractor/issues/365

Other,
- type with direct `trio.lowlevel.Task` import.
- use `log.exception()` to show tbs for all error-terminations in
  `.open_context()` (for now) and always explicitly mention the `.side`.
---
 tractor/_context.py | 43 ++++++++++++++++++++++++++++---------------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index ec64b157..dd14361b 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -58,6 +58,7 @@ from typing import (
 import warnings
 # ------ - ------
 import trio
+from trio.lowlevel import Task
 # ------ - ------
 from ._exceptions import (
     ContextCancelled,
@@ -121,7 +122,7 @@ class Unresolved:
 @dataclass
 class Context:
     '''
-    An inter-actor, SC transitive, `trio.Task` communication context.
+    An inter-actor, SC transitive, `Task` communication context.
 
     NB: This class should **never be instatiated directly**, it is allocated
     by the runtime in 2 ways:
@@ -134,7 +135,7 @@ class Context:
 
     Allows maintaining task or protocol specific state between
     2 cancel-scope-linked, communicating and parallel executing
-    `trio.Task`s. Contexts are allocated on each side of any task
+    `Task`s. Contexts are allocated on each side of any task
     RPC-linked msg dialog, i.e. for every request to a remote
     actor from a `Portal`. On the "callee" side a context is
     always allocated inside `._rpc._invoke()`.
@@ -214,7 +215,7 @@ class Context:
     # which is exactly the primitive that allows for
     # cross-actor-task-supervision and thus SC.
     _scope: trio.CancelScope|None = None
-    _task: trio.lowlevel.Task|None = None
+    _task: Task|None = None
 
     # TODO: cs around result waiting so we can cancel any
     # permanently blocking `._rx_chan.receive()` call in
@@ -258,14 +259,14 @@ class Context:
     # a call to `.cancel()` which triggers `ContextCancelled`.
     _cancel_msg: str|dict|None = None
 
-    # NOTE: this state var used by the runtime to determine if the
+    # NOTE: this state-var is used by the runtime to determine if the
     # `pdbp` REPL is allowed to engage on contexts terminated via
     # a `ContextCancelled` due to a call to `.cancel()` triggering
     # "graceful closure" on either side:
     # - `._runtime._invoke()` will check this flag before engaging
     #   the crash handler REPL in such cases where the "callee"
     #   raises the cancellation,
-    # - `.devx._debug.lock_tty_for_child()` will set it to `False` if
+    # - `.devx._debug.lock_stdio_for_peer()` will set it to `False` if
     #   the global tty-lock has been configured to filter out some
     #   actors from being able to acquire the debugger lock.
     _enter_debugger_on_cancel: bool = True
@@ -861,7 +862,7 @@ class Context:
     ) -> None:
         '''
         Cancel this inter-actor IPC context by requestng the
-        remote side's cancel-scope-linked `trio.Task` by calling
+        remote side's cancel-scope-linked `Task` by calling
         `._scope.cancel()` and delivering an `ContextCancelled`
         ack msg in reponse.
 
@@ -1030,7 +1031,7 @@ class Context:
         # XXX NOTE XXX: `ContextCancelled`/`StreamOverrun` absorption
         # for "graceful cancellation" case:
         #
-        # Whenever a "side" of a context (a `trio.Task` running in
+        # Whenever a "side" of a context (a `Task` running in
         # an actor) **is** the side which requested ctx
         # cancellation (likekly via ``Context.cancel()``), we
         # **don't** want to re-raise any eventually received
@@ -1089,7 +1090,8 @@ class Context:
             else:
                 log.warning(
                     'Local error already set for ctx?\n'
-                    f'{self._local_error}\n'
+                    f'{self._local_error}\n\n'
+                    f'{self}'
                 )
 
             return remote_error
@@ -2117,8 +2119,9 @@ async def open_context_from_portal(
         # the `ContextCancelled` "self cancellation absorbed" case
         # handled in the block above ^^^ !!
         # await _debug.pause()
-        log.cancel(
-            'Context terminated due to\n\n'
+        # log.cancel(
+        log.exception(
+            f'{ctx.side}-side of `Context` terminated with '
             f'.outcome => {ctx.repr_outcome()}\n'
         )
 
@@ -2319,7 +2322,7 @@ async def open_context_from_portal(
                     # type_only=True,
                 )
                 log.cancel(
-                    f'Context terminated due to local scope error:\n\n'
+                    f'Context terminated due to local {ctx.side!r}-side error:\n\n'
                     f'{ctx.chan.uid} => {outcome_str}\n'
                 )
 
@@ -2385,15 +2388,25 @@ def mk_context(
 
 # TODO: use the new type-parameters to annotate this in 3.13?
 # -[ ] https://peps.python.org/pep-0718/#unknown-types
+# -[ ] allow for `pld_spec` input(s) ideally breaking down,
+# |_ `start: ParameterSpec`,
+# |_ `started: TypeAlias`,
+# |_ `yields: TypeAlias`,
+# |_ `return: TypeAlias`,
+# |_ `invalid_policy: str|Callable` ?
+# -[ ] prolly implement the `@acm` wrapper using
+#     a `contextlib.ContextDecorator`?
+#
 def context(
     func: Callable,
 ) -> Callable:
     '''
-    Mark an (async) function as an SC-supervised, inter-`Actor`,
-    child-`trio.Task`, IPC endpoint otherwise known more
-    colloquially as a (RPC) "context".
+    Mark an async function as an SC-supervised, inter-`Actor`, RPC
+    scheduled child-side `Task`, IPC endpoint otherwise
+    known more colloquially as a (RPC) "context".
 
-    Functions annotated the fundamental IPC endpoint type offered by `tractor`.
+    Functions annotated the fundamental IPC endpoint type offered by
+    `tractor`.
 
     '''
     # TODO: apply whatever solution ``mypy`` ends up picking for this:
-- 
2.34.1


From 418c6907fde313ebd1801af884f10e5fbb1178fd Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 14 Jun 2024 15:37:57 -0400
Subject: [PATCH 364/378] Add `enable_stack_on_sig: bool` for `stackscope`
 toggle

---
 tractor/_root.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tractor/_root.py b/tractor/_root.py
index 203627da..7cdef601 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -80,6 +80,7 @@ async def open_root_actor(
     # enables the multi-process debugger support
     debug_mode: bool = False,
     maybe_enable_greenback: bool = False,  # `.pause_from_sync()/breakpoint()` support
+    enable_stack_on_sig: bool = False,
 
     # internal logging
     loglevel: str|None = None,
@@ -220,7 +221,11 @@ async def open_root_actor(
     assert _log
 
     # TODO: factor this into `.devx._stackscope`!!
-    if debug_mode:
+    if (
+        debug_mode
+        and
+        enable_stack_on_sig
+    ):
         try:
             logger.info('Enabling `stackscope` traces on SIGUSR1')
             from .devx import enable_stack_on_sig
-- 
2.34.1


From e6d4ec43b99461c58ff97b50fc5ddf37516dedf4 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Fri, 14 Jun 2024 15:49:30 -0400
Subject: [PATCH 365/378] Log tbs from non-RAE `._invoke()`-RPC-task errors

`RemoteActorError`s show this by default in their `.__repr__()`, and we
obvi capture and embed the src traceback in an `Error` msg prior to
transit, but for logging it's also handy to see the tb of any set
`Context._remote_error` on console especially when trying to decipher
remote error details at their origin actor. Also improve the log message
description using `ctx.repr_state` and show any `ctx.outcome`.
---
 tractor/_rpc.py | 47 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index fa615772..c9eb8454 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -26,6 +26,7 @@ from contextlib import (
 from functools import partial
 import inspect
 from pprint import pformat
+import traceback
 from typing import (
     Any,
     Callable,
@@ -47,6 +48,7 @@ from ._context import (
 )
 from ._exceptions import (
     ContextCancelled,
+    RemoteActorError,
     ModuleNotExposed,
     MsgTypeError,
     TransportClosed,
@@ -198,7 +200,8 @@ async def _invoke_non_context(
                 raise ipc_err
             else:
                 log.exception(
-                    f'Failed to respond to runtime RPC request for\n\n'
+                    f'Failed to ack runtime RPC request\n\n'
+                    f'{func} x=> {ctx.chan}\n\n'
                     f'{ack}\n'
                 )
 
@@ -415,7 +418,6 @@ async def _errors_relayed_via_ipc(
 
 
 async def _invoke(
-
     actor: Actor,
     cid: str,
     chan: Channel,
@@ -691,10 +693,6 @@ async def _invoke(
                         boxed_type=trio.Cancelled,
                         canceller=canceller,
                     )
-                    # does this matter other then for
-                    # consistentcy/testing? |_ no user code should be
-                    # in this scope at this point..
-                    # ctx._local_error = ctxc
                     raise ctxc
 
         # XXX: do we ever trigger this block any more?
@@ -715,6 +713,11 @@ async def _invoke(
             # always set this (child) side's exception as the
             # local error on the context
             ctx._local_error: BaseException = scope_error
+            # ^-TODO-^ question,
+            # does this matter other then for
+            # consistentcy/testing?
+            # |_ no user code should be in this scope at this point
+            #    AND we already set this in the block below?
 
             # if a remote error was set then likely the
             # exception group was raised due to that, so
@@ -741,22 +744,32 @@ async def _invoke(
 
             logmeth: Callable = log.runtime
             merr: Exception|None = ctx.maybe_error
-            descr_str: str = 'with final result `{repr(ctx.outcome)}`'
-            message: str = (
-                f'IPC context terminated {descr_str}\n\n'
+            message: str = 'IPC context terminated '
+            descr_str: str = (
+                f'after having {ctx.repr_state!r}\n'
             )
             if merr:
-                descr_str: str = (
-                    f'with ctx having {ctx.repr_state!r}\n'
-                    f'{ctx.repr_outcome()}\n'
-                )
+
+                logmeth: Callable = log.error
                 if isinstance(merr, ContextCancelled):
                     logmeth: Callable = log.runtime
-                else:
-                    logmeth: Callable = log.error
-                    message += f'\n{merr!r}\n'
 
-            logmeth(message)
+                if not isinstance(merr, RemoteActorError):
+                    tb_str: str = ''.join(traceback.format_exception(merr))
+                    descr_str += (
+                        f'\n{merr!r}\n'  # needed?
+                        f'{tb_str}\n'
+                    )
+                else:
+                    descr_str += f'\n{merr!r}\n'
+            else:
+                descr_str += f'\nand final result {ctx.outcome!r}\n'
+
+            logmeth(
+                message
+                +
+                descr_str
+            )
 
 
 async def try_ship_error_to_remote(
-- 
2.34.1


From 5449bd567309b943a0f3317aabf4a9cb95cb0fd3 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 16 Jun 2024 22:50:43 -0400
Subject: [PATCH 366/378] Offer a  `@context(pld_spec=<TypeAlias>)` API

Instead of the WIP/prototyped `Portal.open_context()` offering
a `pld_spec` input arg, this changes to a proper decorator API for
specifying the "payload spec" on `@context` endpoints.

The impl change details actually cover 2-birds:
- monkey patch decorated functions with a new
  `._tractor_context_meta: dict[str, Any]` and insert any provided input
  `@context` kwargs: `_pld_spec`, `enc_hook`, `enc_hook`.
- use `inspect.get_annotations()` to scan for a `func` arg
  type-annotated with `tractor.Context` and use the name of that arg as
  the RPC task-side injected `Context`, thus injecting the needed arg
  by type instead of by name (a longstanding TODO); raise a type-error
  when not found.
- pull the `pld_spec` from the `._tractor_context_meta` attr both in the
  `.open_context()` parent-side and child-side `._invoke()`-cation of
  the RPC task and use the `msg._ops.maybe_limit_plds()` API to apply it
  internally in the runtime for each case.
---
 tractor/_context.py | 89 +++++++++++++++++++++++++++++++++++----------
 tractor/_rpc.py     | 25 +++++++++++--
 2 files changed, 92 insertions(+), 22 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index dd14361b..f5d9d69e 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -1792,7 +1792,6 @@ async def open_context_from_portal(
     portal: Portal,
     func: Callable,
 
-    pld_spec: TypeAlias|None = None,
     allow_overruns: bool = False,
     hide_tb: bool = True,
 
@@ -1838,12 +1837,20 @@ async def open_context_from_portal(
     # NOTE: 2 bc of the wrapping `@acm`
     __runtimeframe__: int = 2  # noqa
 
-    # conduct target func method structural checks
-    if not inspect.iscoroutinefunction(func) and (
-        getattr(func, '_tractor_contex_function', False)
+    # if NOT an async func but decorated with `@context`, error.
+    if (
+        not inspect.iscoroutinefunction(func)
+        and getattr(func, '_tractor_context_meta', False)
     ):
         raise TypeError(
-            f'{func} must be an async generator function!')
+            f'{func!r} must be an async function!'
+        )
+
+    ctx_meta: dict[str, Any]|None = getattr(
+        func,
+        '_tractor_context_meta',
+        None,
+    )
 
     # TODO: i think from here onward should probably
     # just be factored into an `@acm` inside a new
@@ -1890,12 +1897,9 @@ async def open_context_from_portal(
             trio.open_nursery() as tn,
             msgops.maybe_limit_plds(
                 ctx=ctx,
-                spec=pld_spec,
-            ) as maybe_msgdec,
+                spec=ctx_meta.get('pld_spec'),
+            ),
         ):
-            if maybe_msgdec:
-                assert maybe_msgdec.pld_spec == pld_spec
-
             # NOTE: this in an implicit runtime nursery used to,
             # - start overrun queuing tasks when as well as
             # for cancellation of the scope opened by the user.
@@ -2398,7 +2402,15 @@ def mk_context(
 #     a `contextlib.ContextDecorator`?
 #
 def context(
-    func: Callable,
+    func: Callable|None = None,
+
+    *,
+
+    # must be named!
+    pld_spec: Union[Type]|TypeAlias = Any,
+    dec_hook: Callable|None = None,
+    enc_hook: Callable|None = None,
+
 ) -> Callable:
     '''
     Mark an async function as an SC-supervised, inter-`Actor`, RPC
@@ -2409,15 +2421,54 @@ def context(
     `tractor`.
 
     '''
+    # XXX for the `@context(pld_spec=MyMsg|None)` case
+    if func is None:
+        return partial(
+            context,
+            pld_spec=pld_spec,
+            dec_hook=dec_hook,
+            enc_hook=enc_hook,
+        )
+
+    # TODO: from this, enforcing a `Start.sig` type
+    # check when invoking RPC tasks by ensuring the input
+    # args validate against the endpoint def.
+    sig: inspect.Signature = inspect.signature(func)
+    # params: inspect.Parameters = sig.parameters
+
+    # https://docs.python.org/3/library/inspect.html#inspect.get_annotations
+    annots: dict[str, Type] = inspect.get_annotations(
+        func,
+        eval_str=True,
+    )
+    name: str
+    param: Type
+    for name, param in annots.items():
+        if param is Context:
+            ctx_var_name: str = name
+            break
+    else:
+        raise TypeError(
+            'At least one (normally the first) argument to the `@context` function '
+            f'{func.__name__!r} must be typed as `tractor.Context`, for ex,\n\n'
+            f'`ctx: tractor.Context`\n'
+        )
+
     # TODO: apply whatever solution ``mypy`` ends up picking for this:
     # https://github.com/python/mypy/issues/2087#issuecomment-769266912
-    func._tractor_context_function = True  # type: ignore
+    # func._tractor_context_function = True  # type: ignore
+    func._tractor_context_meta: dict[str, Any] = {
+        'ctx_var_name': ctx_var_name,
+        # `msgspec` related settings
+        'pld_spec': pld_spec,
+        'enc_hook': enc_hook,
+        'dec_hook': dec_hook,
 
-    sig: inspect.Signature = inspect.signature(func)
-    params: Mapping = sig.parameters
-    if 'ctx' not in params:
-        raise TypeError(
-            "The first argument to the context function "
-            f"{func.__name__} must be `ctx: tractor.Context`"
-        )
+        # TODO: eventually we need to "signature-check" with these
+        # vs. the `Start` msg fields!
+        # => this would allow for TPC endpoint argument-type-spec
+        # limiting and we could then error on
+        # invalid inputs passed to `.open_context(rpc_ep, arg0='blah')`
+        'sig': sig,
+    }
     return func
diff --git a/tractor/_rpc.py b/tractor/_rpc.py
index c9eb8454..166ee96a 100644
--- a/tractor/_rpc.py
+++ b/tractor/_rpc.py
@@ -69,6 +69,7 @@ from .msg import (
     PayloadT,
     NamespacePath,
     pretty_struct,
+    _ops as msgops,
 )
 from tractor.msg.types import (
     CancelAck,
@@ -500,8 +501,19 @@ async def _invoke(
 
 
     # handle decorated ``@tractor.context`` async function
-    elif getattr(func, '_tractor_context_function', False):
-        kwargs['ctx'] = ctx
+    # - pull out any typed-pld-spec info and apply (below)
+    # - (TODO) store func-ref meta data for API-frame-info logging
+    elif (
+        ctx_meta := getattr(
+            func,
+            '_tractor_context_meta',
+            False,
+        )
+    ):
+        # kwargs['ctx'] = ctx
+        # set the required `tractor.Context` typed input argument to
+        # the allocated RPC task context.
+        kwargs[ctx_meta['ctx_var_name']] = ctx
         context_ep_func = True
 
     # errors raised inside this block are propgated back to caller
@@ -595,7 +607,14 @@ async def _invoke(
         #     `@context` marked RPC function.
         # - `._portal` is never set.
         try:
-            async with trio.open_nursery() as tn:
+            async with (
+                trio.open_nursery() as tn,
+                msgops.maybe_limit_plds(
+                    ctx=ctx,
+                    spec=ctx_meta.get('pld_spec'),
+                    dec_hook=ctx_meta.get('dec_hook'),
+                ),
+            ):
                 ctx._scope_nursery = tn
                 ctx._scope = tn.cancel_scope
                 task_status.started(ctx)
-- 
2.34.1


From a0ee0cc7135cc5f8934603b6d5ad70e6836b69df Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 17 Jun 2024 09:01:13 -0400
Subject: [PATCH 367/378] Port debug request ep to use `@context(pld_spec)`

Namely passing the `.__pld_spec__` directly to the
`lock_stdio_for_peer()` decorator B)

Also, allows dropping `apply_debug_pldec()` (which was a todo) and
removing a `lock_stdio_for_peer()` indent level.
---
 tractor/devx/_debug.py | 150 +++++++++++++++--------------------------
 1 file changed, 54 insertions(+), 96 deletions(-)

diff --git a/tractor/devx/_debug.py b/tractor/devx/_debug.py
index 5578e8a6..ccf57d62 100644
--- a/tractor/devx/_debug.py
+++ b/tractor/devx/_debug.py
@@ -68,6 +68,7 @@ from trio import (
 )
 import tractor
 from tractor.log import get_logger
+from tractor._context import Context
 from tractor._state import (
     current_actor,
     is_root_process,
@@ -83,7 +84,6 @@ if TYPE_CHECKING:
     from trio.lowlevel import Task
     from threading import Thread
     from tractor._ipc import Channel
-    from tractor._context import Context
     from tractor._runtime import (
         Actor,
     )
@@ -529,7 +529,10 @@ class Lock:
             )
 
 
-@tractor.context
+@tractor.context(
+    # enable the locking msgspec
+    pld_spec=__pld_spec__,
+)
 async def lock_stdio_for_peer(
     ctx: Context,
     subactor_task_uid: tuple[str, int],
@@ -597,61 +600,55 @@ async def lock_stdio_for_peer(
         # scope despite the shielding we apply below.
         debug_lock_cs: CancelScope = ctx._scope
 
-        # TODO: use `.msg._ops.maybe_limit_plds()` here instead so we
-        # can merge into a single async with, with the
-        # `Lock.acquire_for_ctx()` enter below?
-        #
-        # enable the locking msgspec
-        with apply_debug_pldec():
-            async with Lock.acquire_for_ctx(ctx=ctx):
-                debug_lock_cs.shield = True
+        async with Lock.acquire_for_ctx(ctx=ctx):
+            debug_lock_cs.shield = True
 
-                log.devx(
-                    'Subactor acquired debugger request lock!\n'
-                    f'root task: {root_task_name}\n'
-                    f'subactor_uid: {subactor_uid}\n'
-                    f'remote task: {subactor_task_uid}\n\n'
+            log.devx(
+                'Subactor acquired debugger request lock!\n'
+                f'root task: {root_task_name}\n'
+                f'subactor_uid: {subactor_uid}\n'
+                f'remote task: {subactor_task_uid}\n\n'
 
-                    'Sending `ctx.started(LockStatus)`..\n'
+                'Sending `ctx.started(LockStatus)`..\n'
 
-                )
-
-                # indicate to child that we've locked stdio
-                await ctx.started(
-                    LockStatus(
-                        subactor_uid=subactor_uid,
-                        cid=ctx.cid,
-                        locked=True,
-                    )
-                )
-
-                log.devx(
-                    f'Actor {subactor_uid} acquired `Lock` via debugger request'
-                )
-
-                # wait for unlock pdb by child
-                async with ctx.open_stream() as stream:
-                    release_msg: LockRelease = await stream.receive()
-
-                    # TODO: security around only releasing if
-                    # these match?
-                    log.devx(
-                        f'TTY lock released requested\n\n'
-                        f'{release_msg}\n'
-                    )
-                    assert release_msg.cid == ctx.cid
-                    assert release_msg.subactor_uid == tuple(subactor_uid)
-
-                log.devx(
-                    f'Actor {subactor_uid} released TTY lock'
-                )
-
-            return LockStatus(
-                subactor_uid=subactor_uid,
-                cid=ctx.cid,
-                locked=False,
             )
 
+            # indicate to child that we've locked stdio
+            await ctx.started(
+                LockStatus(
+                    subactor_uid=subactor_uid,
+                    cid=ctx.cid,
+                    locked=True,
+                )
+            )
+
+            log.devx(
+                f'Actor {subactor_uid} acquired `Lock` via debugger request'
+            )
+
+            # wait for unlock pdb by child
+            async with ctx.open_stream() as stream:
+                release_msg: LockRelease = await stream.receive()
+
+                # TODO: security around only releasing if
+                # these match?
+                log.devx(
+                    f'TTY lock released requested\n\n'
+                    f'{release_msg}\n'
+                )
+                assert release_msg.cid == ctx.cid
+                assert release_msg.subactor_uid == tuple(subactor_uid)
+
+            log.devx(
+                f'Actor {subactor_uid} released TTY lock'
+            )
+
+        return LockStatus(
+            subactor_uid=subactor_uid,
+            cid=ctx.cid,
+            locked=False,
+        )
+
     except BaseException as req_err:
         message: str = (
             f'On behalf of remote peer {subactor_task_uid!r}@{ctx.chan.uid!r}\n\n'
@@ -1037,48 +1034,6 @@ class PdbREPL(pdbp.Pdb):
         return None
 
 
-# TODO: prolly remove this and instead finally get our @context API
-# supporting a msg/pld-spec via type annots as per,
-# https://github.com/goodboy/tractor/issues/365
-@cm
-def apply_debug_pldec() -> _codec.MsgCodec:
-    '''
-    Apply the subactor TTY `Lock`-ing protocol's msgspec temporarily
-    (only in the current task).
-
-    '''
-    from tractor.msg import (
-        _ops as msgops,
-    )
-    cctx: Context = current_ipc_ctx()
-    rx: msgops.PldRx = cctx.pld_rx
-    orig_pldec: msgops.MsgDec = rx.pld_dec
-
-    try:
-        with msgops.limit_plds(
-            spec=__pld_spec__,
-        ) as debug_dec:
-            assert (
-                debug_dec
-                is
-                rx.pld_dec
-            )
-            log.runtime(
-                'Applied `.devx._debug` pld-spec\n\n'
-                f'{debug_dec}\n'
-            )
-            yield debug_dec
-
-    finally:
-        assert (
-            rx.pld_dec is orig_pldec
-        )
-        log.runtime(
-            'Reverted to previous pld-spec\n\n'
-            f'{orig_pldec}\n'
-        )
-
-
 async def request_root_stdio_lock(
     actor_uid: tuple[str, str],
     task_uid: tuple[str, int],
@@ -1147,6 +1102,7 @@ async def request_root_stdio_lock(
                     async with portal.open_context(
                         lock_stdio_for_peer,
                         subactor_task_uid=task_uid,
+
                         # NOTE: set it here in the locker request task bc it's
                         # possible for multiple such requests for the lock in any
                         # single sub-actor AND there will be a race between when the
@@ -1159,7 +1115,7 @@ async def request_root_stdio_lock(
                         # this IPC-ctx request task, NOT any other task(s)
                         # including the one that actually enters the REPL. This
                         # is oc desired bc ow the debugged task will msg-type-error.
-                        pld_spec=__pld_spec__,
+                        # pld_spec=__pld_spec__,
 
                     ) as (req_ctx, status):
 
@@ -2856,7 +2812,9 @@ def open_crash_handler(
 
 
 @cm
-def maybe_open_crash_handler(pdb: bool = False):
+def maybe_open_crash_handler(
+    pdb: bool = False,
+):
     '''
     Same as `open_crash_handler()` but with bool input flag
     to allow conditional handling.
-- 
2.34.1


From 04bd111037bcbe02629b05f63bc98fd9b29c629e Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 17 Jun 2024 09:23:31 -0400
Subject: [PATCH 368/378] Proxy through `dec_hook` in `.limit_plds()` APIs

---
 tractor/msg/_ops.py | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/tractor/msg/_ops.py b/tractor/msg/_ops.py
index 86f80395..80633e7e 100644
--- a/tractor/msg/_ops.py
+++ b/tractor/msg/_ops.py
@@ -27,6 +27,7 @@ from contextlib import (
 )
 from typing import (
     Any,
+    Callable,
     Type,
     TYPE_CHECKING,
     Union,
@@ -138,6 +139,7 @@ class PldRx(Struct):
     def limit_plds(
         self,
         spec: Union[Type[Struct]],
+        **dec_kwargs,
 
     ) -> MsgDec:
         '''
@@ -147,7 +149,10 @@ class PldRx(Struct):
 
         '''
         orig_dec: MsgDec = self._pld_dec
-        limit_dec: MsgDec = mk_dec(spec=spec)
+        limit_dec: MsgDec = mk_dec(
+            spec=spec,
+            **dec_kwargs,
+        )
         try:
             self._pld_dec = limit_dec
             yield limit_dec
@@ -449,7 +454,7 @@ class PldRx(Struct):
 @cm
 def limit_plds(
     spec: Union[Type[Struct]],
-    **kwargs,
+    **dec_kwargs,
 
 ) -> MsgDec:
     '''
@@ -467,7 +472,7 @@ def limit_plds(
 
         with rx.limit_plds(
             spec=spec,
-            **kwargs,
+            **dec_kwargs,
         ) as pldec:
             log.runtime(
                 'Applying payload-decoder\n\n'
@@ -487,7 +492,9 @@ def limit_plds(
 async def maybe_limit_plds(
     ctx: Context,
     spec: Union[Type[Struct]]|None = None,
+    dec_hook: Callable|None = None,
     **kwargs,
+
 ) -> MsgDec|None:
     '''
     Async compat maybe-payload type limiter.
@@ -497,7 +504,11 @@ async def maybe_limit_plds(
     used.
 
     '''
-    if spec is None:
+    if (
+        spec is None
+        and
+        dec_hook is None
+    ):
         yield None
         return
 
@@ -505,7 +516,11 @@ async def maybe_limit_plds(
     curr_ctx: Context = current_ipc_ctx()
     assert ctx is curr_ctx
 
-    with ctx._pld_rx.limit_plds(spec=spec) as msgdec:
+    with ctx._pld_rx.limit_plds(
+        spec=spec,
+        dec_hook=dec_hook,
+        **kwargs,
+    ) as msgdec:
         yield msgdec
 
     curr_ctx: Context = current_ipc_ctx()
-- 
2.34.1


From affc210033248f13c2d17b12c2a02db7c51d965e Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 17 Jun 2024 09:24:03 -0400
Subject: [PATCH 369/378] Update pld-rx limiting test(s) to use deco input

The tests only use one input spec (conveniently) so there's not much to
change in the logic,
- only pass the `maybe_msg_spec` to the child-side decorator and obvi
  drop the surrounding `msgops.limit_plds()` block in the child.
- tweak a few `MsgDec` asserts, mostly dropping the
  `msg._ops._def_any_spec` state checks since the child-side won't have
  any pre pld-spec state given the runtime now applies the `pld_spec`
  before running the task's func body.
  - also allowed dropping the `finally:` which did a similar check
    outside the `.limit_plds()` block.
---
 tests/test_pldrx_limiting.py | 203 +++++++++++++++++------------------
 1 file changed, 99 insertions(+), 104 deletions(-)

diff --git a/tests/test_pldrx_limiting.py b/tests/test_pldrx_limiting.py
index ddf2a234..e5ce691a 100644
--- a/tests/test_pldrx_limiting.py
+++ b/tests/test_pldrx_limiting.py
@@ -7,9 +7,6 @@ related settings around IPC contexts.
 from contextlib import (
     asynccontextmanager as acm,
 )
-from contextvars import (
-    Context,
-)
 
 from msgspec import (
     Struct,
@@ -19,6 +16,7 @@ import trio
 
 import tractor
 from tractor import (
+    Context,
     MsgTypeError,
     current_ipc_ctx,
     Portal,
@@ -35,7 +33,17 @@ from tractor.msg.types import (
 )
 
 
-class PldMsg(Struct):
+class PldMsg(
+    Struct,
+
+    # TODO: with multiple structs in-spec we need to tag them!
+    # -[ ] offer a built-in `PldMsg` type to inherit from which takes
+    #      case of these details?
+    #
+    # https://jcristharif.com/msgspec/structs.html#tagged-unions
+    # tag=True,
+    # tag_field='msg_type',
+):
     field: str
 
 
@@ -96,12 +104,14 @@ async def maybe_expect_raises(
                 )
 
 
-@tractor.context
+@tractor.context(
+    pld_spec=maybe_msg_spec,
+)
 async def child(
     ctx: Context,
     started_value: int|PldMsg|None,
     return_value: str|None,
-   validate_pld_spec: bool,
+    validate_pld_spec: bool,
     raise_on_started_mte: bool = True,
 
 ) -> None:
@@ -116,113 +126,99 @@ async def child(
     assert ctx is curr_ctx
 
     rx: msgops.PldRx = ctx._pld_rx
-    orig_pldec: _codec.MsgDec = rx.pld_dec
-    # senity that default pld-spec should be set
-    assert (
-        rx.pld_dec
-        is
-        msgops._def_any_pldec
+    curr_pldec: _codec.MsgDec = rx.pld_dec
+
+    ctx_meta: dict = getattr(
+        child,
+        '_tractor_context_meta',
+        None,
     )
+    if ctx_meta:
+        assert (
+            ctx_meta['pld_spec']
+            is curr_pldec.spec
+            is curr_pldec.pld_spec
+        )
 
+    # 2 cases: hdndle send-side and recv-only validation
+    # - when `raise_on_started_mte == True`, send validate
+    # - else, parent-recv-side only validation
+    mte: MsgTypeError|None = None
     try:
-        with msgops.limit_plds(
-            spec=maybe_msg_spec,
-        ) as pldec:
-            # sanity on `MsgDec` state
-            assert rx.pld_dec is pldec
-            assert pldec.spec is maybe_msg_spec
+        await ctx.started(
+            value=started_value,
+            validate_pld_spec=validate_pld_spec,
+        )
 
-            # 2 cases: hdndle send-side and recv-only validation
-            # - when `raise_on_started_mte == True`, send validate
-            # - else, parent-recv-side only validation
-            mte: MsgTypeError|None = None
-            try:
-                await ctx.started(
-                    value=started_value,
-                    validate_pld_spec=validate_pld_spec,
-                )
-
-            except MsgTypeError as _mte:
-                mte = _mte
-                log.exception('started()` raised an MTE!\n')
-                if not expect_started_mte:
-                    raise RuntimeError(
-                        'Child-ctx-task SHOULD NOT HAVE raised an MTE for\n\n'
-                        f'{started_value!r}\n'
-                    )
-
-                boxed_div: str = '------ - ------'
-                assert boxed_div not in mte._message
-                assert boxed_div not in mte.tb_str
-                assert boxed_div not in repr(mte)
-                assert boxed_div not in str(mte)
-                mte_repr: str = repr(mte)
-                for line in mte.message.splitlines():
-                    assert line in mte_repr
-
-                # since this is a *local error* there should be no
-                # boxed traceback content!
-                assert not mte.tb_str
-
-                # propagate to parent?
-                if raise_on_started_mte:
-                    raise
-
-            # no-send-side-error fallthrough
-            if (
-                validate_pld_spec
-                and
-                expect_started_mte
-            ):
-                raise RuntimeError(
-                    'Child-ctx-task SHOULD HAVE raised an MTE for\n\n'
-                    f'{started_value!r}\n'
-                )
-
-            assert (
-                not expect_started_mte
-                or
-                not validate_pld_spec
+    except MsgTypeError as _mte:
+        mte = _mte
+        log.exception('started()` raised an MTE!\n')
+        if not expect_started_mte:
+            raise RuntimeError(
+                'Child-ctx-task SHOULD NOT HAVE raised an MTE for\n\n'
+                f'{started_value!r}\n'
             )
 
-            # if wait_for_parent_to_cancel:
-            #     ...
-            #
-            # ^-TODO-^ logic for diff validation policies on each side:
-            #
-            # -[ ] ensure that if we don't validate on the send
-            #   side, that we are eventually error-cancelled by our
-            #   parent due to the bad `Started` payload!
-            # -[ ] the boxed error should be srced from the parent's
-            #   runtime NOT ours!
-            # -[ ] we should still error on bad `return_value`s
-            #   despite the parent not yet error-cancelling us?
-            #   |_ how do we want the parent side to look in that
-            #     case?
-            #     -[ ] maybe the equiv of "during handling of the
-            #       above error another occurred" for the case where
-            #       the parent sends a MTE to this child and while
-            #       waiting for the child to terminate it gets back
-            #       the MTE for this case?
-            #
+        boxed_div: str = '------ - ------'
+        assert boxed_div not in mte._message
+        assert boxed_div not in mte.tb_str
+        assert boxed_div not in repr(mte)
+        assert boxed_div not in str(mte)
+        mte_repr: str = repr(mte)
+        for line in mte.message.splitlines():
+            assert line in mte_repr
 
-            # XXX should always fail on recv side since we can't
-            # really do much else beside terminate and relay the
-            # msg-type-error from this RPC task ;)
-            return return_value
+        # since this is a *local error* there should be no
+        # boxed traceback content!
+        assert not mte.tb_str
 
-    finally:
-        # sanity on `limit_plds()` reversion
-        assert (
-            rx.pld_dec
-            is
-            msgops._def_any_pldec
-        )
-        log.runtime(
-            'Reverted to previous pld-spec\n\n'
-            f'{orig_pldec}\n'
+        # propagate to parent?
+        if raise_on_started_mte:
+            raise
+
+    # no-send-side-error fallthrough
+    if (
+        validate_pld_spec
+        and
+        expect_started_mte
+    ):
+        raise RuntimeError(
+            'Child-ctx-task SHOULD HAVE raised an MTE for\n\n'
+            f'{started_value!r}\n'
         )
 
+    assert (
+        not expect_started_mte
+        or
+        not validate_pld_spec
+    )
+
+    # if wait_for_parent_to_cancel:
+    #     ...
+    #
+    # ^-TODO-^ logic for diff validation policies on each side:
+    #
+    # -[ ] ensure that if we don't validate on the send
+    #   side, that we are eventually error-cancelled by our
+    #   parent due to the bad `Started` payload!
+    # -[ ] the boxed error should be srced from the parent's
+    #   runtime NOT ours!
+    # -[ ] we should still error on bad `return_value`s
+    #   despite the parent not yet error-cancelling us?
+    #   |_ how do we want the parent side to look in that
+    #     case?
+    #     -[ ] maybe the equiv of "during handling of the
+    #       above error another occurred" for the case where
+    #       the parent sends a MTE to this child and while
+    #       waiting for the child to terminate it gets back
+    #       the MTE for this case?
+    #
+
+    # XXX should always fail on recv side since we can't
+    # really do much else beside terminate and relay the
+    # msg-type-error from this RPC task ;)
+    return return_value
+
 
 @pytest.mark.parametrize(
     'return_value',
@@ -321,7 +317,6 @@ def test_basic_payload_spec(
                     child,
                     return_value=return_value,
                     started_value=started_value,
-                    pld_spec=maybe_msg_spec,
                     validate_pld_spec=pld_check_started_value,
                 ) as (ctx, first),
             ):
-- 
2.34.1


From 872feef24b50eccf6206f928e4ff564f9ac2bd61 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 17 Jun 2024 10:32:38 -0400
Subject: [PATCH 370/378] Add note about using `@acm` as decorator in 3.10

---
 tractor/_context.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index f5d9d69e..3b297616 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -2399,7 +2399,11 @@ def mk_context(
 # |_ `return: TypeAlias`,
 # |_ `invalid_policy: str|Callable` ?
 # -[ ] prolly implement the `@acm` wrapper using
-#     a `contextlib.ContextDecorator`?
+#     a `contextlib.ContextDecorator`, i guess not if
+#     we don't need an `__aexit__` block right?
+#  |_ de hecho, @acm can already be used as a decorator as of 3.10
+#     but i dunno how that's gonna play with `trio.Nursery.start[_soon]()`
+#  |_ https://docs.python.org/3/library/contextlib.html#using-a-context-manager-as-a-function-decorator
 #
 def context(
     func: Callable|None = None,
-- 
2.34.1


From 8477919fc92b9811ec10bb9931da184319fa4d4c Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 17 Jun 2024 10:32:50 -0400
Subject: [PATCH 371/378] Don't pass `ipc_msg` for send side MTEs

Just pass `_bad_msg` such that it get's injected to `.msgdata` since
with a send-side `MsgTypeError` we don't have a remote `._ipc_msg:
Error` per say to include.
---
 tractor/_exceptions.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 92c3fafb..8ed46ebc 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -441,7 +441,8 @@ class RemoteActorError(Exception):
 
         for key in fields:
             if (
-                key == 'relay_uid' and not self.is_inception()
+                key == 'relay_uid'
+                and not self.is_inception()
             ):
                 continue
 
@@ -1291,8 +1292,7 @@ def _mk_msg_type_err(
 
         msgtyperr = MsgTypeError(
             message=message,
-            ipc_msg=msg,
-            bad_msg=msg,
+            _bad_msg=msg,
         )
         # ya, might be `None`
         msgtyperr.__cause__ = src_type_error
-- 
2.34.1


From 711f639fc5d7dbafc72fd6b04a34a23824591a84 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 17 Jun 2024 13:12:16 -0400
Subject: [PATCH 372/378] Break `_mk_msg_type_err()` into recv/send side funcs

Name them `_mk_send_mte()`/`_mk_recv_mte()` and change the runtime to
call each appropriately depending on location/usage.

Also add some dynamic call-frame "unhide" blocks such that when we
expect raised MTE from the aboves calls but we get a different
unexpected error from the runtime, we ensure the call stack downward is
shown in tbs/pdb.
|_ ideally in the longer run we come up with a fancier dynamic sys for
   this, prolly something in `.devx._frame_stack`?
---
 tractor/_context.py    |  30 ++---
 tractor/_exceptions.py | 258 ++++++++++++++++++++++-------------------
 tractor/_ipc.py        |  12 +-
 tractor/msg/_ops.py    |  36 +++---
 4 files changed, 174 insertions(+), 162 deletions(-)

diff --git a/tractor/_context.py b/tractor/_context.py
index 3b297616..32acf831 100644
--- a/tractor/_context.py
+++ b/tractor/_context.py
@@ -49,7 +49,6 @@ from typing import (
     Any,
     AsyncGenerator,
     Callable,
-    Mapping,
     Type,
     TypeAlias,
     TYPE_CHECKING,
@@ -1484,13 +1483,21 @@ class Context:
         #
         __tracebackhide__: bool = hide_tb
         if validate_pld_spec:
-            msgops.validate_payload_msg(
-                pld_msg=started_msg,
-                pld_value=value,
-                ipc=self,
-                strict_pld_parity=strict_pld_parity,
-                hide_tb=hide_tb,
-            )
+            # TODO: prolly wrap this as a `show_frame_when_not()`
+            try:
+                msgops.validate_payload_msg(
+                    pld_msg=started_msg,
+                    pld_value=value,
+                    ipc=self,
+                    strict_pld_parity=strict_pld_parity,
+                    hide_tb=hide_tb,
+                )
+            except BaseException as err:
+                if not isinstance(err, MsgTypeError):
+                    __tracebackhide__: bool = False
+
+                raise
+
 
         # TODO: maybe a flag to by-pass encode op if already done
         # here in caller?
@@ -2185,11 +2192,6 @@ async def open_context_from_portal(
             try:
                 result_or_err: Exception|Any = await ctx.result()
             except BaseException as berr:
-                # cancelled before (or maybe during?) final result capture
-                # if isinstance(trio.Cancelled, berr):
-                #     from .devx import mk_pdb
-                #     mk_pdb.set_trace()
-
                 # on normal teardown, if we get some error
                 # raised in `Context.result()` we still want to
                 # save that error on the ctx's state to
@@ -2201,7 +2203,7 @@ async def open_context_from_portal(
                 ctx._local_error: BaseException = scope_err
                 raise
 
-            # yes! this worx Bp
+            # yes this worx!
             # from .devx import _debug
             # await _debug.pause()
 
diff --git a/tractor/_exceptions.py b/tractor/_exceptions.py
index 8ed46ebc..7164d6ab 100644
--- a/tractor/_exceptions.py
+++ b/tractor/_exceptions.py
@@ -1232,14 +1232,13 @@ def _raise_from_unexpected_msg(
 _raise_from_no_key_in_msg = _raise_from_unexpected_msg
 
 
-def _mk_msg_type_err(
+def _mk_send_mte(
     msg: Any|bytes|MsgType,
     codec: MsgCodec|MsgDec,
 
     message: str|None = None,
     verb_header: str = '',
 
-    src_validation_error: ValidationError|None = None,
     src_type_error: TypeError|None = None,
     is_invalid_payload: bool = False,
 
@@ -1247,131 +1246,148 @@ def _mk_msg_type_err(
 
 ) -> MsgTypeError:
     '''
-    Compose a `MsgTypeError` from an input runtime context.
+    Compose a `MsgTypeError` from a `Channel.send()`-side error,
+    normally raised witih a runtime IPC `Context`.
 
     '''
-    # `Channel.send()` case
-    if src_validation_error is None:
+    if isinstance(codec, MsgDec):
+        raise RuntimeError(
+            '`codec` must be a `MsgCodec` for send-side errors?'
+        )
 
-        if isinstance(codec, MsgDec):
-            raise RuntimeError(
-                '`codec` must be a `MsgCodec` for send-side errors?'
+    from tractor.devx import (
+        pformat_caller_frame,
+    )
+    # no src error from `msgspec.msgpack.Decoder.decode()` so
+    # prolly a manual type-check on our part.
+    if message is None:
+        tb_fmt: str = pformat_caller_frame(stack_limit=3)
+        message: str = (
+            f'invalid msg -> {msg}: {type(msg)}\n\n'
+            f'{tb_fmt}\n'
+            f'Valid IPC msgs are:\n\n'
+            f'{codec.msg_spec_str}\n',
+        )
+    elif src_type_error:
+        src_message: str = str(src_type_error)
+        patt: str = 'type '
+        type_idx: int = src_message.find('type ')
+        invalid_type: str = src_message[type_idx + len(patt):].split()[0]
+
+        enc_hook: Callable|None = codec.enc.enc_hook
+        if enc_hook is None:
+            message += (
+                '\n\n'
+
+                f"The current IPC-msg codec can't encode type `{invalid_type}` !\n"
+                f'Maybe a `msgpack.Encoder.enc_hook()` extension is needed?\n\n'
+
+                f'Check the `msgspec` docs for ad-hoc type extending:\n'
+                '|_ https://jcristharif.com/msgspec/extending.html\n'
+                '|_ https://jcristharif.com/msgspec/extending.html#defining-a-custom-extension-messagepack-only\n'
             )
 
-        from tractor.devx import (
-            pformat_caller_frame,
+    msgtyperr = MsgTypeError(
+        message=message,
+        _bad_msg=msg,
+    )
+    # ya, might be `None`
+    msgtyperr.__cause__ = src_type_error
+    return msgtyperr
+
+
+def _mk_recv_mte(
+    msg: Any|bytes|MsgType,
+    codec: MsgCodec|MsgDec,
+
+    message: str|None = None,
+    verb_header: str = '',
+
+    src_validation_error: ValidationError|None = None,
+    is_invalid_payload: bool = False,
+
+    **mte_kwargs,
+
+) -> MsgTypeError:
+    '''
+    Compose a `MsgTypeError` from a
+    `Channel|Context|MsgStream.receive()`-side error,
+    normally raised witih a runtime IPC ctx or streaming
+    block.
+
+    '''
+    msg_dict: dict|None = None
+    bad_msg: PayloadMsg|None = None
+
+    if is_invalid_payload:
+        msg_type: str = type(msg)
+        any_pld: Any = msgpack.decode(msg.pld)
+        message: str = (
+            f'invalid `{msg_type.__qualname__}` msg payload\n\n'
+            f'value: `{any_pld!r}` does not match type-spec: '
+            f'`{type(msg).__qualname__}.pld: {codec.pld_spec_str}`'
         )
-        # no src error from `msgspec.msgpack.Decoder.decode()` so
-        # prolly a manual type-check on our part.
-        if message is None:
-            tb_fmt: str = pformat_caller_frame(stack_limit=3)
-            message: str = (
-                f'invalid msg -> {msg}: {type(msg)}\n\n'
-                f'{tb_fmt}\n'
-                f'Valid IPC msgs are:\n\n'
-                f'{codec.msg_spec_str}\n',
-            )
-        elif src_type_error:
-            src_message: str = str(src_type_error)
-            patt: str = 'type '
-            type_idx: int = src_message.find('type ')
-            invalid_type: str = src_message[type_idx + len(patt):].split()[0]
+        bad_msg = msg
 
-            enc_hook: Callable|None = codec.enc.enc_hook
-            if enc_hook is None:
-                message += (
-                    '\n\n'
-
-                    f"The current IPC-msg codec can't encode type `{invalid_type}` !\n"
-                    f'Maybe a `msgpack.Encoder.enc_hook()` extension is needed?\n\n'
-
-                    f'Check the `msgspec` docs for ad-hoc type extending:\n'
-                    '|_ https://jcristharif.com/msgspec/extending.html\n'
-                    '|_ https://jcristharif.com/msgspec/extending.html#defining-a-custom-extension-messagepack-only\n'
-                )
-
-        msgtyperr = MsgTypeError(
-            message=message,
-            _bad_msg=msg,
-        )
-        # ya, might be `None`
-        msgtyperr.__cause__ = src_type_error
-        return msgtyperr
-
-    # `Channel.recv()` case
     else:
-        msg_dict: dict|None = None
-        bad_msg: PayloadMsg|None = None
-
-        if is_invalid_payload:
-            msg_type: str = type(msg)
-            any_pld: Any = msgpack.decode(msg.pld)
-            message: str = (
-                f'invalid `{msg_type.__qualname__}` msg payload\n\n'
-                f'value: `{any_pld!r}` does not match type-spec: '
-                f'`{type(msg).__qualname__}.pld: {codec.pld_spec_str}`'
-            )
-            bad_msg = msg
-
-        else:
-            # decode the msg-bytes using the std msgpack
-            # interchange-prot (i.e. without any `msgspec.Struct`
-            # handling) so that we can determine what
-            # `.msg.types.PayloadMsg` is the culprit by reporting the
-            # received value.
-            msg: bytes
-            msg_dict: dict = msgpack.decode(msg)
-            msg_type_name: str = msg_dict['msg_type']
-            msg_type = getattr(msgtypes, msg_type_name)
-            message: str = (
-                f'invalid `{msg_type_name}` IPC msg\n\n'
-            )
-            # XXX be "fancy" and see if we can determine the exact
-            # invalid field such that we can comprehensively report
-            # the specific field's type problem.
-            msgspec_msg: str = src_validation_error.args[0].rstrip('`')
-            msg, _, maybe_field = msgspec_msg.rpartition('$.')
-            obj = object()
-            if (field_val := msg_dict.get(maybe_field, obj)) is not obj:
-                field_name_expr: str = (
-                    f' |_{maybe_field}: {codec.pld_spec_str} = '
-                )
-                fmt_val_lines: list[str] = pformat(field_val).splitlines()
-                fmt_val: str = (
-                    f'{fmt_val_lines[0]}\n'
-                    +
-                    textwrap.indent(
-                        '\n'.join(fmt_val_lines[1:]),
-                        prefix=' '*len(field_name_expr),
-                    )
-                )
-                message += (
-                    f'{msg.rstrip("`")}\n\n'
-                    f'<{msg_type.__qualname__}(\n'
-                    # f'{".".join([msg_type.__module__, msg_type.__qualname__])}\n'
-                    f'{field_name_expr}{fmt_val}\n'
-                    f')>'
-                )
-
-        if verb_header:
-            message = f'{verb_header} ' + message
-
-        msgtyperr = MsgTypeError.from_decode(
-            message=message,
-            bad_msg=bad_msg,
-            bad_msg_as_dict=msg_dict,
-            boxed_type=type(src_validation_error),
-
-            # NOTE: for pld-spec MTEs we set the `._ipc_msg` manually:
-            # - for the send-side `.started()` pld-validate
-            #   case we actually raise inline so we don't need to
-            #   set the it at all.
-            # - for recv side we set it inside `PldRx.decode_pld()`
-            #   after a manual call to `pack_error()` since we
-            #   actually want to emulate the `Error` from the mte we
-            #   build here. So by default in that case, this is left
-            #   as `None` here.
-            #   ipc_msg=src_err_msg,
+        # decode the msg-bytes using the std msgpack
+        # interchange-prot (i.e. without any `msgspec.Struct`
+        # handling) so that we can determine what
+        # `.msg.types.PayloadMsg` is the culprit by reporting the
+        # received value.
+        msg: bytes
+        msg_dict: dict = msgpack.decode(msg)
+        msg_type_name: str = msg_dict['msg_type']
+        msg_type = getattr(msgtypes, msg_type_name)
+        message: str = (
+            f'invalid `{msg_type_name}` IPC msg\n\n'
         )
-        msgtyperr.__cause__ = src_validation_error
-        return msgtyperr
+        # XXX be "fancy" and see if we can determine the exact
+        # invalid field such that we can comprehensively report
+        # the specific field's type problem.
+        msgspec_msg: str = src_validation_error.args[0].rstrip('`')
+        msg, _, maybe_field = msgspec_msg.rpartition('$.')
+        obj = object()
+        if (field_val := msg_dict.get(maybe_field, obj)) is not obj:
+            field_name_expr: str = (
+                f' |_{maybe_field}: {codec.pld_spec_str} = '
+            )
+            fmt_val_lines: list[str] = pformat(field_val).splitlines()
+            fmt_val: str = (
+                f'{fmt_val_lines[0]}\n'
+                +
+                textwrap.indent(
+                    '\n'.join(fmt_val_lines[1:]),
+                    prefix=' '*len(field_name_expr),
+                )
+            )
+            message += (
+                f'{msg.rstrip("`")}\n\n'
+                f'<{msg_type.__qualname__}(\n'
+                # f'{".".join([msg_type.__module__, msg_type.__qualname__])}\n'
+                f'{field_name_expr}{fmt_val}\n'
+                f')>'
+            )
+
+    if verb_header:
+        message = f'{verb_header} ' + message
+
+    msgtyperr = MsgTypeError.from_decode(
+        message=message,
+        bad_msg=bad_msg,
+        bad_msg_as_dict=msg_dict,
+        boxed_type=type(src_validation_error),
+
+        # NOTE: for pld-spec MTEs we set the `._ipc_msg` manually:
+        # - for the send-side `.started()` pld-validate
+        #   case we actually raise inline so we don't need to
+        #   set the it at all.
+        # - for recv side we set it inside `PldRx.decode_pld()`
+        #   after a manual call to `pack_error()` since we
+        #   actually want to emulate the `Error` from the mte we
+        #   build here. So by default in that case, this is left
+        #   as `None` here.
+        #   ipc_msg=src_err_msg,
+    )
+    msgtyperr.__cause__ = src_validation_error
+    return msgtyperr
diff --git a/tractor/_ipc.py b/tractor/_ipc.py
index ec7d348a..e5e3d10f 100644
--- a/tractor/_ipc.py
+++ b/tractor/_ipc.py
@@ -49,7 +49,8 @@ from tractor._exceptions import (
     MsgTypeError,
     pack_from_raise,
     TransportClosed,
-    _mk_msg_type_err,
+    _mk_send_mte,
+    _mk_recv_mte,
 )
 from tractor.msg import (
     _ctxvar_MsgCodec,
@@ -256,7 +257,7 @@ class MsgpackTCPStream(MsgTransport):
             # and always raise such that spec violations
             # are never allowed to be caught silently!
             except msgspec.ValidationError as verr:
-                msgtyperr: MsgTypeError = _mk_msg_type_err(
+                msgtyperr: MsgTypeError = _mk_recv_mte(
                     msg=msg_bytes,
                     codec=codec,
                     src_validation_error=verr,
@@ -321,7 +322,7 @@ class MsgpackTCPStream(MsgTransport):
 
             if type(msg) not in msgtypes.__msg_types__:
                 if strict_types:
-                    raise _mk_msg_type_err(
+                    raise _mk_send_mte(
                         msg,
                         codec=codec,
                     )
@@ -333,8 +334,9 @@ class MsgpackTCPStream(MsgTransport):
 
             try:
                 bytes_data: bytes = codec.encode(msg)
-            except TypeError as typerr:
-                msgtyperr: MsgTypeError = _mk_msg_type_err(
+            except TypeError as _err:
+                typerr = _err
+                msgtyperr: MsgTypeError = _mk_send_mte(
                     msg,
                     codec=codec,
                     message=(
diff --git a/tractor/msg/_ops.py b/tractor/msg/_ops.py
index 80633e7e..91c0ddea 100644
--- a/tractor/msg/_ops.py
+++ b/tractor/msg/_ops.py
@@ -47,7 +47,7 @@ from tractor._exceptions import (
     InternalError,
     _raise_from_unexpected_msg,
     MsgTypeError,
-    _mk_msg_type_err,
+    _mk_recv_mte,
     pack_error,
 )
 from tractor._state import current_ipc_ctx
@@ -264,7 +264,7 @@ class PldRx(Struct):
                     # pack mgterr into error-msg for
                     # reraise below; ensure remote-actor-err
                     # info is displayed nicely?
-                    mte: MsgTypeError = _mk_msg_type_err(
+                    mte: MsgTypeError = _mk_recv_mte(
                         msg=msg,
                         codec=self.pld_dec,
                         src_validation_error=valerr,
@@ -277,19 +277,6 @@ class PldRx(Struct):
                     if is_started_send_side:
                         raise mte
 
-                        # XXX TODO: remove this right?
-                        # => any bad stated/return values should
-                        # always be treated a remote errors right?
-                        #
-                        # if (
-                        #     expect_msg is Return
-                        #     or expect_msg is Started
-                        # ):
-                        #     # set emulated remote error more-or-less as the
-                        #     # runtime would
-                        #     ctx: Context = getattr(ipc, 'ctx', ipc)
-                        #     ctx._maybe_cancel_and_set_remote_error(mte)
-
                     # NOTE: the `.message` is automatically
                     # transferred into the message as long as we
                     # define it as a `Error.message` field.
@@ -799,13 +786,18 @@ def validate_payload_msg(
 
     # raise any msg type error NO MATTER WHAT!
     except ValidationError as verr:
-        mte: MsgTypeError = _mk_msg_type_err(
-            msg=roundtripped,
-            codec=codec,
-            src_validation_error=verr,
-            verb_header='Trying to send ',
-            is_invalid_payload=True,
-        )
+        try:
+            mte: MsgTypeError = _mk_recv_mte(
+                msg=roundtripped,
+                codec=codec,
+                src_validation_error=verr,
+                verb_header='Trying to send ',
+                is_invalid_payload=True,
+            )
+        except BaseException:
+            __tracebackhide__: bool = False
+            raise
+
         if not raise_mte:
             return mte
 
-- 
2.34.1


From 72df312e71dd3d29b16118b6233e40e7512347bf Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 18 Jun 2024 09:57:10 -0400
Subject: [PATCH 373/378] Expand `PayloadMsg` doc-str

---
 tractor/msg/types.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/tractor/msg/types.py b/tractor/msg/types.py
index 0fc0ee96..0904411f 100644
--- a/tractor/msg/types.py
+++ b/tractor/msg/types.py
@@ -73,11 +73,22 @@ class PayloadMsg(
     # as_array=True,
 ):
     '''
-    The "god" boxing msg type.
+    An abstract payload boxing/shuttling IPC msg type.
 
-    Boxes user data-msgs in a `.pld` and uses `msgspec`'s tagged
-    unions support to enable a spec from a common msg inheritance
-    tree.
+    Boxes data-values passed to/from user code
+
+    (i.e. any values passed by `tractor` application code using any of
+
+      |_ `._streaming.MsgStream.send/receive()`
+      |_ `._context.Context.started/result()`
+      |_ `._ipc.Channel.send/recv()`
+
+     aka our "IPC primitive APIs")
+
+    as message "payloads" set to the `.pld` field and uses
+    `msgspec`'s "tagged unions" feature to support a subset of our
+    "SC-transitive shuttle protocol" specification with
+    a `msgspec.Struct` inheritance tree.
 
     '''
     cid: str  # call/context-id
-- 
2.34.1


From 83d69fe395d324357e07460f533022126239d849 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 18 Jun 2024 14:40:26 -0400
Subject: [PATCH 374/378] Change `_Cache` reuse emit to `.runtime()`

---
 tractor/trionics/_mngrs.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tractor/trionics/_mngrs.py b/tractor/trionics/_mngrs.py
index f57be0a7..08e70ad2 100644
--- a/tractor/trionics/_mngrs.py
+++ b/tractor/trionics/_mngrs.py
@@ -271,8 +271,11 @@ async def maybe_open_context(
         yield False, yielded
 
     else:
-        log.info(f'Reusing _Cached resource for {ctx_key}')
         _Cache.users += 1
+        log.runtime(
+            f'Reusing resource for `_Cache` user {_Cache.users}\n\n'
+            f'{ctx_key!r} -> {yielded!r}\n'
+        )
         lock.release()
         yield True, yielded
 
-- 
2.34.1


From 9292d73b40fb46cc2d93e50e68633e927bb02c6c Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 18 Jun 2024 14:42:25 -0400
Subject: [PATCH 375/378] Avoid actor-nursery-exit warns on registrees

Since a local-actor-nursery-parented subactor might also use the root as
its registry, we need to avoid warning when short lived IPC `Channel`
connections establish and then disconnect (quickly, bc the apparently
the subactor isn't re-using an already cached parente-peer<->child conn
as you'd expect efficiency..) since such cases currently considered
normal operation of our super shoddy/naive "discovery sys" XD

As such, (un)guard the whole local-actor-nursery OR channel-draining
waiting blocks with the additional `or Actor._cancel_called` branch
since really we should also be waiting on the parent nurse to exit (at
least, for sure and always) when the local `Actor` indeed has been
"globally" cancelled-called. Further add separate timeout warnings for
channel-draining vs. local-actor-nursery-exit waiting since they are
technically orthogonal cases (at least, afaik).

Also,
- adjust the `Actor._stream_handler()` connection status log-emit to
  `.runtime()`, especially to reduce noise around the aforementioned
  ephemeral registree connection-requests.
- if we do wait on a local actor-nurse to exit, report its `._children`
  table (which should help figure out going forward how useful the
  warning is, if at all).
---
 tractor/_runtime.py | 136 +++++++++++++++++++++++++++++++-------------
 1 file changed, 95 insertions(+), 41 deletions(-)

diff --git a/tractor/_runtime.py b/tractor/_runtime.py
index 99a969b5..3cf35ff9 100644
--- a/tractor/_runtime.py
+++ b/tractor/_runtime.py
@@ -444,7 +444,7 @@ class Actor:
             # inside ``open_root_actor()`` where there is a check for
             # a bound listener on the "arbiter" addr.  the reset will be
             # because the handshake was never meant took place.
-            log.warning(
+            log.runtime(
                 con_status
                 +
                 ' -> But failed to handshake? Ignoring..\n'
@@ -520,24 +520,50 @@ class Actor:
             # the peer was cancelled we try to wait for them
             # to tear down their side of the connection before
             # moving on with closing our own side.
-            if local_nursery:
-                if chan._cancel_called:
-                    log.cancel(
-                        'Waiting on cancel request to peer\n'
-                        f'`Portal.cancel_actor()` => {chan.uid}\n'
-                    )
+            if (
+                local_nursery
+                and (
+                    self._cancel_called
+                    or
+                    chan._cancel_called
+                )
+                #
+                # ^-TODO-^ along with this is there another condition
+                # that we should filter with to avoid entering this
+                # waiting block needlessly?
+                # -[ ] maybe `and local_nursery.cancelled` and/or
+                #     only if the `._children` table is empty or has
+                #     only `Portal`s with .chan._cancel_called ==
+                #     True` as per what we had below; the MAIN DIFF
+                #     BEING that just bc one `Portal.cancel_actor()`
+                #     was called, doesn't mean the whole actor-nurse
+                #     is gonna exit any time soon right!?
+                #
+                # or
+                # all(chan._cancel_called for chan in chans)
+
+            ):
+                log.cancel(
+                    'Waiting on cancel request to peer\n'
+                    f'`Portal.cancel_actor()` => {chan.uid}\n'
+                )
 
                 # XXX: this is a soft wait on the channel (and its
                 # underlying transport protocol) to close from the
                 # remote peer side since we presume that any channel
-                # which is mapped to a sub-actor (i.e. it's managed by
-                # one of our local nurseries) has a message is sent to
-                # the peer likely by this actor (which is now in
-                # a cancelled condition) when the local runtime here is
-                # now cancelled while (presumably) in the middle of msg
-                # loop processing.
-                with trio.move_on_after(0.5) as cs:
-                    cs.shield = True
+                # which is mapped to a sub-actor (i.e. it's managed
+                # by local actor-nursery) has a message that is sent
+                # to the peer likely by this actor (which may be in
+                # a shutdown sequence due to cancellation) when the
+                # local runtime here is now cancelled while
+                # (presumably) in the middle of msg loop processing.
+                chan_info: str = (
+                    f'{chan.uid}\n'
+                    f'|_{chan}\n'
+                    f'  |_{chan.transport}\n\n'
+                )
+                with trio.move_on_after(0.5) as drain_cs:
+                    drain_cs.shield = True
 
                     # attempt to wait for the far end to close the
                     # channel and bail after timeout (a 2-generals
@@ -554,10 +580,7 @@ class Actor:
                         # TODO: factor this into a helper?
                         log.warning(
                             'Draining msg from disconnected peer\n'
-                            f'{chan.uid}\n'
-                            f'|_{chan}\n'
-                            f'  |_{chan.transport}\n\n'
-
+                            f'{chan_info}'
                             f'{pformat(msg)}\n'
                         )
                         # cid: str|None = msg.get('cid')
@@ -569,31 +592,62 @@ class Actor:
                                 cid,
                                 msg,
                             )
-
-                    # NOTE: when no call to `open_root_actor()` was
-                    # made, we implicitly make that call inside
-                    # the first `.open_nursery()`, in this case we
-                    # can assume that we are the root actor and do
-                    # not have to wait for the nursery-enterer to
-                    # exit before shutting down the actor runtime.
-                    #
-                    # see matching  note inside `._supervise.open_nursery()`
-                    if not local_nursery._implicit_runtime_started:
-                        log.runtime(
-                            'Waiting on local actor nursery to exit..\n'
-                            f'|_{local_nursery}\n'
-                        )
-                        await local_nursery.exited.wait()
-
-                if (
-                    cs.cancelled_caught
-                    and not local_nursery._implicit_runtime_started
-                ):
+                if drain_cs.cancelled_caught:
                     log.warning(
-                        'Failed to exit local actor nursery?\n'
+                        'Timed out waiting on IPC transport channel to drain?\n'
+                        f'{chan_info}'
+                    )
+
+                # XXX NOTE XXX when no explicit call to
+                # `open_root_actor()` was made by the application
+                # (normally we implicitly make that call inside
+                # the first `.open_nursery()` in root-actor
+                # user/app code), we can assume that either we
+                # are NOT the root actor or are root but the
+                # runtime was started manually. and thus DO have
+                # to wait for the nursery-enterer to exit before
+                # shutting down the local runtime to avoid
+                # clobbering any ongoing subactor
+                # teardown/debugging/graceful-cancel.
+                #
+                # see matching  note inside `._supervise.open_nursery()`
+                #
+                # TODO: should we have a separate cs + timeout
+                # block here?
+                if (
+                    # XXX SO either,
+                    #  - not root OR,
+                    #  - is root but `open_root_actor()` was
+                    #    entered manually (in which case we do
+                    #    the equiv wait there using the
+                    #    `devx._debug` sub-sys APIs).
+                    not local_nursery._implicit_runtime_started
+                ):
+                    log.runtime(
+                        'Waiting on local actor nursery to exit..\n'
                         f'|_{local_nursery}\n'
                     )
-                    # await _debug.pause()
+                    with trio.move_on_after(0.5) as an_exit_cs:
+                        an_exit_cs.shield = True
+                        await local_nursery.exited.wait()
+
+                    # TODO: currently this is always triggering for every
+                    # sub-daemon spawned from the `piker.services._mngr`?
+                    # -[ ] how do we ensure that the IPC is supposed to
+                    #      be long lived and isn't just a register?
+                    # |_ in the register case how can we signal that the
+                    #    ephemeral msg loop was intentional?
+                    if (
+                        # not local_nursery._implicit_runtime_started
+                        # and
+                        an_exit_cs.cancelled_caught
+                    ):
+                        log.warning(
+                            'Timed out waiting on local actor-nursery to exit?\n'
+                            f'{local_nursery}\n'
+                            f' |_{pformat(local_nursery._children)}\n'
+                        )
+                        # await _debug.pause()
 
                 if disconnected:
                     # if the transport died and this actor is still
-- 
2.34.1


From 3d12a7e005963371ff777325b5b078dcc8643ed0 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 18 Jun 2024 18:14:58 -0400
Subject: [PATCH 376/378] Flip `infected_asyncio` status msg to `.runtime()`

---
 tractor/to_asyncio.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tractor/to_asyncio.py b/tractor/to_asyncio.py
index 585b0b00..d1451b4c 100644
--- a/tractor/to_asyncio.py
+++ b/tractor/to_asyncio.py
@@ -577,14 +577,18 @@ def run_as_asyncio_guest(
                 log.runtime(f"trio_main finished: {main_outcome!r}")
 
         # start the infection: run trio on the asyncio loop in "guest mode"
-        log.info(f"Infecting asyncio process with {trio_main}")
+        log.runtime(
+            'Infecting `asyncio`-process with a `trio` guest-run of\n\n'
+            f'{trio_main!r}\n\n'
 
+            f'{trio_done_callback}\n'
+        )
         trio.lowlevel.start_guest_run(
             trio_main,
             run_sync_soon_threadsafe=loop.call_soon_threadsafe,
             done_callback=trio_done_callback,
         )
-        # ``.unwrap()`` will raise here on error
+        # NOTE `.unwrap()` will raise on error
         return (await trio_done_fut).unwrap()
 
     # might as well if it's installed.
-- 
2.34.1


From 1858fb6efc09070b2597daadc2f8c59bb38c5e37 Mon Sep 17 00:00:00 2001
From: Jad Abou-Chakra <29726242+jc211@users.noreply.github.com>
Date: Thu, 19 Sep 2024 15:24:38 +1000
Subject: [PATCH 377/378] Add project name to pyproject.toml

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 0e80e14d..45847b14 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -120,6 +120,7 @@ log_cli = false
 # ------ - ------
 
 [project]
+name = "tractor"
 keywords = [
   'trio',
   'async',
-- 
2.34.1


From aba46b723b0a23cc1c75c777d63ccd9db2a8b346 Mon Sep 17 00:00:00 2001
From: Jad Abou-Chakra <29726242+jc211@users.noreply.github.com>
Date: Thu, 19 Sep 2024 18:17:20 +1000
Subject: [PATCH 378/378] Decouple registery addresses from binding addresses

---
 tractor/_root.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/tractor/_root.py b/tractor/_root.py
index 7cdef601..3b8bd158 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -69,7 +69,10 @@ async def open_root_actor(
 
     # defaults are above
     arbiter_addr: tuple[str, int]|None = None,
-
+    
+    # binding addrs for the transport layer server
+    trans_bind_addrs: list[tuple[str, int]] = [(_default_host, _default_port)],
+    
     name: str|None = 'root',
 
     # either the `multiprocessing` start method:
@@ -183,6 +186,8 @@ async def open_root_actor(
         _default_lo_addrs
     )
     assert registry_addrs
+    
+    assert trans_bind_addrs
 
     loglevel = (
         loglevel
@@ -273,8 +278,6 @@ async def open_root_actor(
                 tuple(addr),  # TODO: just drop this requirement?
             )
 
-    trans_bind_addrs: list[tuple[str, int]] = []
-
     # Create a new local root-actor instance which IS NOT THE
     # REGISTRAR
     if ponged_addrs:
@@ -296,11 +299,6 @@ async def open_root_actor(
             loglevel=loglevel,
             enable_modules=enable_modules,
         )
-        # DO NOT use the registry_addrs as the transport server
-        # addrs for this new non-registar, root-actor.
-        for host, port in ponged_addrs:
-            # NOTE: zero triggers dynamic OS port allocation
-            trans_bind_addrs.append((host, 0))
 
     # Start this local actor as the "registrar", aka a regular
     # actor who manages the local registry of "mailboxes" of
-- 
2.34.1