Code-style, couple newline/ws tweaks

(cherry picked from commit 8526985c97)
Pin to latest `xonsh` release
2026-06-09 20:28:42 -04:00 · 2026-06-09 20:28:42 -04:00 · 2026-06-09 20:28:42 -04:00 · 2026-06-09 20:28:42 -04:00 · 2026-06-09 20:28:42 -04:00 · 2026-06-09 20:28:42 -04:00
177 changed files with 21476 additions and 3756 deletions
--- a/.claude/ai_notes/docs_todos.md
+++ b/.claude/ai_notes/docs_todos.md
@ -0,0 +1,38 @@
+# Docs TODOs
+
+## Auto-sync README code examples with source
+
+The `docs/README.rst` has inline code blocks that
+duplicate actual example files (e.g.
+`examples/infected_asyncio_echo_server.py`). Every time
+the public API changes we have to manually sync both.
+
+Sphinx's `literalinclude` directive can pull code directly
+from source files:
+
+```rst
+.. literalinclude:: ../examples/infected_asyncio_echo_server.py
+   :language: python
+   :caption: examples/infected_asyncio_echo_server.py
+```
+
+Or to include only a specific function/section:
+
+```rst
+.. literalinclude:: ../examples/infected_asyncio_echo_server.py
+   :language: python
+   :pyobject: aio_echo_server
+```
+
+This way the docs always reflect the actual code without
+manual syncing.
+
+### Considerations
+- `README.rst` is also rendered on GitHub/PyPI which do
+  NOT support `literalinclude` - so we'd need a build
+  step or a separate `_sphinx_readme.rst` (which already
+  exists at `docs/github_readme/_sphinx_readme.rst`).
+- Could use a pre-commit hook or CI step to extract code
+  from examples into the README for GitHub rendering.
+- Another option: `sphinx-autodoc` style approach where
+  docstrings from the actual module are pulled in.
--- a/.claude/notes/rt_vars_lift_plan.md
+++ b/.claude/notes/rt_vars_lift_plan.md
@ -0,0 +1,125 @@
+# `RuntimeVars` env-var lift — design plan
+
+Status: **draft, awaiting user edits**
+
+## Goal
+
+Consolidate the sprawl of pytest CLI flags + ad-hoc env vars +
+hardcoded fixture defaults into a *single* env-var-encoded
+runtime-vars envelope, with a typed in-memory representation
+(`tractor.runtime._state.RuntimeVars`) as the sole source of
+truth.
+
+## Why now
+
+- `--tpt-proto`, `--spawn-backend`, `--diag-on-hang`,
+  `--diag-capture-delay` and (soon) `TRACTOR_REG_ADDR` etc. are
+  proliferating. Each adds a parsing seam.
+- `tests/devx/test_debugger.py` invokes example scripts as
+  separate subprocesses; they currently can't see the
+  fixture-allocated `reg_addr` at all (root cause of why
+  parametrizing devx scripts on `reg_addr` is on your TODO).
+- Concurrent pytest sessions on the same host collide on
+  shared defaults (the `registry@1616` race we just fixed is
+  one symptom; per-session unique addr is the structural
+  fix).
+- `tractor.runtime._state.RuntimeVars: Struct` is already
+  defined and **unused** — its docstring even says it
+  "should be utilized as possible for future calls."
+
+## Design
+
+### Module: `tractor/_testing/_rtvars.py`
+
+Lifted from `modden.runtime.env`, ~50 LOC, no new deps.
+
+```python
+_TRACTOR_RT_VARS_OSENV: str = '_TRACTOR_RT_VARS'
+
+def dump_rtvars(rtvars: RuntimeVars|dict) -> tuple[str, str]:
+    '''str-serialize via `str(dict)` — ast.literal_eval-able'''
+
+def load_rtvars(env: dict) -> RuntimeVars:
+    '''ast.literal_eval the env-var value, hydrate to struct'''
+
+def get_rtvars(proc: psutil.Process|None = None) -> RuntimeVars:
+    '''read the var from a target proc's env (or current)'''
+
+def update_rtvars(
+    rtvars: RuntimeVars|dict|None = None,
+    update_osenv: bool|dict = True,
+) -> tuple[str, str]:
+    '''mutate + re-encode + (optionally) write to os.environ'''
+```
+
+### Encoding choice: `str(dict)` + `ast.literal_eval`
+
+Pros:
+- stdlib only
+- handles all the types tractor's tests need: `str`, `int`,
+  `float`, `bool`, `None`, `list`, `tuple`, `dict`
+- human-readable in the env (greppable, inspectable via
+  `cat /proc/<pid>/environ | tr '\0' '\n'`)
+
+Cons:
+- non-stdlib types (msgspec Structs, `Path`, custom classes)
+  must be lowered first — fine for the test fixture set
+- not stable across Python versions for esoteric repr cases
+  (we don't hit any)
+
+Alternatives considered:
+- **msgpack**: adds a dep + binary form is ungreppable
+- **json**: doesn't preserve tuples (becomes lists), which is
+  a common type for `reg_addr`
+- **toml/yaml**: heavier deps, no real benefit
+
+### `RuntimeVars` becomes the single source of truth
+
+The legacy `_runtime_vars: dict[str, Any]` global in
+`runtime/_state.py` becomes a *cached view* of a
+`RuntimeVars` singleton instance:
+
+- `get_runtime_vars()` returns either the struct or a
+  `.to_dict()` view depending on caller's preference
+- `set_runtime_vars(...)` validates against the struct schema
+- spawn-time SpawnSpec sends the struct (already does
+  conceptually — just gets typed)
+- `__setattr__` `breakpoint()` debug instrumentation gets
+  removed (unrelated cleanup, mentioned in conversation)
+
+### Migration path
+
+**Phase 0** *(prep)*: strip the stray `breakpoint()` from
+`RuntimeVars.__setattr__`.
+
+**Phase 1**: land `_rtvars.py` as a leaf module, used only by
+test infra. Subprocess-spawned scripts in `tests/devx/`
+read `_TRACTOR_RT_VARS` on startup → reconstruct
+`RuntimeVars` → call `tractor.open_root_actor(**rtvars.as_kwargs())`.
+Concurrent runs become deterministic-isolated because each
+session writes a unique `_registry_addrs` into the env.
+
+**Phase 2**: migrate runtime callers (`_state.get_runtime_vars`,
+spawn `SpawnSpec`, `Actor.async_main`) to operate on the
+struct directly, with the dict as a compat view that gets
+deprecated.
+
+**Phase 3** *(structural)*: per-session bindspace subdir
+`/run/user/<uid>/tractor/<session_uuid>/` — encoded in the
+rt-vars envelope, picked up by every subactor automatically.
+Obsoletes the entire bindspace-leak warning class.
+
+## Open design questions (user input wanted)
+
+- (placeholder for your edits)
+- (placeholder)
+- (placeholder)
+
+## Out-of-scope for this lift
+
+- Anything in `modden.runtime.env` related to `Spawn`,
+  `WmCtl`, `Wks` — that's a workspace orchestration layer,
+  not an env-var helper. We only lift the four utility
+  functions + the var name constant.
+- Switching to msgpack/json — explicitly chosen against
+  above.
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@ -0,0 +1,42 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(cp .claude/*)",
+      "Read(.claude/**)",
+      "Read(.claude/skills/run-tests/**)",
+      "Write(.claude/**/*commit_msg*)",
+      "Write(.claude/git_commit_msg_LATEST.md)",
+      "Skill(run-tests)",
+      "Skill(close-wkt)",
+      "Skill(open-wkt)",
+      "Skill(prompt-io)",
+      "Bash(date *)",
+      "Bash(git diff *)",
+      "Bash(git log *)",
+      "Bash(git status)",
+      "Bash(git remote:*)",
+      "Bash(git stash:*)",
+      "Bash(git mv:*)",
+      "Bash(git rev-parse:*)",
+      "Bash(test:*)",
+      "Bash(ls:*)",
+      "Bash(grep:*)",
+      "Bash(find:*)",
+      "Bash(ln:*)",
+      "Bash(cat:*)",
+      "Bash(mkdir:*)",
+      "Bash(gh pr:*)",
+      "Bash(gh api:*)",
+      "Bash(gh issue:*)",
+      "Bash(UV_PROJECT_ENVIRONMENT=py* uv sync:*)",
+      "Bash(UV_PROJECT_ENVIRONMENT=py* uv run:*)",
+      "Bash(echo EXIT:$?:*)",
+      "Bash(echo \"EXIT=$?\")",
+      "Read(//tmp/**)"
+    ],
+    "deny": [],
+    "ask": []
+  },
+  "prefersReducedMotion": false,
+  "outputStyle": "default"
+}
--- a/.claude/skills/commit-msg/style-guide-reference.md
+++ b/.claude/skills/commit-msg/style-guide-reference.md
@ -0,0 +1,225 @@
+# Commit Message Style Guide for `tractor`
+
+Analysis based on 500 recent commits from the `tractor` repository.
+
+## Core Principles
+
+Write commit messages that are technically precise yet casual in
+tone. Use abbreviations and informal language while maintaining
+clarity about what changed and why.
+
+## Subject Line Format
+
+### Length and Structure
+- Target: ~50 chars with a hard-max of 67.
+- Use backticks around code elements (72.2% of commits)
+- Rarely use colons (5.2%), except for file prefixes
+- End with '?' for uncertain changes (rare: 0.8%)
+- End with '!' for important changes (rare: 2.0%)
+
+### Opening Verbs (Present Tense)
+
+Most common verbs from analysis:
+- `Add` (14.4%) - wholly new features/functionality
+- `Use` (4.4%) - adopt new approach/tool
+- `Drop` (3.6%) - remove code/feature
+- `Fix` (2.4%) - bug fixes
+- `Move`/`Mv` (3.6%) - relocate code
+- `Adjust` (2.0%) - minor tweaks
+- `Update` (1.6%) - enhance existing feature
+- `Bump` (1.2%) - dependency updates
+- `Rename` (1.2%) - identifier changes
+- `Set` (1.2%) - configuration changes
+- `Handle` (1.0%) - add handling logic
+- `Raise` (1.0%) - add error raising
+- `Pass` (0.8%) - pass parameters/values
+- `Support` (0.8%) - add support for something
+- `Hide` (1.4%) - make private/internal
+- `Always` (1.4%) - enforce consistent behavior
+- `Mk` (1.4%) - make/create (abbreviated)
+- `Start` (1.0%) - begin implementation
+
+Other frequent verbs: `More`, `Change`, `Extend`, `Disable`, `Log`,
+`Enable`, `Ensure`, `Expose`, `Allow`
+
+### Backtick Usage
+
+Always use backticks for:
+- Module names: `trio`, `asyncio`, `msgspec`, `greenback`, `stackscope`
+- Class names: `Context`, `Actor`, `Address`, `PldRx`, `SpawnSpec`
+- Method names: `.pause_from_sync()`, `._pause()`, `.cancel()`
+- Function names: `breakpoint()`, `collapse_eg()`, `open_root_actor()`
+- Decorators: `@acm`, `@context`
+- Exceptions: `Cancelled`, `TransportClosed`, `MsgTypeError`
+- Keywords: `finally`, `None`, `False`
+- Variable names: `tn`, `debug_mode`
+- Complex expressions: `trio.Cancelled`, `asyncio.Task`
+
+Most backticked terms in tractor:
+`trio`, `asyncio`, `Context`, `.pause_from_sync()`, `tn`,
+`._pause()`, `breakpoint()`, `collapse_eg()`, `Actor`, `@acm`,
+`.cancel()`, `Cancelled`, `open_root_actor()`, `greenback`
+
+### Examples
+
+Good subject lines:
+```
+Add `uds` to `._multiaddr`, tweak typing
+Drop `DebugStatus.shield` attr, add `.req_finished`
+Use `stackscope` for all actor-tree rendered "views"
+Fix `.to_asyncio` inter-task-cancellation!
+Bump `ruff.toml` to target py313
+Mv `load_module_from_path()` to new `._code_load` submod
+Always use `tuple`-cast for singleton parent addrs
+```
+
+## Body Format
+
+### General Structure
+- 43.2% of commits have no body (simple changes)
+- Use blank line after subject
+- Max line length: 67 chars
+- Use `-` bullets for lists (28.0% of commits)
+- Rarely use `*` bullets (2.4%)
+
+### Section Markers
+
+Use these markers to organize longer commit bodies:
+- `Also,` (most common: 26 occurrences)
+- `Other,` (13 occurrences)
+- `Deats,` (11 occurrences) - for implementation details
+- `Further,` (7 occurrences)
+- `TODO,` (3 occurrences)
+- `Impl details,` (2 occurrences)
+- `Notes,` (1 occurrence)
+
+### Common Abbreviations
+
+Use these freely (sorted by frequency):
+- `msg` (63) - message
+- `bg` (37) - background
+- `ctx` (30) - context
+- `impl` (27) - implementation
+- `mod` (26) - module
+- `obvi` (17) - obviously
+- `tn` (16) - task name
+- `fn` (15) - function
+- `vs` (15) - versus
+- `bc` (14) - because
+- `var` (14) - variable
+- `prolly` (9) - probably
+- `ep` (6) - entry point
+- `OW` (5) - otherwise
+- `rn` (4) - right now
+- `sig` (4) - signal/signature
+- `deps` (3) - dependencies
+- `iface` (2) - interface
+- `subproc` (2) - subprocess
+- `tho` (2) - though
+- `ofc` (2) - of course
+
+### Tone and Style
+
+- Casual but technical (use `XD` for humor: 23 times)
+- Use `..` for trailing thoughts (108 occurrences)
+- Use `Woops,` to acknowledge mistakes (4 subject lines)
+- Don't be afraid to show personality while being precise
+
+### Example Bodies
+
+Simple with bullets:
+```
+Add `multiaddr` and bump up some deps
+
+Since we're planning to use it for (discovery)
+addressing, allowing replacement of the hacky (pretend)
+attempt in `tractor._multiaddr` Bp
+
+Also pin some deps,
+- make us py312+
+- use `pdbp` with my frame indexing fix.
+- mv to latest `xonsh` for fancy cmd/suggestion injections.
+
+Bump lock file to match obvi!
+```
+
+With section markers:
+```
+Use `stackscope` for all actor-tree rendered "views"
+
+Instead of the (much more) limited and hacky `.devx._code`
+impls, move to using the new `.devx._stackscope` API which
+wraps the `stackscope` project.
+
+Deats,
+- make new `stackscope.extract_stack()` wrapper
+- port over frame-descing to `_stackscope.pformat_stack()`
+- move `PdbREPL` to use `stackscope` render approach
+- update tests for new stack output format
+
+Also,
+- tweak log formatting for consistency
+- add typing hints throughout
+```
+
+## Special Patterns
+
+### WIP Commits
+Rare (0.2%) - avoid committing WIP if possible
+
+### Merge Commits
+Auto-generated (4.4%), don't worry about style
+
+### File References
+- Use `module.py` or `.submodule` style
+- Rarely use `file.py:line` references (0 in analysis)
+
+### Links
+- GitHub links used sparingly (3 total)
+- Prefer code references over external links
+
+## Footer
+
+The default footer should credit `claude` (you) for helping generate
+the commit msg content:
+
+```
+(this commit msg was generated in some part by [`claude-code`][claude-code-gh])
+[claude-code-gh]: https://github.com/anthropics/claude-code
+```
+
+Further, if the patch was solely or in part written
+by `claude`, instead add:
+
+```
+(this patch was generated in some part by [`claude-code`][claude-code-gh])
+[claude-code-gh]: https://github.com/anthropics/claude-code
+```
+
+## Summary Checklist
+
+Before committing, verify:
+- [ ] Subject line uses present tense verb
+- [ ] Subject line ~50 chars (hard max 67)
+- [ ] Code elements wrapped in backticks
+- [ ] Body lines ≤67 chars
+- [ ] Abbreviations used where natural
+- [ ] Casual yet precise tone
+- [ ] Section markers if body >3 paragraphs
+- [ ] Technical accuracy maintained
+
+## Analysis Metadata
+
+```
+Source: tractor repository
+Commits analyzed: 500
+Date range: 2019-2025
+Analysis date: 2026-02-08
+```
+
+---
+
+(this style guide was generated by [`claude-code`][claude-code-gh]
+analyzing commit history)
+
+[claude-code-gh]: https://github.com/anthropics/claude-code
--- a/.claude/skills/conc-anal/SKILL.md
+++ b/.claude/skills/conc-anal/SKILL.md
@ -0,0 +1,297 @@
+---
+name: conc-anal
+description: >
+  Concurrency analysis for tractor's trio-based
+  async primitives. Trace task scheduling across
+  checkpoint boundaries, identify race windows in
+  shared mutable state, and verify synchronization
+  correctness. Invoke on code segments the user
+  points at, OR proactively when reviewing/writing
+  concurrent cache, lock, or multi-task acm code.
+argument-hint: "[file:line-range or function name]"
+allowed-tools:
+  - Read
+  - Grep
+  - Glob
+  - Task
+---
+
+Perform a structured concurrency analysis on the
+target code. This skill should be invoked:
+
+- **On demand**: user points at a code segment
+  (file:lines, function name, or pastes a snippet)
+- **Proactively**: when writing or reviewing code
+  that touches shared mutable state across trio
+  tasks — especially `_Cache`, locks, events, or
+  multi-task `@acm` lifecycle management
+
+## 0. Identify the target
+
+If the user provides a file:line-range or function
+name, read that code. If not explicitly provided,
+identify the relevant concurrent code from context
+(e.g. the current diff, a failing test, or the
+function under discussion).
+
+## 1. Inventory shared mutable state
+
+List every piece of state that is accessed by
+multiple tasks. For each, note:
+
+- **What**: the variable/dict/attr (e.g.
+  `_Cache.values`, `_Cache.resources`,
+  `_Cache.users`)
+- **Scope**: class-level, module-level, or
+  closure-captured
+- **Writers**: which tasks/code-paths mutate it
+- **Readers**: which tasks/code-paths read it
+- **Guarded by**: which lock/event/ordering
+  protects it (or "UNGUARDED" if none)
+
+Format as a table:
+
+```
+| State               | Writers         | Readers         | Guard          |
+|---------------------|-----------------|-----------------|----------------|
+| _Cache.values       | run_ctx, moc¹   | moc             | ctx_key lock   |
+| _Cache.resources    | run_ctx, moc    | moc, run_ctx    | UNGUARDED      |
+```
+
+¹ `moc` = `maybe_open_context`
+
+## 2. Map checkpoint boundaries
+
+For each code path through the target, mark every
+**checkpoint** — any `await` expression where trio
+can switch to another task. Use line numbers:
+
+```
+L325: await lock.acquire()        ← CHECKPOINT
+L395: await service_tn.start(...) ← CHECKPOINT
+L411: lock.release()              ← (not a checkpoint, but changes lock state)
+L414: yield (False, yielded)      ← SUSPEND (caller runs)
+L485: no_more_users.set()         ← (wakes run_ctx, no switch yet)
+```
+
+**Key trio scheduling rules to apply:**
+- `Event.set()` makes waiters *ready* but does NOT
+  switch immediately
+- `lock.release()` is not a checkpoint
+- `await sleep(0)` IS a checkpoint
+- Code in `finally` blocks CAN have checkpoints
+  (unlike asyncio)
+- `await` inside `except` blocks can be
+  `trio.Cancelled`-masked
+
+## 3. Trace concurrent task schedules
+
+Write out the **interleaved execution trace** for
+the problematic scenario. Number each step and tag
+which task executes it:
+
+```
+[Task A]  1. acquires lock
+[Task A]  2. cache miss → allocates resources
+[Task A]  3. releases lock
+[Task A]  4. yields to caller
+[Task A]  5. caller exits → finally runs
+[Task A]  6. users-- → 0, sets no_more_users
+[Task A]  7. pops lock from _Cache.locks
+[run_ctx] 8. wakes from no_more_users.wait()
+[run_ctx] 9. values.pop(ctx_key)
+[run_ctx] 10. acm __aexit__ → CHECKPOINT
+[Task B]  11. creates NEW lock (old one popped)
+[Task B]  12. acquires immediately
+[Task B]  13. values[ctx_key] → KeyError
+[Task B]  14. resources[ctx_key] → STILL EXISTS
+[Task B]  15. 💥 RuntimeError
+```
+
+Identify the **race window**: the range of steps
+where state is inconsistent. In the example above,
+steps 9–10 are the window (values gone, resources
+still alive).
+
+## 4. Classify the bug
+
+Categorize what kind of concurrency issue this is:
+
+- **TOCTOU** (time-of-check-to-time-of-use): state
+  changes between a check and the action based on it
+- **Stale reference**: a task holds a reference to
+  state that another task has invalidated
+- **Lifetime mismatch**: a synchronization primitive
+  (lock, event) has a shorter lifetime than the
+  state it's supposed to protect
+- **Missing guard**: shared state is accessed
+  without any synchronization
+- **Atomicity gap**: two operations that should be
+  atomic have a checkpoint between them
+
+## 5. Propose fixes
+
+For each proposed fix, provide:
+
+- **Sketch**: pseudocode or diff showing the change
+- **How it closes the window**: which step(s) from
+  the trace it eliminates or reorders
+- **Tradeoffs**: complexity, perf, new edge cases,
+  impact on other code paths
+- **Risk**: what could go wrong (deadlocks, new
+  races, cancellation issues)
+
+Rate each fix: `[simple|moderate|complex]` impl
+effort.
+
+## 6. Output format
+
+Structure the full analysis as:
+
+```markdown
+## Concurrency analysis: `<target>`
+
+### Shared state
+<table from step 1>
+
+### Checkpoints
+<list from step 2>
+
+### Race trace
+<interleaved trace from step 3>
+
+### Classification
+<bug type from step 4>
+
+### Fixes
+<proposals from step 5>
+```
+
+## Tractor-specific patterns to watch
+
+These are known problem areas in tractor's
+concurrency model. Flag them when encountered:
+
+### `_Cache` lock vs `run_ctx` lifetime
+
+The `_Cache.locks` entry is managed by
+`maybe_open_context` callers, but `run_ctx` runs
+in `service_tn` — a different task tree. Lock
+pop/release in the caller's `finally` does NOT
+wait for `run_ctx` to finish tearing down. Any
+state that `run_ctx` cleans up in its `finally`
+(e.g. `resources.pop()`) is vulnerable to
+re-entry races after the lock is popped.
+
+### `values.pop()` → acm `__aexit__` → `resources.pop()` gap
+
+In `_Cache.run_ctx`, the inner `finally` pops
+`values`, then the acm's `__aexit__` runs (which
+has checkpoints), then the outer `finally` pops
+`resources`. This creates a window where `values`
+is gone but `resources` still exists — a classic
+atomicity gap.
+
+### Global vs per-key counters
+
+`_Cache.users` as a single `int` (pre-fix) meant
+that users of different `ctx_key`s inflated each
+other's counts, preventing teardown when one key's
+users hit zero. Always verify that per-key state
+(`users`, `locks`) is actually keyed on `ctx_key`
+and not on `fid` or some broader key.
+
+### `Event.set()` wakes but doesn't switch
+
+`trio.Event.set()` makes waiting tasks *ready* but
+the current task continues executing until its next
+checkpoint. Code between `.set()` and the next
+`await` runs atomically from the scheduler's
+perspective. Use this to your advantage (or watch
+for bugs where code assumes the woken task runs
+immediately).
+
+### `except` block checkpoint masking
+
+`await` expressions inside `except` handlers can
+be masked by `trio.Cancelled`. If a `finally`
+block runs from an `except` and contains
+`lock.release()`, the release happens — but any
+`await` after it in the same `except` may be
+swallowed. This is why `maybe_open_context`'s
+cache-miss path does `lock.release()` in a
+`finally` inside the `except KeyError`.
+
+### Cancellation in `finally`
+
+Unlike asyncio, trio allows checkpoints in
+`finally` blocks. This means `finally` cleanup
+that does `await` can itself be cancelled (e.g.
+by nursery shutdown). Watch for cleanup code that
+assumes it will run to completion.
+
+### Unbounded waits in cleanup paths
+
+Any `await <event>.wait()` in a teardown path is
+a latent deadlock unless the event's setter is
+GUARANTEED to fire. If the setter depends on
+external state (peer disconnects, child process
+exit, subsequent task completion) that itself
+depends on the current task's progress, you have
+a mutual wait.
+
+Rule: **bound every `await X.wait()` in cleanup
+paths with `trio.move_on_after()`** unless you
+can prove the setter is unconditionally reachable
+from the state at the await site. Concrete recent
+example: `ipc_server.wait_for_no_more_peers()` in
+`async_main`'s finally (see
+`ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`
+"probe iteration 3") — it was unbounded, and when
+one peer-handler was stuck the wait-for-no-more-
+peers event never fired, deadlocking the whole
+actor-tree teardown cascade.
+
+### The capture-pipe-fill hang pattern (grep this first)
+
+When investigating any hang in the test suite
+**especially under fork-based backends**, first
+check whether the hang reproduces under `pytest
+-s` (`--capture=no`). If `-s` makes it go away
+you're not looking at a trio concurrency bug —
+you're looking at a Linux pipe-buffer fill.
+
+Mechanism: pytest replaces fds 1,2 with pipe
+write-ends. Fork-child subactors inherit those
+fds. High-volume error-log tracebacks (cancel
+cascade spew) fill the 64KB pipe buffer. Child
+`write()` blocks. Child can't exit. Parent's
+`waitpid`/pidfd wait blocks. Deadlock cascades up
+the tree.
+
+Pre-existing guards in `tests/conftest.py` encode
+this knowledge — grep these BEFORE blaming
+concurrency:
+
+```python
+# tests/conftest.py:258
+if loglevel in ('trace', 'debug'):
+    # XXX: too much logging will lock up the subproc (smh)
+    loglevel: str = 'info'
+
+# tests/conftest.py:316
+# can lock up on the `_io.BufferedReader` and hang..
+stderr: str = proc.stderr.read().decode()
+```
+
+Full post-mortem +
+`ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`
+for the canonical reproduction. Cost several
+investigation sessions before catching it —
+because the capture-pipe symptom was masked by
+deeper cascade-deadlocks. Once the cascades were
+fixed, the tree tore down enough to generate
+pipe-filling log volume → capture-pipe finally
+surfaced. Grep-note for future-self: **if a
+multi-subproc tractor test hangs, `pytest -s`
+first, conc-anal second.**
--- a/.claude/skills/pr-msg/format-reference.md
+++ b/.claude/skills/pr-msg/format-reference.md
@ -0,0 +1,241 @@
+# PR/Patch-Request Description Format Reference
+
+Canonical structure for `tractor` patch-request
+descriptions, designed to work across GitHub,
+Gitea, SourceHut, and GitLab markdown renderers.
+
+**Line length: wrap at 72 chars** for all prose
+content (Summary bullets, Motivation paragraphs,
+Scopes bullets, etc.). Fill lines *to* 72 — don't
+stop short at 50-65. Only raw URLs in
+reference-link definitions may exceed this.
+
+## Template
+
+```markdown
+<!-- pr-msg-meta
+branch: <branch-name>
+base: <base-branch>
+submitted:
+  github: ___
+  gitea: ___
+  srht: ___
+-->
+
+## <Title: present-tense verb + backticked code>
+
+### Summary
+- [<hash>][<hash>] Description of change ending
+  with period.
+- [<hash>][<hash>] Another change description
+  ending with period.
+- [<hash>][<hash>] [<hash>][<hash>] Multi-commit
+  change description.
+
+### Motivation
+<1-2 paragraphs: problem/limitation first,
+then solution. Hard-wrap at 72 chars.>
+
+### Scopes changed
+- [<hash>][<hash>] `pkg.mod.func()` — what
+  changed.
+  * [<hash>][<hash>] Also adjusts
+    `.related_thing()` in same module.
+- [<hash>][<hash>] `tests.test_mod` — new/changed
+  test coverage.
+
+<!--
+### Cross-references
+Also submitted as
+[github-pr][] | [gitea-pr][] | [srht-patch][].
+
+### Links
+- [relevant-issue-or-discussion](url)
+- [design-doc-or-screenshot](url)
+-->
+
+(this pr content was generated in some part by
+[`claude-code`][claude-code-gh])
+
+[<hash>]: https://<service>/<owner>/<repo>/commit/<hash>
+[claude-code-gh]: https://github.com/anthropics/claude-code
+
+<!-- cross-service pr refs (fill after submit):
+[github-pr]: https://github.com/<owner>/<repo>/pull/___
+[gitea-pr]: https://<host>/<owner>/<repo>/pulls/___
+[srht-patch]: https://git.sr.ht/~<owner>/<repo>/patches/___
+-->
+```
+
+## Markdown Reference-Link Strategy
+
+Use reference-style links for ALL commit hashes
+and cross-service PR refs to ensure cross-service
+compatibility:
+
+**Inline usage** (in bullets):
+```markdown
+- [f3726cf9][f3726cf9] Add `reg_err_types()`
+  for custom exc lookup.
+```
+
+**Definition** (bottom of document):
+```markdown
+[f3726cf9]: https://github.com/goodboy/tractor/commit/f3726cf9
+```
+
+### Why reference-style?
+- Keeps prose readable without long inline URLs.
+- All URLs in one place — trivially swappable
+  per-service.
+- Most git services auto-link bare SHAs anyway,
+  but explicit refs guarantee it works in *any*
+  md renderer.
+- The `[hash][hash]` form is self-documenting —
+  display text matches the ref ID.
+- Cross-service PR refs use the same mechanism:
+  `[github-pr][]` resolves via a ref-link def
+  at the bottom, trivially fillable post-submit.
+
+## Cross-Service PR Placeholder Mechanism
+
+The generated description includes three layers
+of cross-service support, all using native md
+reference-links:
+
+### 1. Metadata comment (top of file)
+
+```markdown
+<!-- pr-msg-meta
+branch: remote_exc_type_registry
+base: main
+submitted:
+  github: ___
+  gitea: ___
+  srht: ___
+-->
+```
+
+A YAML-ish HTML comment block. The `___`
+placeholders get filled with PR/patch numbers
+after submission. Machine-parseable for tooling
+(e.g. `gish`) but invisible in rendered md.
+
+### 2. Cross-references section (in body)
+
+```markdown
+<!--
+### Cross-references
+Also submitted as
+[github-pr][] | [gitea-pr][] | [srht-patch][].
+-->
+```
+
+Commented out at generation time. After submitting
+to multiple services, uncomment and the ref-links
+resolve via the stubs at the bottom.
+
+### 3. Ref-link stubs (bottom of file)
+
+```markdown
+<!-- cross-service pr refs (fill after submit):
+[github-pr]: https://github.com/goodboy/tractor/pull/___
+[gitea-pr]: https://pikers.dev/goodboy/tractor/pulls/___
+[srht-patch]: https://git.sr.ht/~goodboy/tractor/patches/___
+-->
+```
+
+Commented out with `___` number placeholders.
+After submission: uncomment, replace `___` with
+the actual number. Each service-specific copy
+fills in all services' numbers so any copy can
+cross-reference the others.
+
+### Post-submission file layout
+
+```
+pr_msg_LATEST.md                    # latest draft (skill root)
+msgs/
+  20260325T002027Z_mybranch_pr_msg.md  # timestamped
+  github/
+    42_pr_msg.md        # github PR #42
+  gitea/
+    17_pr_msg.md        # gitea PR #17
+  srht/
+    5_pr_msg.md         # srht patch #5
+```
+
+Each `<service>/<num>_pr_msg.md` is a copy with:
+- metadata `submitted:` fields filled in
+- cross-references section uncommented
+- ref-link stubs uncommented with real numbers
+- all services cross-linked in each copy
+
+This mirrors the `gish` skill's
+`<backend>/<num>.md` pattern.
+
+## Commit-Link URL Patterns by Service
+
+| Service   | Pattern                             |
+|-----------|-------------------------------------|
+| GitHub    | `https://github.com/<o>/<r>/commit/<h>` |
+| Gitea     | `https://<host>/<o>/<r>/commit/<h>` |
+| SourceHut | `https://git.sr.ht/~<o>/<r>/commit/<h>` |
+| GitLab    | `https://gitlab.com/<o>/<r>/-/commit/<h>` |
+
+## PR/Patch URL Patterns by Service
+
+| Service   | Pattern                             |
+|-----------|-------------------------------------|
+| GitHub    | `https://github.com/<o>/<r>/pull/<n>` |
+| Gitea     | `https://<host>/<o>/<r>/pulls/<n>`  |
+| SourceHut | `https://git.sr.ht/~<o>/<r>/patches/<n>` |
+| GitLab    | `https://gitlab.com/<o>/<r>/-/merge_requests/<n>` |
+
+## Scope Naming Convention
+
+Use Python namespace-resolution syntax for
+referencing changed code scopes:
+
+| File path                 | Scope reference               |
+|---------------------------|-------------------------------|
+| `tractor/_exceptions.py`  | `tractor._exceptions`         |
+| `tractor/_state.py`       | `tractor._state`              |
+| `tests/test_foo.py`       | `tests.test_foo`              |
+| Function in module        | `tractor._exceptions.func()`  |
+| Method on class           | `.RemoteActorError.src_type`  |
+| Class                     | `tractor._exceptions.RAE`     |
+
+Prefix with the package path for top-level refs;
+use leading-dot shorthand (`.ClassName.method()`)
+for sub-bullets where the parent module is already
+established.
+
+## Title Conventions
+
+Same verb vocabulary as commit messages:
+- `Add` — wholly new feature/API
+- `Fix` — bug fix
+- `Drop` — removal
+- `Use` — adopt new approach
+- `Move`/`Mv` — relocate code
+- `Adjust` — minor tweak
+- `Update` — enhance existing feature
+- `Support` — add support for something
+
+Target 50 chars, hard max 70. Always backtick
+code elements.
+
+## Tone
+
+Casual yet technically precise — matching the
+project's commit-msg style. Terse but every bullet
+carries signal. Use project abbreviations freely
+(msg, bg, ctx, impl, mod, obvi, fn, bc, var,
+prolly, ep, etc.).
+
+---
+
+(this format reference was generated by
+[`claude-code`][claude-code-gh])
+[claude-code-gh]: https://github.com/anthropics/claude-code
--- a/.claude/skills/run-tests/SKILL.md
+++ b/.claude/skills/run-tests/SKILL.md
@ -0,0 +1,625 @@
+---
+name: run-tests
+description: >
+  Run tractor test suite (or subsets). Use when the user wants
+  to run tests, verify changes, or check for regressions.
+argument-hint: "[test-path-or-pattern] [--opts]"
+allowed-tools:
+  - Bash(python -m pytest *)
+  - Bash(python -c *)
+  - Bash(python --version *)
+  - Bash(UV_PROJECT_ENVIRONMENT=py* uv run python *)
+  - Bash(UV_PROJECT_ENVIRONMENT=py* uv run pytest *)
+  - Bash(UV_PROJECT_ENVIRONMENT=py* uv sync *)
+  - Bash(UV_PROJECT_ENVIRONMENT=py* uv pip show *)
+  - Bash(git rev-parse *)
+  - Bash(ls *)
+  - Bash(cat *)
+  - Bash(jq * .pytest_cache/*)
+  - Read
+  - Grep
+  - Glob
+  - Task
+  - AskUserQuestion
+---
+
+Run the `tractor` test suite using `pytest`. Follow this
+process:
+
+## 1. Parse user intent
+
+From the user's message and any arguments, determine:
+
+- **scope**: full suite, specific file(s), specific
+  test(s), or a keyword pattern (`-k`).
+- **transport**: which IPC transport protocol to test
+  against (default: `tcp`, also: `uds`).
+- **options**: any extra pytest flags the user wants
+  (e.g. `--ll debug`, `--tpdb`, `-x`, `-v`).
+
+If the user provides a bare path or pattern as argument,
+treat it as the test target. Examples:
+
+- `/run-tests` → full suite
+- `/run-tests test_local.py` → single file
+- `/run-tests test_registrar -v` → file + verbose
+- `/run-tests -k cancel` → keyword filter
+- `/run-tests tests/ipc/ --tpt-proto uds` → subdir + UDS
+
+## 2. Construct the pytest command
+
+Base command:
+```
+python -m pytest
+```
+
+### Default flags (always include unless user overrides):
+- `-x` (stop on first failure)
+- `--tb=short` (concise tracebacks)
+- `--no-header` (reduce noise)
+
+### Path resolution:
+- If the user gives a bare filename like `test_local.py`,
+  resolve it under `tests/`.
+- If the user gives a subdirectory like `ipc/`, resolve
+  under `tests/ipc/`.
+- Glob if needed: `tests/**/test_*<pattern>*.py`
+
+### Key pytest options for this project:
+
+| Flag | Purpose |
+|---|---|
+| `--ll <level>` | Set tractor log level (e.g. `debug`, `info`, `runtime`) |
+| `--tpdb` / `--debug-mode` | Enable tractor's multi-proc debugger |
+| `--tpt-proto <key>` | IPC transport: `tcp` (default) or `uds` |
+| `--spawn-backend <be>` | Spawn method: `trio` (default), `mp_spawn`, `mp_forkserver` |
+| `-k <expr>` | pytest keyword filter |
+| `-v` / `-vv` | Verbosity |
+| `-s` | No output capture (useful with `--tpdb`) |
+
+### Common combos:
+```sh
+# quick smoke test of core modules
+python -m pytest tests/test_local.py tests/test_rpc.py -x --tb=short --no-header
+
+# full suite, stop on first failure
+python -m pytest tests/ -x --tb=short --no-header
+
+# specific test with debug
+python -m pytest tests/discovery/test_registrar.py::test_reg_then_unreg -x -s --tpdb --ll debug
+
+# run with UDS transport
+python -m pytest tests/ -x --tb=short --no-header --tpt-proto uds
+
+# keyword filter
+python -m pytest tests/ -x --tb=short --no-header -k "cancel and not slow"
+```
+
+## 3. Pre-flight: venv detection (MANDATORY)
+
+**Always verify a `uv` venv is active before running
+`python` or `pytest`.** This project uses
+`UV_PROJECT_ENVIRONMENT=py<MINOR>` naming (e.g.
+`py313`) — never `.venv`.
+
+### Step 1: detect active venv
+
+Run this check first:
+
+```sh
+python -c "
+import sys, os
+venv = os.environ.get('VIRTUAL_ENV', '')
+prefix = sys.prefix
+print(f'VIRTUAL_ENV={venv}')
+print(f'sys.prefix={prefix}')
+print(f'executable={sys.executable}')
+"
+```
+
+### Step 2: interpret results
+
+**Case A — venv is active** (`VIRTUAL_ENV` is set
+and points to a `py<MINOR>/` dir under the project
+root or worktree):
+
+Use bare `python` / `python -m pytest` for all
+commands. This is the normal, fast path.
+
+**Case B — no venv active** (`VIRTUAL_ENV` is empty
+or `sys.prefix` points to a system Python):
+
+Use `AskUserQuestion` to ask the user:
+
+> "No uv venv is active. Should I activate one
+> via `UV_PROJECT_ENVIRONMENT=py<MINOR> uv sync`,
+> or would you prefer to activate your shell venv
+> first?"
+
+Options:
+1. **"Create/sync venv"** — run
+   `UV_PROJECT_ENVIRONMENT=py<MINOR> uv sync` where
+   `<MINOR>` is detected from `python --version`
+   (e.g. `313` for 3.13). Then use
+   `py<MINOR>/bin/python` for all subsequent
+   commands in this session.
+2. **"I'll activate it myself"** — stop and let the
+   user `source py<MINOR>/bin/activate` or similar.
+
+**Case C — inside a git worktree** (`git rev-parse
+--git-common-dir` differs from `--git-dir`):
+
+Verify Python resolves from the **worktree's own
+venv**, not the main repo's:
+
+```sh
+python -c "import tractor; print(tractor.__file__)"
+```
+
+If the path points outside the worktree, create a
+worktree-local venv:
+
+```sh
+UV_PROJECT_ENVIRONMENT=py<MINOR> uv sync
+```
+
+Then use `py<MINOR>/bin/python` for all commands.
+
+**Why this matters**: without the correct venv,
+subprocesses spawned by tractor resolve modules
+from the wrong editable install, causing spurious
+`AttributeError` / `ModuleNotFoundError`.
+
+### Fallback: `uv run`
+
+If the user can't or won't activate a venv, all
+`python` and `pytest` commands can be prefixed
+with `UV_PROJECT_ENVIRONMENT=py<MINOR> uv run`:
+
+```sh
+# instead of: python -m pytest tests/ -x
+UV_PROJECT_ENVIRONMENT=py313 uv run pytest tests/ -x
+
+# instead of: python -c 'import tractor'
+UV_PROJECT_ENVIRONMENT=py313 uv run python -c 'import tractor'
+```
+
+`uv run` auto-discovers the project and venv,
+but is slower than a pre-activated venv due to
+lock-file resolution on each invocation. Prefer
+activating the venv when possible.
+
+### Step 3: import + collection checks
+
+After venv is confirmed, always run these
+(especially after refactors or module moves):
+
+```sh
+# 1. package import smoke check
+python -c 'import tractor; print(tractor)'
+
+# 2. verify all tests collect (no import errors)
+python -m pytest tests/ -x -q --co 2>&1 | tail -5
+```
+
+If either fails, fix the import error before running
+any actual tests.
+
+### Step 4: zombie-actor / stale-registry check (MANDATORY)
+
+The tractor runtime's default registry address is
+**`127.0.0.1:1616`** (TCP) / `/tmp/registry@1616.sock`
+(UDS). Whenever any prior test run — especially one
+using a fork-based backend like `subint_forkserver` —
+leaks a child actor process, that zombie keeps the
+registry port bound and **every subsequent test
+session fails to bind**, often presenting as 50+
+unrelated failures ("all tests broken"!) across
+backends.
+
+**This has to be checked before the first run AND
+after any cancelled/SIGINT'd run** — signal failures
+in the middle of a test can leave orphan children.
+
+```sh
+# 1. TCP registry — any listener on :1616? (primary signal)
+ss -tlnp 2>/dev/null | grep ':1616' || echo 'TCP :1616 free'
+
+# 2. leftover actor/forkserver procs — scoped to THIS
+#    repo's python path, so we don't false-flag legit
+#    long-running tractor-using apps (e.g. `piker`,
+#    downstream projects that embed tractor).
+pgrep -af "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv" \
+  | grep -v 'grep\|pgrep' \
+  || echo 'no leaked actor procs from this repo'
+
+# 3. stale UDS registry sockets
+ls -la /tmp/registry@*.sock 2>/dev/null \
+  || echo 'no leaked UDS registry sockets'
+```
+
+**Interpretation:**
+
+- **TCP :1616 free AND no stale sockets** → clean,
+  proceed. The actor-procs probe is secondary — false
+  positives are common (piker, any other tractor-
+  embedding app); only cleanup if `:1616` is bound or
+  sockets linger.
+- **TCP :1616 bound OR stale sockets present** →
+  surface PIDs + cmdlines to the user, offer cleanup:
+
+  ```sh
+  # 1. GRACEFUL FIRST (tractor is structured concurrent — it
+  #    catches SIGINT as an OS-cancel in `_trio_main` and
+  #    cascades Portal.cancel_actor via IPC to every descendant.
+  #    So always try SIGINT first with a bounded timeout; only
+  #    escalate to SIGKILL if graceful cleanup doesn't complete).
+  pkill -INT -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv"
+
+  # 2. bounded wait for graceful teardown (usually sub-second).
+  #    Loop until the processes exit, or timeout. Keep the
+  #    bound tight — hung/abrupt-killed descendants usually
+  #    hang forever, so don't wait more than a few seconds.
+  for i in $(seq 1 10); do
+    pgrep -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv" >/dev/null || break
+    sleep 0.3
+  done
+
+  # 3. ESCALATE TO SIGKILL only if graceful didn't finish.
+  if pgrep -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv" >/dev/null; then
+    echo 'graceful teardown timed out — escalating to SIGKILL'
+    pkill -9 -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv"
+  fi
+
+  # 4. if a test zombie holds :1616 specifically and doesn't
+  #    match the above pattern, find its PID the hard way:
+  ss -tlnp 2>/dev/null | grep ':1616'   # prints `users:(("<name>",pid=NNNN,...))`
+  # then (same SIGINT-first ladder):
+  # kill -INT <NNNN>; sleep 1; kill -9 <NNNN> 2>/dev/null
+
+  # 5. remove stale UDS sockets
+  rm -f /tmp/registry@*.sock
+
+  # 6. re-verify
+  ss -tlnp 2>/dev/null | grep ':1616' || echo 'TCP :1616 now free'
+  ```
+
+**Never ignore stale registry state.** If you see the
+"all tests failing" pattern — especially
+`trio.TooSlowError` / connection refused / address in
+use on many unrelated tests — check registry **before**
+spelunking into test code. The failure signature will
+be identical across backends because they're all
+fighting for the same port.
+
+**False-positive warning for step 2:** a plain
+`pgrep -af '_actor_child_main'` will also match
+legit long-running tractor-embedding apps (e.g.
+`piker` at `~/repos/piker/py*/bin/python3 -m
+tractor._child ...`). Always scope to the current
+repo's python path, or only use step 1 (`:1616`) as
+the authoritative signal.
+
+## 4. Run and report
+
+- Run the constructed command.
+- Use a timeout of **600000ms** (10min) for full suite
+  runs, **120000ms** (2min) for single-file runs.
+- If the suite is large (full `tests/`), consider running
+  in the background and checking output when done.
+- Use `--lf` (last-failed) to re-run only previously
+  failing tests when iterating on a fix.
+
+### On failure:
+- Show the failing test name(s) and short traceback.
+- If the failure looks related to recent changes, point
+  out the likely cause and suggest a fix.
+- **Check the known-flaky list** (section 8) before
+  investigating — don't waste time on pre-existing
+  timeout issues.
+- **NEVER auto-commit fixes.** If you apply a code fix
+  during test iteration, leave it unstaged. Tell the
+  user what changed and suggest they review the
+  worktree state, stage files manually, and use
+  `/commit-msg` (inline or in a separate session) to
+  generate the commit message. The human drives all
+  `git add` and `git commit` operations.
+
+### On success:
+- Report the pass/fail/skip counts concisely.
+
+## 5. Test directory layout (reference)
+
+```
+tests/
+├── conftest.py          # root fixtures, daemon, signals
+├── devx/                # debugger/tooling tests
+├── ipc/                 # transport protocol tests
+├── msg/                 # messaging layer tests
+├── discovery/           # discovery subsystem tests
+│   ├── test_multiaddr.py  # multiaddr construction
+│   └── test_registrar.py  # registry/discovery protocol
+├── test_local.py        # registrar + local actor basics
+├── test_rpc.py          # RPC error handling
+├── test_spawning.py     # subprocess spawning
+├── test_multi_program.py  # multi-process tree tests
+├── test_cancellation.py # cancellation semantics
+├── test_context_stream_semantics.py  # ctx streaming
+├── test_inter_peer_cancellation.py   # peer cancel
+├── test_infected_asyncio.py  # trio-in-asyncio
+└── ...
+```
+
+## 6. Change-type → test mapping
+
+After modifying specific modules, run the corresponding
+test subset first for fast feedback:
+
+| Changed module(s) | Run these tests first |
+|---|---|
+| `runtime/_runtime.py`, `runtime/_state.py` | `test_local.py test_rpc.py test_spawning.py test_root_runtime.py` |
+| `discovery/` (`_registry`, `_discovery`, `_addr`) | `tests/discovery/ test_multi_program.py test_local.py` |
+| `_context.py`, `_streaming.py` | `test_context_stream_semantics.py test_advanced_streaming.py` |
+| `ipc/` (`_chan`, `_server`, `_transport`) | `tests/ipc/ test_2way.py` |
+| `runtime/_portal.py`, `runtime/_rpc.py` | `test_rpc.py test_cancellation.py` |
+| `spawn/` (`_spawn`, `_entry`) | `test_spawning.py test_multi_program.py` |
+| `devx/debug/` | `tests/devx/test_debugger.py` (slow!) |
+| `to_asyncio.py` | `test_infected_asyncio.py test_root_infect_asyncio.py` |
+| `msg/` | `tests/msg/` |
+| `_exceptions.py` | `test_remote_exc_relay.py test_inter_peer_cancellation.py` |
+| `runtime/_supervise.py` | `test_cancellation.py test_spawning.py` |
+
+## 7. Quick-check shortcuts
+
+### After refactors (fastest first-pass):
+```sh
+# import + collect check
+python -c 'import tractor' && python -m pytest tests/ -x -q --co 2>&1 | tail -3
+
+# core subset (~10s)
+python -m pytest tests/test_local.py tests/test_rpc.py tests/test_spawning.py tests/discovery/test_registrar.py -x --tb=short --no-header
+```
+
+### Inspect last failures (without re-running):
+
+When the user asks "what failed?", "show failures",
+or wants to check the last-failed set before
+re-running — read the pytest cache directly. This
+is instant and avoids test collection overhead.
+
+```sh
+python -c "
+import json, pathlib, sys
+p = pathlib.Path('.pytest_cache/v/cache/lastfailed')
+if not p.exists():
+    print('No lastfailed cache found.'); sys.exit()
+data = json.loads(p.read_text())
+# filter to real test node IDs (ignore junk
+# entries that can accumulate from system paths)
+tests = sorted(k for k in data if k.startswith('tests/'))
+if not tests:
+    print('No failures recorded.')
+else:
+    print(f'{len(tests)} last-failed test(s):')
+    for t in tests:
+        print(f'  {t}')
+"
+```
+
+**Why not `--cache-show` or `--co --lf`?**
+
+- `pytest --cache-show 'cache/lastfailed'` works
+  but dumps raw dict repr including junk entries
+  (stale system paths that leak into the cache).
+- `pytest --co --lf` actually *collects* tests which
+  triggers import resolution and is slow (~0.5s+).
+  Worse, when cached node IDs don't exactly match
+  current parametrize IDs (e.g. param names changed
+  between runs), pytest falls back to collecting
+  the *entire file*, giving false positives.
+- Reading the JSON directly is instant, filterable
+  to `tests/`-prefixed entries, and shows exactly
+  what pytest recorded — no interpretation.
+
+**After inspecting**, re-run the failures:
+```sh
+python -m pytest --lf -x --tb=short --no-header
+```
+
+### Full suite in background:
+When core tests pass and you want full coverage while
+continuing other work, run in background:
+```sh
+python -m pytest tests/ -x --tb=short --no-header -q
+```
+(use `run_in_background=true` on the Bash tool)
+
+## 8. Known flaky tests
+
+These tests have **pre-existing** timing/environment
+sensitivity. If they fail with `TooSlowError` or
+pexpect `TIMEOUT`, they are almost certainly NOT caused
+by your changes — note them and move on.
+
+| Test | Typical error | Notes |
+|---|---|---|
+| `devx/test_debugger.py::test_multi_nested_subactors_error_through_nurseries` | pexpect TIMEOUT | Debugger pexpect timing |
+| `test_cancellation.py::test_cancel_via_SIGINT_other_task` | TooSlowError | Signal handling race |
+| `test_inter_peer_cancellation.py::test_peer_spawns_and_cancels_service_subactor` | TooSlowError | Async timing (both param variants) |
+| `test_docs_examples.py::test_example[we_are_processes.py]` | `assert None == 0` | `__main__` missing `__file__` in subproc |
+
+**Rule of thumb**: if a test fails with `TooSlowError`,
+`trio.TooSlowError`, or `pexpect.TIMEOUT` and you didn't
+touch the relevant code path, it's flaky — skip it.
+
+## 9. The pytest-capture hang pattern (CHECK THIS FIRST)
+
+**Symptom:** a tractor test hangs indefinitely under
+default `pytest` but passes instantly when you add
+`-s` (`--capture=no`).
+
+**Cause:** tractor subactors (especially under fork-
+based backends) inherit pytest's stdout/stderr
+capture pipes via fds 1,2. Under high-volume error
+logging (e.g. multi-level cancel cascade, nested
+`run_in_actor` failures, anything triggering
+`RemoteActorError` + `ExceptionGroup` traceback
+spew), the **64KB Linux pipe buffer fills** faster
+than pytest drains it. Subactor writes block → can't
+finish exit → parent's `waitpid`/pidfd wait blocks →
+deadlock cascades up the tree.
+
+**Pre-existing guards in the tractor harness** that
+encode this same knowledge — grep these FIRST
+before spelunking:
+
+- `tests/conftest.py:258-260` (in the `daemon`
+  fixture): `# XXX: too much logging will lock up
+  the subproc (smh)` — downgrades `trace`/`debug`
+  loglevel to `info` to prevent the hang.
+- `tests/conftest.py:316`: `# can lock up on the
+  _io.BufferedReader and hang..` — noted on the
+  `proc.stderr.read()` post-SIGINT.
+
+**Debug recipe (in priority order):**
+
+1. **Try `-s` first.** If the hang disappears with
+   `pytest -s`, you've confirmed it's capture-pipe
+   fill. Skip spelunking.
+2. **Lower the loglevel.** Default `--ll=error` on
+   this project; if you've bumped it to `debug` /
+   `info`, try dropping back. Each log level
+   multiplies pipe-pressure under fault cascades.
+3. **If you MUST use default capture + high log
+   volume**, redirect subactor stdout/stderr in the
+   child prelude (e.g.
+   `tractor.spawn._subint_forkserver._child_target`
+   post-`_close_inherited_fds`) to `/dev/null` or a
+   file.
+
+**Signature tells you it's THIS bug (vs. a real
+code hang):**
+
+- Multi-actor test under fork-based backend
+  (`subint_forkserver`, eventually `trio_proc` too
+  under enough log volume).
+- Multiple `RemoteActorError` / `ExceptionGroup`
+  tracebacks in the error path.
+- Test passes with `-s` in the 5-10s range, hangs
+  past pytest-timeout (usually 30+ s) without `-s`.
+- Subactor processes visible via `pgrep -af
+  subint-forkserv` or similar after the hang —
+  they're alive but blocked on `write()` to an
+  inherited stdout fd.
+
+**Historical reference:** this deadlock cost a
+multi-session investigation (4 genuine cascade
+fixes landed along the way) that only surfaced the
+capture-pipe issue AFTER the deeper fixes let the
+tree actually tear down enough to produce pipe-
+filling log volume. Full post-mortem in
+`ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`.
+Lesson codified here so future-me grep-finds the
+workaround before digging.
+
+## 10. Reaping zombie subactors (`tractor-reap`)
+
+**Symptom:** after a `pytest` run crashes, times out,
+or is `Ctrl+C`'d, subactor forks (esp. under
+`subint_forkserver`) can be reparented to `init`
+(PPid==1) and linger. They hold onto ports, inherit
+pytest's capture-pipe fds, and flakify later
+sessions.
+
+**Two layers of defense:**
+
+### a) Session-scoped auto-fixture (always on)
+
+`tractor/_testing/pytest.py::_reap_orphaned_subactors`
+runs at pytest session teardown. It walks `/proc` for
+direct descendants of the pytest pid, SIGINTs them,
+waits up to 3s, then SIGKILLs survivors. SC-polite:
+gives the subactor runtime a chance to run its trio
+cancel shield + IPC teardown before escalation.
+
+This is *autouse* and session-scoped — you don't need
+to do anything. It just runs.
+
+### b) `scripts/tractor-reap` CLI (manual reap)
+
+For the **pytest-died-mid-session** case (Ctrl+C, OOM
+kill, hung process you had to `kill -9`), the fixture
+never ran. Reach for the CLI:
+
+```sh
+# default: orphans (PPid==1, cwd==repo, cmd contains python)
+scripts/tractor-reap
+
+# descendant-mode: from a still-live supervisor
+scripts/tractor-reap --parent <pytest-pid>
+
+# see what would be reaped, don't signal
+scripts/tractor-reap -n
+
+# tune the SIGINT → SIGKILL grace window
+scripts/tractor-reap --grace 5
+```
+
+Exit code: `0` if everyone exited on SIGINT, `1` if
+SIGKILL had to escalate — so you can chain it in CI
+health-checks (`scripts/tractor-reap || <alert>`).
+
+**What it matches** (orphan-mode):
+- `PPid == 1` (reparented to init → definitely
+  orphaned, not just a currently-running child)
+- `cwd == <repo-root>` (keeps the sweep scoped; won't
+  touch unrelated init-children elsewhere)
+- `python` in cmdline
+
+**What it does not do:** kill anything whose PPid is
+still a live tractor parent. If the parent is alive
+it's not an orphan; use `--parent <pid>` if you need
+to force-reap under a still-live supervisor.
+
+**When NOT to run it:** while a pytest session is
+active in another terminal. It's safe (won't touch
+that session's live children in orphan-mode) but can
+race if the target session is mid-teardown.
+
+### c) `--shm` / `--shm-only`: orphan-segment sweep
+
+Because `tractor.ipc._mp_bs.disable_mantracker()`
+turns off `mp.resource_tracker` (see
+`ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md`),
+a hard-crashing actor can leave `/dev/shm/<key>`
+segments behind that nothing else GCs.
+
+```sh
+# process reap THEN shm sweep
+scripts/tractor-reap --shm
+
+# shm sweep only (skip process phase)
+scripts/tractor-reap --shm-only
+
+# dry-run: list candidates, don't unlink
+scripts/tractor-reap --shm -n
+```
+
+**Match criteria** (very conservative — this is a
+shared-system path, can't be wrong):
+- segment is a regular file under `/dev/shm`,
+- owned by the **current uid** (`stat.st_uid`),
+- AND **no live process holds it open** —
+  enumerated by walking every readable
+  `/proc/<pid>/maps` (post-mmap mappings) AND
+  `/proc/<pid>/fd/*` (pre-mmap shm-opened fds).
+
+The "nobody has it open" check is the
+kernel-canonical "is this leaked?" test — same
+answer `lsof /dev/shm/<key>` would give. No
+reliance on tractor-specific naming, so it works
+for any tractor app. Critically, it WILL NOT touch
+segments held by other apps you have running
+(e.g. `piker`, `lttng-ust-*`, `aja-shm-*` —
+verified locally with 81 in-use segments correctly
+preserved).
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -1,10 +1,18 @@
 name: CI

+# NOTE distilled from,
+# https://github.com/orgs/community/discussions/26276
 on:
-  # any time someone pushes a new branch to origin
+  # any time a new update to 'main'
  push:
+    branches:
+      - main

-  # Allows you to run this workflow manually from the Actions tab
+  # for on all (forked) PRs to repo
+  # NOTE, use a draft PR if you just want CI triggered..
+  pull_request:
+
+  # to run workflow manually from the "Actions" tab
  workflow_dispatch:

 jobs:
@ -74,24 +82,44 @@ jobs:
  #       run: mypy tractor/ --ignore-missing-imports --show-traceback


-  testing-linux:
-    name: '${{ matrix.os }} Python ${{ matrix.python }} - ${{ matrix.spawn_backend }}'
-    timeout-minutes: 10
+  testing:
+    name: '${{ matrix.os }} Python${{ matrix.python-version }} spawn_backend=${{ matrix.spawn_backend }} tpt_proto=${{ matrix.tpt_proto }}'
+    timeout-minutes: 16
    runs-on: ${{ matrix.os }}

    strategy:
      fail-fast: false
      matrix:
-        os: [ubuntu-latest]
-        python-version: ['3.13']
+        os: [
+          ubuntu-latest,
+          macos-latest,
+        ]
+        python-version: [
+          '3.13',
+          # '3.14',
+        ]
        spawn_backend: [
          'trio',
          # 'mp_spawn',
          # 'mp_forkserver',
+          # ?TODO^ is it worth it to get these running again?
+          #
+          # - [ ] next-gen backends, on 3.13+
+          #   https://github.com/goodboy/tractor/issues/379
+          # 'subinterpreter',
+          # 'subint',
        ]
+        tpt_proto: [
+          'tcp',
+          'uds',
+        ]
+        # https://github.com/orgs/community/discussions/26253#discussioncomment-3250989
+        exclude:
+          # don't do UDS run on macOS (for now)
+          - os: macos-latest
+            tpt_proto: 'uds'

    steps:
-
      - uses: actions/checkout@v4

      - name: 'Install uv + py-${{ matrix.python-version }}'
@ -118,7 +146,15 @@ jobs:
        run: uv tree

      - name: Run tests
-        run: uv run pytest tests/ --spawn-backend=${{ matrix.spawn_backend }} -rsx
+        run: >
+          uv run
+          pytest
+          tests/
+          -rsx
+          --spawn-backend=${{ matrix.spawn_backend }}
+          --tpt-proto=${{ matrix.tpt_proto }}
+          --capture=fd
+        # ^XXX^ can't work with --spawn-method=main_thread_forkserver

  # XXX legacy NOTE XXX
  #
--- a/.gitignore
+++ b/.gitignore
@ -102,3 +102,69 @@ venv.bak/

 # mypy
 .mypy_cache/
+
+# all files under
+.git/
+
+# require very explicit staging for anything we **really**
+# want put/kept in repo.
+notes_to_self/
+snippets/
+
+# ------- AI shiz -------
+# `ai.skillz` symlinks,
+# (machine-local, deploy via deploy-skill.sh)
+.claude/skills/py-codestyle
+.claude/skills/close-wkt
+.claude/skills/plan-io
+.claude/skills/prompt-io
+.claude/skills/resolve-conflicts
+.claude/skills/inter-skill-review
+
+# /open-wkt specifics
+.claude/skills/open-wkt
+.claude/wkts/
+claude_wkts
+
+# /code-review-changes specifics
+.claude/skills/code-review-changes
+# review-skill ephemeral ctx (per-PR, single-use)
+.claude/review_context.md
+.claude/review_regression.md
+
+# /pr-msg specifics
+.claude/skills/pr-msg/*
+# repo-specific
+!.claude/skills/pr-msg/format-reference.md
+# XXX, so u can nvim-telescope this file.
+# !.claude/skills/pr-msg/pr_msg_LATEST.md
+
+# /commit-msg specifics
+# - any commit-msg gen tmp files
+.claude/*_commit_*.md
+.claude/*_commit*.txt
+.claude/skills/commit-msg/*
+!.claude/skills/commit-msg/style-duie-reference.md
+
+# use prompt-io instead?
+.claude/plans
+
+# nix develop --profile .nixdev
+.nixdev*
+
+# :Obsession .
+Session.vim
+
+# `gish` local `.md`-files
+# TODO? better all around automation!
+# -[ ] it'd be handy to also commit and sync with wtv git service?
+# -[ ] everything should be put under a `.gish/` no?
+gitea/
+gh/
+
+# ------ macOS ------
+# Finder metadata
+**/.DS_Store
+
+# LLM conversations that should remain private
+docs/conversations/
--- a/ai/conc-anal/fork_thread_semantics_execution_vs_memory.md
+++ b/ai/conc-anal/fork_thread_semantics_execution_vs_memory.md
@ -0,0 +1,281 @@
+# `fork()` in a multi-threaded program — execution-side vs. memory-side of the same coin
+
+A reference doc for readers who've encountered one of two
+opposite-sounding framings of POSIX `fork()` semantics in a
+multi-threaded program and are confused by the other.
+
+This is a sibling to
+`subint_fork_blocked_by_cpython_post_fork_issue.md` — that
+doc covers a CPython-level refusal of fork-from-subint;
+this one covers the more general POSIX layer, since
+tractor's main-thread forkserver design rests on it.
+
+## TL;DR
+
+POSIX `fork()` only preserves the *calling* thread as a
+runnable thread in the child — every other thread in the
+parent simply never executes another instruction in the
+child. trio's docs call this "leaked"; tractor's
+`_main_thread_forkserver.py` docstring calls it "gone".
+Both are correct: "gone" is the *execution* side (no
+scheduler entry, no instructions retired), "leaked" is the
+*memory* side (the dead threads' stacks and per-thread
+heap structures still ride into the child's address space
+as orphaned COW pages with no owner and no cleanup hook).
+Same POSIX reality, two halves of the same coin.
+
+## The two framings
+
+[python-trio/trio#1614][trio-1614] (the canonical "trio +
+fork" hazards thread) puts it this way:
+
+> If you use `fork()` in a process with multiple threads,
+> all the other thread stacks are just leaked: there's
+> nothing else you can reasonably do with them.
+
+`tractor.spawn._main_thread_forkserver`'s module docstring
+(specifically the "What survives the fork? — POSIX
+semantics" section) puts it this way:
+
+> POSIX `fork()` only preserves the *calling* thread as a
+> runnable thread in the child. Every other thread in the
+> parent — trio's runner thread, any `to_thread` cache
+> threads, anything else — never executes another
+> instruction post-fork.
+
+A reader bouncing between the two can be forgiven for
+asking: well, *which* is it — leaked or gone?
+
+The answer is "yes". They're describing the same POSIX
+behavior from two different angles:
+
+- trio is talking about the **bytes** the dead threads
+  leave behind — stacks, TLS slots, per-thread arena
+  metadata — and the fact that nothing in the child can
+  drive them forward, free them, or even safely walk
+  them. That's a memory leak in the strict sense: held
+  but unreachable.
+- tractor is talking about the **execution** side
+  relevant to the forkserver design: which threads
+  retire instructions in the child? Exactly one — the
+  one that called `fork()`. Everything else, regardless
+  of the bytes left behind, is dead in a scheduler
+  sense.
+
+Neither framing is wrong; they're just answering
+different questions.
+
+## POSIX `fork()` in a multi-threaded program — what actually happens
+
+Per POSIX (and concretely on Linux glibc), the contract
+of `fork()` in a multi-threaded process is:
+
+1. The kernel creates a new process whose virtual
+   address space is a COW copy of the parent's. *All*
+   pages map across — code, heap, every thread's stack,
+   every malloc arena, every mmap region.
+2. Of the parent's N threads, exactly **one** is
+   reified in the child as a runnable kernel task: the
+   thread that called `fork()`. The other N-1 threads
+   have *no* corresponding task in the child kernel. They
+   were never scheduled, never `clone()`d for the child,
+   never exist as runnable entities.
+3. Their **memory artifacts** — pthread stacks, TLS,
+   `pthread_t` structures, glibc per-thread arena
+   bookkeeping — are still mapped in the child's address
+   space, because (1) duplicates *everything* page-wise.
+   They sit there as inert COW bytes.
+4. The kernel does not clean those bytes up. There is no
+   "phantom-thread cleanup" pass post-fork. The kernel
+   doesn't know which mapped pages "belonged to" which
+   thread — at the kernel level mappings are
+   process-scoped, not thread-scoped.
+5. The surviving thread (the caller of `fork()`) cannot
+   safely access those leaked bytes either. Any state
+   they encoded — held mutexes, in-flight syscalls,
+   half-updated invariants — is frozen at whatever
+   instant the parent's fork-syscall observed it. Some
+   of those mutexes may even still be locked from the
+   child's POV (the canonical "fork-in-multithreaded-
+   program-deadlocks" hazard; see `man pthread_atfork`).
+
+So: from the kernel's PoV, the child has one thread.
+From the address-space's PoV, the child has all the
+parent's bytes — including the corpses of the N-1 dead
+threads' stacks. Both true simultaneously.
+
+## Why trio says "leaked"
+
+trio's framing makes sense from the parent's
+PoV, looking at *what those threads were doing*. In a
+running `trio.run()` process you typically have:
+
+- The trio runner thread itself — owns the `selectors`
+  epoll fd, the signal-wakeup-fd, the run-queue.
+- Threadpool worker threads (`trio.to_thread`'s cache)
+  — blocked in `wait()` on the threadpool's work
+  condvar.
+- Whatever other ad-hoc threads the application
+  started.
+
+Each of those threads owns *real work-state*: epoll
+registrations, file descriptors held in
+soon-to-be-completed reads, half-released locks, posted
+but unconsumed wakeups. After fork, that state is still
+encoded in the child's memory. None of it is invalid in
+a well-formed-bytes sense. It's just that:
+
+- The thread that was driving it is gone.
+- Nothing else in the child knows the layout well
+  enough to take over.
+- Even if it did, the kernel objects backing the work
+  (epoll fd, signalfd) have separate post-fork
+  semantics that don't compose with userland trio
+  state.
+
+So the bytes are *held* (they're in the child's
+address space, they count against RSS, they survive
+until something clobbers them), and they're
+*unreachable* in any meaningful sense — no thread can
+safely drive them forward. That is the textbook
+definition of a leak.
+
+trio's quote is reminding the user that `fork()` from a
+multi-threaded process is a one-way memory hazard:
+whatever those threads were doing, that work-state is
+now garbage you happen to still be carrying.
+
+## Why tractor says "gone"
+
+tractor's `_main_thread_forkserver` framing is concerned
+with a different question: *which thread executes in the
+child, and is it safe?*
+
+The forkserver design rests on POSIX's "calling thread
+is the sole survivor" guarantee. We pick that calling
+thread very deliberately: a dedicated worker that has
+provably never entered trio. So the thread that *does*
+run in the child is one whose locals, TLS, and stack
+contain nothing trio-related. Trio's runner thread —
+the one that owned the epoll fd and the run-queue — is
+*gone* from the child in the execution sense. It will
+never run another instruction. The fact that its stack
+bytes still exist in the child's address space (the
+"leaked" view) is irrelevant to the forkserver, because
+nothing in the child reads or writes those pages.
+
+So when the docstring says "Every other thread … is
+gone the instant `fork()` returns in the child", it's
+being precise about the surface that matters for the
+backend: scheduler-level liveness. Nothing schedules
+those threads ever again. Whether their bytes are
+hanging around is a separate (and, for the design,
+non-load-bearing) fact.
+
+## Cross-table
+
+The same tabular layout the `_main_thread_forkserver`
+docstring uses, expanded with a fourth "what handles
+it" column:
+
+| thread              | parent    | child (executing) | child (memory)               | what handles it             |
+|---------------------|-----------|-------------------|------------------------------|-----------------------------|
+| forkserver worker   | continues | sole survivor     | live stack                   | runs the child's bootstrap  |
+| `trio.run()` thread | continues | not running       | leaked stack (zombie bytes)  | overwritten by child's fresh `trio.run()` |
+| any other thread    | continues | not running       | leaked stack (zombie bytes)  | overwritten / GC'd / clobbered by `exec()` if used |
+
+The "child (executing)" column is the *execution* side
+of the coin — what tractor cares about. The "child
+(memory)" column is the *memory* side — what trio
+cares about.
+
+The "what handles it" column is the deliberate punchline
+of the design: nothing has to handle the leaked bytes
+*explicitly*. They get clobbered by ordinary forward
+progress in the child:
+
+- The fresh `trio.run()` the child boots up allocates
+  its own stack, scheduler, and run-queue, which over
+  time overlaps and overwrites the inherited zombie
+  pages.
+- Python's GC walks live objects only; the dead-thread
+  Python frames aren't reachable from any
+  `PyThreadState`, so they get freed at the next
+  collection cycle.
+- If the child eventually `exec()`s, the entire address
+  space is replaced and the leak vanishes.
+
+## What this means for the forkserver design
+
+The crucial point is that **the design doesn't and
+*can't* prevent the leak**. There is no userland fix
+for COW thread stacks. The kernel hands the child a
+duplicated address space; that's what `fork()` *is*. No
+amount of pre-fork hookery, `pthread_atfork()`
+gymnastics, or post-fork cleanup can un-COW the dead
+threads' pages without unmapping them, and unmapping
+arbitrary regions of a duplicated address space is
+neither portable nor safe.
+
+What the design *does* ensure is the orthogonal
+property: the survivor thread is one that doesn't need
+any of that leaked state to function. Concretely:
+
+- Survivor is the forkserver worker thread.
+- That worker has provably never imported, called into,
+  or held any reference to `trio`. (Enforced by keeping
+  the worker's lifecycle entirely in
+  `_main_thread_forkserver.py` and never letting trio
+  task-state cross into it.)
+- So the leaked pages — trio runner stack, threadpool
+  caches, etc. — are inert relative to the survivor.
+  No code path in the child references them.
+- The child then boots its own fresh `trio.run()`,
+  which allocates new state in new pages. Over the
+  child's lifetime the COW'd zombie pages get
+  overwritten, GC'd, or (if the child eventually
+  `exec()`s) discarded wholesale.
+
+The "leak" is real but inert. It costs RSS until
+clobbered; it doesn't cost correctness. That's exactly
+the property the forkserver pattern is built on, and
+it's also why the design needs the "calling thread is
+trio-free" precondition to be airtight: if the survivor
+were a trio thread, it *would* try to drive the leaked
+trio state, and the leak would no longer be inert.
+
+## See also
+
+- `tractor/spawn/_main_thread_forkserver.py` — module
+  docstring's "What survives the fork? — POSIX
+  semantics" section is the in-tree, code-adjacent
+  prose this doc expands on. The cross-table here is a
+  fourth-column expansion of the table there.
+
+- [python-trio/trio#1614][trio-1614] — the trio issue
+  with the "leaked" framing, and the canonical thread
+  for trio + `fork()` hazards more broadly.
+
+- [`subint_fork_blocked_by_cpython_post_fork_issue.md`](./subint_fork_blocked_by_cpython_post_fork_issue.md)
+  — sibling analysis covering CPython's *post-fork*
+  hooks (`PyOS_AfterFork_Child`,
+  `_PyInterpreterState_DeleteExceptMain`) and why
+  fork-from-non-main-subint is a CPython-level hard
+  refusal. Complementary axis: this doc is about POSIX
+  semantics; that doc is about the CPython runtime
+  layer that runs *after* POSIX `fork()` returns in
+  the child.
+
+- `man pthread_atfork(3)` — canonical "fork in a
+  multithreaded process is dangerous" reference.
+  Especially the rationale section, which is the
+  closest thing to a normative statement of "the
+  surviving thread cannot safely use anything the dead
+  threads were touching."
+
+- `man fork(2)` (Linux) — "Other than [the calling
+  thread], … no other threads are replicated …"
+  paragraph is the kernel-side statement of the
+  execution-side framing this doc opens with.
+
+[trio-1614]: https://github.com/python-trio/trio/issues/1614
--- a/ai/conc-anal/spawn_time_boot_death_dup_name_issue.md
+++ b/ai/conc-anal/spawn_time_boot_death_dup_name_issue.md
@ -0,0 +1,142 @@
+# Spawn-time boot-death (`rc=2`) under rapid same-name spawn against a registrar
+
+## Symptom
+
+Spawning N (≥4) sub-actors with the **same name** in tight
+succession against a daemon registrar surfaces as
+`ActorFailure: Sub-actor (...) died during boot (rc=2)
+before completing parent-handshake`.
+
+```
+tests/discovery/test_multi_program.py
+  ::test_dup_name_cancel_cascade_escalates_to_hard_kill[n_dups=4]
+```
+
+```
+tractor._exceptions.ActorFailure:
+  Sub-actor ('doggy', '<uuid>') died during boot (rc=2)
+  before completing parent-handshake.
+    proc: <_ForkedProc pid=<n> returncode=None>
+```
+
+The `proc` repr shows `returncode=None` because the repr is
+captured before `proc.wait()` returns; the actual
+`os.WEXITSTATUS == 2` is reported via `result['died']` in the
+race-helper.
+
+## When it surfaces
+
+- N=2 (`n_dups=2`): **always passes**.
+- N=4 (`n_dups=4`): **consistent fail** under both `tpt-proto=tcp`
+  and `tpt-proto=uds`, MTF backend.
+- N=8 (`n_dups=8`): **passes** (counter-intuitive — see "racing
+  windows").
+- Non-MTF backends: not yet exercised systematically.
+
+## What previously masked it
+
+Pre the spawn-time `wait_for_peer_or_proc_death` race-helper
+(in `tractor.spawn._spawn`), the parent's `start_actor` flow
+ended with a bare:
+
+```python
+event, chan = await ipc_server.wait_for_peer(uid)
+```
+
+That awaits an unsignalled `trio.Event` on `_peer_connected[uid]`.
+If the sub-actor process **dies during boot** (before its
+runtime executes the parent-callback handshake that sets the
+event), the wait parks forever. The dead proc becomes a zombie
+because no one ever calls `proc.wait()` to reap it.
+
+In test contexts the failure presented as a hang or a much
+later `trio.TooSlowError` from an outer `fail_after`. In
+production it'd present as a parent that never makes progress
+past `start_actor`. The death itself was silently masked.
+
+## What surfaces it now
+
+`tractor.spawn._spawn.wait_for_peer_or_proc_death` (used by
+`_main_thread_forkserver_proc`) races the handshake-wait
+against `proc.wait()`. The race-helper raises `ActorFailure`
+on death-first instead of parking, exposing the rc=2.
+
+## Hypothesis: registrar-side same-name contention
+
+The test spawns N actors with name `doggy` sequentially:
+
+```python
+for i in range(n_dups):
+    p: Portal = await an.start_actor('doggy')
+    portals.append(p)
+```
+
+Each spawned doggy:
+
+1. Forks via the forkserver.
+2. Boots its runtime in `_actor_child_main`.
+3. Connects back to the parent for handshake.
+4. Connects to the daemon registrar to call `register_actor`.
+5. Enters its RPC msg-loop.
+
+Step (4) is where the same-name contention lives. The
+registrar's `register_actor` (in
+`tractor.discovery._registry`) accepts duplicate names
+(stores `(name, uuid) -> addr`), but its internal bookkeeping
+may have a non-trivial check (e.g. `wait_for_actor` resolution,
+`_addrs2aids` map updates) that errors out under specific
+ordering between the existing entry and the incoming one.
+
+`rc=2 == os.WEXITSTATUS == 2` corresponds to `sys.exit(2)`
+in the doggy process — typically reached via an unhandled
+exception that's translated to exit code 2 by Python's top-
+level (e.g. `argparse` errors use 2; `SystemExit(2)` etc.).
+So the doggy is hitting an explicit exit path during
+`register_actor` or just-after.
+
+The non-monotonic shape (N=2 OK, N=4 BAD, N=8 OK) suggests a
+specific timing window — likely "the 3rd register-RPC arrives
+while the 1st-or-2nd is in some intermediate state". With
+N=8, the additional procs widen the registration spread
+enough that no two land in the conflicting window.
+
+## Where to dig next
+
+- Add per-actor logging in `_actor_child_main` and
+  `register_actor` to surface the actual exception that
+  triggers the rc=2 exit. Currently the doggy dies before
+  the parent ever sees its stderr (forkserver doesn't
+  marshal child stdio back).
+- Race-test the registrar's `register_actor` /
+  `unregister_actor` /  `wait_for_actor` against same-name
+  concurrent calls in isolation (no spawn).
+- Consider whether `register_actor` should be idempotent
+  under same-name re-register or should explicitly reject
+  same-name (and ideally with a clear `RemoteActorError`,
+  not `sys.exit(2)`).
+
+## Test-suite handling
+
+Currently:
+
+- `tests/discovery/test_multi_program.py
+  ::test_dup_name_cancel_cascade_escalates_to_hard_kill[n_dups=4]`
+  is `pytest.mark.xfail(strict=False, reason=...)` to keep
+  the suite green while this issue is investigated.
+- `n_dups=2` and `n_dups=8` continue to validate the
+  cancel-cascade hard-kill escalation.
+
+Once the underlying race is understood + fixed, drop the
+xfail.
+
+## Related work
+
+- The cancel-cascade fix that introduced this regression
+  test:
+  `tractor/_exceptions.py:ActorTooSlowError`,
+  `tractor/runtime/_supervise.py:_try_cancel_then_kill`,
+  `tractor/runtime/_portal.py:Portal.cancel_actor(
+   raise_on_timeout=...)`.
+- The spawn-time death-detection that exposed this:
+  `tractor/spawn/_spawn.py:wait_for_peer_or_proc_death`,
+  used by `tractor/spawn/_main_thread_forkserver.py`.
--- a/ai/conc-anal/test_register_duplicate_name_daemon_connect_race_issue.md
+++ b/ai/conc-anal/test_register_duplicate_name_daemon_connect_race_issue.md
@ -0,0 +1,273 @@
+# `test_register_duplicate_name` racy connect-failure on `daemon` fixture readiness
+
+## Symptom
+
+`tests/test_multi_program.py::test_register_duplicate_name`
+fails intermittently under BOTH transports + ALL spawn
+backends with connect-refused errors:
+
+```
+# under --tpt-proto=uds
+FAILED tests/test_multi_program.py::test_register_duplicate_name
+- ConnectionRefusedError: [Errno 111] Connection refused
+( ^^^ this exc was collapsed from a group ^^^ )
+
+# under --tpt-proto=tcp
+FAILED tests/test_multi_program.py::test_register_duplicate_name
+- OSError: all attempts to connect to 127.0.0.1:36003 failed
+( ^^^ this exc was collapsed from a group ^^^ )
+```
+
+Distinct from the cancel-cascade `TooSlowError` flake
+class — see
+`cancel_cascade_too_slow_under_main_thread_forkserver_issue.md`.
+This is a **connect-time race** before the daemon is
+fully ready to `accept()`, not a teardown-cascade
+slowness.
+
+## Root cause: blind `time.sleep()` in `daemon` fixture
+
+`tests/conftest.py::daemon` boots a sub-py-process via
+`subprocess.Popen([python, '-c', 'tractor.run_daemon(...)'])`,
+then **blindly sleeps** a fixed delay before yielding
+`proc` to the test:
+
+```python
+# excerpt from tests/conftest.py::daemon
+proc = subprocess.Popen([
+    sys.executable, '-c', code,
+])
+
+bg_daemon_spawn_delay: float = _PROC_SPAWN_WAIT  # 0.6
+if tpt_proto == 'uds':
+    bg_daemon_spawn_delay += 1.6
+if _non_linux and ci_env:
+    bg_daemon_spawn_delay += 1
+
+# XXX, allow time for the sub-py-proc to boot up.
+# !TODO, see ping-polling ideas above!
+time.sleep(bg_daemon_spawn_delay)
+
+assert not proc.returncode
+yield proc
+```
+
+Inherent fragility: the delay is "long enough on dev
+boxes most of the time" but has no actual
+synchronization with the daemon's `bind()` + `listen()`
+completion. Under any of:
+
+- Loaded box (CI parallelism, big rebuild in
+  background, low-cpu-freq)
+- Cold first-run (`importlib` cache miss, JIT warmup)
+- Higher-than-expected `tractor` import cost
+- Filesystem latency (UDS sockfile create, slow
+  tmpfs)
+
+...the sleep finishes BEFORE the daemon has bound its
+listen socket → first test client call to
+`tractor.find_actor()` / `wait_for_actor()` /
+`open_nursery(registry_addrs=[reg_addr])`'s implicit
+connect → `ConnectionRefusedError` (TCP) or
+`FileNotFoundError`/`ConnectionRefusedError` (UDS).
+
+## Reproducer
+
+Easiest: run the suite under load.
+
+```bash
+# create CPU pressure on another core in parallel
+stress-ng --cpu 2 --timeout 600s &
+
+./py313/bin/python -m pytest \
+  tests/test_multi_program.py::test_register_duplicate_name \
+  --spawn-backend=main_thread_forkserver \
+  --tpt-proto=tcp -v
+```
+
+Reproduces ~30-50% of the time on a dev laptop. On a
+quiet idle box, may need 5-10 runs to hit.
+
+## Why the existing `_PROC_SPAWN_WAIT` tuning is
+inadequate
+
+Recent `bg_daemon_spawn_delay` rename
+(de-monotonic-grow fix) just-shipped removed the
+*accumulation* bug where each invocation made the
+NEXT test's wait longer too. Net effect: every
+invocation now uses the SAME `0.6 + 1.6` (UDS) or
+`0.6` (TCP) sleep, no growth. Good — but does
+NOTHING for the underlying race. Each individual
+test still relies on a blind sleep that may or may
+not be sufficient.
+
+Bumping the constant higher pushes flake rate down
+but never to zero AND adds dead time to every
+non-flaking run. Not a fix, just a knob.
+
+## Side effects
+
+- **Inter-test cascade**: a single failure can cascade
+  via leaked subprocesses (the `daemon` fixture's
+  cleanup may not fully tear down a daemon that never
+  reached "ready"). The `_reap_orphaned_subactors`
+  session-end + `_track_orphaned_uds_per_test`
+  per-test fixtures handle most of this now, but the
+  affected test itself still fails.
+- **Worsens under fork-spawn backends**: the daemon
+  has more init work
+  (`_main_thread_forkserver`-coordinator-thread
+  startup, etc.) so the sleep has to cover MORE.
+
+## Fix design — replace blind sleep with active poll
+
+The right primitive is **poll the daemon's bind
+address until it accepts a connection or we time
+out**, with the timeout being a hard ceiling rather
+than a baseline. Two implementation paths:
+
+### Path A — TCP/UDS connect-poll loop
+
+Try `socket.connect(reg_addr)` in a tight loop with
+short backoff (~50ms), succeed on the first non-error
+return, fail-loud on a hard cap (e.g. 10s). Same
+primitive works for both transports because both use
+`socket.connect()` semantics.
+
+Rough shape:
+
+```python
+def _wait_for_daemon_ready(
+    reg_addr,
+    tpt_proto: str,
+    timeout: float = 10.0,
+    poll_interval: float = 0.05,
+) -> None:
+    deadline = time.monotonic() + timeout
+    while True:
+        if tpt_proto == 'tcp':
+            sock = socket.socket(socket.AF_INET)
+            target = reg_addr  # (host, port)
+        else:  # uds
+            sock = socket.socket(socket.AF_UNIX)
+            target = os.path.join(*reg_addr)
+        try:
+            sock.settimeout(poll_interval)
+            sock.connect(target)
+        except (
+            ConnectionRefusedError,
+            FileNotFoundError,
+            socket.timeout,
+        ) as exc:
+            if time.monotonic() >= deadline:
+                raise TimeoutError(
+                    f'Daemon never accepted on {target!r} '
+                    f'within {timeout}s'
+                ) from exc
+            time.sleep(poll_interval)
+        else:
+            sock.close()
+            return
+```
+
+Pros: trivial primitive, no tractor-runtime
+dependency, works pre-yield in the fixture body,
+fail-fast on truly-broken daemon.
+Cons: doesn't actually do an IPC handshake, just
+proves listen-side is up. A daemon that bound but
+hasn't initialized its registrar table yet would
+still race.
+
+### Path B — `tractor.find_actor()` poll
+
+Use the actual discovery API the test would call:
+
+```python
+async def _wait_for_daemon_ready_via_discovery(
+    reg_addr,
+    timeout: float = 10.0,
+    poll_interval: float = 0.05,
+):
+    deadline = trio.current_time() + timeout
+    async with tractor.open_root_actor(
+        registry_addrs=[reg_addr],
+        # ephemeral root just for the probe
+    ):
+        while True:
+            try:
+                async with tractor.find_actor(
+                    'registrar',  # daemon's own name
+                    registry_addrs=[reg_addr],
+                ) as portal:
+                    if portal is not None:
+                        return
+            except Exception:
+                pass
+            if trio.current_time() >= deadline:
+                raise TimeoutError(...)
+            await trio.sleep(poll_interval)
+```
+
+Pros: actually proves the discovery path works,
+handles the "bound but not ready" case naturally.
+Cons: requires booting an ephemeral root actor JUST
+for the probe (overhead), more code, and runs in trio
+which complicates the sync-fixture context. Need a
+`trio.run()` wrapper.
+
+### Recommended: Path A with optional handshake check
+
+Path A is much simpler + handles 95% of the bug
+class. If "bound-but-not-ready" turns out to still
+race (it shouldn't — `tractor.run_daemon` doesn't
+return from `bind()` until the registrar is
+fully populated), escalate to Path B as a focused
+follow-up.
+
+## Workarounds (until fix lands)
+
+1. **Bump `_PROC_SPAWN_WAIT`** higher (current: 0.6).
+   2.0–3.0 hides most flakes at the cost of adding
+   dead time to every test. Not a fix but reduces
+   blast radius while the proper poll lands.
+2. **`pytest-rerunfailures`** with `reruns=1` on the
+   `daemon` fixture's tests specifically. Hides the
+   flake but doesn't address it.
+3. **Mark known-affected tests as `xfail(strict=False)`**
+   under `--ci`. Lets CI go green at the cost of
+   silently hiding regressions.
+
+(Recommend skipping all three — implement the active
+poll instead.)
+
+## Investigation next steps
+
+1. Implement Path A as a `_wait_for_daemon_ready()`
+   helper in `tests/conftest.py`. Replace the
+   `time.sleep(bg_daemon_spawn_delay)` call with it.
+2. Drop the `_PROC_SPAWN_WAIT` constant entirely
+   (active poll obsoletes blind sleep).
+3. Run the suite 5-10 times to validate flake rate
+   drops to 0.
+4. If flakes persist, profile whether the daemon
+   process exits with non-zero before the poll's
+   deadline hits — that'd be a different bug
+   (daemon startup crash) that the blind sleep was
+   masking.
+5. Cross-check `tests/test_multi_program.py::test_*`
+   — multiple tests use the `daemon` fixture; all
+   should benefit from the same poll primitive.
+
+## Related
+
+- `tests/conftest.py::daemon` — the fixture under
+  fix
+- `tests/conftest.py::_PROC_SPAWN_WAIT` — the
+  constant to drop
+- `cancel_cascade_too_slow_under_main_thread_forkserver_issue.md`
+  — distinct flake class (cancel-cascade
+  `TooSlowError` at teardown, not connect-time race)
+- `trio_wakeup_socketpair_busy_loop_under_fork_issue.md`
+  — different bug entirely; this race was masked
+  pre-WakeupSocketpair-patch by the busy-loop
+  hangs.
--- a/ai/conc-anal/trio_033_cancel_cascade_slowdown_depth3_issue.md
+++ b/ai/conc-anal/trio_033_cancel_cascade_slowdown_depth3_issue.md
@ -0,0 +1,102 @@
+# `trio` 0.29 -> 0.33 slows the depth=3 cancel-cascade
+
+## Symptom
+
+After locking to `trio==0.33.0` (commit `c7741bba`, was
+`0.29.0`), this test reliably trips its `fail_after`
+deadline on the **`trio`** backend:
+
+```
+FAILED tests/test_cancellation.py::test_nested_multierrors[start_method=trio-depth=3]
+  - AssertionError: assert False
+    where False = isinstance(
+      Cancelled(source='deadline', source_task=None, reason=None),
+      tractor.RemoteActorError,
+    )
+```
+
+A `fail_after_w_trace` hang-snapshot is captured for the
+test each run (deadline-injected `Cancelled` wrapped into
+the actor-nursery `BaseExceptionGroup`).
+
+## Root cause (immediate)
+
+The test budgets `fail_after(6)` for the `trio` backend.
+That 6s was chosen (commit `32955db0`, while `trio==0.29`)
+with the assertion that trio finishes "well under" 6s.
+The `trio` 0.29 -> 0.33 bump slowed the depth=3 cascade
+past that budget, so the 6s deadline now fires mid-cascade.
+
+trio 0.33 added **cancel-reason tracking** — every
+`Cancelled` now carries `(source=, reason=, source_task=)`.
+The injected exc is `Cancelled(source='deadline')`, i.e.
+trio itself naming our `fail_after(6)` scope as the cancel
+origin. When that `Cancelled` collapses one branch of the
+nursery BEG, the test's `isinstance(subexc,
+RemoteActorError)` assertion fails. The healthy outcome is
+`BEG = [RemoteActorError, RemoteActorError]`; the
+`Cancelled` is purely an artifact of the deadline cutting
+the cascade short.
+
+## Measurements (standalone, this machine)
+
+```
+depth=1  trio   ~3.15s   PASS  (keeps 6s budget)
+depth=3  trio   ~6.8-8.2s  FAIL @ 6s  (now bumped to 12s)
+```
+
+depth=1 still fits comfortably; only depth=3 (deeper
+recursive spawn-and-error tree => more actors to reap)
+exceeds the old budget. The ~2s/depth-level cost looks
+like serialized per-actor reap / `terminate_after` waits.
+
+## Mitigation applied
+
+`test_nested_multierrors` now splits the `trio` budget:
+
+```python
+case ('trio', 1):
+    timeout = 6
+case ('trio', 3):
+    timeout = 12   # was 6; see this doc
+```
+
+This stops the deadline from firing so the cascade
+completes naturally to `[RAE, RAE]`.
+
+## Also affected — same root cause, different test
+
+`test_echoserver_detailed_mechanics[trio-raise_error=KeyboardInterrupt]`
+(`tests/test_infected_asyncio.py`) tripped the *same*
+slowdown via its much tighter `trio` budget of `1s`. The
+single-aio-subactor teardown now takes ~1s, so the `1s`
+`fail_after` raced the deadline (PASS at 0.99s / FAIL at
+1.03s across back-to-back standalone runs). On a deadline-
+fire the injected `Cancelled(source='deadline')` wraps the
+mid-stream `KeyboardInterrupt` into a `BaseExceptionGroup`,
+which is NOT a `KeyboardInterrupt` so the bare
+`pytest.raises(KeyboardInterrupt)` fails. (The sibling
+`raise_error=Exception` variant only "passes" by accident:
+an `ExceptionGroup` *is-a* `Exception`, so its
+`pytest.raises(Exception)` still matches even when wrapped.)
+
+Mitigation: bump that `trio` budget `1 -> 4s` (matching the
+forking-spawner case). Without a deadline-fire the KBI
+propagates bare and the assertion passes.
+
+## Open follow-up (the actual regression)
+
+The budget bump is a band-aid — the underlying question is
+**why** the depth=3 `trio` cancel-cascade went from <6s to
+~7-8s across `trio` 0.29 -> 0.33. Candidate avenues:
+
+- which scope owns the per-actor `terminate_after` wait,
+  and are the tree's reaps concurrent or serialized?
+- did trio 0.33's abort/reschedule or cancel-reason
+  bookkeeping change checkpoint timing on the cancel path?
+
+If/when the cascade speeds back up under-budget, depth=3
+will start completing well under 12s — at which point the
+budget can be tightened back toward 6s as a regression
+tripwire. Related (different backend, same cascade class):
+`cancel_cascade_too_slow_under_main_thread_forkserver_issue.md`.
--- a/ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md
+++ b/ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md
@ -0,0 +1,221 @@
+# trio `WakeupSocketpair.drain()` busy-loop in forked child (peer-closed missed-EOF)
+
+## Reproducer
+
+```bash
+./py313/bin/python -m pytest \
+  tests/test_multi_program.py::test_register_duplicate_name \
+  --tpt-proto=tcp \
+  --spawn-backend=main_thread_forkserver \
+  -v --capture=sys
+```
+
+Subactor pegs a CPU core indefinitely; parent test
+hangs waiting for the subactor.
+
+## Empirical evidence (caught alive)
+
+```
+$ sudo strace -p <subactor-pid>
+recvfrom(6, "", 65536, 0, NULL, NULL)   = 0
+recvfrom(6, "", 65536, 0, NULL, NULL)   = 0
+recvfrom(6, "", 65536, 0, NULL, NULL)   = 0
+... (no `epoll_wait`, no other syscalls, just this back-to-back)
+```
+
+Pattern: tight C-level `recvfrom` loop returning 0
+each call. No `epoll_wait` between iterations →
+**not trio's task scheduler**. Pure synchronous C
+loop.
+
+```
+$ sudo readlink /proc/<subactor-pid>/fd/6
+socket:[<inode>]
+
+$ sudo lsof -p <subactor-pid> | grep ' 6u'
+<cmd> <pid> goodboy 6u unix 0xffff... 0t0 <inode> type=STREAM (CONNECTED)
+```
+
+fd=6 is an **AF_UNIX socket** in CONNECTED state.
+Even though the test uses `--tpt-proto=tcp`, this fd
+is NOT a tractor IPC channel — it's an internal
+trio socketpair.
+
+## Root-cause: `WakeupSocketpair.drain()`
+
+`/site-packages/trio/_core/_wakeup_socketpair.py`:
+
+```python
+class WakeupSocketpair:
+    def __init__(self) -> None:
+        self.wakeup_sock, self.write_sock = socket.socketpair()
+        self.wakeup_sock.setblocking(False)
+        self.write_sock.setblocking(False)
+        ...
+
+    def drain(self) -> None:
+        try:
+            while True:
+                self.wakeup_sock.recv(2**16)
+        except BlockingIOError:
+            pass
+```
+
+`socket.socketpair()` on Linux defaults to AF_UNIX
+SOCK_STREAM. Both ends non-blocking. Normal flow:
+
+1. Signal/wake event → `write_sock.send(b'\x00')`
+   queues a byte.
+2. `wakeup_sock` becomes readable → trio's epoll
+   triggers.
+3. Trio calls `drain()` to flush the buffer.
+4. drain loops on `wakeup_sock.recv(64KB)`.
+5. Eventually buffer empty → non-blocking socket
+   raises `BlockingIOError` → except → break.
+
+**Bug surface — peer-closed missed-EOF**:
+
+Non-blocking socket semantics:
+- buffer has data → `recv` returns N>0 bytes (loop continues)
+- buffer empty → `recv` raises `BlockingIOError`
+- **peer FIN'd → `recv` returns 0 bytes (NEITHER exception NOR
+  break — infinite tight loop)**
+
+`drain()` does not handle the `b''` return-value
+(EOF) case. If `write_sock` has been closed (or the
+process holding it is gone), every iteration returns
+0 → infinite loop → 100% CPU on a single core.
+
+## Why this triggers under `main_thread_forkserver`
+
+Under `os.fork()` from the forkserver-worker thread:
+
+1. Parent has a `WakeupSocketpair` instance with
+   `wakeup_sock=fdN`, `write_sock=fdM`. Both fds
+   open in parent.
+2. Fork → child inherits BOTH fds (kernel-level fd
+   table dup).
+3. `_close_inherited_fds()` runs in child →
+   closes everything except stdio. `wakeup_sock` and
+   `write_sock` of the parent's `WakeupSocketpair`
+   ARE closed in child.
+4. Child's trio (running fresh) creates its OWN
+   `WakeupSocketpair` → NEW fd numbers (e.g. fd 6, 7).
+5. **In `infect_asyncio` mode** the asyncio loop is
+   the host; trio runs as guest via
+   `start_guest_run`. trio still creates its
+   `WakeupSocketpair` in the I/O manager but its
+   role is different.
+
+The race window: somewhere between (3) and (5), if a
+`WakeupSocketpair` Python object reference inherited
+via COW (from parent's pre-fork heap) survives long
+enough that `drain()` is called on it AFTER its fds
+were closed but BEFORE the child's NEW socketpair
+takes over the recycled fd numbers — the recycled fd
+will be one of the child's NEW socketpair ends, whose
+peer might be FIN-flagged (e.g. parent-process
+peer-end is closed).
+
+Or simpler: the `wait_for_actor`/`find_actor` discovery
+flow in `test_register_duplicate_name` triggers an
+unusual code path where a stale `WakeupSocketpair`
+gets `drain()`-called on a fd whose peer has already
+closed.
+
+## Why `drain()` shouldn't loop indefinitely on EOF
+(upstream trio bug)
+
+Even WITHOUT fork, `drain()` should treat `b''` as
+EOF and break. The current code is correct for the
+"buffer drained on a healthy socketpair" scenario but
+incorrect for the "peer is gone" scenario. It's a
+defensive-programming gap in trio.
+
+A one-line patch upstream:
+
+```python
+def drain(self) -> None:
+    try:
+        while True:
+            data = self.wakeup_sock.recv(2**16)
+            if not data:
+                break  # peer-closed; nothing more to drain
+    except BlockingIOError:
+        pass
+```
+
+## Workarounds (until the underlying issue lands)
+
+1. **Skip-mark on the fork backend**:
+   `tests/test_multi_program.py` →
+   `pytest.mark.skipon_spawn_backend('main_thread_forkserver',
+   reason='trio WakeupSocketpair.drain busy-loop, see ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md')`.
+
+2. **Defensive monkey-patch in tractor's
+   forkserver-child prelude** — wrap
+   `WakeupSocketpair.drain` to handle `b''`:
+
+   ```python
+   # in `_actor_child_main` or `_close_inherited_fds`'s
+   # post-fork prelude:
+   from trio._core._wakeup_socketpair import WakeupSocketpair
+   _orig_drain = WakeupSocketpair.drain
+   def _safe_drain(self):
+       try:
+           while True:
+               data = self.wakeup_sock.recv(2**16)
+               if not data:
+                   return  # peer closed
+       except BlockingIOError:
+           pass
+   WakeupSocketpair.drain = _safe_drain
+   ```
+
+   Tracks upstream — remove once trio fixes.
+
+3. **Upstream the fix**: 1-line PR to `python-trio/trio`
+   adding `if not data: break` to `drain()`.
+
+## Investigation next steps
+
+1. **Confirm via py-spy**: when caught alive, detach
+   strace first then
+   `sudo py-spy dump --pid <subactor> --locals`. The
+   busy thread should show `drain` from `WakeupSocketpair`
+   in the call chain.
+2. **Identify which write-end peer is closed**: from
+   the inode of fd 6, look up the matching peer
+   inode via `ss -xp` and see whose process it
+   was/is.
+3. **Verify the missed-EOF hypothesis**: hand-craft a
+   minimal `WakeupSocketpair` repro:
+
+   ```python
+   from trio._core._wakeup_socketpair import WakeupSocketpair
+   ws = WakeupSocketpair()
+   ws.write_sock.close()  # simulate peer-gone
+   ws.drain()             # should hang forever
+   ```
+
+## Sibling bug
+
+`tests/test_infected_asyncio.py::test_aio_simple_error`
+hangs under the same backend with a DIFFERENT
+fingerprint (Mode-A deadlock, both parties in
+`epoll_wait`, no busy-loop). Distinct root cause —
+see `infected_asyncio_under_main_thread_forkserver_hang_issue.md`.
+
+Both share the broader theme: **trio internal-state
+initialization isn't fully fork-safe under
+`main_thread_forkserver`** for the more exotic
+dispatch paths.
+
+## See also
+
+- [#379](https://github.com/goodboy/tractor/issues/379) — subint umbrella
+- python-trio/trio#1614 — trio + fork hazards
+- `trio._core._wakeup_socketpair.WakeupSocketpair`
+  source (the smoking gun)
+- `ai/conc-anal/fork_thread_semantics_execution_vs_memory.md`
+- `ai/conc-anal/infected_asyncio_under_main_thread_forkserver_hang_issue.md`
--- a/ai/prompt-io/claude/20260406T172848Z_02b2ef1_prompt_io.md
+++ b/ai/prompt-io/claude/20260406T172848Z_02b2ef1_prompt_io.md
@ -0,0 +1,54 @@
+---
+model: claude-opus-4-6
+service: claude
+session: (ad-hoc, not tracked via conf.toml)
+timestamp: 2026-04-06T17:28:48Z
+git_ref: 02b2ef1
+scope: tests
+substantive: true
+raw_file: 20260406T172848Z_02b2ef1_prompt_io.raw.md
+---
+
+## Prompt
+
+User asked to extend `tests/test_resource_cache.py` with a test
+that reproduces the edge case fixed in commit `02b2ef18` (per-key
+locking+user tracking in `maybe_open_context()`). The bug was
+originally triggered in piker's `brokerd.kraken` backend where the
+same `acm_func` was called with different kwargs, and the old
+global `_Cache.users` counter caused:
+
+- teardown skipped for one `ctx_key` bc another key's users kept
+  the global count > 0
+- re-entry hitting `assert not resources.get(ctx_key)` during the
+  teardown window
+
+User requested a test that would fail under the old code and pass
+with the fix.
+
+## Response summary
+
+Designed and implemented `test_per_ctx_key_resource_lifecycle`
+which verifies per-`ctx_key` resource isolation by:
+
+1. Holding resource `'a'` open in a bg task
+2. Opening+closing resource `'b'` (same `acm_func`, different
+   kwargs) while `'a'` is still alive
+3. Re-opening `'b'` and asserting cache MISS — proving `'b'` was
+   torn down independently despite `'a'` keeping its own user
+   count > 0
+
+With the old global counter, phase 3 would produce a stale cache
+HIT (leaked resource) or crash on the assert.
+
+Also added a trivial `acm_with_resource(resource_id)` ACM helper
+at module level.
+
+## Files changed
+
+- `tests/test_resource_cache.py` — add `acm_with_resource` ACM +
+  `test_per_ctx_key_resource_lifecycle` test fn
+
+## Human edits
+
+None — committed as generated (pending user review).
--- a/ai/prompt-io/claude/20260406T193125Z_85f9c5d_prompt_io.md
+++ b/ai/prompt-io/claude/20260406T193125Z_85f9c5d_prompt_io.md
@ -0,0 +1,57 @@
+---
+model: claude-opus-4-6
+service: claude
+session: (ad-hoc, not tracked via conf.toml)
+timestamp: 2026-04-06T19:31:25Z
+git_ref: 85f9c5d
+scope: tests
+substantive: true
+raw_file: 20260406T193125Z_85f9c5d_prompt_io.raw.md
+---
+
+## Prompt
+
+User asked to reproduce the exact `assert not resources.get(ctx_key)`
+crash originally seen in piker's `brokerd.kraken` backend via
+`open_cached_client('kraken')`. Key constraints from user:
+
+- In piker, kwargs were the **same** (empty) so all callers
+  share one `ctx_key = (fid, ())`
+- The root issue is `_Cache.locks` being indexed by `fid`
+  rather than `ctx_key`, plus the race window between
+  `values.pop()` and `resources.pop()` in `_Cache.run_ctx`
+  (the acm `__aexit__` has checkpoints in between)
+- Mark the test `@pytest.mark.xfail` so it can be committed
+  against the unpatched code and later adjusted once the fix
+  from `9e49eddd` is applied
+
+User pointed to piker's `open_cached_client('kraken')` usage
+as the real-world pattern to model.
+
+## Response summary
+
+Added `test_moc_reentry_during_teardown` which deterministically
+reproduces the teardown race:
+
+1. A `cached_client()` acm (no params, like
+   `kraken.api.get_client()`) signals via `in_aexit` event
+   when its `__aexit__` starts, then sleeps to hold the
+   window open
+2. `use_and_exit` task enters+exits the cached resource
+3. `reenter_during_teardown` waits on `in_aexit` then
+   re-enters — landing exactly in the window where
+   `values` is popped but `resources` still exists
+4. The old `assert not resources.get(ctx_key)` fires
+5. `@pytest.mark.xfail(raises=AssertionError)` catches it
+
+Test confirmed: `XFAIL` with the exact traceback at
+`_mngrs.py:327`.
+
+## Files changed
+
+- `tests/test_resource_cache.py` — add
+  `test_moc_reentry_during_teardown` xfail test fn
+
+## Human edits
+
+None — committed as generated (pending user review).
--- a/ai/prompt-io/claude/20260406T222648Z_f086222_prompt_io.md
+++ b/ai/prompt-io/claude/20260406T222648Z_f086222_prompt_io.md
@ -0,0 +1,52 @@
+---
+model: claude-opus-4-6
+service: claude
+session: (ad-hoc, not tracked via conf.toml)
+timestamp: 2026-04-06T22:26:48Z
+git_ref: f086222
+scope: code
+substantive: true
+raw_file: 20260406T222648Z_f086222_prompt_io.raw.md
+---
+
+## Prompt
+
+User applied the per-key locking patch (`f086222d`),
+removed the `@pytest.mark.xfail` from
+`test_moc_reentry_during_teardown`, and found the
+test STILL fails — now with `RuntimeError` instead
+of `AssertionError`. Asked to diagnose why the fix
+doesn't actually resolve this race.
+
+## Response summary
+
+Traced the exact scheduling sequence across three
+concurrent tasks to show that the per-`ctx_key`
+locking change doesn't close the fundamental gap:
+
+- `maybe_open_context`'s finally pops the lock and
+  sets `no_more_users` in the **exiting caller's
+  task**
+- But the actual resource cleanup (`values.pop`,
+  acm `__aexit__`, `resources.pop`) runs in the
+  **`run_ctx` task** inside `service_tn`
+- A re-entering task creates a fresh lock (old one
+  popped), acquires immediately, and hits stale
+  `resources[ctx_key]` while `run_ctx` is still in
+  `__aexit__`
+
+Identified three possible fix strategies:
+1. Let `run_ctx` own lock release/pop after
+   `resources.pop()`
+2. Add a "teardown complete" event that new callers
+   wait on
+3. Move `resources.pop()` into `run_ctx`'s inner
+   finally (before acm `__aexit__`)
+
+## Files changed
+
+No code changes — analysis only.
+
+## Human edits
+
+N/A — diagnostic output, no code generated.
--- a/ai/prompt-io/claude/20260406T222648Z_f086222_prompt_io.raw.md
+++ b/ai/prompt-io/claude/20260406T222648Z_f086222_prompt_io.raw.md
@ -0,0 +1,32 @@
+---
+model: claude-opus-4-6
+service: claude
+timestamp: 2026-04-06T22:26:48Z
+git_ref: f086222
+diff_cmd: git diff f086222~1..f086222
+---
+
+Here's the problem. The per-key locking doesn't fix this race because the **lock lifetime is decoupled from `run_ctx`'s lifetime**. Trace through what happens:
+
+**`maybe_open_context` finally** (lines 468-495):
+1. `users[ctx_key] -= 1` → 0
+2. `no_more_users.set()` → wakes `run_ctx` (but it doesn't run yet, just becomes ready)
+3. `_Cache.locks.pop(ctx_key)` → **lock is gone**
+4. `maybe_open_context` exits
+
+**`run_ctx`** now gets scheduled (lines 237-241):
+5. Inner finally: `values.pop(ctx_key)` → values gone
+6. `mng.__aexit__()` runs → `in_aexit.set()`, `await trio.sleep(10)` → **CHECKPOINT**
+
+**`reenter_during_teardown`** wakes:
+7. Enters `maybe_open_context` → no lock exists for `ctx_key` → creates **fresh** lock, acquires immediately
+8. `values[ctx_key]` → `KeyError`
+9. `resources.get(ctx_key)` → **still exists!** (step 6 hasn't reached `resources.pop()` in `run_ctx`'s outer finally yet)
+10. `raise RuntimeError('Caching resources ALREADY exist?!')`
+
+The core issue: `no_more_users.set()` (step 2) and `locks.pop()` (step 3) happen in the **exiting caller's task**, but the actual resource cleanup (`values.pop`, acm `__aexit__`, `resources.pop`) happens in the **`run_ctx` task** inside `service_tn`. There's no synchronization between "lock removed, new callers welcome" and "run_ctx has actually finished tearing down". The per-`ctx_key` lock change just made the lock more granular — it didn't close this gap.
+
+The fix needs to ensure a new caller can't acquire a lock for `ctx_key` until `run_ctx` has fully exited (including `resources.pop()`). Either:
+- **don't pop the lock in `maybe_open_context`** — let `run_ctx` release/pop it after `resources.pop()`
+- **wait on a "teardown complete" event** in the cache-miss path when stale `resources` exist
+- **pop `resources` before the acm `__aexit__`** (move `resources.pop()` into the inner finally alongside `values.pop()`)
--- a/ai/prompt-io/claude/20260413T192116Z_f851f28_prompt_io.md
+++ b/ai/prompt-io/claude/20260413T192116Z_f851f28_prompt_io.md
@ -0,0 +1,74 @@
+---
+model: claude-opus-4-6
+service: claude
+session: (ad-hoc, not tracked via conf.toml)
+timestamp: 2026-04-13T19:21:16Z
+git_ref: f851f28
+scope: code
+substantive: true
+---
+
+## Prompt
+
+User provided a detailed implementation plan for
+a new `test_tpt_bind_addrs.py` test suite covering
+`open_root_actor(tpt_bind_addrs=...)` — the three
+runtime code paths in `_root.py:385-450`:
+
+1. Non-registrar, no explicit bind -> random addrs
+2. Registrar, no explicit bind -> registry_addrs
+3. Explicit bind given -> `wrap_address()` + merge
+
+Plan specified 6 test functions (~10 parametrized
+variants), predicted a type-mixing bug at line 446,
+and asked for an in-flight fix if confirmed.
+
+## Response summary
+
+Created `tests/discovery/test_tpt_bind_addrs.py`
+with 9 collected test variants across 6 functions:
+
+- `test_registrar_root_tpt_bind_addrs` (3 variants:
+  `bind-eq-reg`, `bind-subset-reg`,
+  `bind-disjoint-reg`)
+- `test_non_registrar_root_tpt_bind_addrs`
+  (2 variants: `bind-same-bindspace`,
+  `bind-disjoint`)
+- `test_non_registrar_default_random_bind`
+  (baseline, no explicit bind)
+- `test_tpt_bind_addrs_as_maddr_str`
+  (multiaddr string input)
+- `test_registrar_merge_binds_union`
+  (registrar + disjoint bind -> union)
+- `test_open_nursery_forwards_tpt_bind_addrs`
+  (`open_nursery(**kwargs)` forwarding)
+
+Confirmed and fixed the predicted bug at
+`_root.py:446`: the registrar merge path mixed
+`Address` objects (`tpt_bind_addrs`) with raw tuples
+(`uw_reg_addrs`) inside `set()`, preventing
+deduplication and causing double-bind `OSError`.
+
+Fix: wrap `uw_reg_addrs` before the set union:
+```python
+# before (broken)
+tpt_bind_addrs = list(set(
+    tpt_bind_addrs + uw_reg_addrs
+))
+# after (fixed)
+tpt_bind_addrs = list(set(
+    tpt_bind_addrs
+    + [wrap_address(a) for a in uw_reg_addrs]
+))
+```
+
+All 9 tests pass after the fix.
+
+## Files changed
+
+- `tests/discovery/test_tpt_bind_addrs.py` (new)
+- `tractor/_root.py:446` (bug fix, 1 line)
+
+## Human edits
+
+N/A — pending review.
--- a/ai/prompt-io/claude/20260413T205048Z_269d939c_prompt_io.md
+++ b/ai/prompt-io/claude/20260413T205048Z_269d939c_prompt_io.md
@ -0,0 +1,50 @@
+---
+model: claude-opus-4-6
+service: claude
+session: 76154e65-d8e1-4b5f-9275-0ea45ba7e98a
+timestamp: 2026-04-13T20:50:48Z
+git_ref: 269d939c
+scope: code
+substantive: true
+raw_file: 20260413T205048Z_269d939c_prompt_io.raw.md
+---
+
+## Prompt
+
+Implement a `parse_endpoints()` API in
+`tractor.discovery._multiaddr` that lets downstream
+projects declare per-actor transport bind addresses
+as a service table of actor-name -> multiaddr
+strings (loaded from config, e.g. TOML `[network]`
+section). Add type aliases `EndpointsTable` and
+`ParsedEndpoints`, plus 7 unit tests covering TCP,
+UDS, tuples, mixed input, unsupported protocols, and
+edge cases.
+
+A detailed implementation plan was provided
+specifying insertion points, type signatures, test
+names, and verification commands.
+
+## Response summary
+
+Add `parse_endpoints()` function with two supporting
+type aliases to `_multiaddr.py`. The function
+iterates a `dict[str, list[str|tuple]]` service
+table and delegates each entry to the existing
+`wrap_address()` helper, which already handles maddr
+strings, raw tuples, and pre-wrapped `Address`
+objects. Seven pure unit tests (no trio runtime)
+validate all input variants and edge cases. All 22
+multiaddr tests pass.
+
+## Files changed
+
+- `tractor/discovery/_multiaddr.py` — add
+  `EndpointsTable`, `ParsedEndpoints` type aliases
+  and `parse_endpoints()` function after line 133
+- `tests/discovery/test_multiaddr.py` — add
+  `parse_endpoints` import and 7 new test functions
+
+## Human edits
+
+None — committed as generated.
--- a/ai/prompt-io/claude/20260413T205048Z_269d939c_prompt_io.raw.md
+++ b/ai/prompt-io/claude/20260413T205048Z_269d939c_prompt_io.raw.md
@ -0,0 +1,32 @@
+---
+model: claude-opus-4-6
+service: claude
+timestamp: 2026-04-13T20:50:48Z
+git_ref: 269d939c
+diff_cmd: git diff 269d939c~1..269d939c
+---
+
+## Implementation output
+
+### `tractor/discovery/_multiaddr.py`
+
+> `git diff 269d939c~1..269d939c -- tractor/discovery/_multiaddr.py`
+
+Type aliases `EndpointsTable` and `ParsedEndpoints`
+plus `parse_endpoints()` helper that wraps each
+entry via `wrap_address()`.
+
+### `tests/discovery/test_multiaddr.py`
+
+> `git diff 269d939c~1..269d939c -- tests/discovery/test_multiaddr.py`
+
+Add import of `parse_endpoints` to existing import
+block, then 7 new test functions.
+
+### Test run output
+
+```
+22 passed, 1 warning in 0.05s
+```
+
+All 22 tests pass (15 existing + 7 new).
--- a/ai/prompt-io/claude/20260414T163300Z_befedc49_prompt_io.md
+++ b/ai/prompt-io/claude/20260414T163300Z_befedc49_prompt_io.md
@ -0,0 +1,38 @@
+---
+model: claude-opus-4-6
+service: claude
+session: multiaddr-support-rename-prefer
+timestamp: 2026-04-14T16:33:00Z
+git_ref: befedc49
+scope: code
+substantive: true
+raw_file: 20260414T163300Z_befedc49_prompt_io.raw.md
+---
+
+## Prompt
+
+Create a helper function that determines the best transport given
+actor locality (distributed vs same host). Use PID/hostname
+comparison for locality detection, apply at registry addr selection
+only (not spawn-time).
+
+## Response summary
+
+New `prefer_addr()` + `_is_local_addr()` helpers
+in `_api.py` using `socket.getaddrinfo()` and
+`ipaddress` for PID/hostname locality detection.
+Preference: UDS > local TCP > remote TCP.
+Integrated into `query_actor()` and
+`wait_for_actor()`. Also changed
+`Registrar.find_actor()` to return full addr list
+so callers can apply preference.
+
+## Files changed
+
+- `tractor/discovery/_discovery.py` → `_api.py`
+  — renamed + added `prefer_addr()`,
+  `_is_local_addr()`; updated `query_actor()` and
+  `wait_for_actor()` call sites
+- `tractor/discovery/_registry.py`
+  — `Registrar.find_actor()` returns
+  `list[UnwrappedAddress]|None`
--- a/ai/prompt-io/claude/20260414T163300Z_befedc49_prompt_io.raw.md
+++ b/ai/prompt-io/claude/20260414T163300Z_befedc49_prompt_io.raw.md
@ -0,0 +1,62 @@
+---
+model: claude-opus-4-6
+service: claude
+timestamp: 2026-04-14T16:33:00Z
+git_ref: befedc49
+diff_cmd: git diff befedc49~1..befedc49
+---
+
+### `tractor/discovery/_api.py`
+
+> `git diff befedc49~1..befedc49 -- tractor/discovery/_api.py`
+
+Add `_is_local_addr()` and `prefer_addr()` transport
+preference helpers.
+
+#### `_is_local_addr(addr: Address) -> bool`
+
+Determines whether an `Address` is reachable on the
+local host:
+
+- `UDSAddress`: always returns `True`
+  (filesystem-bound, inherently local)
+- `TCPAddress`: checks if `._host` is a loopback IP
+  via `ipaddress.ip_address().is_loopback`, then
+  falls back to comparing against the machine's own
+  interface IPs via
+  `socket.getaddrinfo(socket.gethostname(), None)`
+
+#### `prefer_addr(addrs: list[UnwrappedAddress]) -> UnwrappedAddress`
+
+Selects the "best" transport address from a
+multihomed actor's address list. Wraps each
+candidate via `wrap_address()` to get typed
+`Address` objects, then classifies into three tiers:
+
+1. **UDS** (same-host guaranteed, lowest overhead)
+2. **TCP loopback / same-host IP** (local network)
+3. **TCP remote** (only option for distributed)
+
+Within each tier, the last-registered (latest) entry
+is preferred. Falls back to `addrs[-1]` if no
+heuristic matches.
+
+### `tractor/discovery/_registry.py`
+
+> `git diff befedc49~1..befedc49 -- tractor/discovery/_registry.py`
+
+`Registrar.find_actor()` return type broadened from
+single addr to `list[UnwrappedAddress]|None` — full
+addr list lets callers apply transport preference.
+
+#### Integration
+
+`query_actor()` and `wait_for_actor()` now call
+`prefer_addr(addrs)` instead of `addrs[-1]`.
+
+### Verification
+
+All discovery tests pass (13/13 non-daemon).
+`test_local.py` and `test_multi_program.py` also
+pass (daemon fixture teardown failures are
+pre-existing and unrelated).
--- a/ai/prompt-io/claude/20260417T034918Z_9703210_prompt_io.md
+++ b/ai/prompt-io/claude/20260417T034918Z_9703210_prompt_io.md
@ -0,0 +1,101 @@
+---
+model: claude-opus-4-7[1m]
+service: claude
+session: subints-spawner-design-kickoff
+timestamp: 2026-04-17T03:49:18Z
+git_ref: 9703210
+scope: docs
+substantive: true
+raw_file: 20260417T034918Z_9703210_prompt_io.raw.md
+---
+
+## Prompt
+
+Drive the "first big boi, from GH issue" task seeded by
+`ai/prompt-io/prompts/subints_spawner.md`: design, plan
+and implement sub-interpreter (subint) spawn-backend
+support per issue #379, including (1) modularizing
+`tractor.spawn._spawn` into per-backend submods, (2) a new
+`._subint` backend, and (3) harness parametrization via the
+existing `--spawn-backend` / `start_method` pytest fixture
+in `tractor._testing.pytest`.
+
+Follow-up clarifications from the user (this turn):
+1. Pin `<3.15` on this dev branch and feature-gate subint
+   tests — chose option (a).
+2. Split Phase A (modularization) into its own PR first.
+3. Defer the `fork()`-via-subint hack to a follow-up.
+4. Harness flag is `pytest --spawn-backend <key>` CLI →
+   `start_method` fixture (prompt file updated to match).
+
+## Response summary
+
+Produced a three-phase plan and a concrete Phase A (pure
+modularization) file-split plan for user review; no code
+written yet — the green-light to start Phase A was given
+in this same turn conditional on logging this prompt-io
+entry first.
+
+Phases:
+- **A — modularize** `tractor/spawn/_spawn.py` (847 LOC):
+  keep generic machinery in `_spawn.py`, extract
+  `trio_proc` → `spawn/_trio.py`, `mp_proc` →
+  `spawn/_mp.py`. No pin bump.
+- **B — `_subint` backend**: bump `pyproject.toml`
+  `requires-python` upper to `<3.15`; add `'subint'` to
+  `SpawnMethodKey`; reuse existing UDS transport; shm
+  escape-hatch deferred.
+- **C — harness**: drive the valid-backend tuple in
+  `tractor/_testing/pytest.py:345-349` from
+  `typing.get_args(SpawnMethodKey)`; skip subint tests on
+  Python < 3.14.
+
+Key findings surfaced to the user:
+- `pyproject.toml:12` currently pins `<3.14`; PEP 734
+  `concurrent.interpreters` only ships in 3.14 — the
+  load-bearing constraint.
+- `_testing/pytest.py:345-349` hardcodes valid backends
+  as a string tuple (`'mp_spawn'`, `'mp_forkserver'`,
+  `'trio'`) — should be `get_args(SpawnMethodKey)`.
+- `_testing/pytest.py:228` already imports
+  `try_set_start_method` from `tractor.spawn._spawn` —
+  keeping the `_spawn.py` path as the "core" module
+  avoids breaking external refs during Phase A.
+
+Phase A commit breakdown (each green via `/run-tests`):
+1. Drive `_testing/pytest.py` backend-tuple from
+   `get_args(SpawnMethodKey)` — prep.
+2. Extract `trio_proc` → `spawn/_trio.py`; `_methods`
+   registry imports it.
+3. Extract `mp_proc` → `spawn/_mp.py`; `_methods` imports
+   it.
+4. Tidy `spawn/__init__.py` docstring for the new layout.
+
+Workflow: Phase A runs in a new worktree via `/open-wkt
+spawn_modularize` (branch `wkt/spawn_modularize`) and
+lands as its own PR before Phase B/C returns to the
+current `claude_bun_opus4.7_subint_spawner_backend`
+branch.
+
+## Files changed
+
+None this turn — design/planning only. The user updated
+`ai/prompt-io/prompts/subints_spawner.md` themselves to
+complete the truncated sentence on line 32 (harness-flag
+reference).
+
+Upcoming (Phase A, next turn):
+- `tractor/spawn/_spawn.py` — trimmed to "core"
+- `tractor/spawn/_trio.py` — new; holds `trio_proc()`
+- `tractor/spawn/_mp.py` — new; holds `mp_proc()`
+- `tractor/spawn/__init__.py` — docstring refresh
+- `tractor/_testing/pytest.py` — backend tuple from
+  `get_args(SpawnMethodKey)`
+
+## Human edits
+
+None — committed as generated. User completed the
+truncated sentence on line 32 of
+`ai/prompt-io/prompts/subints_spawner.md` themselves
+*before* this prompt-io log was written; that edit was
+the user's authoring, not an edit to AI output.
--- a/ai/prompt-io/claude/20260417T034918Z_9703210_prompt_io.raw.md
+++ b/ai/prompt-io/claude/20260417T034918Z_9703210_prompt_io.raw.md
@ -0,0 +1,117 @@
+---
+model: claude-opus-4-7[1m]
+service: claude
+timestamp: 2026-04-17T03:49:18Z
+git_ref: 9703210
+diff_cmd: git diff HEAD~1..HEAD
+---
+
+No code was produced in this turn — this is a design/planning
+exchange for subint spawner support per issue #379. Captured
+verbatim below are the two substantive AI responses, which
+together fix the phased plan (A: modularize, B: subint, C:
+harness) and the concrete Phase A file split before any
+implementation begins.
+
+## Turn 1 — initial triage + gating questions
+
+> Read: issue #379, current `spawn/_spawn.py` (847 lines),
+> and pyproject pins.
+>
+> ## Task summary
+> Design/plan/implement subint backend per #379, plus
+> modularize `spawn/_spawn.py` into per-backend submods, plus
+> harness flag for test parametrization.
+>
+> ## Key finding
+> `pyproject.toml:12` pins `requires-python = ">=3.12, <3.14"`
+> — but `concurrent.interpreters` / PEP 734 **only lands in
+> 3.14**. This is the load-bearing design question before
+> coding.
+>
+> ## Proposed phasing (incremental commits)
+> - **Phase A — modularize** `_spawn.py`: extract `trio_proc`
+>   → `spawn/_trio.py`, `mp_proc` → `spawn/_mp.py`, keep
+>   generic helpers (`hard_kill`/`soft_kill`/`new_proc`
+>   dispatcher/`SpawnMethodKey`) in a slim core. Tests green
+>   at each step.
+> - **Phase B — `_subint` backend**: extend `SpawnMethodKey`
+>   with `'subint'`; reuse UDS transport by default; shm is
+>   escape-hatch follow-up.
+> - **Phase C — harness**: add backend-parametrize flag, skip
+>   subint when Python < 3.14.
+>
+> ## Gating questions before I start
+> 1. **Python pin**: bump `<3.15` and feature-gate subint
+>    tests (a), require 3.14 outright (b), or backport shim
+>    (c)? I'd recommend (a).
+> 2. **PR strategy**: one branch, commits per phase (current
+>    `claude_bun_opus4.7_subint_spawner_backend`)? Or split
+>    Phase A as its own PR first?
+> 3. **`fork()`-via-subint hack** from the issue's "Our own
+>    thoughts" section — v1 or follow-up? I'd defer to
+>    follow-up.
+> 4. **Prompt file line 32 is truncated** — what's the
+>    existing harness flag name you had in mind ("using the
+>    existing ___")?
+>
+> Which of these are already decided vs. open?
+
+## User answers (for the raw record)
+
+1. bump `<3.15`, feature-gate — option (a).
+2. split Phase A as its own PR first.
+3. defer `fork()`-via-subint hack.
+4. harness flag is `--spawn-backend` CLI → `start_method`
+   fixture, per `tractor._testing._pytest`. Prompt file
+   updated accordingly.
+
+## Turn 2 — concrete Phase A plan
+
+> Good — prompt file now ends cleanly; harness flag
+> confirmed as `--spawn-backend` CLI → `start_method`
+> fixture (via `_testing/pytest.py:193,355`). Also noted
+> `pytest.py:345-349` hardcodes valid backends as a string
+> tuple — `typing.get_args(SpawnMethodKey)` should drive
+> that list.
+>
+> ## Phase A — concrete plan (separate PR)
+>
+> **File split** in `tractor/spawn/`:
+> - `_spawn.py` stays as "core": keeps `SpawnMethodKey`,
+>   `_spawn_method`, `proc_waiter`, `try_set_start_method`,
+>   `exhaust_portal`, `cancel_on_completion`, `hard_kill`,
+>   `soft_kill`, `new_proc` dispatcher, `_methods`
+>   registry. Kept at this path so `_testing/pytest.py:228`
+>   and other external imports don't break.
+> - `_trio.py` ← `trio_proc()` (lines 448–671, ~224 lines)
+> - `_mp.py` ← `mp_proc()` (lines 672–842, ~171 lines)
+> - `_methods` dict imports the two backend fns from the
+>   new modules.
+>
+> **Commits** (each green via `/run-tests`):
+> 1. Drive `_testing/pytest.py` backend-tuple from
+>    `get_args(SpawnMethodKey)` — tiny prep.
+> 2. Extract `trio_proc` → `spawn/_trio.py`; `_methods`
+>    imports it.
+> 3. Extract `mp_proc` → `spawn/_mp.py`; `_methods`
+>    imports it.
+> 4. Tidy `spawn/__init__.py` docstring to describe new
+>    layout.
+>
+> No pyproject pin bump in Phase A (that's Phase B).
+>
+> **Workflow**: I'd run this in a new worktree via
+> `/open-wkt spawn_modularize` (branch
+> `wkt/spawn_modularize`), then open a PR before returning
+> to the current `claude_bun_opus4.7_subint_spawner_backend`
+> branch for Phase B/C.
+>
+> Green-light to `/open-wkt` and start Phase A?
+
+## User final response
+
+> all this first then yes, green light.
+
+— log this `/prompt-io` entry first, then proceed with
+Phase A in a worktree.
--- a/ai/prompt-io/claude/20260417T035800Z_61a73ba_prompt_io.md
+++ b/ai/prompt-io/claude/20260417T035800Z_61a73ba_prompt_io.md
@ -0,0 +1,86 @@
+---
+model: claude-opus-4-7[1m]
+service: claude
+session: subints-spawner-phase-a-impl
+timestamp: 2026-04-17T03:58:05Z
+git_ref: 61a73ba
+scope: code
+substantive: true
+raw_file: 20260417T035800Z_61a73ba_prompt_io.raw.md
+---
+
+## Prompt
+
+Follow-up from the design session logged in
+`20260417T034918Z_9703210_prompt_io.md`: the user
+greenlit Phase A after `/prompt-io` tracking, so
+execute the Phase A "modularize `spawn._spawn`" plan
+end-to-end in a new worktree — four logical steps
+(pytest harness prep, `trio_proc` extraction,
+`mp_proc` extraction, docstring refresh), each green
+via `/run-tests`.
+
+User directives during execution:
+- **Option 3** for the test cadence ("continue A.2–A.4
+  first, then run the full suite once at the end of
+  Phase A").
+- **One commit** for the whole phase ("can't we just
+  commit the whole patch in one commit?") instead of
+  the 3/4-commit split I initially proposed.
+- **Don't pre-draft** commit messages — wait for the
+  user to invoke `/commit-msg` (captured as feedback
+  memory `feedback_no_auto_draft_commit_msgs.md`).
+
+## Response summary
+
+Produced the cohesive Phase A modularization patch,
+landed as commit `61a73bae` (subject: `Mv
+trio_proc`/`mp_proc` to per-backend submods`). Five
+files changed, +565 / -418 lines.
+
+Key pieces of the patch (generated by claude,
+reviewed by the human before commit):
+- `tractor/spawn/_trio.py` — **new**; receives
+  `trio_proc()` verbatim from `_spawn.py`; imports
+  cross-backend helpers back from `._spawn`.
+- `tractor/spawn/_mp.py` — **new**; receives
+  `mp_proc()` verbatim; uses `from . import _spawn`
+  for late-binding access to the mutable `_ctx` /
+  `_spawn_method` globals (design decision made
+  during impl, not the original plan).
+- `tractor/spawn/_spawn.py` — shrunk 847 → 448 LOC;
+  import pruning; bottom-of-module late imports for
+  `trio_proc` / `mp_proc` with a one-line comment
+  explaining the circular-dep reason.
+- `tractor/spawn/__init__.py` — docstring refresh
+  describing the new layout.
+- `tractor/_testing/pytest.py` — the valid-backend
+  set now comes from `typing.get_args(SpawnMethodKey)`
+  so future additions (`'subint'`) don't need harness
+  edits.
+
+## Files changed
+
+See `git diff 61a73ba~1..61a73ba --stat`:
+
+```
+ tractor/_testing/pytest.py |  12 +-
+ tractor/spawn/__init__.py  |  31 +++-
+ tractor/spawn/_mp.py       | 235 ++++++++++++++++++++++++
+ tractor/spawn/_spawn.py    | 413 +-------------------------------
+ tractor/spawn/_trio.py     | 292 ++++++++++++++++++++++++++++
+ 5 files changed, 565 insertions(+), 418 deletions(-)
+```
+
+Validation:
+- import probe + `_methods` wiring check — OK
+- spawn-relevant test subset — 37 passed, 1 skipped
+- full suite — 350 passed, 14 skipped, 7 xfailed, 1
+  xpassed
+
+## Human edits
+
+None — committed as generated by claude (no diff
+between `.claude/git_commit_msg_LATEST.md` and the
+committed body, as far as the assistant could
+observe).
--- a/ai/prompt-io/claude/20260417T035800Z_61a73ba_prompt_io.raw.md
+++ b/ai/prompt-io/claude/20260417T035800Z_61a73ba_prompt_io.raw.md
@ -0,0 +1,138 @@
+---
+model: claude-opus-4-7[1m]
+service: claude
+timestamp: 2026-04-17T03:58:05Z
+git_ref: 61a73ba
+diff_cmd: git diff 61a73ba~1..61a73ba
+---
+
+Code generated in this turn was committed verbatim as
+`61a73bae` ("Mv `trio_proc`/`mp_proc` to per-backend
+submods"). Per diff-ref mode, per-file code is captured
+via the pointers below, each followed by a prose
+summary of what the AI generated. Non-code output
+(sanity-check results, design rationale) is included
+verbatim.
+
+## Per-file generated content
+
+### `tractor/spawn/_trio.py` (new, 292 lines)
+
+> `git diff 61a73ba~1..61a73ba -- tractor/spawn/_trio.py`
+
+Pure lift-and-shift of `trio_proc()` out of
+`tractor/spawn/_spawn.py` (previously lines 448–670).
+Added AGPL header + module docstring describing the
+backend; imports include local `from ._spawn import
+cancel_on_completion, hard_kill, soft_kill` which
+creates the bottom-of-module late-import pattern in
+the core file to avoid a cycle. All call sites,
+log-format strings, and body logic are byte-identical
+to the originals — no semantic change.
+
+### `tractor/spawn/_mp.py` (new, 235 lines)
+
+> `git diff 61a73ba~1..61a73ba -- tractor/spawn/_mp.py`
+
+Pure lift-and-shift of `mp_proc()` out of
+`tractor/spawn/_spawn.py` (previously lines 672–842).
+Same AGPL header convention. Key difference from
+`_trio.py`: uses `from . import _spawn` (module
+import, not from-import) for `_ctx` and
+`_spawn_method` references — these are mutated at
+runtime by `try_set_start_method()`, so late binding
+via `_spawn._ctx` / `_spawn._spawn_method` is required
+for correctness. Also imports `cancel_on_completion`,
+`soft_kill`, `proc_waiter` from `._spawn`.
+
+### `tractor/spawn/_spawn.py` (modified, 847 → 448 LOC)
+
+> `git diff 61a73ba~1..61a73ba -- tractor/spawn/_spawn.py`
+
+- removed `trio_proc()` body (moved to `_trio.py`)
+- removed `mp_proc()` body (moved to `_mp.py`)
+- pruned imports now unused in core: `sys`,
+  `is_root_process`, `current_actor`,
+  `is_main_process`, `_mp_main`, `ActorFailure`,
+  `pretty_struct`, `_pformat`
+- added bottom-of-file late imports
+  `from ._trio import trio_proc` and
+  `from ._mp import mp_proc` with a one-line
+  comment explaining why (circular dep)
+- `_methods` dict unchanged structurally; still binds
+  `'trio' → trio_proc`, `'mp_spawn' → mp_proc`,
+  `'mp_forkserver' → mp_proc`
+
+### `tractor/spawn/__init__.py` (modified)
+
+> `git diff 61a73ba~1..61a73ba -- tractor/spawn/__init__.py`
+
+Docstring rewrite only — no code. New text describes
+the per-backend `Layout` with `- ._spawn` (core),
+`- ._trio` (default), `- ._mp` (two mp variants), and
+reiterates the "no eager import" NOTE.
+
+### `tractor/_testing/pytest.py` (modified)
+
+> `git diff 61a73ba~1..61a73ba -- tractor/_testing/pytest.py`
+
+Two-line logical change (plus a comment):
+- `from typing import` grows `get_args`
+- `pytest_generate_tests()` replaces the hardcoded
+  `('mp_spawn','mp_forkserver','trio')` tuple with
+  `get_args(SpawnMethodKey)` via lazy import
+  `from tractor.spawn._spawn import SpawnMethodKey`
+  inside the function body (matching the project's
+  existing lazy-import convention for the spawn
+  module)
+
+## Non-code output (verbatim)
+
+### Design rationale for the file split
+
+From my Phase A plan turn (already captured in
+`20260417T034918Z_9703210_prompt_io.md`):
+- `_spawn.py` stays as the "core" rather than renamed
+  to `_core.py` so external imports
+  (`_testing/pytest.py:228` imports
+  `try_set_start_method` from `tractor.spawn._spawn`)
+  keep working without churn.
+- Per-backend extraction chosen over alternatives
+  (e.g. splitting generic helpers further) because
+  the immediate motivation is hosting a 3rd
+  `_subint.py` sibling cleanly in Phase B.
+
+### Sanity-check output (verbatim terminal excerpts)
+
+Post-extraction import probe:
+```
+extraction OK
+_methods: {'trio': 'tractor.spawn._trio.trio_proc',
+           'mp_spawn': 'tractor.spawn._mp.mp_proc',
+           'mp_forkserver': 'tractor.spawn._mp.mp_proc'}
+```
+
+Spawn-relevant test subset (`tests/test_local.py
+test_rpc.py test_spawning.py test_multi_program.py
+test_discovery.py`):
+```
+37 passed, 1 skipped, 14 warnings in 55.37s
+```
+
+Full suite:
+```
+350 passed, 14 skipped, 7 xfailed, 1 xpassed,
+151 warnings in 437.73s (0:07:17)
+```
+
+No regressions vs. `main`. One transient `-x`
+early-stop `ERROR` on
+`test_close_channel_explicit_remote_registrar[trio-True]`
+was flaky (passed solo, passed without `-x`), not
+caused by this refactor.
+
+### Commit message
+
+Also AI-drafted (via `/commit-msg`) — the 40-line
+message on commit `61a73bae` itself. Not reproduced
+here; see `git log -1 61a73bae`.
--- a/ai/prompt-io/claude/20260601T231429Z_0e3e008b_prompt_io.md
+++ b/ai/prompt-io/claude/20260601T231429Z_0e3e008b_prompt_io.md
@ -0,0 +1,146 @@
+---
+model: claude-opus-4-7[1m]
+service: claude
+session: trio-0.33-subproc-supervisor-retroactive
+timestamp: 2026-06-01T23:14:29Z
+git_ref: 0e3e008b
+scope: code
+substantive: true
+raw_file: 20260601T231429Z_0e3e008b_prompt_io.raw.md
+---
+
+## Prompt
+
+**RETROACTIVE LOG** — original session prompts not
+preserved; reconstructed from the staged work product.
+
+The work designs a `trio.Nursery.start()`-style wrapper
+around `trio.run_process()` for SC-friendly subprocess
+supervision. From the resulting code shape, the
+prompting intent was:
+
+1. Surface rc!=0 `CalledProcessError` DETERMINISTICALLY,
+   without the nursery-eg-wrapping that complicates
+   `collapse_eg()` usage and races the relay reader on
+   trio's `check=True`-driven cancel cascade.
+2. ALWAYS isolate the parent controlling-tty so a
+   spawned child can't emit terminal control-seqs onto
+   the launching tty (clobbering scrollback). Default
+   `stdin=DEVNULL`; default `stdout=DEVNULL` unless
+   explicitly relayed/overridden; distinguish "caller
+   passed nothing" from "caller passed `None` for
+   inherit".
+3. Optional live per-line relay of child std-streams to
+   the `tractor` log — STREAMED (not
+   buffered-until-exit) so long-lived daemon output is
+   visible during the run. Pick a custom log level that
+   shows at usual `info`/`devx` console levels but is
+   separately filterable.
+4. Concurrent pipe-drain reader MANDATORY when piping
+   without `capture_*` — without it the child blocks on
+   `write()` once the OS pipe buffer fills (~64KiB),
+   causing deadlocks on output bursts.
+5. Non-blocking `tn.start()` semantics: hand the live
+   `trio.Process` to the parent immediately;
+   supervise/relay run to completion in the supervisor
+   coro.
+6. Hermetic `trio`-only unit tests (no actor-runtime)
+   covering each of: per-line relay, tty isolation,
+   no-deadlock on >64KiB unnewlined output, CPE
+   rebuild w/ stderr relay, CPE rebuild on the silent
+   drain+capture path.
+
+## Response summary
+
+Adds `tractor/trionics/_subproc.py` (296 LOC) +
+`tests/trionics/test_subproc.py` (230 LOC) + a
+re-export in `tractor/trionics/__init__.py`.
+
+**`supervise_run_process()`** (public, re-exported)
+- `check=False` is forced to `trio.run_process`; the
+  rc-check runs in the supervisor coro AFTER `own_tn`
+  unwinds (both the child AND the relay readers have
+  hit EOF + fully drained). A BARE
+  `subprocess.CalledProcessError` is rebuilt + raised
+  from there, with `.stderr` bytes passed in the
+  constructor AND attached as an `add_note()`'d
+  `|_.stderr:` block for legible teardown logs.
+- `stdin=DEVNULL` always. `stdout` default chosen via a
+  `_UNSET` sentinel: `relay_stdout=True` → PIPE,
+  explicit `stdout=...` → as given, else `DEVNULL`.
+  `stderr` defaults to PIPE whenever we relay OR need
+  the CPE note (when `check=True`), else `DEVNULL`.
+- `relay_level='io'` (custom level 21; sorts just
+  above stdlib `INFO`=20 so it shows at usual
+  `info`/`devx` levels and stays separately
+  filterable). `runtime`=15 would silently filter at
+  default levels, so it's rejected as a default.
+- `task_status.started(trio_proc)` delivers the live
+  process immediately. The internal `own_tn`
+  supervises `trio.run_process` + any relay readers to
+  completion.
+- `**run_process_kwargs` forward verbatim;
+  `stdin/stdout/stderr/check` are MANAGED keys
+  (override on conflict).
+- Crash-handling deliberately NOT baked in — compose
+  `maybe_open_crash_handler()` on top at the call-site.
+
+**`_relay_stream_lines()`** (internal helper)
+- Three modes (combinable): `emit`-only (live per-line
+  relay), `accum`-only (silent drain+capture for a CPE
+  note), or both (live relay AND capture).
+- Per-line split handles cross-chunk residuals via a
+  rolling `residual` bytes buffer; flushes any trailing
+  un-newline-term'd line at EOF.
+- `async with stream:` ensures aclose at EOF/cancel
+  (mirrors trio's internal `_subprocess` drain idiom).
+
+**`_add_stderr_note()`** (internal helper)
+- `add_note()`s a `textwrap.indent(...)`'d
+  `|_.stderr:` block onto a `CalledProcessError` for
+  teardown logs.
+
+**Tests** (5 hermetic, trio-only) — `_capture_relay`
+fixture monkeypatches `_subproc.log.<level>` to a list:
+- `test_stdout_relayed_per_line`: per-line stdout
+  relay carries each `line=N` to the records.
+- `test_parent_tty_isolated`: `readlink /proc/self/fd/0`
+  and `fd/1` from the child show `pipe:` (fd1) +
+  `/dev/null` (fd0); NO `/dev/pts/*`.
+- `test_no_deadlock_on_big_unnewlined_output`: 200KiB
+  of `x` with no newlines completes inside
+  `fail_after(2)` — exercises the concurrent drain.
+- `test_stderr_relay_and_cpe_rebuild`: rc=3 with
+  `relay_stderr=True` raises bare CPE
+  (via `collapse_eg()`) with `b'boom' in cpe.stderr`,
+  the note attached, AND per-line live relay.
+- `test_nonrelay_cpe_note`: rc=7 with no relay still
+  produces CPE with `.stderr` + note via the silent
+  drain+capture path.
+
+## Files changed
+
+- `tractor/trionics/_subproc.py` — NEW. Public
+  `supervise_run_process()` + helpers
+  `_relay_stream_lines()` / `_add_stderr_note()` + the
+  `_UNSET` sentinel.
+- `tests/trionics/test_subproc.py` — NEW. 5 hermetic
+  trio-only tests + `_capture_relay` monkeypatch
+  fixture.
+- `tractor/trionics/__init__.py` — re-export
+  `supervise_run_process`.
+
+## Human edits
+
+**RETROACTIVE**: this log is being written from the
+staged diff, not from a live session. The code as
+staged is the canonical artifact; any human edits the
+user made during the originating design session are
+already integrated and cannot be separated post-hoc.
+The `.raw.md` sibling is a diff-pointer placeholder,
+NOT a pre-edit transcript.
+
+Future prompt-io entries for in-flight work should be
+written DURING the design session per the skill
+contract so the pre-edit `.raw.md` captures the
+unedited model output for genuine provenance.
--- a/ai/prompt-io/claude/20260601T231429Z_0e3e008b_prompt_io.raw.md
+++ b/ai/prompt-io/claude/20260601T231429Z_0e3e008b_prompt_io.raw.md
@ -0,0 +1,106 @@
+---
+model: claude-opus-4-7[1m]
+service: claude
+timestamp: 2026-06-01T23:14:29Z
+git_ref: 0e3e008b
+diff_cmd: git diff HEAD~1..HEAD
+---
+
+# RETROACTIVE — original model output not preserved
+
+This `.raw.md` would normally contain the verbatim
+pre-human-edit response from the design session that
+produced the staged `_subproc.py` module + tests. That
+session's transcript is not available, so this file
+serves as a diff-pointer placeholder + transparency
+note.
+
+## Authoritative artifact
+
+The committed code IS the artifact of record. Once the
+companion commit lands, the unified diff is:
+
+> `git diff HEAD~1..HEAD -- tractor/trionics/_subproc.py`
+> `git diff HEAD~1..HEAD -- tests/trionics/test_subproc.py`
+> `git diff HEAD~1..HEAD -- tractor/trionics/__init__.py`
+
+Before committing, substitute `--cached` for the
+pre-commit form.
+
+## What is NOT here
+
+Because this is retroactive:
+- No verbatim chain-of-thought / discussion prose from
+  the design session.
+- No rejected alternatives the model considered before
+  arriving at the final shape (e.g. whether the
+  rc-check should live inside `own_tn` vs after it; the
+  `_UNSET` sentinel vs a `None`-means-DEVNULL
+  convention; `io` vs `info` as the default relay
+  level).
+- No pre-edit code blocks as the model first emitted
+  them, separable from any user cleanup applied before
+  the diff was staged.
+
+## Inferred design choices visible in the final code
+
+(Documented here because they're the kind of decision
+detail an unedited raw transcript would have captured.)
+
+1. **Post-drain rc-check in the supervisor coro body,
+   AFTER `own_tn.__aexit__`.** Placing the
+   `CalledProcessError` raise here (not inside
+   `own_tn`) means the EG-unwrap happens at the OUTER
+   `tn.start()` boundary — callers do `collapse_eg()`
+   if they want bare. Doing the raise INSIDE `own_tn`
+   would cancel the still-draining relay reader
+   mid-flight and lose stderr lines.
+
+2. **`_UNSET` sentinel for `stdout`.** A plain default
+   of `None` couldn't distinguish "use the safe
+   `DEVNULL` default" from "caller explicitly passed
+   `None` (inherit, presumably knowingly)". The
+   sentinel keeps the SAFE default while letting power
+   users opt into inherit.
+
+3. **`relay_level='io'` (custom level 21).** Chosen to
+   sort just above stdlib `INFO`=20 so a default
+   `--ll info` shows the relay, but it remains a
+   distinct level so users can filter
+   `tractor.trionics:io` separately. Picking
+   `runtime`=15 would have made the relay invisible at
+   default verbosity (a footgun for daemon supervisors
+   whose whole point is "I want to see this output").
+
+4. **Reader is MANDATORY, not opt-in cosmetic.** With
+   `stdout=PIPE` / `stderr=PIPE` we OWN the drain
+   responsibility — there's no `trio.capture_*` running
+   under the hood here. The ~64KiB OS pipe buffer
+   means a child writing more than that without us
+   reading hangs at `write()` — a deadlock that won't
+   show up in small-output tests, which is why the
+   200KiB-no-newline test is in the suite.
+
+5. **`task_status.started(trio_proc)` BEFORE the
+   `own_tn` exits.** Without this, `tn.start()` would
+   block until the child exits — losing the "start a
+   long-lived daemon and continue with parent work"
+   use case. With it, the parent gets the live process
+   handle immediately and the supervise+relay tasks
+   run in the supervisor coro until the child exits.
+
+6. **`__notes__` via `add_note()` for the CPE
+   `.stderr`.** The `.stderr` attribute is what
+   `subprocess` callers expect; the `add_note()` is
+   what trio's exception-rendering shows. Both wired so
+   programmatic AND human consumers see the stderr at
+   teardown.
+
+## Honesty statement
+
+This file's content is RECONSTRUCTED from the staged
+code, not extracted from a verbatim model transcript.
+The prompt-io skill's intent is for the `.raw.md` to
+be a pre-edit fossil; that's not possible here. Future
+work should write the prompt-io entry DURING the
+design session.
--- a/ai/prompt-io/claude/README.md
+++ b/ai/prompt-io/claude/README.md
@ -0,0 +1,27 @@
+# AI Prompt I/O Log — claude
+
+This directory tracks prompt inputs and model
+outputs for AI-assisted development using
+`claude` (Claude Code).
+
+## Policy
+
+Prompt logging follows the
+[NLNet generative AI policy][nlnet-ai].
+All substantive AI contributions are logged
+with:
+- Model name and version
+- Timestamps
+- The prompts that produced the output
+- Unedited model output (`.raw.md` files)
+
+[nlnet-ai]: https://nlnet.nl/foundation/policies/generativeAI/
+
+## Usage
+
+Entries are created by the `/prompt-io` skill
+or automatically via `/commit-msg` integration.
+
+Human contributors remain accountable for all
+code decisions. AI-generated content is never
+presented as human-authored work.
--- a/ai/prompt-io/prompts/multiaddr_declare_eps.md
+++ b/ai/prompt-io/prompts/multiaddr_declare_eps.md
@ -0,0 +1,76 @@
+ok now i want you to take a look at the most recent commit adding
+a `tpt_bind_addrs` to `open_root_actor()` and extend the existing
+tests/discovery/test_multiaddr* and friends to use this new param in
+at least one suite with parametrizations over,
+
+- `registry_addrs == tpt_bind_addrs`, as in both inputs are the same.
+- `set(registry_addrs) >= set(tpt_bind_addrs)`, as in the registry
+  addrs include the bind set.
+- `registry_addrs != tpt_bind_addrs`, where the reg set is disjoint from
+  the bind set in all possible combos you can imagine.
+
+All of the ^above cases should further be parametrized over,
+- the root being the registrar,
+- a non-registrar root using our bg `daemon` fixture.
+
+once we have a fairly thorough test suite and have flushed out all
+bugs and edge cases we want to design a wrapping API which allows
+declaring full tree's of actors tpt endpoints using multiaddrs such
+that a `dict[str, list[str]]` of actor-name -> multiaddr can be used
+to configure a tree of actors-as-services given such an input
+"endpoints-table" can be matched with the number of appropriately
+named subactore spawns in a `tractor` user-app.
+
+Here is a small example from piker,
+
+- in piker's root conf.toml we define a `[network]` section which can
+  define various actor-service-daemon names set to a maddr
+  (multiaddress str).
+
+- each actor whether part of the `pikerd` tree (as a sub) or spawned
+  in other non-registrar rooted trees (such as `piker chart`) should
+  configurable in terms of its `tractor` tpt bind addresses via
+  a simple service lookup table,
+
+  ```toml
+  [network]
+  pikerd = [
+    '/ip4/127.0.0.1/tcp/6116',  # std localhost daemon-actor tree
+    '/uds/run/user/1000/piker/pikerd@6116.sock',  # same but serving UDS
+  ]
+  chart = [
+    '/ip4/127.0.0.1/tcp/3333',  # std localhost daemon-actor tree
+    '/uds/run/user/1000/piker/chart@3333.sock',
+  ]
+  ```
+
+We should take whatever common API is needed to support this and
+distill it into a
+```python
+tractor.discovery.parse_endpoints(
+) -> dict[
+  str,
+  list[Address]
+  |dict[str, list[Address]]
+  # ^recursive case, see below
+]:
+```
+
+style API which can,
+
+- be re-used easily across dependent projects.
+- correctly raise tpt-backend support errors when a maddr specifying
+  a unsupport proto is passed.
+- be used to handle "tunnelled" maddrs per
+  https://github.com/multiformats/py-multiaddr/#tunneling such that
+  for any such tunneled maddr-`str`-entry we deliver a data-structure
+  which can easily be passed to nested `@acm`s which consecutively
+  setup nested net bindspaces for binding the endpoint addrs using
+  a combo of our `.ipc.*` machinery and, say for example something like
+  https://github.com/svinota/pyroute2, more precisely say for
+  managing tunnelled wireguard eps within network-namespaces,
+  * https://docs.pyroute2.org/
+  * https://docs.pyroute2.org/netns.html
+
+remember to include use of all default `.claude/skills` throughout
+this work!
--- a/ai/prompt-io/prompts/subints_spawner.md
+++ b/ai/prompt-io/prompts/subints_spawner.md
@ -0,0 +1,34 @@
+This is your first big boi, "from GH issue" design, plan and
+implement task.
+
+We need to try and add sub-interpreter (aka subint) support per the
+issue,
+
+https://github.com/goodboy/tractor/issues/379
+
+Part of this work should include,
+
+- modularizing and thus better organizing the `.spawn.*` subpkg by
+  breaking up various backends currently in `spawn._spawn` into
+  separate submods where it makes sense.
+
+- add a new `._subint` backend which tries to keep as much of the
+  inter-process-isolation machinery in use as possible but with plans
+  to optimize for localhost only benefits as offered by python's
+  subints where possible.
+
+  * utilizing localhost-only tpts like UDS, shm-buffers for
+    performant IPC between subactors but also leveraging the benefits from
+    the traditional OS subprocs mem/storage-domain isolation, linux
+    namespaces where possible and as available/permitted by whatever
+    is happening under the hood with how cpython implements subints.
+
+  * default configuration should encourage state isolation as with
+    subprocs, but explicit public escape hatches to enable rigorously
+    managed shm channels for high performance apps.
+
+- all tests should be (able to be) parameterized to use the new
+  `subints` backend and enabled by flag in the harness using the
+  existing `pytest --spawn-backend <spawn-backend>` support offered in
+  the `open_root_actor()` and `.testing._pytest` harness override
+  fixture.
--- a/ai/tooling-todos/logspec_leaf_module_granularity_route_b.md
+++ b/ai/tooling-todos/logspec_leaf_module_granularity_route_b.md
@ -0,0 +1,159 @@
+# Logging-spec leaf-module granularity — "Route B" (decouple
+# logger-*identity* from console-*display*)
+
+Follow-up notes recording the breaking-changes / costs of the
+deeper fix that would give the `tractor.log` logging-spec (see
+`LogSpec`/`apply_logspec()`) true **per-leaf-MODULE** level
+control — deliberately *not* taken (for now) in favour of the
+smaller sub-PACKAGE fix already landed.
+
+## Status / what already shipped
+
+The cheap, contained fix is **done**: `get_logger()`'s "strip
+#2" (`log.py`, the `pkg_path = subpkg_path` collapse) no longer
+eats a real sub-package component. It now strips the trailing
+token *only* when it duplicates the caller's leaf-*module*
+filename (which the header already shows via `{filename}`).
+
+Result:
+
+- `devx.debug` resolves to `tractor.devx.debug`, **distinct**
+  from a bare `devx` -> `tractor.devx` (its parent). So the
+  logging-spec can dial sub-package levels at any nesting depth
+  (`devx.debug:runtime` ≠ `devx:cancel`).
+- The `get_logger(__name__)` cosmetic ("don't repeat the leaf
+  module in `{name}` since `{filename}` shows it") is preserved.
+
+What is **still NOT addressable** after that fix:
+
+- **Per-leaf-MODULE** levels. Every module in a (sub-)pkg shares
+  that pkg's logger, because `get_logger()` drops the leaf
+  module-name from the logger key by design.
+- **Top-level lib modules** (eg. `tractor.to_asyncio`,
+  `__package__ == 'tractor'`) emit on the *root* `tractor`
+  logger, so a `to_asyncio:<lvl>` spec entry hits a phantom
+  child -> no-op.
+
+## What "Route B" is
+
+Make the logger's *identity* the **full dotted module path**
+(incl. the leaf module + top-level modules), eg.
+`tractor.devx.debug._tty_lock` and `tractor.to_asyncio`, and
+move the cosmetic leaf-trim out of logger-naming and into the
+**formatter's `{name}` rendering**.
+
+Net effect:
+
+- Real per-module `Logger` nodes exist in the hierarchy ->
+  the spec can target ANY module; stdlib level-inheritance and
+  propagation "just work" top-down.
+- Console headers stay clean because the formatter computes a
+  trimmed display string (drop the trailing token that equals
+  `{filename}`'s stem) instead of the logger doing it.
+
+## Why it's "broad" — breaking changes / costs
+
+The logger *name* is currently load-bearing well beyond
+display; changing it ripples:
+
+1. **Every logger name changes.**
+   Today (post sub-pkg fix) names collapse to the sub-package;
+   Route B = full module path. This touches:
+   - handler attachment points + the `getChild()` hierarchy,
+   - any `logging.getLogger('tractor.X')` string lookups,
+   - any name-based filtering,
+   - the dedup / `_strict_debug` warning logic *inside*
+     `get_logger()` itself — the `pkg_name in name`,
+     `leaf_mod in pkg_path`, "duplicate pkg-name" branches all
+     key off the *name shape* and would need re-derivation.
+
+2. **Formatter rewrite.**
+   `LOG_FORMAT` uses `{name}` == `record.name` (the full logger
+   name). To keep headers clean we must compute a *display*
+   name and inject it as a record attr (eg. `record.pkg_ns`)
+   via a `logging.Filter` or a `colorlog.ColoredFormatter`
+   subclass overriding `.format()`, then point `LOG_FORMAT` at
+   that field. The `{filename}` vs `{name}` de-dup intent has
+   to be re-implemented per-record rather than per-logger.
+
+3. **Propagation / double-emit surface grows.**
+   Full-depth loggers mean more intermediate nodes
+   (`...debug._tty_lock` -> `.debug` -> `.devx` -> `tractor`).
+   If more than one level carries a handler (spec sub-handlers
+   + a root console), records double-emit. The
+   `propagate=False` trick we already use for filter-targeted
+   sub-loggers (`apply_logspec()`) must be applied carefully
+   across a deeper tree — more levels == more places to leak a
+   dup.
+
+4. **Level-inheritance semantics shift.**
+   Today setting a level on `tractor.devx` gates *all* devx
+   emits (they share that logger). Post-Route-B,
+   `tractor.devx.debug._tty_lock` is its own `NOTSET` logger
+   that *inherits* the effective level from ancestors —
+   functionally similar via inheritance, BUT any code that does
+   `log.setLevel(...)` / reads `log.level` on a (previously
+   collapsed) logger now only affects that exact node. All
+   `setLevel`/`.level =` call sites need an audit (eg.
+   `get_logger()`'s own `log.level = rlog.level` line).
+
+5. **Downstream contract churn.**
+   `modden` / `piker` call `get_logger()` / `get_console_log()`
+   and may depend on current names — including
+   `modden.runtime.daemon.setup_tractor_logging()` which
+   asserts `'tractor' not in name` on spec parts. The header
+   `{name}` field is user-visible in everyone's logs + CI
+   output. Changing the canonical names is a public-ish
+   behavior change -> needs a version note + downstream
+   coordination (or a formatter trim that keeps the *displayed*
+   string byte-identical to today).
+
+6. **`get_logger()` refactor risk.**
+   The fn tangles two concerns: compute logger *identity* and
+   compute the *display* string. Route B forces splitting them
+   inside a ~300-line fn with multiple `_strict_debug`
+   branches, dup-warnings, and the `name=__name__` convenience.
+   High chance of subtle regressions without an exhaustive
+   name-derivation test matrix.
+
+## Migration / test plan (if pursued)
+
+- Extract a pure helper
+  `_mk_logger_name(pkg_name, mod_name, mod_pkg) -> (logger_name,
+  display_name)` and cover it with an exhaustive unit matrix:
+  auto vs explicit vs `__name__`; package-`__init__` vs leaf
+  module; nested vs flat; `pkg_name in name` vs not; top-level
+  module (`__package__ == pkg_name`).
+- Switch `get_logger()` to use it for *identity*; switch the
+  formatter to use `display_name` (via a record attr).
+- Re-run the full suite + golden-diff a sample of rendered log
+  headers to confirm zero cosmetic churn.
+- Coordinate the name change with `modden`/`piker`; bump +
+  CHANGES note.
+
+## Cheaper alternative — "Route A" (record-filter)
+
+If per-leaf control is wanted *before* committing to Route B:
+keep names collapsed, add a `logging.Filter` on the configured
+handler keyed on `record.module` / `record.pathname` that maps
+each record's source module -> its spec level. Set the base
+logger to the *minimum* level in the spec (so records aren't
+pre-dropped by the logger), and let the filter discriminate
+up/down within that floor.
+
+- Pros: no name churn, no formatter change, fully contained
+  next to `apply_logspec()`.
+- Cons: a filter can only discriminate *within* what the logger
+  admits -> base must be permissive, so `at_least_level()`
+  expensive-work guards over-admit; matching dotted spec names
+  to a `pathname` is fiddly; doesn't clean up the hierarchy
+  itself.
+
+## Recommendation
+
+- Defer Route B unless true per-module loggers are wanted as a
+  first-class feature.
+- If per-leaf control is needed soon, prefer **Route A**
+  (filter) — lower risk.
+- The shipped sub-PACKAGE fix already covers the common ask
+  (`devx.debug` vs `devx`).
--- a/docs/README.rst
+++ b/docs/README.rst
@ -420,20 +420,17 @@ Check out our experimental system for `guest`_-mode controlled


    async def aio_echo_server(
-        to_trio: trio.MemorySendChannel,
-        from_trio: asyncio.Queue,
+        chan: tractor.to_asyncio.LinkedTaskChannel,
    ) -> None:

        # a first message must be sent **from** this ``asyncio``
        # task or the ``trio`` side will never unblock from
        # ``tractor.to_asyncio.open_channel_from():``
-        to_trio.send_nowait('start')
+        chan.started_nowait('start')

-        # XXX: this uses an ``from_trio: asyncio.Queue`` currently but we
-        # should probably offer something better.
        while True:
            # echo the msg back
-            to_trio.send_nowait(await from_trio.get())
+            chan.send_nowait(await chan.get())
            await asyncio.sleep(0)


@ -445,7 +442,7 @@ Check out our experimental system for `guest`_-mode controlled
        # message.
        async with tractor.to_asyncio.open_channel_from(
            aio_echo_server,
-        ) as (first, chan):
+        ) as (chan, first):

            assert first == 'start'
            await ctx.started(first)
@ -504,8 +501,10 @@ Yes, we spawn a python process, run ``asyncio``, start ``trio`` on the
 ``asyncio`` loop, then send commands to the ``trio`` scheduled tasks to
 tell ``asyncio`` tasks what to do XD

-We need help refining the `asyncio`-side channel API to be more
-`trio`-like. Feel free to sling your opinion in `#273`_!
+The ``asyncio``-side task receives a single
+``chan: LinkedTaskChannel`` handle providing a ``trio``-like
+API: ``.started_nowait()``, ``.send_nowait()``, ``.get()``
+and more. Feel free to sling your opinion in `#273`_!


 .. _#273: https://github.com/goodboy/tractor/issues/273
@ -641,13 +640,15 @@ Help us push toward the future of distributed `Python`.
 - Typed capability-based (dialog) protocols ( see `#196
  <https://github.com/goodboy/tractor/issues/196>`_ with draft work
  started in `#311 <https://github.com/goodboy/tractor/pull/311>`_)
- We **recently disabled CI-testing on windows** and need help getting
-  it running again! (see `#327
-  <https://github.com/goodboy/tractor/pull/327>`_). **We do have windows
-  support** (and have for quite a while) but since no active hacker
-  exists in the user-base to help test on that OS, for now we're not
-  actively maintaining testing due to the added hassle and general
-  latency..
+- **macOS is now officially supported** and tested in CI
+  alongside Linux!
+- We **recently disabled CI-testing on windows** and need
+  help getting it running again! (see `#327
+  <https://github.com/goodboy/tractor/pull/327>`_). **We do
+  have windows support** (and have for quite a while) but
+  since no active hacker exists in the user-base to help
+  test on that OS, for now we're not actively maintaining
+  testing due to the added hassle and general latency..


 Feel like saying hi?
--- a/examples/advanced_faults/ipc_failure_during_stream.py
+++ b/examples/advanced_faults/ipc_failure_during_stream.py
@ -17,6 +17,7 @@ from tractor import (
    MsgStream,
    _testing,
    trionics,
+    TransportClosed,
 )
 import trio
 import pytest
@ -208,12 +209,16 @@ async def main(
                        # TODO: is this needed or no?
                        raise

-                    except trio.ClosedResourceError:
+                    except (
+                        trio.ClosedResourceError,
+                        TransportClosed,
+                    ) as _tpt_err:
                        # NOTE: don't send if we already broke the
                        # connection to avoid raising a closed-error
                        # such that we drop through to the ctl-c
                        # mashing by user.
-                        await trio.sleep(0.01)
+                        with trio.CancelScope(shield=True):
+                            await trio.sleep(0.01)

                    # timeout: int = 1
                    # with trio.move_on_after(timeout) as cs:
@ -247,6 +252,7 @@ async def main(
                    await stream.send(i)
                    pytest.fail('stream not closed?')
                except (
+                    TransportClosed,
                    trio.ClosedResourceError,
                    trio.EndOfChannel,
                ) as send_err:
--- a/examples/debugging/asyncio_bp.py
+++ b/examples/debugging/asyncio_bp.py
@ -18,15 +18,14 @@ async def aio_sleep_forever():


 async def bp_then_error(
-    to_trio: trio.MemorySendChannel,
-    from_trio: asyncio.Queue,
+    chan: to_asyncio.LinkedTaskChannel,

    raise_after_bp: bool = True,

 ) -> None:

    # sync with `trio`-side (caller) task
-    to_trio.send_nowait('start')
+    chan.started_nowait('start')

    # NOTE: what happens here inside the hook needs some refinement..
    # => seems like it's still `.debug._set_trace()` but
@ -60,7 +59,7 @@ async def trio_ctx(
        to_asyncio.open_channel_from(
            bp_then_error,
            # raise_after_bp=not bp_before_started,
-        ) as (first, chan),
+        ) as (chan, first),

        trio.open_nursery() as tn,
    ):
--- a/examples/debugging/fast_error_in_root_after_spawn.py
+++ b/examples/debugging/fast_error_in_root_after_spawn.py
@ -20,7 +20,7 @@ async def sleep(


 async def open_ctx(
-    n: tractor._supervise.ActorNursery
+    n: tractor.runtime._supervise.ActorNursery
 ):

    # spawn both actors
--- a/examples/debugging/multi_daemon_subactors.py
+++ b/examples/debugging/multi_daemon_subactors.py
@ -27,12 +27,9 @@ async def main():
    '''
    async with tractor.open_nursery(
        debug_mode=True,
-        loglevel='cancel',
-        # loglevel='devx',
-    ) as n:
-
-        p0 = await n.start_actor('bp_forever', enable_modules=[__name__])
-        p1 = await n.start_actor('name_error', enable_modules=[__name__])
+    ) as an:
+        p0 = await an.start_actor('bp_forever', enable_modules=[__name__])
+        p1 = await an.start_actor('name_error', enable_modules=[__name__])

        # retreive results
        async with p0.open_stream_from(breakpoint_forever) as stream:
--- a/examples/debugging/multi_nested_subactors_error_up_through_nurseries.py
+++ b/examples/debugging/multi_nested_subactors_error_up_through_nurseries.py
@ -67,7 +67,7 @@ async def main():
    """
    async with tractor.open_nursery(
        debug_mode=True,
-        # loglevel='cancel',
+        loglevel='pdb',
    ) as n:

        # spawn both actors
--- a/examples/debugging/root_cancelled_but_child_is_in_tty_lock.py
+++ b/examples/debugging/root_cancelled_but_child_is_in_tty_lock.py
@ -39,8 +39,8 @@ async def main():
    '''
    async with tractor.open_nursery(
        debug_mode=True,
-        loglevel='devx',
-        enable_transports=['uds'],
+        enable_transports=['uds'],  # TODO, apss this via osenv?
+        loglevel='devx',  # XXX, required for test!
    ) as n:

        # spawn both actors
--- a/examples/debugging/root_timeout_while_child_crashed.py
+++ b/examples/debugging/root_timeout_while_child_crashed.py
@ -1,4 +1,3 @@
-
 import trio
 import tractor

@ -9,16 +8,22 @@ async def key_error():


 async def main():
-    """Root dies 
+    '''
+    Root is fail-after-cancelled while blocking and child RPC fails
+    simultaneously.

-    """
+    '''
    async with tractor.open_nursery(
        debug_mode=True,
-        loglevel='debug'
+        # loglevel='debug'  # ?XXX required?
    ) as n:

        # spawn both actors
        portal = await n.run_in_actor(key_error)
+        print(
+            f'Child is up @ {portal.chan.aid.reprol()}'
+        )
+

        # XXX: originally a bug caused by this is where root would enter
        # the debugger and clobber the tty used by the repl even though
--- a/examples/debugging/shield_hang_in_sub.py
+++ b/examples/debugging/shield_hang_in_sub.py
@ -3,6 +3,7 @@ Verify we can dump a `stackscope` tree on a hang.

 '''
 import os
+import platform
 import signal

 import trio
@ -31,13 +32,28 @@ async def main(
    from_test: bool = False,
 ) -> None:

+    if platform.system() != 'Darwin':
+        tpt = 'uds'
+    else:
+        # XXX, precisely we can't use pytest's tmp-path generation
+        # for tests.. apparently because:
+        #
+        # > The OSError: AF_UNIX path too long in macOS Python occurs
+        # > because the path to the Unix domain socket exceeds the
+        # > operating system's maximum path length limit (around 104
+        #
+        # WHICH IS just, wtf hillarious XD
+        tpt = 'tcp'
+
    async with (
        tractor.open_nursery(
            debug_mode=True,
            enable_stack_on_sig=True,
-            # maybe_enable_greenback=False,
-            loglevel='devx',
-            enable_transports=['uds'],
+            loglevel='devx',  # XXX REQUIRED log level!
+            enable_transports=[tpt],
+            # maybe_enable_greenback=True,
+            # ^TODO? maybe a "smarter" way todo all this is how
+            # `modden` does with a rtv serialized through the osenv?
        ) as an,
    ):
        ptl: tractor.Portal  = await an.start_actor(
@ -49,7 +65,9 @@ async def main(
            start_n_shield_hang,
        ) as (ctx, cpid):

-            _, proc, _ = an._children[ptl.chan.uid]
+            _, proc, _ = an._children[
+                ptl.chan.aid.uid
+            ]
            assert cpid == proc.pid

            print(
--- a/examples/debugging/subactor_bp_in_ctx.py
+++ b/examples/debugging/subactor_bp_in_ctx.py
@ -1,3 +1,5 @@
+import platform
+
 import tractor
 import trio

@ -34,9 +36,27 @@ async def just_bp(

 async def main():

+    # !TODO, parametrize the --tpt-proto={key} with osenv vars just
+    # like we do for loglevel/spawn-backend!
+    # - [ ] run on both tpts for all such debugger tests?
+    # - [ ] special skip for macos!
+    #
+    if platform.system() != 'Darwin':
+        tpt = 'uds'
+    else:
+        # XXX, precisely we can't use pytest's tmp-path generation
+        # for tests.. apparently because:
+        #
+        # > The OSError: AF_UNIX path too long in macOS Python occurs
+        # > because the path to the Unix domain socket exceeds the
+        # > operating system's maximum path length limit (around 104
+        #
+        # WHICH IS just, wtf hillarious XD
+        tpt = 'tcp'
+
    async with tractor.open_nursery(
        debug_mode=True,
-        enable_transports=['uds'],
+        enable_transports=[tpt],
        loglevel='devx',
    ) as n:
        p = await n.start_actor(
--- a/examples/debugging/subactor_error.py
+++ b/examples/debugging/subactor_error.py
@ -9,7 +9,6 @@ async def name_error():
 async def main():
    async with tractor.open_nursery(
        debug_mode=True,
-        # loglevel='transport',
    ) as an:

        # TODO: ideally the REPL arrives at this frame in the parent,
--- a/examples/debugging/sync_bp.py
+++ b/examples/debugging/sync_bp.py
@ -1,9 +1,22 @@
 from functools import partial
+import os
 import time

+# ?TODO? how to make `pdbp` enforce this?
+# os.environ['PYTHON_COLORS'] = '0'
+# os.environ['NO_COLOR'] = '1'
+
 import trio
 import tractor

+# disable `pbdp` prompt colors
+# for prompt matching in test.
+def disable_pdbp_color():
+    if os.environ['PYTHON_COLORS'] == '0':
+        from tractor.devx.debug import _repl
+        _repl.TractorConfig.use_pygments = False
+
+
 # TODO: only import these when not running from test harness?
 # can we detect `pexpect` usage maybe?
 # from tractor.devx.debug import (
@ -42,6 +55,7 @@ async def start_n_sync_pause(
    ctx: tractor.Context,
 ):
    actor: tractor.Actor = tractor.current_actor()
+    disable_pdbp_color()

    # sync to parent-side task
    await ctx.started()
@ -52,13 +66,15 @@ async def start_n_sync_pause(


 async def main() -> None:
+    disable_pdbp_color()
    async with (
        tractor.open_nursery(
            debug_mode=True,
            maybe_enable_greenback=True,
-            enable_stack_on_sig=True,
-            # loglevel='warning',
-            # loglevel='devx',
+
+            # XXX flags required for test pattern matching.
+            loglevel='pdb',
+            # enable_stack_on_sig=True,
        ) as an,
        trio.open_nursery() as tn,
    ):
@ -68,8 +84,8 @@ async def main() -> None:
        p: tractor.Portal  = await an.start_actor(
            'subactor',
            enable_modules=[__name__],
-            # infect_asyncio=True,
            debug_mode=True,
+            # infect_asyncio=True,
        )

        # TODO: 3 sub-actor usage cases:
--- a/examples/full_fledged_streaming_service.py
+++ b/examples/full_fledged_streaming_service.py
@ -90,7 +90,7 @@ async def main() -> list[int]:
    # yes, a nursery which spawns `trio`-"actors" B)
    an: ActorNursery
    async with tractor.open_nursery(
-        loglevel='cancel',
+        loglevel='error',
        # debug_mode=True,
    ) as an:

@ -118,8 +118,10 @@ async def main() -> list[int]:
        cancelled: bool = await portal.cancel_actor()
        assert cancelled

-        print(f"STREAM TIME = {time.time() - start}")
-        print(f"STREAM + SPAWN TIME = {time.time() - pre_start}")
+        print(
+            f"STREAM TIME = {time.time() - start}\n"
+            f"STREAM + SPAWN TIME = {time.time() - pre_start}\n"
+        )
        assert result_stream == list(range(seed))
        return result_stream

--- a/examples/infected_asyncio_echo_server.py
+++ b/examples/infected_asyncio_echo_server.py
@ -11,21 +11,17 @@ import tractor


 async def aio_echo_server(
-    to_trio: trio.MemorySendChannel,
-    from_trio: asyncio.Queue,
-
+    chan: tractor.to_asyncio.LinkedTaskChannel,
 ) -> None:

    # a first message must be sent **from** this ``asyncio``
    # task or the ``trio`` side will never unblock from
    # ``tractor.to_asyncio.open_channel_from():``
-    to_trio.send_nowait('start')
+    chan.started_nowait('start')

-    # XXX: this uses an ``from_trio: asyncio.Queue`` currently but we
-    # should probably offer something better.
    while True:
        # echo the msg back
-        to_trio.send_nowait(await from_trio.get())
+        chan.send_nowait(await chan.get())
        await asyncio.sleep(0)


@ -37,7 +33,7 @@ async def trio_to_aio_echo_server(
    # message.
    async with tractor.to_asyncio.open_channel_from(
        aio_echo_server,
-    ) as (first, chan):
+    ) as (chan, first):

        assert first == 'start'
        await ctx.started(first)
--- a/examples/integration/mpi4py/init.py
+++ b/examples/integration/mpi4py/init.py
--- a/examples/integration/mpi4py/_child.py
+++ b/examples/integration/mpi4py/_child.py
@ -0,0 +1,5 @@
+import os
+
+
+async def child_fn() -> str:
+    return f"child OK  pid={os.getpid()}"
--- a/examples/integration/mpi4py/inherit_parent_main.py
+++ b/examples/integration/mpi4py/inherit_parent_main.py
@ -0,0 +1,50 @@
+"""
+Integration test: spawning tractor actors from an MPI process.
+
+When a parent is launched via ``mpirun``, Open MPI sets ``OMPI_*`` env
+vars that bind ``MPI_Init`` to the ``orted`` daemon.  Tractor children
+inherit those env vars, so if ``inherit_parent_main=True`` (the default)
+the child re-executes ``__main__``, re-imports ``mpi4py``, and
+``MPI_Init_thread`` fails because the child was never spawned by
+``orted``::
+
+    getting local rank failed
+      --> Returned value No permission (-17) instead of ORTE_SUCCESS
+
+Passing ``inherit_parent_main=False`` and placing RPC functions in a
+separate importable module (``_child``) avoids the re-import entirely.
+
+Usage::
+
+    mpirun --allow-run-as-root -np 1 python -m \
+        examples.integration.mpi4py.inherit_parent_main
+"""
+
+from mpi4py import MPI
+
+import os
+import trio
+import tractor
+
+from ._child import child_fn
+
+
+async def main() -> None:
+    rank = MPI.COMM_WORLD.Get_rank()
+    print(f"[parent] rank={rank}  pid={os.getpid()}", flush=True)
+
+    async with tractor.open_nursery(start_method='trio') as an:
+        portal = await an.start_actor(
+            'mpi-child',
+            enable_modules=[child_fn.__module__],
+            # Without this the child replays __main__, which
+            # re-imports mpi4py and crashes on MPI_Init.
+            inherit_parent_main=False,
+        )
+        result = await portal.run(child_fn)
+        print(f"[parent] got: {result}", flush=True)
+        await portal.cancel_actor()
+
+
+if __name__ == "__main__":
+    trio.run(main)
--- a/examples/service_discovery.py
+++ b/examples/service_discovery.py
@ -10,7 +10,7 @@ async def main(service_name):
        await an.start_actor(service_name)

        async with tractor.get_registry() as portal:
-            print(f"Arbiter is listening on {portal.channel}")
+            print(f"Registrar is listening on {portal.channel}")

        async with tractor.wait_for_actor(service_name) as sockaddr:
            print(f"my_service is found at {sockaddr}")
--- a/flake.lock
+++ b/flake.lock
@ -0,0 +1,27 @@
+{
+  "nodes": {
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1769018530,
+        "narHash": "sha256-MJ27Cy2NtBEV5tsK+YraYr2g851f3Fl1LpNHDzDX15c=",
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "rev": "88d3861acdd3d2f0e361767018218e51810df8a1",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nixos",
+        "ref": "nixos-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "nixpkgs": "nixpkgs"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
--- a/flake.nix
+++ b/flake.nix
@ -0,0 +1,70 @@
+# An "impure" template thx to `pyproject.nix`,
+# https://pyproject-nix.github.io/pyproject.nix/templates.html#impure
+# https://github.com/pyproject-nix/pyproject.nix/blob/master/templates/impure/flake.nix
+{
+  description = "An impure overlay (w dev-shell) using `uv`";
+
+  inputs = {
+    nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";
+  };
+
+  outputs =
+    { nixpkgs, ... }:
+    let
+      inherit (nixpkgs) lib;
+      forAllSystems = lib.genAttrs lib.systems.flakeExposed;
+    in
+    {
+      devShells = forAllSystems (
+        system:
+        let
+          pkgs = nixpkgs.legacyPackages.${system};
+
+          # XXX NOTE XXX, for now we overlay specific pkgs via
+          # a major-version-pinned-`cpython`
+          cpython = "python313";
+          venv_dir = "py313";
+          pypkgs = pkgs."${cpython}Packages";
+        in
+        {
+          default = pkgs.mkShell {
+
+            packages = [
+              # XXX, ensure sh completions activate!
+              pkgs.bashInteractive
+              pkgs.bash-completion
+
+              # XXX, on nix(os), use pkgs version to avoid
+              # build/sys-sh-integration issues
+              pkgs.ruff
+
+              pkgs.uv
+              pkgs.${cpython}# ?TODO^ how to set from `cpython` above?
+            ];
+
+            shellHook = ''
+              # unmask to debug **this** dev-shell-hook
+              # set -e
+
+              # link-in c++ stdlib for various AOT-ext-pkgs (numpy, etc.)
+              LD_LIBRARY_PATH="${pkgs.stdenv.cc.cc.lib}/lib:$LD_LIBRARY_PATH"
+
+              export LD_LIBRARY_PATH
+
+              # RUNTIME-SETTINGS
+              # ------ uv ------
+              # - always use the ./py313/ venv-subdir
+              # - sync env with all extras
+              export UV_PROJECT_ENVIRONMENT=${venv_dir}
+              uv sync --dev --all-extras
+
+              # ------ TIPS ------
+              # NOTE, to launch the py-venv installed `xonsh` (like @goodboy)
+              # run the `nix develop` cmd with,
+              # >> nix develop -c uv run xonsh
+            '';
+          };
+        }
+      );
+    };
+}
--- a/pyproject.toml
+++ b/pyproject.toml
@ -9,7 +9,7 @@ name = "tractor"
 version = "0.1.0a6dev0"
 description = 'structured concurrent `trio`-"actors"'
 authors = [{ name = "Tyler Goodlet", email = "goodboy_foss@protonmail.com" }]
-requires-python = ">= 3.11"
+requires-python = ">=3.13, <3.15"
 readme = "docs/README.rst"
 license = "AGPL-3.0-or-later"
 keywords = [
@ -24,11 +24,14 @@ keywords = [
 classifiers = [
  "Development Status :: 3 - Alpha",
  "Operating System :: POSIX :: Linux",
+  "Operating System :: MacOS",
  "Framework :: Trio",
  "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
  "Programming Language :: Python :: Implementation :: CPython",
  "Programming Language :: Python :: 3 :: Only",
-  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+  "Programming Language :: Python :: 3.14",
  "Topic :: System :: Distributed Computing",
 ]
 dependencies = [
@ -42,48 +45,115 @@ dependencies = [
  "wrapt>=1.16.0,<2",
  "colorlog>=6.8.2,<7",
  # built-in multi-actor `pdb` REPL
-  "pdbp>=1.6,<2", # windows only (from `pdbp`)
+  "pdbp>=1.8.2,<2", # windows only (from `pdbp`)
  # typed IPC msging
-  "msgspec>=0.19.0",
-  "cffi>=1.17.1",
+  "msgspec>=0.20.0",
  "bidict>=0.23.1",
+  "multiaddr>=0.2.0",
+  "platformdirs>=4.4.0",
+  # per-actor `argv[0]` proc-title for OS-level diag tools
+  # (`ps`, `top`, `psutil`-backed tooling like `acli.pytree`).
+  # Optional at runtime — guarded by `try/except ImportError` in
+  # `tractor.devx._proctitle` — but listed here so default
+  # installs benefit from it. See tracking issue for follow-ups
+  # (e.g. richer formats, per-backend overrides).
+  "setproctitle>=1.3,<2",
 ]

 # ------ project ------

 [dependency-groups]
 dev = [
-  # test suite
-  # TODO: maybe some of these layout choices?
-  # https://docs.pytest.org/en/8.0.x/explanation/goodpractices.html#choosing-a-test-layout-import-rules
-  "pytest>=8.3.5",
-  "pexpect>=4.9.0,<5",
+  {include-group = 'devx'},
+  {include-group = 'testing'},
+  {include-group = 'repl'},
+  {include-group = 'sync_pause'},
+]
+devx = [
  # `tractor.devx` tooling
-  "greenback>=1.2.1,<2",
  "stackscope>=0.2.2,<0.3",
  # ^ requires this?
  "typing-extensions>=4.14.1",
-
+  # {include-group = 'sync_pause'},  # XXX, no 3.14 yet!
+]
+sync_pause = [
+  "greenback>=1.2.1,<2",  # TODO? 3.14 greenlet on nix?
+]
+testing = [
+  # test suite
+  # TODO: maybe some of these layout choices?
+  # https://docs.pytest.org/en/8.0.x/explanation/goodpractices.html#choosing-a-test-layout-import-rules
+  # bumped 8.3.5 → 9.0 per upstream security advisory + our
+  # local-only reliance on the post-9.0 capture-machinery shape
+  # (the `sys.__stderr__`-bypass print in
+  # `tractor._testing.trace._do_capture_snapshot` works on 8.x
+  # too, but standardizing on 9.x here ensures `--show-capture`
+  # interactions stay predictable across dev installs).
+  "pytest>=9.0",
+  "pexpect>=4.9.0,<5",
+  # per-test wall-clock bound (used via
+  # `@pytest.mark.timeout(..., method='thread')` on the
+  # known-hanging `subint`-backend audit tests; see
+  # `ai/conc-anal/subint_*_issue.md`).
+  "pytest-timeout>=2.3",
+  # used by `tractor._testing._reap` for the
+  # `tractor-reap` zombie-subactor + leaked-shm
+  # cleanup utility (xplatform `Process.memory_maps`,
+  # `Process.open_files`).
+  "psutil>=7.0.0",
+]
+repl = [
  "pyperclip>=1.9.0",
  "prompt-toolkit>=3.0.50",
-  "xonsh>=0.19.2",
+  "xonsh>=0.23.8",
  "psutil>=7.0.0",
 ]
+lint = [
+  "ruff>=0.9.6"
+]
+# XXX, used for linux-only hi perf eventfd+shm channels
+# now mostly moved over to `hotbaud`.
+eventfd = [
+  "cffi>=1.17.1",
+]
+subints = [
+  "msgspec>=0.21.0",
+]
 # TODO, add these with sane versions; were originally in
 # `requirements-docs.txt`..
 # docs = [
 #   "sphinx>="
 #   "sphinx_book_theme>="
 # ]
-
 # ------ dependency-groups ------

-# ------ dependency-groups ------
+[tool.uv.dependency-groups]
+# for subints, we require 3.14+ due to 2 issues,
+# - hanging behaviour for various multi-task teardown cases (see
+#   "Availability" section in the `tractor.spawn._subints` doc string).
+# - `msgspec` support which is oustanding per PEP 684 upstream tracker:
+#   https://github.com/jcrist/msgspec/issues/563
+#
+# https://docs.astral.sh/uv/concepts/projects/dependencies/#group-requires-python
+subints = {requires-python = ">=3.14"}
+eventfd = {requires-python = ">=3.13, <3.14"}
+sync_pause = {requires-python = ">=3.13, <3.14"}

 [tool.uv.sources]
 # XXX NOTE, only for @goodboy's hacking on `pprint(sort_dicts=False)`
 # for the `pp` alias..
-# pdbp = { path = "../pdbp", editable = true }
+# ------ gh upstream ------
+# xonsh = { git = 'https://github.com/anki-code/xonsh.git', branch = 'prompt_next_suggestion' }
+# ^ https://github.com/xonsh/xonsh/pull/6048
+# xonsh = { git = 'https://github.com/xonsh/xonsh.git', branch = 'main' }
+# xonsh = { path = "../xonsh", editable = true }
+
+# [tool.uv.sources.pdbp]
+# XXX, in case we need to tmp patch again.
+# git = "https://github.com/goodboy/pdbp.git"
+# branch ="repair_stack_trace_frame_indexing"
+# path = "../pdbp"
+# editable = true

 # ------ tool.uv.sources ------
 # TODO, distributed (multi-host) extensions
@ -145,20 +215,69 @@ all_bullets = true

 [tool.pytest.ini_options]
 minversion = '6.0'
+# NOTE: `pytest-timeout`'s global per-test cap is intentionally
+# NOT set — both of its enforcement methods break trio's
+# runtime under our fork-based spawn backends:
+#
+# - `method='signal'` (the default; SIGALRM) raises `Failed`
+#   synchronously from the signal handler in trio's main
+#   thread, which leaves `GLOBAL_RUN_CONTEXT` half-installed
+#   ("Trio guest run got abandoned"). EVERY subsequent
+#   `trio.run()` in the same pytest session then bails with
+#   `RuntimeError: Attempted to call run() from inside a
+#   run()` — full-session poison: a single 200s hang
+#   cascades into 30+ false-positive failures across
+#   downstream test files.
+#
+# - `method='thread'` calls `_thread.interrupt_main()` which
+#   can let the resulting `KeyboardInterrupt` escape trio's
+#   `KIManager` under fork-cascade teardown races, killing
+#   the whole pytest session.
+#
+# For tests that legitimately need a wall-clock cap, use
+# `with trio.fail_after(N):` INSIDE the test — trio's own
+# Cancelled machinery handles the timeout cleanly through
+# the actor nursery without disturbing global state. See
+# `tests/test_advanced_streaming.py::test_dynamic_pub_sub`'s
+# module-level NOTE for the canonical pattern.
+#
+# CI environments should rely on job-level wall-clock
+# timeouts (e.g. GitHub Actions `timeout-minutes`) for an
+# escape hatch on genuinely-stuck suites.
+# https://docs.pytest.org/en/stable/reference/reference.html#configuration-options
 testpaths = [
  'tests'
 ]
 addopts = [
  # TODO: figure out why this isn't working..
  '--rootdir=./tests',
-
  '--import-mode=importlib',
  # don't show frickin captured logs AGAIN in the report..
  '--show-capture=no',
+
+  # load builtin plugin since we need a boostrapping hook,
+  # `pytest_load_initial_conftests()` for `--capture=` per:
+  # https://docs.pytest.org/en/stable/reference/reference.html#bootstrapping-hooks
+  '-p tractor._testing.pytest',
+
+  # disable `xonsh` plugin
+  # https://docs.pytest.org/en/stable/how-to/plugins.html#disabling-plugins-from-autoloading
+  # https://docs.pytest.org/en/stable/how-to/plugins.html#deactivating-unregistering-a-plugin-by-name
+  '-p no:xonsh',
+
+  # XXX default on non-forking spawners
+  '--capture=fd',
+  # '--capture=sys',
+  # ^XXX NOTE^ ALWAYS SET THIS for `*_forkserver` spawner
+  # backends! see details @
+  # `tractor._testing.pytest.pytest_load_initial_conftests()`
+
 ]
 log_cli = false
 # TODO: maybe some of these layout choices?
 # https://docs.pytest.org/en/8.0.x/explanation/goodpractices.html#choosing-a-test-layout-import-rules
 # pythonpath = "src"

+# https://docs.pytest.org/en/stable/reference/reference.html#confval-console_output_style
+console_output_style = 'progress'
 # ------ tool.pytest ------
--- a/pytest.ini
+++ b/pytest.ini
@ -1,8 +0,0 @@
-# vim: ft=ini
-# pytest.ini for tractor
-
-[pytest]
-# don't show frickin captured logs AGAIN in the report..
-addopts = --show-capture='no'
-log_cli = false
-; minversion = 6.0
--- a/ruff.toml
+++ b/ruff.toml
@ -35,8 +35,8 @@ exclude = [
 line-length = 88
 indent-width = 4

-# Assume Python 3.9
-target-version = "py311"
+# assume latest minor cpython
+target-version = "py313"

 [lint]
 # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`)  codes by default.
--- a/scripts/tractor-reap
+++ b/scripts/tractor-reap
@ -0,0 +1,237 @@
+#!/usr/bin/env python3
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+'''
+`tractor-reap` — SC-polite zombie-subactor reaper +
+optional `/dev/shm/` orphan-segment sweep.
+
+Two cleanup phases (run in order when both are enabled):
+
+1. **process reap** — finds `tractor` subactor processes
+   left alive after a `pytest` (or any tractor-app) run
+   that failed to fully cancel its actor tree, then sends
+   SIGINT with a bounded grace window before escalating
+   to SIGKILL.
+
+2. **shm sweep** (`--shm` / `--shm-only`) — unlinks
+   `/dev/shm/<file>` entries owned by the current uid
+   that no live process has open (mmap'd or fd-held).
+   Needed because `tractor` disables
+   `mp.resource_tracker` (see `tractor.ipc._mp_bs`), so a
+   hard-crashing actor leaves leaked segments that
+   nothing else GCs.
+
+3. **UDS sweep** (`--uds` / `--uds-only`) — unlinks
+   `${XDG_RUNTIME_DIR}/tractor/<name>@<pid>.sock` files
+   whose binder pid is dead (or the `1616` registry
+   sentinel). Needed because the IPC server's
+   `os.unlink()` cleanup lives in a `finally:` block
+   that doesn't always run on hard exits (SIGKILL,
+   escaped `KeyboardInterrupt`, etc.) — see issue #452.
+
+Process-reap detection modes (auto-selected):
+
+    --parent <pid>  : descendant-mode — kill procs whose
+                      PPid == <pid>. Use when a parent
+                      is still alive and you want to
+                      scope the sweep precisely (e.g.
+                      CI wrapper calling in from outside
+                      pytest).
+
+    (default)       : orphan-mode — kill procs with
+                      PPid==1 (init-reparented) whose
+                      cwd matches the repo root AND
+                      whose cmdline contains `python`.
+                      The cwd filter is what prevents
+                      sweeping unrelated init-children.
+
+Usage:
+
+    # process reap only (default)
+    scripts/tractor-reap
+
+    # process reap + shm sweep
+    scripts/tractor-reap --shm
+
+    # only the shm sweep, skip process reap
+    scripts/tractor-reap --shm-only
+
+    # process reap + shm + UDS sweep (the works)
+    scripts/tractor-reap --shm --uds
+
+    # only UDS sweep
+    scripts/tractor-reap --uds-only
+
+    # from inside a still-live supervisor
+    scripts/tractor-reap --parent 12345
+
+    # dry-run: list what would be reaped, don't act
+    scripts/tractor-reap -n
+    scripts/tractor-reap --shm --uds -n
+
+'''
+import argparse
+import pathlib
+import subprocess
+import sys
+
+
+def _repo_root() -> pathlib.Path:
+    '''
+    Use `git rev-parse --show-toplevel` when available;
+    fall back to the repo this script lives in.
+
+    '''
+    try:
+        out: str = subprocess.check_output(
+            ['git', 'rev-parse', '--show-toplevel'],
+            stderr=subprocess.DEVNULL,
+            text=True,
+        ).strip()
+        return pathlib.Path(out)
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return pathlib.Path(__file__).resolve().parent.parent
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        prog='tractor-reap',
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        '--parent', '-p',
+        type=int,
+        default=None,
+        help='descendant-mode: reap procs with PPid==<pid>',
+    )
+    parser.add_argument(
+        '--grace', '-g',
+        type=float,
+        default=3.0,
+        help='SIGINT grace window in seconds (default 3.0)',
+    )
+    parser.add_argument(
+        '--dry-run', '-n',
+        action='store_true',
+        help='list matched pids/paths but do not signal/unlink',
+    )
+    parser.add_argument(
+        '--shm',
+        action='store_true',
+        help=(
+            'after process reap, also unlink orphaned '
+            '/dev/shm segments owned by the current user '
+            'that no live process is mapping or holding open'
+        ),
+    )
+    parser.add_argument(
+        '--shm-only',
+        action='store_true',
+        help='skip process reap; only do the shm sweep',
+    )
+    parser.add_argument(
+        '--uds',
+        action='store_true',
+        help=(
+            'after process reap, also unlink orphaned '
+            '${XDG_RUNTIME_DIR}/tractor/*.sock files '
+            'whose binder pid is dead (or the 1616 '
+            'registry sentinel). See issue #452.'
+        ),
+    )
+    parser.add_argument(
+        '--uds-only',
+        action='store_true',
+        help='skip process reap + shm; only do the UDS sweep',
+    )
+    args = parser.parse_args()
+    # any *-only flag also skips the process reap phase
+    skip_proc_reap: bool = (
+        args.shm_only
+        or
+        args.uds_only
+    )
+
+    # import lazily so `--help` doesn't require the tractor
+    # package to be importable (e.g. when running from a
+    # shell not inside a venv).
+    repo = _repo_root()
+    sys.path.insert(0, str(repo))
+    from tractor._testing._reap import (
+        find_descendants,
+        find_orphans,
+        find_orphaned_shm,
+        find_orphaned_uds,
+        reap,
+        reap_shm,
+        reap_uds,
+    )
+
+    rc: int = 0
+
+    # --- phase 1: process reap (skipped under --*-only) ---
+    if not skip_proc_reap:
+        if args.parent is not None:
+            pids: list[int] = find_descendants(args.parent)
+            mode: str = f'descendants of PPid={args.parent}'
+        else:
+            pids = find_orphans(repo)
+            mode = f'orphans (PPid=1, cwd={repo})'
+
+        if not pids:
+            print(f'[tractor-reap] no {mode} to reap')
+        elif args.dry_run:
+            print(
+                f'[tractor-reap] dry-run — {mode}:\n  {pids}'
+            )
+        else:
+            _, survivors = reap(pids, grace=args.grace)
+            if survivors:
+                rc = 1
+
+    # --- phase 2: shm sweep (opt-in) ---
+    if args.shm or args.shm_only:
+        leaked: list[str] = find_orphaned_shm()
+        if not leaked:
+            print(
+                '[tractor-reap] no orphaned /dev/shm '
+                'segments to sweep'
+            )
+        elif args.dry_run:
+            print(
+                f'[tractor-reap] dry-run — {len(leaked)} '
+                f'orphaned shm segment(s):\n  {leaked}'
+            )
+        else:
+            _, errors = reap_shm(leaked)
+            if errors:
+                rc = 1
+
+    # --- phase 3: UDS sweep (opt-in) ---
+    if args.uds or args.uds_only:
+        leaked_uds: list[str] = find_orphaned_uds()
+        if not leaked_uds:
+            print(
+                '[tractor-reap] no orphaned UDS sock-files '
+                'to sweep'
+            )
+        elif args.dry_run:
+            print(
+                f'[tractor-reap] dry-run — {len(leaked_uds)} '
+                f'orphaned UDS sock-file(s):\n  {leaked_uds}'
+            )
+        else:
+            _, errors = reap_uds(leaked_uds)
+            if errors:
+                rc = 1
+
+    # exit 0 if everything cleaned cleanly, else 1 — useful
+    # for CI health-check chaining.
+    return rc
+
+
+if __name__ == '__main__':
+    raise SystemExit(main())
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -9,8 +9,11 @@ import os
 import signal
 import platform
 import time
+from pathlib import Path
+from typing import Literal

 import pytest
+import tractor
 from tractor._testing import (
    examples_dir as examples_dir,
    tractor_test as tractor_test,
@ -19,58 +22,111 @@ from tractor._testing import (

 pytest_plugins: list[str] = [
    'pytester',
-    'tractor._testing.pytest',
+    # NOTE, now loaded in `pytest-ini` section of `pyproject.toml`
+    # 'tractor._testing.pytest',
 ]

+_ci_env: bool = os.environ.get('CI', False)
+_non_linux: bool = platform.system() != 'Linux'

 # Sending signal.SIGINT on subprocess fails on windows. Use CTRL_* alternatives
 if platform.system() == 'Windows':
    _KILL_SIGNAL = signal.CTRL_BREAK_EVENT
    _INT_SIGNAL = signal.CTRL_C_EVENT
    _INT_RETURN_CODE = 3221225786
-    _PROC_SPAWN_WAIT = 2
 else:
    _KILL_SIGNAL = signal.SIGKILL
    _INT_SIGNAL = signal.SIGINT
    _INT_RETURN_CODE = 1 if sys.version_info < (3, 8) else -signal.SIGINT.value
-    _PROC_SPAWN_WAIT = (
-        0.6
-        if sys.version_info < (3, 7)
-        else 0.4
-    )


 no_windows = pytest.mark.skipif(
    platform.system() == "Windows",
    reason="Test is unsupported on windows",
 )
+no_macos = pytest.mark.skipif(
+    platform.system() == "Darwin",
+    reason="Test is unsupported on MacOS",
+)


-def pytest_addoption(
-    parser: pytest.Parser,
-):
-    # ?TODO? should this be exposed from our `._testing.pytest`
-    # plugin or should we make it more explicit with `--tl` for
-    # tractor logging like we do in other client projects?
-    parser.addoption(
-        "--ll",
-        action="store",
-        dest='loglevel',
-        default='ERROR', help="logging level to set when testing"
-    )
+def get_cpu_state(
+    icpu: int = 0,
+    setting: Literal[
+        'scaling_governor',
+        '*_pstate_max_freq',
+        'scaling_max_freq',
+        # 'scaling_cur_freq',
+    ] = '*_pstate_max_freq',
+) -> tuple[
+    Path,
+    str|int,
+]|None:
+    '''
+    Attempt to read the (first) CPU's setting according
+    to the set `setting` from under the file-sys,
+
+    /sys/devices/system/cpu/cpu0/cpufreq/{setting}
+
+    Useful to determine latency headroom for various perf affected
+    test suites.
+
+    '''
+    try:
+        # Read governor for core 0 (usually same for all)
+        setting_path: Path = list(
+            Path(f'/sys/devices/system/cpu/cpu{icpu}/cpufreq/')
+            .glob(f'{setting}')
+        )[0]  # <- XXX must be single match!
+        with open(
+            setting_path,
+            'r',
+        ) as f:
+            return (
+                setting_path,
+                f.read().strip(),
+            )
+    except (FileNotFoundError, IndexError):
+        return None


-@pytest.fixture(scope='session', autouse=True)
-def loglevel(request):
-    import tractor
-    orig = tractor.log._default_loglevel
-    level = tractor.log._default_loglevel = request.config.option.loglevel
-    tractor.log.get_console_log(level)
-    yield level
-    tractor.log._default_loglevel = orig
+def cpu_scaling_factor() -> float:
+    '''
+    Return a latency-headroom multiplier (>= 1.0) reflecting how
+    much to inflate time-limits when CPU-freq scaling is active on
+    linux.
+
+    When no scaling info is available (non-linux, missing sysfs),
+    returns 1.0 (i.e. no headroom adjustment needed).
+
+    '''
+    if _non_linux:
+        return 1.
+
+    mx = get_cpu_state()
+    cur = get_cpu_state(setting='scaling_max_freq')
+    if mx is None or cur is None:
+        return 1.
+
+    _mx_pth, max_freq = mx
+    _cur_pth, cur_freq = cur
+    cpu_scaled: float = int(cur_freq) / int(max_freq)
+
+    if cpu_scaled != 1.:
+        return 1. / (
+            cpu_scaled * 2  # <- bc likely "dual threaded"
+        )
+
+    return 1.


-_ci_env: bool = os.environ.get('CI', False)
+# NOTE, the `--ll`/`--tl` CLI flags + the `loglevel`, `test_log`
+# and `testing_pkg_name` fixtures have been factored into the
+# `tractor._testing.pytest` plugin (loaded via the `-p` entry in
+# `pyproject.toml`'s `[tool.pytest.ini_options]`) so downstream
+# consuming projects (eg. `modden`) inherit them for free. The
+# plugin's `testing_pkg_name` fixture defaults to `'tractor'`, so
+# this suite keeps treating `--ll` as the runtime loglevel.


@pytest.fixture(scope='session')
@ -85,92 +141,51 @@ def ci_env() -> bool:
 def sig_prog(
    proc: subprocess.Popen,
    sig: int,
-    canc_timeout: float = 0.1,
+    canc_timeout: float = 0.2,
+    tries: int = 3,
 ) -> int:
-    "Kill the actor-process with ``sig``."
-    proc.send_signal(sig)
-    time.sleep(canc_timeout)
-    if not proc.poll():
+    '''
+    Kill the actor-process with `sig`.
+
+    Prefer to kill with the provided signal and
+    failing a `canc_timeout`, send a `SIKILL`-like
+    to ensure termination.
+
+    '''
+    for i in range(tries):
+        proc.send_signal(sig)
+        if proc.poll() is None:
+            print(
+                f'WARNING, proc still alive after,\n'
+                f'canc_timeout={canc_timeout!r}\n'
+                f'sig={sig!r}\n'
+                f'\n'
+                f'{proc.args!r}\n'
+            )
+            time.sleep(canc_timeout)
+    else:
        # TODO: why sometimes does SIGINT not work on teardown?
        # seems to happen only when trace logging enabled?
-        proc.send_signal(_KILL_SIGNAL)
+        if proc.poll() is None:
+            print(
+                f'XXX WARNING KILLING PROG WITH SIGINT XXX\n'
+                f'canc_timeout={canc_timeout!r}\n'
+                f'{proc.args!r}\n'
+            )
+            proc.send_signal(_KILL_SIGNAL)
+
    ret: int = proc.wait()
    assert ret


-# TODO: factor into @cm and move to `._testing`?
-@pytest.fixture
-def daemon(
-    debug_mode: bool,
-    loglevel: str,
-    testdir: pytest.Pytester,
-    reg_addr: tuple[str, int],
-    tpt_proto: str,
-
-) -> subprocess.Popen:
-    '''
-    Run a daemon root actor as a separate actor-process tree and
-    "remote registrar" for discovery-protocol related tests.
-
-    '''
-    if loglevel in ('trace', 'debug'):
-        # XXX: too much logging will lock up the subproc (smh)
-        loglevel: str = 'info'
-
-    code: str = (
-        "import tractor; "
-        "tractor.run_daemon([], "
-        "registry_addrs={reg_addrs}, "
-        "debug_mode={debug_mode}, "
-        "loglevel={ll})"
-    ).format(
-        reg_addrs=str([reg_addr]),
-        ll="'{}'".format(loglevel) if loglevel else None,
-        debug_mode=debug_mode,
-    )
-    cmd: list[str] = [
-        sys.executable,
-        '-c', code,
-    ]
-    # breakpoint()
-    kwargs = {}
-    if platform.system() == 'Windows':
-        # without this, tests hang on windows forever
-        kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
-
-    proc: subprocess.Popen = testdir.popen(
-        cmd,
-        **kwargs,
-    )
-
-    # UDS sockets are **really** fast to bind()/listen()/connect()
-    # so it's often required that we delay a bit more starting
-    # the first actor-tree..
-    if tpt_proto == 'uds':
-        global _PROC_SPAWN_WAIT
-        _PROC_SPAWN_WAIT = 0.6
-
-    time.sleep(_PROC_SPAWN_WAIT)
-
-    assert not proc.returncode
-    yield proc
-    sig_prog(proc, _INT_SIGNAL)
-
-    # XXX! yeah.. just be reaaal careful with this bc sometimes it
-    # can lock up on the `_io.BufferedReader` and hang..
-    stderr: str = proc.stderr.read().decode()
-    if stderr:
-        print(
-            f'Daemon actor tree produced STDERR:\n'
-            f'{proc.args}\n'
-            f'\n'
-            f'{stderr}\n'
-        )
-    if proc.returncode != -2:
-        raise RuntimeError(
-            'Daemon actor tree failed !?\n'
-            f'{proc.args}\n'
-        )
+# NOTE, the `daemon` fixture (+ its `_wait_for_daemon_ready`
+# helper + the post-yield teardown drain logic) has been
+# moved to `tests/discovery/conftest.py` since 100% of its
+# consumers are discovery-protocol tests now living under
+# that subdir. See:
+# - `tests/discovery/test_multi_program.py`
+# - `tests/discovery/test_registrar.py`
+# - `tests/discovery/test_tpt_bind_addrs.py`


 # @pytest.fixture(autouse=True)
--- a/tests/devx/conftest.py
+++ b/tests/devx/conftest.py
@ -3,6 +3,10 @@

 '''
 from __future__ import annotations
+import platform
+import os
+import re
+import signal
 import time
 from typing import (
    Callable,
@ -32,14 +36,29 @@ if TYPE_CHECKING:
    from pexpect import pty_spawn


+_non_linux: bool = platform.system() != 'Linux'
+
+
+def pytest_configure(config):
+    # register custom marks to avoid warnings see,
+    # https://docs.pytest.org/en/stable/how-to/writing_plugins.html#registering-custom-markers
+    config.addinivalue_line(
+        'markers',
+        'ctlcs_bish: test will (likely) not behave under SIGINT..'
+    )
+
 # a fn that sub-instantiates a `pexpect.spawn()`
 # and returns it.
-type PexpectSpawner = Callable[[str], pty_spawn.spawn]
+type PexpectSpawner = Callable[
+    [str],
+    pty_spawn.spawn,
+]


@pytest.fixture
 def spawn(
    start_method: str,
+    loglevel: str,
    testdir: pytest.Pytester,
    reg_addr: tuple[str, int],

@ -49,9 +68,19 @@ def spawn(
    run an `./examples/..` script by name.

    '''
-    if start_method != 'trio':
+    supported_spawners: set[str] = {
+        'trio',
+        # `examples/debugging/<script>.py` picks up the spawn
+        # backend via the `TRACTOR_SPAWN_METHOD` env-var which
+        # is honored inside `tractor._root.open_root_actor()`,
+        # so no per-script edits are required.
+        'main_thread_forkserver',
+        'subint_forkserver',
+    }
+    if start_method not in supported_spawners:
        pytest.skip(
-            '`pexpect` based tests only supported on `trio` backend'
+            f'`pexpect` based tests NOT supported on spawning-backend: {start_method!r}\n'
+            f'supported-spawners: {supported_spawners!r}'
        )

    def unset_colors():
@ -63,27 +92,117 @@ def spawn(
        https://docs.python.org/3/using/cmdline.html#using-on-controlling-color

        '''
-        import os
+        # disable colored tbs
        os.environ['PYTHON_COLORS'] = '0'
+        # disable all ANSI color output
+        # os.environ['NO_COLOR'] = '1'
+        # ?TODO, doesn't seem to disable prompt color
+        # for `pdbp`?
+
+    def set_spawn_method(
+        start_method: str,
+    ):
+        '''
+        Drive the actor-spawn backend inside the spawned
+        `examples/debugging/<script>.py` subproc via env-var
+        (consumed by `tractor._root.open_root_actor()`),
+        without requiring per-script CLI plumbing.
+
+        '''
+        os.environ['TRACTOR_SPAWN_METHOD'] = start_method
+
+    def set_loglevel(
+        loglevel: str|None,
+    ):
+        '''
+        Forward the test-suite parametrized `loglevel` into the
+        spawned `examples/debugging/<script>.py` subproc via
+        env-var (consumed by `tractor._root.open_root_actor()`),
+        so console verbosity can be cranked or silenced from
+        the test harness without per-script edits.
+
+        '''
+        if loglevel:
+            os.environ['TRACTOR_LOGLEVEL'] = loglevel
+        else:
+            os.environ.pop('TRACTOR_LOGLEVEL', None)
+
+    spawned: PexpectSpawner|None = None

    def _spawn(
        cmd: str,
+        expect_timeout: float = 4,
+        start_method: str = start_method,
+        loglevel: str|None = None,
        **mkcmd_kwargs,
    ) -> pty_spawn.spawn:
+        '''
+        Inner closure handed to consumer tests to invoke
+        `pytest.Pytester.spawn`
+
+        '''
+        nonlocal spawned
        unset_colors()
-        return testdir.spawn(
+        set_spawn_method(start_method=start_method)
+        set_loglevel(
+            loglevel=loglevel,
+            # ?TODO^ when should this be set by `--ll <level>` ?
+            # by default we apply 'error' but there should be a diff
+            # vs. when the flag IS NOT passed?
+        )
+        spawned = testdir.spawn(
            cmd=mk_cmd(
                cmd,
                **mkcmd_kwargs,
            ),
-            expect_timeout=3,
+            expect_timeout=(timeout:=(
+                expect_timeout + 6
+                if _non_linux and _ci_env
+                else expect_timeout
+            )),
            # preexec_fn=unset_colors,
            # ^TODO? get `pytest` core to expose underlying
            # `pexpect.spawn()` stuff?
        )
+        # sanity
+        assert spawned.timeout == timeout
+        return spawned

    # such that test-dep can pass input script name.
-    return _spawn  # the `PexpectSpawner`, type alias.
+    yield _spawn  # the `PexpectSpawner`, type alias.
+
+    if (
+        spawned
+        and
+        (ptyproc := spawned.ptyproc)
+    ):
+        start: float = time.time()
+        timeout: float = 5
+        while (
+            ptyproc.isalive()
+            and
+            (
+                (_time_took := (time.time() - start))
+                 <
+                 timeout
+            )
+        ):
+            ptyproc.kill(signal.SIGINT)
+            time.sleep(0.01)
+
+        if ptyproc.isalive():
+            ptyproc.kill(signal.SIGKILL)
+
+    # Scope our env-var mutations to this single fixture invocation
+    # — both `TRACTOR_SPAWN_METHOD` and `TRACTOR_LOGLEVEL` are
+    # honored by `tractor._root.open_root_actor()` so leaking them
+    # past this test could inadvertently re-route a later in-process
+    # tractor test's spawn-backend / loglevel.
+    os.environ.pop('TRACTOR_SPAWN_METHOD', None)
+    os.environ.pop('TRACTOR_LOGLEVEL', None)
+
+    # TODO? ensure we've cleaned up any UDS-paths?
+    # breakpoint()


@pytest.fixture(
@ -91,25 +210,47 @@ def spawn(
    ids='ctl-c={}'.format,
 )
 def ctlc(
-    request,
+    request: pytest.FixtureRequest,
    ci_env: bool,
-
+    start_method: str,
 ) -> bool:
+    '''
+    Parametrize and optionally skip tests which handle
+    ctlc-in-`pdbp`-REPL testing scenarios; certain spawners and actor-tree depths
+    cope very poorly with this..

-    use_ctlc = request.param
+    In particular the spawning backends from `multiprocessing` are
+    fragile, as can be the default `trio` spawner under certain
+    conditions where SIGINT is relayed down the entire subproc tree.

+    '''
+    use_ctlc: bool = request.param
    node = request.node
    markers = node.own_markers
    for mark in markers:
-        if mark.name == 'has_nested_actors':
+        if (
+            mark.name == 'has_nested_actors'
+            and
+            start_method not in {
+                # TODO, any spawners we should try again?
+                # - [ ] 'trio' but WITHOUT the SIGINT handler setup
+                #      per subproc?
+                # 'main_thread_forkserver',
+            }
+        ):
            pytest.skip(
                f'Test {node} has nested actors and fails with Ctrl-C.\n'
                f'The test can sometimes run fine locally but until'
                ' we solve' 'this issue this CI test will be xfail:\n'
                'https://github.com/goodboy/tractor/issues/320'
            )
-
-        if mark.name == 'ctlcs_bish':
+        if (
+            mark.name == 'ctlcs_bish'
+            and
+            use_ctlc
+            and
+            all(mark.args)
+        ):
            pytest.skip(
                f'Test {node} prolly uses something from the stdlib (namely `asyncio`..)\n'
                f'The test and/or underlying example script can *sometimes* run fine '
@ -129,13 +270,10 @@ def ctlc(

 def expect(
    child,
-
-    # normally a `pdb` prompt by default
-    patt: str,
-
+    patt: str,  # often a `pdbp`-prompt
    **kwargs,

-) -> None:
+) -> str:
    '''
    Expect wrapper that prints last seen console
    data before failing.
@ -146,6 +284,8 @@ def expect(
            patt,
            **kwargs,
        )
+        before = str(child.before.decode())
+        return before
    except TIMEOUT:
        before = str(child.before.decode())
        print(before)
@ -155,6 +295,26 @@ def expect(
 PROMPT = r"\(Pdb\+\)"


+# Strip terminal color / ANSI-VT100 escape sequences so
+# substring matching against REPL + traceback output stays
+# robust to color leakage — Python 3.13's colored tracebacks,
+# `pdbp`'s pygments highlighting, etc. — even when
+# `PYTHON_COLORS=0` (set in the `spawn` fixture) isn't honored
+# by every renderer in the spawned subproc.
+# Regex per https://stackoverflow.com/a/14693789
+_ansi_re: re.Pattern = re.compile(
+    r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])'
+)
+
+
+def ansi_strip(text: str) -> str:
+    '''
+    Remove ANSI/VT100 escape sequences from `text`.
+
+    '''
+    return _ansi_re.sub('', text)
+
+
 def in_prompt_msg(
    child: SpawnBase,
    parts: list[str],
@ -174,7 +334,7 @@ def in_prompt_msg(
    '''
    __tracebackhide__: bool = False

-    before: str = str(child.before.decode())
+    before: str = ansi_strip(str(child.before.decode()))
    for part in parts:
        if part not in before:
            if pause_on_false:
@ -194,16 +354,19 @@ def in_prompt_msg(
    return True


-# TODO: todo support terminal color-chars stripping so we can match
-# against call stack frame output from the the 'll' command the like!
-# -[ ] SO answer for stipping ANSI codes: https://stackoverflow.com/a/14693789
+# NB: color-char stripping (so we can match against call-stack
+# frame output from the `ll` command and the like) is handled by
+# `ansi_strip()` applied inside `in_prompt_msg()` + below.
 def assert_before(
    child: SpawnBase,
    patts: list[str],
-
    **kwargs,
+) -> str:
+    '''
+    Assert a patter is in `child.before.decode() -> str`,
+    return the full `.before` output on success.

-) -> None:
+    '''
    __tracebackhide__: bool = False

    assert in_prompt_msg(
@ -214,12 +377,14 @@ def assert_before(
        err_on_false=True,
        **kwargs
    )
+    before: str = ansi_strip(str(child.before.decode()))
+    return before


 def do_ctlc(
    child,
    count: int = 3,
-    delay: float = 0.1,
+    delay: float|None = None,
    patt: str|None = None,

    # expect repl UX to reprint the prompt after every
@ -231,6 +396,7 @@ def do_ctlc(
 ) -> str|None:

    before: str|None = None
+    delay = delay or 0.1

    # make sure ctl-c sends don't do anything but repeat output
    for _ in range(count):
@ -241,7 +407,10 @@ def do_ctlc(
        # if you run this test manually it works just fine..
        if expect_prompt:
            time.sleep(delay)
-            child.expect(PROMPT)
+            child.expect(
+                PROMPT,
+                timeout=(child.timeout * 2) if _ci_env else child.timeout,
+            )
            before = str(child.before.decode())
            time.sleep(delay)

--- a/tests/devx/test_debugger.py
+++ b/tests/devx/test_debugger.py
@ -24,6 +24,7 @@ from pexpect.exceptions import (
    TIMEOUT,
    EOF,
 )
+import tractor

 from .conftest import (
    do_ctlc,
@ -37,6 +38,9 @@ from .conftest import (
    in_prompt_msg,
    assert_before,
 )
+from ..conftest import (
+    _ci_env,
+)

 if TYPE_CHECKING:
    from ..conftest import PexpectSpawner
@ -51,13 +55,14 @@ if TYPE_CHECKING:
 # - recurrent root errors


+_non_linux: bool = platform.system() != 'Linux'
+
 if platform.system() == 'Windows':
    pytest.skip(
        'Debugger tests have no windows support (yet)',
        allow_module_level=True,
    )

-
 # TODO: was trying to this xfail style but some weird bug i see in CI
 # that's happening at collect time.. pretty soon gonna dump actions i'm
 # thinkin...
@ -193,6 +198,11 @@ def test_root_actor_bp_forever(
    child.expect(EOF)


+# skip on non-Linux CI
+@pytest.mark.ctlcs_bish(
+    _non_linux,
+    _ci_env,
+)
@pytest.mark.parametrize(
    'do_next',
    (True, False),
@ -258,6 +268,11 @@ def test_subactor_error(
    child.expect(EOF)


+# skip on non-Linux CI
+@pytest.mark.ctlcs_bish(
+    _non_linux,
+    _ci_env,
+)
 def test_subactor_breakpoint(
    spawn,
    ctlc: bool,
@ -329,6 +344,7 @@ def test_subactor_breakpoint(
 def test_multi_subactors(
    spawn,
    ctlc: bool,
+    set_fork_aware_capture,
 ):
    '''
    Multiple subactors, both erroring and
@ -473,15 +489,32 @@ def test_multi_subactors(
 def test_multi_daemon_subactors(
    spawn,
    loglevel: str,
-    ctlc: bool
+    ctlc: bool,
+    set_fork_aware_capture,
 ):
    '''
-    Multiple daemon subactors, both erroring and breakpointing within a
-    stream.
+    Multiple daemon subactors, both erroring and breakpointing within
+    a stream.

    '''
-    child = spawn('multi_daemon_subactors')
+    non_linux = _non_linux
+    if non_linux and ctlc:
+        pytest.skip(
+            'Ctl-c + MacOS is too unreliable/racy for this test..\n'
+        )
+        # !TODO, if someone with more patience then i wants to muck
+        # with the timings on this please feel free to see all the
+        # `non_linux` branching logic i added on my first attempt
+        # below!
+        #
+        # my conclusion was that if i were to run the script
+        # manually, and thus as slowly as a human would, the test
+        # would and should pass as described in this test fn, however
+        # after fighting with it for >= 1hr. i decided more then
+        # likely the more extensive `linux` testing should cover most
+        # regressions.

+    child = spawn('multi_daemon_subactors')
    child.expect(PROMPT)

    # there can be a race for which subactor will acquire
@ -511,8 +544,19 @@ def test_multi_daemon_subactors(
    else:
        raise ValueError('Neither log msg was found !?')

+    non_linux_delay: float = 0.3
    if ctlc:
-        do_ctlc(child)
+        do_ctlc(
+            child,
+            delay=(
+                non_linux_delay
+                if non_linux
+                else None
+            ),
+        )
+
+        if non_linux:
+            time.sleep(1)

    # NOTE: previously since we did not have clobber prevention
    # in the root actor this final resume could result in the debugger
@ -543,33 +587,69 @@ def test_multi_daemon_subactors(
    # assert "in use by child ('bp_forever'," in before

    if ctlc:
-        do_ctlc(child)
+        do_ctlc(
+            child,
+            delay=(
+                non_linux_delay
+                if non_linux
+                else None
+            ),
+        )
+
+        if non_linux:
+            time.sleep(1)

    # expect another breakpoint actor entry
    child.sendline('c')
    child.expect(PROMPT)
-
    try:
-        assert_before(
+        before: str = assert_before(
            child,
            bp_forev_parts,
        )
-    except AssertionError:
-        assert_before(
+    except (
+        # AssertionError,  # TODO? rm since never raised?
+        ValueError,
+    ):
+        before: str = assert_before(
            child,
            name_error_parts,
        )

    else:
        if ctlc:
-            do_ctlc(child)
+            before: str = do_ctlc(
+                child,
+                delay=(
+                    non_linux_delay
+                    if non_linux
+                    else None
+                ),
+            )
+
+            if non_linux:
+                time.sleep(1)

        # should crash with the 2nd name error (simulates
        # a retry) and then the root eventually (boxed) errors
        # after 1 or more further bp actor entries.

        child.sendline('c')
-        child.expect(PROMPT)
+        try:
+            child.expect(
+                PROMPT,
+                timeout=3,
+            )
+        except EOF:
+            before: str = child.before.decode()
+            print(
+                f'\n'
+                f'??? NEVER RXED `pdb` PROMPT ???\n'
+                f'\n'
+                f'{before}\n'
+            )
+            raise
+
        assert_before(
            child,
            name_error_parts,
@ -689,7 +769,10 @@ def test_multi_subactors_root_errors(

@has_nested_actors
 def test_multi_nested_subactors_error_through_nurseries(
-    spawn,
+    ci_env: bool,
+    spawn: PexpectSpawner,
+    is_forking_spawner: bool,
+    test_log: tractor.log.StackLevelAdapter,

    # TODO: address debugger issue for nested tree:
    # https://github.com/goodboy/tractor/issues/320
@ -706,51 +789,105 @@ def test_multi_nested_subactors_error_through_nurseries(
    # A test (below) has now been added to explicitly verify this is
    # fixed.

-    child = spawn('multi_nested_subactors_error_up_through_nurseries')
+    child = spawn(
+        'multi_nested_subactors_error_up_through_nurseries',
+        loglevel='pdb',
+    )
+    last_send_char: str|None = None
+    for (
+        i,
+        send_char,
+    ) in enumerate(itertools.cycle(['c', 'q'])):

-    # timed_out_early: bool = False
+        timeout: float = child.timeout
+        if (
+            _non_linux
+            and
+            ci_env
+        ):
+            timeout: float = 6
+
+        # XXX linux but the first crash sequence
+        # can take longer to arrive at a prompt.
+        elif i == 0:
+            timeout = 5
+
+        # XXX forking backends may take longer due to
+        # determinstic IPC cancellation.
+        if is_forking_spawner:
+            timeout += 4

-    for send_char in itertools.cycle(['c', 'q']):
        try:
-            child.expect(PROMPT)
+            child.expect(
+                PROMPT,
+                timeout=timeout,
+            )
+            delay: float = 0.1
+            test_log.info('Sleeping {delay!r} before next send-chart..')
+            time.sleep(delay)
+            last_send_char: str = send_char
            child.sendline(send_char)
-            time.sleep(0.01)
+            time.sleep(delay)

+        # script finally exited with tb on console.
        except EOF:
+            test_log.info(
+                f'Breaking from send-char loop'
+                f'last_send_char: {last_send_char!r}\n'
+            )
            break

+    # boxed source errors
+    expect_patts: list[str] = [
+        "NameError: name 'doggypants' is not defined",
+        "tractor._exceptions.RemoteActorError:",
+        "('name_error'",
+
+        # first level subtrees
+        # "tractor._exceptions.RemoteActorError: ('spawner0'",
+        "src_uid=('spawner0'",
+
+        # "tractor._exceptions.RemoteActorError: ('spawner1'",
+
+        # propagation of errors up through nested subtrees
+        # "tractor._exceptions.RemoteActorError: ('spawn_until_0'",
+        # "tractor._exceptions.RemoteActorError: ('spawn_until_1'",
+        # "tractor._exceptions.RemoteActorError: ('spawn_until_2'",
+        # ^-NOTE-^ old RAE repr, new one is below with a field
+        # showing the src actor's uid.
+        "src_uid=('spawn_until_2'",
+    ]
+    # XXX, I HAVE NO IDEA why these patts only show on the
+    # `trio`-spawner but it seems to have something to do with
+    # what gets dumped in prior-prompt latches somehow??
+    # TODO for claude, explain and or work through how this is
+    # happening but ONLY WHEN RUN FROM THE TEST, bc when i try to
+    # run the test script manually the correct output ALWAYS seems
+    # to be in the last `str(child.before.decode())` output !?!?
+    if (
+        not is_forking_spawner
+        and
+        last_send_char == 'q'
+    ):
+        expect_patts += [
+            # expect the pdb-quit exc.
+            "bdb.BdbQuit",
+            # BUT WHY these dude!?
+            "src_uid=('spawn_until_0'",
+            "relay_uid=('spawn_until_1'",
+        ]
+
    assert_before(
        child,
-        [ # boxed source errors
-            "NameError: name 'doggypants' is not defined",
-            "tractor._exceptions.RemoteActorError:",
-            "('name_error'",
-            "bdb.BdbQuit",
-
-            # first level subtrees
-            # "tractor._exceptions.RemoteActorError: ('spawner0'",
-            "src_uid=('spawner0'",
-
-            # "tractor._exceptions.RemoteActorError: ('spawner1'",
-
-            # propagation of errors up through nested subtrees
-            # "tractor._exceptions.RemoteActorError: ('spawn_until_0'",
-            # "tractor._exceptions.RemoteActorError: ('spawn_until_1'",
-            # "tractor._exceptions.RemoteActorError: ('spawn_until_2'",
-            # ^-NOTE-^ old RAE repr, new one is below with a field
-            # showing the src actor's uid.
-            "src_uid=('spawn_until_0'",
-            "relay_uid=('spawn_until_1'",
-            "src_uid=('spawn_until_2'",
-        ]
+        expect_patts,
    )
+    expect(child, EOF)


-@pytest.mark.timeout(15)
+# @pytest.mark.timeout(15)
@has_nested_actors
 def test_root_nursery_cancels_before_child_releases_tty_lock(
    spawn,
-    start_method,
    ctlc: bool,
 ):
    '''
@ -889,6 +1026,11 @@ def test_different_debug_mode_per_actor(
    )


+# skip on non-Linux CI
+@pytest.mark.ctlcs_bish(
+    _non_linux,
+    _ci_env,
+)
 def test_post_mortem_api(
    spawn,
    ctlc: bool,
@ -1044,7 +1186,12 @@ def test_shield_pause(
            "('cancelled_before_pause'",  # actor name
            _repl_fail_msg,
            "trio.Cancelled",
-            "raise Cancelled._create()",
+            # trio >=0.30 raises via a multi-line
+            # `raise Cancelled._create(source=.., reason=..,
+            #  source_task=..)` (cancel-reason metadata), so
+            # match the open-paren form only, NOT the legacy
+            # bare `()`.
+            "raise Cancelled._create(",

            # we should be handling a taskc inside
            # the first `.port_mortem()` sin-shield!
@ -1062,7 +1209,12 @@ def test_shield_pause(
            "('root'",  # actor name
            _repl_fail_msg,
            "trio.Cancelled",
-            "raise Cancelled._create()",
+            # trio >=0.30 raises via a multi-line
+            # `raise Cancelled._create(source=.., reason=..,
+            #  source_task=..)` (cancel-reason metadata), so
+            # match the open-paren form only, NOT the legacy
+            # bare `()`.
+            "raise Cancelled._create(",

            # handling a taskc inside the first unshielded
            # `.port_mortem()`.
@ -1087,7 +1239,11 @@ def test_ctxep_pauses_n_maybe_ipc_breaks(
    mashed and zombie reaper kills sub with no hangs.

    '''
-    child = spawn('subactor_bp_in_ctx')
+    child = spawn(
+        'subactor_bp_in_ctx',
+        loglevel='devx'
+        # ^XXX REQUIRED for below patt matching!
+    )
    child.expect(PROMPT)

    # 3 iters for the `gen()` pause-points
@ -1133,12 +1289,21 @@ def test_ctxep_pauses_n_maybe_ipc_breaks(
            # closed so verify we see error reporting as well as
            # a failed crash-REPL request msg and can CTL-c our way
            # out.
+
+            # ?TODO, match depending on `tpt_proto(s)`?
+            # - [ ] how can we pass it into the script tho?
+            tpt: str = 'UDS'
+            if _non_linux:
+                tpt: str = 'TCP'
+
            assert_before(
                child,
                ['peer IPC channel closed abruptly?',
                 'another task closed this fd',
                 'Debug lock request was CANCELLED?',
-                 "TransportClosed: 'MsgpackUDSStream' was already closed locally ?",]
+                 f"'Msgpack{tpt}Stream' was already closed locally?",
+                 f"TransportClosed: 'Msgpack{tpt}Stream' was already closed 'by peer'?",
+                ]

                # XXX races on whether these show/hit?
                 # 'Failed to REPl via `_pause()` You called `tractor.pause()` from an already cancelled scope!',
@ -1168,7 +1333,11 @@ def test_crash_handling_within_cancelled_root_actor(
    call.

    '''
-    child = spawn('root_self_cancelled_w_error')
+    child = spawn(
+        'root_self_cancelled_w_error',
+        loglevel='cancel',
+        # ^XXX REQUIRED for below patt matching!
+    )
    child.expect(PROMPT)

    assert_before(
--- a/tests/devx/test_pause_from_non_trio.py
+++ b/tests/devx/test_pause_from_non_trio.py
@ -63,19 +63,31 @@ def test_pause_from_sync(
    `examples/debugging/sync_bp.py`

    '''
-    child = spawn('sync_bp')
+    # XXX required for `breakpoint()` overload and
+    # thus`tractor.devx.pause_from_sync()`.
+    pytest.importorskip('greenback')
+    child = spawn(
+        'sync_bp',
+        loglevel='pdb',  # XXX pattern matching
+    )

    # first `sync_pause()` after nurseries open
    child.expect(PROMPT)
-    assert_before(
+    _before: str = assert_before(
        child,
        [
-            # pre-prompt line
-            _pause_msg,
-            "<Task '__main__.main'",
+            # devx-loglevel
+            # "imported <module 'greenback' from",
+            # "successfully scheduled `._pause()` in `trio` thread on behalf of <Task",
+
+            _pause_msg,  # pre-prompt line
            "('root'",
+            "<Task '__main__.main'",
+            "tractor.pause_from_sync()",
        ]
    )
+    # XXX `enable_stack_on_sig=False` in script
+    assert 'stackscope' not in _before
    if ctlc:
        do_ctlc(child)
        # ^NOTE^ subactor not spawned yet; don't need extra delay.
@ -85,18 +97,18 @@ def test_pause_from_sync(
    # first `await tractor.pause()` inside `p.open_context()` body
    child.expect(PROMPT)

-    # XXX shouldn't see gb loaded message with PDB loglevel!
-    # assert not in_prompt_msg(
-    #     child,
-    #     ['`greenback` portal opened!'],
-    # )
    # should be same root task
    assert_before(
        child,
        [
+            # XXX should see gb loaded with devx-loglevel.
+            # "`greenback` portal opened!",
+            # "Activated `greenback` for `tractor.pause_from_sync()` support!",
+
            _pause_msg,
-            "<Task '__main__.main'",
            "('root'",
+            "<Task '__main__.main'",
+            "tractor.pause()",
        ]
    )

@ -127,17 +139,17 @@ def test_pause_from_sync(
    # `Lock.acquire()`-ed
    # (NOT both, which will result in REPL clobbering!)
    attach_patts: dict[str, list[str]] = {
-        'subactor': [
-            "'start_n_sync_pause'",
-            "('subactor'",
+        "|_<Task 'start_n_sync_pause'": [
+            "|_('subactor'",
+            "tractor.pause_from_sync()",
        ],
-        'inline_root_bg_thread': [
-            "<Thread(inline_root_bg_thread",
+        "|_<Thread(inline_root_bg_thread": [
            "('root'",
+            "breakpoint(hide_tb=hide_tb)",
        ],
-        'start_soon_root_bg_thread': [
-            "<Thread(start_soon_root_bg_thread",
-            "('root'",
+        "|_<Thread(start_soon_root_bg_thread": [
+            "|_('root'",
+            "tractor.pause_from_sync()",
        ],
    }
    conts: int = 0  # for debugging below matching logic on failure
@ -260,6 +272,9 @@ def test_sync_pause_from_aio_task(
    `examples/debugging/asycio_bp.py`

    '''
+    # XXX required for `breakpoint()` overload and
+    # thus`tractor.devx.pause_from_sync()`.
+    pytest.importorskip('greenback')
    child = spawn('asyncio_bp')

    # RACE on whether trio/asyncio task bps first
--- a/tests/devx/test_proctitle.py
+++ b/tests/devx/test_proctitle.py
@ -0,0 +1,178 @@
+'''
+Tests for `tractor.devx._proctitle` (per-actor `setproctitle`)
+and the intrinsic-signal sub-actor detection in
+`tractor._testing._reap`.
+
+The proctitle is set in `tractor._child._actor_child_main()`
+after `Actor` construction, so any spawned sub-actor process
+should:
+
+  - have `argv[0]` (== `/proc/<pid>/cmdline`) start with
+    `<_def_prefix>[<aid.reprol()>]` (currently `_subactor[…]`)
+  - have `/proc/<pid>/comm` start with `<_def_prefix>[`
+    (kernel truncates to ~15 bytes)
+  - be detected as a tractor sub-actor by
+    `_is_tractor_subactor(pid)` via the cmdline marker.
+
+`set_actor_proctitle()` itself is also unit-tested in-process
+to verify the format string.
+
+'''
+from __future__ import annotations
+import platform
+
+import psutil
+import pytest
+import trio
+import tractor
+
+from tractor.runtime._runtime import Actor
+from tractor.devx._proctitle import (
+    set_actor_proctitle,
+    _def_prefix,
+)
+from tractor._testing._reap import (
+    _is_tractor_subactor,
+    _read_cmdline,
+    _read_comm,
+)
+
+
+_non_linux: bool = platform.system() != 'Linux'
+
+
+def test_set_actor_proctitle_format():
+    '''
+    `set_actor_proctitle()` returns the canonical
+    `<_def_prefix>[<aid.reprol()>]` form (currently
+    `_subactor[…]`) and actually mutates the running
+    proc's title.
+
+    '''
+    pytest.importorskip(
+        'setproctitle',
+        reason='`setproctitle` is an optional runtime dep',
+    )
+    import setproctitle
+
+    # save + restore so we don't pollute pytest's own title
+    saved: str = setproctitle.getproctitle()
+    try:
+        actor = Actor(
+            name='unit_test_actor',
+            uuid='1027301b-a0e3-430e-8806-a5279f21abe6',
+        )
+        title: str = set_actor_proctitle(actor)
+
+        # canonical wrapping: `<_def_prefix>[<aid.reprol()>]`.
+        # We source BOTH the prefix (`_def_prefix`) and the
+        # runtime-computed `reprol()` rather than hard-coding,
+        # so the test stays decoupled from the prefix shape
+        # (flipped to `_subactor` in `3a45dbd5`) AND from
+        # `Aid.reprol()`'s internal format (currently
+        # `<name>@<pid>`, but could evolve).
+        expected: str = f'{_def_prefix}[{actor.aid.reprol()}]'
+        assert title == expected
+        # sanity: the actor's name must be in the title
+        # somewhere (so a future `reprol()` change that
+        # drops the name is also caught).
+        assert 'unit_test_actor' in title
+
+        # actually set on the running proc
+        assert setproctitle.getproctitle() == title
+
+    finally:
+        setproctitle.setproctitle(saved)
+
+
+@pytest.mark.skipif(
+    _non_linux,
+    reason=(
+        'detection helpers read `/proc/<pid>/{cmdline,comm}` '
+        'which is Linux-specific'
+    ),
+)
+def test_subactor_proctitle_visible_via_proc():
+    '''
+    Spawn a sub-actor and verify its proc-title is visible
+    via both `/proc/<pid>/cmdline` AND `/proc/<pid>/comm`,
+    AND that `_is_tractor_subactor()` correctly identifies
+    it.
+
+    '''
+    pytest.importorskip('setproctitle')
+
+    async def main() -> dict:
+        async with tractor.open_nursery() as an:
+            portal = await an.start_actor('proctitle_boi')
+            # let the child finish setproctitle in
+            # `_actor_child_main`
+            await trio.sleep(0.3)
+
+            # the sub-actor's pid is on the portal's chan
+            # repr; psutil-walk `me.children()` is simpler.
+            me = psutil.Process()
+            sub_pids: list[int] = [
+                p.pid for p in me.children(recursive=True)
+            ]
+            assert sub_pids, (
+                'expected at least one spawned sub-actor pid'
+            )
+
+            results: dict = {}
+            for pid in sub_pids:
+                results[pid] = {
+                    'cmdline': _read_cmdline(pid),
+                    'comm': _read_comm(pid),
+                    'is_tractor': _is_tractor_subactor(pid),
+                }
+
+            await portal.cancel_actor()
+            return results
+
+    found: dict = trio.run(main)
+
+    # at least one of the spawned procs should match the
+    # `proctitle_boi` actor we started; assert the proc-
+    # title shape on it specifically.
+    matched: list[tuple[int, dict]] = [
+        (pid, info)
+        for pid, info in found.items()
+        if 'proctitle_boi' in info['cmdline']
+    ]
+    assert matched, (
+        f'no sub-actor pid had a `proctitle_boi` cmdline; '
+        f'all={found}'
+    )
+
+    pid, info = matched[0]
+    # canonical proctitle prefix in cmdline (full form);
+    # prefix sourced from `_def_prefix` so it tracks the
+    # `3a45dbd5` flip (`tractor[` -> `_subactor[`).
+    assert info['cmdline'].startswith(f'{_def_prefix}[proctitle_boi@'), (
+        f'cmdline missing `{_def_prefix}[proctitle_boi@…]` prefix: '
+        f'{info["cmdline"]!r}'
+    )
+    # comm is kernel-truncated to ~15 bytes — just check the
+    # `<_def_prefix>[` prefix made it.
+    assert info['comm'].startswith(f'{_def_prefix}['), (
+        f'comm missing `{_def_prefix}[` prefix: {info["comm"]!r}'
+    )
+    # intrinsic-signal detector should match.
+    assert info['is_tractor'] is True
+
+
+@pytest.mark.skipif(
+    _non_linux,
+    reason='reads /proc/<pid>/{cmdline,comm}',
+)
+def test_is_tractor_subactor_negative():
+    '''
+    `_is_tractor_subactor()` returns False for non-tractor
+    procs (e.g. the pytest test-runner pid itself, which
+    is `python -m pytest …` — no `tractor[` proctitle, no
+    `tractor._child` cmdline).
+
+    '''
+    import os
+    assert _is_tractor_subactor(os.getpid()) is False
--- a/tests/devx/test_tooling.py
+++ b/tests/devx/test_tooling.py
@ -21,6 +21,7 @@ import os
 import signal
 import time
 from typing import (
+    Callable,
    TYPE_CHECKING,
 )

@ -31,6 +32,9 @@ from .conftest import (
    PROMPT,
    _pause_msg,
 )
+from ..conftest import (
+    no_macos,
+)

 import pytest
 from pexpect.exceptions import (
@ -42,8 +46,14 @@ if TYPE_CHECKING:
    from ..conftest import PexpectSpawner


+@no_macos
 def test_shield_pause(
-    spawn: PexpectSpawner,
+    spawn: Callable[
+        ...,
+        PexpectSpawner,
+    ],
+    start_method: str,
+    request: pytest.FixtureRequest,
 ):
    '''
    Verify the `tractor.pause()/.post_mortem()` API works inside an
@ -51,12 +61,15 @@ def test_shield_pause(
    next checkpoint wherein the cancelled will get raised.

    '''
-    child = spawn(
-        'shield_hang_in_sub'
+    child: PexpectSpawner = spawn(
+        'shield_hang_in_sub',
+        loglevel='devx',
+        # ^XXX REQUIRED for below patt matching!
    )
    expect(
        child,
        'Yo my child hanging..?',
+        timeout=3,
    )
    assert_before(
        child,
@ -81,38 +94,82 @@ def test_shield_pause(
        # end-of-tree delimiter
        "end-of-\('root'",
    )
-    assert_before(
+    _before: str = assert_before(
        child,
        [
            # 'Srying to dump `stackscope` tree..',
            # 'Dumping `stackscope` tree for actor',
            "('root'",  # uid line

-            # TODO!? this used to show?
+            # TODO!? this in-task-code used to show??
            # -[ ] mk reproducable for @oremanj?
+            # => SOLVED? by our `trio_token.run_sync_soon()`
+            #    approach?
            #
            # parent block point (non-shielded)
            # 'await trio.sleep_forever()  # in root',
        ]
    )
-    expect(
-        child,
-        # end-of-tree delimiter
-        "end-of-\('hanger'",
-    )
-    assert_before(
-        child,
-        [
-            # relay to the sub should be reported
-            'Relaying `SIGUSR1`[10] to sub-actor',

-            "('hanger'",  # uid line
+    # NOTE, hierarchical-ordering invariant restored by
+    # `_dump_then_relay` (co-scheduled dump+relay on the
+    # trio loop, see `tractor.devx._stackscope`): the
+    # parent's full task-tree prints BEFORE the 'Relaying
+    # `SIGUSR1`' log msg, which prints BEFORE any sub-
+    # actor receives the signal and dumps its own tree.
+    # So the relay log appears BETWEEN `end-of-('root'`
+    # (above) and `end-of-('hanger'` (below).
+    handle_out_of_order: bool = False
+
+    # XXX, when capfd is NOT used we don't expect to
+    # see the logging output from the subactor.
+    if (no_capfd := (start_method in [
+            'main_thread_forkserver',
+        ])
+    ):
+        opts = request.config.option
+        assert opts.spawn_backend == start_method
+        # ?XXX? i guess the `testdir` fixture "pretends to" reset
+        # this to the default 'fd'??
+        # assert opts.capture in [
+        #     'sys',
+        #     'no',
+        # ]
+
+    if (
+        handle_out_of_order
+        and
+        "end-of-('hanger'" in _before
+    ):
+         assert "('hanger'" in _before
+         assert 'Relaying `SIGUSR1`[10] to sub-actor' in _before
+
+    else:
+        _before = expect(
+            child,
+            'Relaying `SIGUSR1`\\[10\\] to sub-actor',
+        )
+        # _before: str = assert_before(
+        #     child,
+        #     ["('hanger'",]  # uid line
+        # )
+        if not no_capfd:
+            expect(
+                child,
+                # end-of-subactor's-tree delimiter
+                "end-of-\('hanger'",
+            )
+            _before: str = assert_before(
+                child,
+                [
+                    "('hanger'",  # uid line
+
+                    # TODO!? SEE ABOVE
+                    # hanger LOC where it's shield-halted
+                    # 'await trio.sleep_forever()  # in subactor',
+                ]
+            )

-            # TODO!? SEE ABOVE
-            # hanger LOC where it's shield-halted
-            # 'await trio.sleep_forever()  # in subactor',
-        ]
-    )

    # simulate the user sending a ctl-c to the hanging program.
    # this should result in the terminator kicking in since
@ -121,21 +178,26 @@ def test_shield_pause(
        child.pid,
        signal.SIGINT,
    )
-    from tractor._supervise import _shutdown_msg
+    from tractor.runtime._supervise import _shutdown_msg
    expect(
        child,
        # 'Shutting down actor runtime',
        _shutdown_msg,
        timeout=6,
    )
-    assert_before(
-        child,
-        [
-            'raise KeyboardInterrupt',
+    expect_on_teardown: list[str] = [
+        'raise KeyboardInterrupt',
+        'Root actor terminated',
+    ]
+    if not no_capfd:
+        expect_on_teardown += [
            # 'Shutting down actor runtime',
            '#T-800 deployed to collect zombie B0',
            "'--uid', \"('hanger',",
        ]
+    assert_before(
+        child,
+        expect_on_teardown,
    )


@ -151,8 +213,10 @@ def test_breakpoint_hook_restored(
    calls used.

    '''
+    # XXX required for `breakpoint()` overload and
+    # thus`tractor.devx.pause_from_sync()`.
+    pytest.importorskip('greenback')
    child = spawn('restore_builtin_breakpoint')
-
    child.expect(PROMPT)
    try:
        assert_before(
--- a/tests/discovery/init.py
+++ b/tests/discovery/init.py
--- a/tests/discovery/conftest.py
+++ b/tests/discovery/conftest.py
@ -0,0 +1,223 @@
+'''
+Discovery-suite fixtures, including the `daemon`
+remote-registrar subprocess used by the multi-program
+discovery tests.
+
+Lives here (vs. the parent `tests/conftest.py`)
+because `daemon` is a discovery-protocol primitive —
+boots a separate `tractor.run_daemon()` process whose
+sole purpose is to serve as a registrar peer for
+discovery-roundtrip tests. Pytest fixtures inherit
+DOWNWARD through conftest hierarchy, so anything
+under `tests/discovery/` automatically picks this up.
+
+'''
+from __future__ import annotations
+import os
+import platform
+import socket
+import subprocess
+import sys
+import time
+
+import pytest
+import tractor
+
+from ..conftest import (
+    sig_prog,
+    _INT_SIGNAL,
+    _non_linux,
+)
+
+
+def _wait_for_daemon_ready(
+    reg_addr: tuple,
+    tpt_proto: str,
+    *,
+    deadline: float = 10.0,
+    poll_interval: float = 0.05,
+    proc: subprocess.Popen|None = None,
+) -> None:
+    '''
+    Active-poll the daemon's bind address until it
+    accepts a connection (proving it has called
+    `bind() + listen()` and is ready to handle IPC).
+
+    Replaces the historical blind `time.sleep()` in the
+    `daemon` fixture which was racy under load — see
+    `ai/conc-anal/test_register_duplicate_name_daemon_connect_race_issue.md`.
+
+    Uses stdlib `socket` directly (no trio runtime
+    bootstrap cost) — sufficient because
+    `tractor.run_daemon()` doesn't return from
+    bootstrap until the runtime is fully ready to
+    accept IPC.
+
+    Raises `TimeoutError` on `deadline` exceeded. If
+    `proc` is given, ALSO raises early if the daemon
+    process exits non-zero before the deadline (catches
+    daemon-startup-crash that the blind sleep used to
+    silently mask).
+
+    '''
+    end: float = time.monotonic() + deadline
+    last_exc: Exception|None = None
+    while time.monotonic() < end:
+        # Daemon-died-during-startup early-exit. Without
+        # this, a crashed-on-import daemon would just
+        # eat the full deadline before raising opaque
+        # TimeoutError.
+        if proc is not None and proc.poll() is not None:
+            raise RuntimeError(
+                f'Daemon proc exited (rc={proc.returncode}) '
+                f'before becoming ready to accept on '
+                f'{reg_addr!r}'
+            )
+        try:
+            if tpt_proto == 'tcp':
+                # `socket.create_connection` does the
+                # `socket() + connect()` dance with a
+                # builtin timeout — perfect primitive
+                # for a one-shot probe.
+                with socket.create_connection(
+                    reg_addr,
+                    timeout=poll_interval,
+                ):
+                    return
+            else:
+                # UDS — `reg_addr` is a `(filedir, sockname)`
+                # tuple per `tractor.ipc._uds.UDSAddress.unwrap`.
+                sockpath: str = os.path.join(*reg_addr)
+                sock = socket.socket(socket.AF_UNIX)
+                try:
+                    sock.settimeout(poll_interval)
+                    sock.connect(sockpath)
+                    return
+                finally:
+                    sock.close()
+        except (
+            ConnectionRefusedError,
+            FileNotFoundError,
+            OSError,
+            socket.timeout,
+        ) as exc:
+            last_exc = exc
+            time.sleep(poll_interval)
+    raise TimeoutError(
+        f'Daemon never accepted on {reg_addr!r} within '
+        f'{deadline}s (last connect-attempt exc: '
+        f'{last_exc!r})'
+    )
+
+
+# TODO: factor into @cm and move to `._testing`?
+@pytest.fixture
+def daemon(
+    debug_mode: bool,
+    loglevel: str,
+    testdir: pytest.Pytester,
+    reg_addr: tuple[str, int],
+    tpt_proto: str,
+    ci_env: bool,
+    test_log: tractor.log.StackLevelAdapter,
+
+) -> subprocess.Popen:
+    '''
+    Run a daemon root actor as a separate actor-process
+    tree and "remote registrar" for discovery-protocol
+    related tests.
+
+    '''
+    # XXX: too much logging will lock up the subproc (smh)
+    if loglevel in ('trace', 'debug'):
+        test_log.warning(
+            f'Test harness log level is too verbose: {loglevel!r}\n'
+            f'Reducing to INFO level..'
+        )
+        loglevel: str = 'info'
+
+    code: str = (
+        "import tractor; "
+        "tractor.run_daemon([], "
+        "registry_addrs={reg_addrs}, "
+        "enable_transports={enable_tpts}, "
+        "debug_mode={debug_mode}, "
+        "loglevel={ll})"
+    ).format(
+        reg_addrs=str([reg_addr]),
+        enable_tpts=str([tpt_proto]),
+        ll="'{}'".format(loglevel) if loglevel else None,
+        debug_mode=debug_mode,
+    )
+    cmd: list[str] = [
+        sys.executable,
+        '-c', code,
+    ]
+    kwargs = {}
+    if platform.system() == 'Windows':
+        # without this, tests hang on windows forever
+        kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
+
+    proc: subprocess.Popen = testdir.popen(
+        cmd,
+        **kwargs,
+    )
+
+    # Active-poll the daemon's bind address until it's
+    # ready to accept connections — replaces the legacy
+    # blind `time.sleep(2.2)` which was racy under load
+    # (see
+    # `ai/conc-anal/test_register_duplicate_name_daemon_connect_race_issue.md`).
+    #
+    # Per-test deadline scales with platform: macOS/CI
+    # gets extra headroom; Linux dev boxes need very
+    # little.
+    deadline: float = (
+        15.0 if (_non_linux and ci_env)
+        else 10.0
+    )
+    _wait_for_daemon_ready(
+        reg_addr=reg_addr,
+        tpt_proto=tpt_proto,
+        deadline=deadline,
+        proc=proc,
+    )
+
+    assert not proc.returncode
+    yield proc
+    sig_prog(proc, _INT_SIGNAL)
+
+    # XXX! yeah.. just be reaaal careful with this bc
+    # sometimes it can lock up on the `_io.BufferedReader`
+    # and hang..
+    #
+    # NB, drain happens at TEARDOWN (post-yield), so the
+    # test body has its chance to read `proc.stderr`
+    # FIRST. Reading here AFTER would silently swallow
+    # the daemon's stderr output and break tests that
+    # assert on it (e.g. `test_abort_on_sigint`).
+    stderr: str = proc.stderr.read().decode()
+    stdout: str = proc.stdout.read().decode()
+    if (
+        stderr
+        or
+        stdout
+    ):
+        print(
+            f'Daemon actor tree produced output:\n'
+            f'{proc.args}\n'
+            f'\n'
+            f'stderr: {stderr!r}\n'
+            f'stdout: {stdout!r}\n'
+        )
+
+    if (rc := proc.returncode) != -2:
+        msg: str = (
+            f'Daemon actor tree was not cancelled !?\n'
+            f'proc.args: {proc.args!r}\n'
+            f'proc.returncode: {rc!r}\n'
+        )
+        if rc < 0:
+            raise RuntimeError(msg)
+
+        test_log.error(msg)
--- a/tests/discovery/test_multi_program.py
+++ b/tests/discovery/test_multi_program.py
@ -0,0 +1,355 @@
+"""
+Multiple python programs invoking the runtime.
+"""
+from __future__ import annotations
+import platform
+import subprocess
+import time
+from typing import (
+    TYPE_CHECKING,
+)
+
+import pytest
+import trio
+import tractor
+from tractor._testing import (
+    tractor_test,
+)
+from tractor import (
+    current_actor,
+    Actor,
+    Context,
+    Portal,
+)
+from tractor.runtime import _state
+from ..conftest import (
+    sig_prog,
+    _INT_SIGNAL,
+    _INT_RETURN_CODE,
+)
+
+if TYPE_CHECKING:
+    from tractor.msg import Aid
+    from tractor.discovery._addr import (
+        UnwrappedAddress,
+    )
+
+
+_non_linux: bool = platform.system() != 'Linux'
+
+
+# NOTE, multi-program tests historically triggered both
+# UDS sock-file leaks (daemon-subproc SIGKILL paths) AND
+# trio `WakeupSocketpair.drain()` busy-loops
+# (`test_register_duplicate_name`). Track + detect
+# per-test as a regression net.
+pytestmark = pytest.mark.usefixtures(
+    'track_orphaned_uds_per_test',
+    'detect_runaway_subactors_per_test',
+)
+
+
+def test_abort_on_sigint(
+    daemon: subprocess.Popen,
+):
+    assert daemon.returncode is None
+    time.sleep(0.1)
+    sig_prog(daemon, _INT_SIGNAL)
+    assert daemon.returncode == _INT_RETURN_CODE
+
+    # XXX: oddly, couldn't get capfd.readouterr() to work here?
+    if platform.system() != 'Windows':
+        # don't check stderr on windows as its empty when sending CTRL_C_EVENT
+        assert "KeyboardInterrupt" in str(daemon.stderr.read())
+
+
+@tractor_test
+async def test_cancel_remote_registrar(
+    daemon: subprocess.Popen,
+    reg_addr: UnwrappedAddress,
+):
+    assert not current_actor().is_registrar
+    async with tractor.get_registry(reg_addr) as portal:
+        await portal.cancel_actor()
+
+    time.sleep(0.1)
+    # the registrar channel server is cancelled but not its main task
+    assert daemon.returncode is None
+
+    # no registrar socket should exist
+    with pytest.raises(OSError):
+        async with tractor.get_registry(reg_addr) as portal:
+            pass
+
+
+def test_register_duplicate_name(
+    daemon: subprocess.Popen,
+    reg_addr: UnwrappedAddress,
+):
+    # bug-class-3 breadcrumbs: the *last* `[CANCEL]` line that
+    # appears under `--ll cancel`/`TRACTOR_LOG_FILE=...` names the
+    # cancel-cascade boundary that's parked. Pair with
+    # `_trio_main` entry/exit breadcrumbs in
+    # `tractor/spawn/_entry.py` to triangulate the swallow point.
+    log = tractor.log.get_logger('tractor.tests.test_multi_program')
+
+    async def main():
+        log.cancel('test_register_duplicate_name: enter `main()`')
+        try:
+            async with tractor.open_nursery(
+                registry_addrs=[reg_addr],
+            ) as an:
+                log.cancel(
+                    'test_register_duplicate_name: '
+                    'actor nursery opened'
+                )
+
+                assert not current_actor().is_registrar
+
+                p1 = await an.start_actor('doggy')
+                log.cancel(
+                    'test_register_duplicate_name: '
+                    'spawned doggy #1'
+                )
+                p2 = await an.start_actor('doggy')
+                log.cancel(
+                    'test_register_duplicate_name: '
+                    'spawned doggy #2'
+                )
+
+                async with tractor.wait_for_actor('doggy') as portal:
+                    log.cancel(
+                        'test_register_duplicate_name: '
+                        '`wait_for_actor` returned'
+                    )
+                    assert portal.channel.uid in (p2.channel.uid, p1.channel.uid)
+
+                log.cancel(
+                    'test_register_duplicate_name: '
+                    'ABOUT TO CALL `an.cancel()`'
+                )
+                await an.cancel()
+                log.cancel(
+                    'test_register_duplicate_name: '
+                    '`an.cancel()` returned'
+                )
+        finally:
+            log.cancel(
+                'test_register_duplicate_name: '
+                '`open_nursery.__aexit__` returned, leaving `main()`'
+            )
+
+    # XXX, run manually since we want to start this root **after**
+    # the other "daemon" program with it's own root.
+    trio.run(main)
+
+
+# `n_dups` in {4, 8} both expose the SAME pre-existing race:
+# under rapid same-name spawning against a forkserver +
+# registrar, ONE of the spawned doggies `sys.exit(2)`s during
+# boot before completing parent-handshake. Surfaces now (post
+# the spawn-time `wait_for_peer_or_proc_death` fix) as
+# `ActorFailure rc=2`; previously it was silently masked by
+# the handshake-wait parking forever.
+#
+# Larger `n_dups` widens the race window so the boot-race
+# fires more often — n_dups=4 hits ~always, n_dups=8 hits
+# occasionally. Both xfail(strict=False) so the cancel-cascade
+# regression-check still passes when the boot-race happens
+# NOT to fire.
+#
+# Tracked separately in,
+# https://github.com/goodboy/tractor/issues/456
+_DOGGY_BOOT_RACE_XFAIL = pytest.mark.xfail(
+    strict=False,
+    reason=(
+        'doggy boot-race rc=2 under rapid same-name '
+        'spawn — separate bug from cancel-cascade'
+    ),
+)
+
+
+@pytest.mark.parametrize(
+    'n_dups',
+    [
+        2,
+        pytest.param(4, marks=_DOGGY_BOOT_RACE_XFAIL),
+        pytest.param(8, marks=_DOGGY_BOOT_RACE_XFAIL),
+    ],
+    ids=lambda n: f'n_dups={n}',
+)
+def test_dup_name_cancel_cascade_escalates_to_hard_kill(
+    daemon: subprocess.Popen,
+    reg_addr: UnwrappedAddress,
+    n_dups: int,
+):
+    '''
+    Regression for the duplicate-name cancel-cascade hang under
+    `tcp+main_thread_forkserver`.
+
+    When N actors share a single name and the parent calls
+    `an.cancel()`, the daemon registrar gets N `register_actor` RPCs
+    in tight succession. Under TCP+MTF, kernel-level socket-buffer
+    contention can push at least one sub-actor's cancel-RPC ack past
+    `Portal.cancel_timeout` (default 0.5s).
+
+    Pre-fix, `Portal.cancel_actor()` silently returned `False` on
+    that timeout, the supervisor's outer `move_on_after(3)` never
+    fired (each per-portal task always returned ≤0.5s, never
+    exceeded 3s), and `soft_kill()`'s `await wait_func(proc)` parked
+    forever — deadlocking nursery `__aexit__`.
+
+    Post-fix, `Portal.cancel_actor()` raises `ActorTooSlowError` on
+    the bounded-wait timeout, and `ActorNursery.cancel()`'s
+    per-child wrapper escalates to `proc.terminate()` (hard-kill).
+    The full nursery teardown therefore stays bounded even under
+    pathological timing.
+
+    `n_dups` is parametrized to widen the race window — more
+    same-name siblings = more concurrent register-RPCs at the
+    daemon = higher probability of hitting the contention path.
+
+    '''
+    log = tractor.log.get_logger(
+        'tractor.tests.test_multi_program'
+    )
+
+    # outer hard ceiling: a regression should fail-fast, NOT hang
+    # the test session for minutes. Budget scales with `n_dups`
+    # since each extra same-name sibling adds ~spawn-cost +
+    # potential cancel-ack-timeout escalation latency under
+    # TCP+forkserver. ~5s/sibling + 15s baseline gives plenty of
+    # headroom while still failing-loud on a real hang.
+    fail_after_s: int = 15 + (5 * n_dups)
+
+    async def main():
+        log.cancel(
+            f'enter `main()` n_dups={n_dups}'
+        )
+        with trio.fail_after(fail_after_s):
+            async with tractor.open_nursery(
+                registry_addrs=[reg_addr],
+            ) as an:
+                portals: list[Portal] = []
+                for i in range(n_dups):
+                    p: Portal = await an.start_actor('doggy')
+                    portals.append(p)
+                    log.cancel(
+                        f'spawned doggy #{i + 1}/{n_dups}'
+                    )
+
+                # at least one of the N must be discoverable by
+                # name; doesn't matter which one (registrar will
+                # have last-wins semantics under same-name).
+                async with tractor.wait_for_actor('doggy') as portal:
+                    expected_uids = {p.channel.uid for p in portals}
+                    assert portal.channel.uid in expected_uids
+
+                # critical section: this MUST return within
+                # `fail_after_s` even when one or more cancel-RPC
+                # acks time out. Pre-fix, this hangs forever.
+                log.cancel('about to call `an.cancel()`')
+                await an.cancel()
+                log.cancel('`an.cancel()` returned')
+
+        # post-teardown sanity: every child proc must be reaped.
+        # If escalation worked, even timed-out cancel-RPCs would
+        # have triggered `proc.terminate()` and the procs are dead.
+        for p in portals:
+            # `Portal.channel.connected()` -> False once the
+            # underlying chan disconnected (clean exit OR
+            # hard-killed proc both produce disconnect).
+            assert not p.channel.connected(), (
+                f'Portal chan still connected post-teardown?\n'
+                f'{p.channel}'
+            )
+
+    trio.run(main)
+
+
+@tractor.context
+async def get_root_portal(
+    ctx: Context,
+):
+    '''
+    Connect back to the root actor manually (using `._discovery` API)
+    and ensure it's contact info is the same as our immediate parent.
+
+    '''
+    sub: Actor = current_actor()
+    rtvs: dict = _state._runtime_vars
+    raddrs: list[UnwrappedAddress] = rtvs['_root_addrs']
+
+    # await tractor.pause()
+    # XXX, in case the sub->root discovery breaks you might need
+    # this (i know i did Xp)!!
+    # from tractor.devx import mk_pdb
+    # mk_pdb().set_trace()
+
+    assert (
+        len(raddrs) == 1
+        and
+        list(sub._parent_chan.raddr.unwrap()) in raddrs
+    )
+
+    # connect back to our immediate parent which should also
+    # be the actor-tree's root.
+    from tractor.discovery._api import get_root
+    ptl: Portal
+    async with get_root() as ptl:
+        root_aid: Aid = ptl.chan.aid
+        parent_ptl: Portal = current_actor().get_parent()
+        assert (
+            root_aid.name == 'root'
+            and
+            parent_ptl.chan.aid == root_aid
+        )
+        await ctx.started()
+
+
+def test_non_registrar_spawns_child(
+    daemon: subprocess.Popen,
+    reg_addr: UnwrappedAddress,
+    loglevel: str,
+    debug_mode: bool,
+    ci_env: bool,
+):
+    '''
+    Ensure a non-regristar (serving) root actor can spawn a sub and
+    that sub can connect back (manually) to it's rent that is the
+    root without issue.
+
+    More or less this audits the global contact info in
+    `._state._runtime_vars`.
+
+    '''
+    async def main():
+
+        # XXX, since apparently on macos in GH's CI it can be a race
+        # with the `daemon` registrar on grabbing the socket-addr..
+        if ci_env and _non_linux:
+            await trio.sleep(.5)
+
+        async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
+            loglevel=loglevel,
+            debug_mode=debug_mode,
+        ) as an:
+
+            actor: Actor = tractor.current_actor()
+            assert not actor.is_registrar
+            sub_ptl: Portal = await an.start_actor(
+                name='sub',
+                enable_modules=[__name__],
+            )
+
+            async with sub_ptl.open_context(
+                get_root_portal,
+            ) as (ctx, _):
+                print('Waiting for `sub` to connect back to us..')
+
+            await an.cancel()
+
+    # XXX, run manually since we want to start this root **after**
+    # the other "daemon" program with it's own root.
+    trio.run(main)
--- a/tests/discovery/test_multiaddr.py
+++ b/tests/discovery/test_multiaddr.py
@ -0,0 +1,376 @@
+'''
+Multiaddr construction, parsing, and round-trip tests for
+`tractor.discovery._multiaddr.mk_maddr()` and
+`tractor.discovery._multiaddr.parse_maddr()`.
+
+'''
+from pathlib import Path
+from types import SimpleNamespace
+
+import pytest
+from multiaddr import Multiaddr
+
+from tractor.ipc._tcp import TCPAddress
+from tractor.ipc._uds import UDSAddress
+from tractor.discovery._multiaddr import (
+    mk_maddr,
+    parse_maddr,
+    parse_endpoints,
+    _tpt_proto_to_maddr,
+    _maddr_to_tpt_proto,
+)
+from tractor.discovery._addr import wrap_address
+
+
+def test_tpt_proto_to_maddr_mapping():
+    '''
+    `_tpt_proto_to_maddr` maps all supported `proto_key`
+    values to their correct multiaddr protocol names.
+
+    '''
+    assert _tpt_proto_to_maddr['tcp'] == 'tcp'
+    assert _tpt_proto_to_maddr['uds'] == 'unix'
+    assert len(_tpt_proto_to_maddr) == 2
+
+
+def test_mk_maddr_tcp_ipv4():
+    '''
+    `mk_maddr()` on a `TCPAddress` with an IPv4 host
+    produces the correct `/ip4/<host>/tcp/<port>` multiaddr.
+
+    '''
+    addr = TCPAddress('127.0.0.1', 1234)
+    result: Multiaddr = mk_maddr(addr)
+
+    assert isinstance(result, Multiaddr)
+    assert str(result) == '/ip4/127.0.0.1/tcp/1234'
+
+    protos = result.protocols()
+    assert protos[0].name == 'ip4'
+    assert protos[1].name == 'tcp'
+
+    assert result.value_for_protocol('ip4') == '127.0.0.1'
+    assert result.value_for_protocol('tcp') == '1234'
+
+
+def test_mk_maddr_tcp_ipv6():
+    '''
+    `mk_maddr()` on a `TCPAddress` with an IPv6 host
+    produces the correct `/ip6/<host>/tcp/<port>` multiaddr.
+
+    '''
+    addr = TCPAddress('::1', 5678)
+    result: Multiaddr = mk_maddr(addr)
+
+    assert str(result) == '/ip6/::1/tcp/5678'
+
+    protos = result.protocols()
+    assert protos[0].name == 'ip6'
+    assert protos[1].name == 'tcp'
+
+
+def test_mk_maddr_uds():
+    '''
+    `mk_maddr()` on a `UDSAddress` produces a `/unix/<path>`
+    multiaddr containing the full socket path.
+
+    '''
+    # NOTE, use an absolute `filedir` to match real runtime
+    # UDS paths; `mk_maddr()` strips the leading `/` to avoid
+    # the double-slash `/unix//run/..` that py-multiaddr
+    # rejects as "empty protocol path".
+    filedir = '/tmp/tractor_test'
+    filename = 'test_sock.sock'
+    addr = UDSAddress(
+        filedir=filedir,
+        filename=filename,
+    )
+    result: Multiaddr = mk_maddr(addr)
+
+    assert isinstance(result, Multiaddr)
+
+    result_str: str = str(result)
+    assert result_str.startswith('/unix/')
+    # verify the leading `/` was stripped to avoid double-slash
+    assert '/unix/tmp/tractor_test/' in result_str
+
+    sockpath_rel: str = str(
+        Path(filedir) / filename
+    ).lstrip('/')
+    unix_val: str = result.value_for_protocol('unix')
+    assert unix_val.endswith(sockpath_rel)
+
+
+def test_mk_maddr_unsupported_proto_key():
+    '''
+    `mk_maddr()` raises `ValueError` for an unsupported
+    `proto_key`.
+
+    '''
+    fake_addr = SimpleNamespace(proto_key='quic')
+    with pytest.raises(
+        ValueError,
+        match='Unsupported proto_key',
+    ):
+        mk_maddr(fake_addr)
+
+
+@pytest.mark.parametrize(
+    'addr',
+    [
+        pytest.param(
+            TCPAddress('127.0.0.1', 9999),
+            id='tcp-ipv4',
+        ),
+        pytest.param(
+            UDSAddress(
+                filedir='/tmp/tractor_rt',
+                filename='roundtrip.sock',
+            ),
+            id='uds',
+        ),
+    ],
+)
+def test_mk_maddr_roundtrip(addr):
+    '''
+    `mk_maddr()` output is valid multiaddr syntax that the
+    library can re-parse back into an equivalent `Multiaddr`.
+
+    '''
+    maddr: Multiaddr = mk_maddr(addr)
+    reparsed = Multiaddr(str(maddr))
+
+    assert reparsed == maddr
+    assert str(reparsed) == str(maddr)
+
+
+# ------ parse_maddr() tests ------
+
+def test_maddr_to_tpt_proto_mapping():
+    '''
+    `_maddr_to_tpt_proto` is the exact inverse of
+    `_tpt_proto_to_maddr`.
+
+    '''
+    assert _maddr_to_tpt_proto == {
+        'tcp': 'tcp',
+        'unix': 'uds',
+    }
+
+
+def test_parse_maddr_tcp_ipv4():
+    '''
+    `parse_maddr()` on an IPv4 TCP multiaddr string
+    produce a `TCPAddress` with the correct host and port.
+
+    '''
+    result = parse_maddr('/ip4/127.0.0.1/tcp/1234')
+
+    assert isinstance(result, TCPAddress)
+    assert result.unwrap() == ('127.0.0.1', 1234)
+
+
+def test_parse_maddr_tcp_ipv6():
+    '''
+    `parse_maddr()` on an IPv6 TCP multiaddr string
+    produce a `TCPAddress` with the correct host and port.
+
+    '''
+    result = parse_maddr('/ip6/::1/tcp/5678')
+
+    assert isinstance(result, TCPAddress)
+    assert result.unwrap() == ('::1', 5678)
+
+
+def test_parse_maddr_uds():
+    '''
+    `parse_maddr()` on a `/unix/...` multiaddr string
+    produce a `UDSAddress` with the correct dir and filename,
+    preserving absolute path semantics.
+
+    '''
+    result = parse_maddr('/unix/tmp/tractor_test/test.sock')
+
+    assert isinstance(result, UDSAddress)
+    filedir, filename = result.unwrap()
+    assert filename == 'test.sock'
+    assert str(filedir) == '/tmp/tractor_test'
+
+
+def test_parse_maddr_unsupported():
+    '''
+    `parse_maddr()` raise `ValueError` for an unsupported
+    protocol combination like UDP.
+
+    '''
+    with pytest.raises(
+        ValueError,
+        match='Unsupported multiaddr protocol combo',
+    ):
+        parse_maddr('/ip4/127.0.0.1/udp/1234')
+
+
+@pytest.mark.parametrize(
+    'addr',
+    [
+        pytest.param(
+            TCPAddress('127.0.0.1', 9999),
+            id='tcp-ipv4',
+        ),
+        pytest.param(
+            UDSAddress(
+                filedir='/tmp/tractor_rt',
+                filename='roundtrip.sock',
+            ),
+            id='uds',
+        ),
+    ],
+)
+def test_parse_maddr_roundtrip(addr):
+    '''
+    Full round-trip: `addr -> mk_maddr -> str -> parse_maddr`
+    produce an `Address` whose `.unwrap()` matches the original.
+
+    '''
+    maddr: Multiaddr = mk_maddr(addr)
+    maddr_str: str = str(maddr)
+    parsed = parse_maddr(maddr_str)
+
+    assert type(parsed) is type(addr)
+    assert parsed.unwrap() == addr.unwrap()
+
+
+def test_wrap_address_maddr_str():
+    '''
+    `wrap_address()` accept a multiaddr-format string and
+    return the correct `Address` type.
+
+    '''
+    result = wrap_address('/ip4/127.0.0.1/tcp/9999')
+
+    assert isinstance(result, TCPAddress)
+    assert result.unwrap() == ('127.0.0.1', 9999)
+
+
+# ------ parse_endpoints() tests ------
+
+def test_parse_endpoints_tcp_only():
+    '''
+    `parse_endpoints()` with a single TCP maddr per actor
+    produce the correct `TCPAddress` instances.
+
+    '''
+    table = {
+        'registry': ['/ip4/127.0.0.1/tcp/1616'],
+        'data_feed': ['/ip4/0.0.0.0/tcp/5555'],
+    }
+    result = parse_endpoints(table)
+
+    assert set(result.keys()) == {'registry', 'data_feed'}
+
+    reg_addr = result['registry'][0]
+    assert isinstance(reg_addr, TCPAddress)
+    assert reg_addr.unwrap() == ('127.0.0.1', 1616)
+
+    feed_addr = result['data_feed'][0]
+    assert isinstance(feed_addr, TCPAddress)
+    assert feed_addr.unwrap() == ('0.0.0.0', 5555)
+
+
+def test_parse_endpoints_mixed_tpts():
+    '''
+    `parse_endpoints()` with both TCP and UDS maddrs for
+    the same actor produce the correct mixed `Address` list.
+
+    '''
+    table = {
+        'broker': [
+            '/ip4/127.0.0.1/tcp/4040',
+            '/unix/tmp/tractor/broker.sock',
+        ],
+    }
+    result = parse_endpoints(table)
+    addrs = result['broker']
+
+    assert len(addrs) == 2
+    assert isinstance(addrs[0], TCPAddress)
+    assert addrs[0].unwrap() == ('127.0.0.1', 4040)
+
+    assert isinstance(addrs[1], UDSAddress)
+    filedir, filename = addrs[1].unwrap()
+    assert filename == 'broker.sock'
+    assert str(filedir) == '/tmp/tractor'
+
+
+def test_parse_endpoints_unwrapped_tuples():
+    '''
+    `parse_endpoints()` accept raw `(host, port)` tuples
+    and wrap them as `TCPAddress`.
+
+    '''
+    table = {
+        'ems': [('127.0.0.1', 6666)],
+    }
+    result = parse_endpoints(table)
+
+    addr = result['ems'][0]
+    assert isinstance(addr, TCPAddress)
+    assert addr.unwrap() == ('127.0.0.1', 6666)
+
+
+def test_parse_endpoints_mixed_str_and_tuple():
+    '''
+    `parse_endpoints()` accept a mix of maddr strings and
+    raw tuples in the same actor entry list.
+
+    '''
+    table = {
+        'quoter': [
+            '/ip4/127.0.0.1/tcp/7777',
+            ('127.0.0.1', 8888),
+        ],
+    }
+    result = parse_endpoints(table)
+    addrs = result['quoter']
+
+    assert len(addrs) == 2
+    assert isinstance(addrs[0], TCPAddress)
+    assert addrs[0].unwrap() == ('127.0.0.1', 7777)
+
+    assert isinstance(addrs[1], TCPAddress)
+    assert addrs[1].unwrap() == ('127.0.0.1', 8888)
+
+
+def test_parse_endpoints_unsupported_proto():
+    '''
+    `parse_endpoints()` raise `ValueError` when a maddr
+    string uses an unsupported protocol like `/udp/`.
+
+    '''
+    table = {
+        'bad_actor': ['/ip4/127.0.0.1/udp/9999'],
+    }
+    with pytest.raises(
+        ValueError,
+        match='Unsupported multiaddr protocol combo',
+    ):
+        parse_endpoints(table)
+
+
+def test_parse_endpoints_empty_table():
+    '''
+    `parse_endpoints()` on an empty table return an empty
+    dict.
+
+    '''
+    assert parse_endpoints({}) == {}
+
+
+def test_parse_endpoints_empty_actor_list():
+    '''
+    `parse_endpoints()` with an actor mapped to an empty
+    list preserve the key with an empty list value.
+
+    '''
+    result = parse_endpoints({'x': []})
+    assert result == {'x': []}
--- a/tests/discovery/test_registrar.py
+++ b/tests/discovery/test_registrar.py
@ -0,0 +1,673 @@
+'''
+Discovery subsystem via a "registrar" actor scenarios.
+
+'''
+import os
+import signal
+import platform
+from functools import partial
+import itertools
+import time
+from typing import Callable
+
+import psutil
+import pytest
+import subprocess
+import tractor
+from tractor.devx import dump_on_hang
+from tractor.trionics import collapse_eg
+from tractor._testing import tractor_test
+from tractor.discovery._addr import wrap_address
+from tractor.discovery._multiaddr import mk_maddr
+import trio
+
+
+pytestmark = pytest.mark.usefixtures(
+    'reap_subactors_per_test',
+    # NOTE, registrar tests stress the discovery
+    # roundtrip (find_actor / wait_for_actor) which
+    # historically left orphaned UDS sock-files when
+    # subactor `hard_kill` SIGKILL'd, and which
+    # exercises the same trio `WakeupSocketpair`
+    # peer-disconnect path that triggered the
+    # busy-loop bug class.
+    'track_orphaned_uds_per_test',
+    'detect_runaway_subactors_per_test',
+)
+
+
+@tractor_test
+async def test_reg_then_unreg(
+    reg_addr: tuple,
+):
+    actor = tractor.current_actor()
+    assert actor.is_registrar
+    assert len(actor._registry) == 1  # only self is registered
+
+    async with tractor.open_nursery(
+        registry_addrs=[reg_addr],
+    ) as n:
+
+        portal = await n.start_actor('actor', enable_modules=[__name__])
+        uid = portal.channel.aid.uid
+
+        async with tractor.get_registry(reg_addr) as aportal:
+            # this local actor should be the registrar
+            assert actor is aportal.actor
+
+            async with tractor.wait_for_actor('actor'):
+                # sub-actor uid should be in the registry
+                assert uid in aportal.actor._registry
+                sockaddrs = actor._registry[uid]
+                # XXX: can we figure out what the listen addr will be?
+                assert sockaddrs
+
+        await n.cancel()  # tear down nursery
+
+        await trio.sleep(0.1)
+        assert uid not in aportal.actor._registry
+        sockaddrs = actor._registry.get(uid)
+        assert not sockaddrs
+
+
+@tractor_test
+async def test_reg_then_unreg_maddr(
+    reg_addr: tuple,
+):
+    '''
+    Same as `test_reg_then_unreg` but pass the registry
+    address as a multiaddr string to verify `wrap_address()`
+    multiaddr parsing end-to-end through the runtime.
+
+    '''
+    # tuple -> Address -> multiaddr string
+    addr_obj = wrap_address(reg_addr)
+    maddr_str: str = str(mk_maddr(addr_obj))
+
+    actor = tractor.current_actor()
+    assert actor.is_registrar
+
+    async with tractor.open_nursery(
+        registry_addrs=[maddr_str],
+    ) as n:
+
+        portal = await n.start_actor(
+            'actor_maddr',
+            enable_modules=[__name__],
+        )
+        uid = portal.channel.aid.uid
+
+        async with tractor.get_registry(maddr_str) as aportal:
+            assert actor is aportal.actor
+
+            async with tractor.wait_for_actor('actor_maddr'):
+                assert uid in aportal.actor._registry
+                sockaddrs = actor._registry[uid]
+                assert sockaddrs
+
+        await n.cancel()
+
+        await trio.sleep(0.1)
+        assert uid not in aportal.actor._registry
+        sockaddrs = actor._registry.get(uid)
+        assert not sockaddrs
+
+
+the_line = 'Hi my name is {}'
+
+
+async def hi():
+    return the_line.format(tractor.current_actor().name)
+
+
+async def say_hello_use_wait(
+    other_actor: str,
+    reg_addr: tuple[str, int],
+):
+    async with tractor.wait_for_actor(
+        other_actor,
+        registry_addr=reg_addr,
+    ) as portal:
+        assert portal is not None
+        result = await portal.run(__name__, 'hi')
+        return result
+
+
+@tractor_test(
+    timeout=7,
+)
+@pytest.mark.parametrize(
+    'ria_fn',
+    [
+        say_hello_use_wait,
+    ]
+)
+async def test_trynamic_trio(
+    ria_fn: Callable,
+    start_method: str,
+    reg_addr: tuple,
+):
+    '''
+    Root actor acting as the "director" and running one-shot-task-actors
+    for the directed subs.
+
+    '''
+    async with tractor.open_nursery() as n:
+        print("Alright... Action!")
+
+        donny = await n.run_in_actor(
+            ria_fn,
+            other_actor='gretchen',
+            reg_addr=reg_addr,
+            name='donny',
+        )
+        gretchen = await n.run_in_actor(
+            ria_fn,
+            other_actor='donny',
+            reg_addr=reg_addr,
+            name='gretchen',
+        )
+        print(await gretchen.result())
+        print(await donny.result())
+        print("CUTTTT CUUTT CUT!!?! Donny!! You're supposed to say...")
+
+
+async def stream_forever():
+    for i in itertools.count():
+        yield i
+        await trio.sleep(0.01)
+
+
+async def cancel(
+    use_signal: bool,
+    delay: float = 0,
+):
+    # hold on there sally
+    await trio.sleep(delay)
+
+    # trigger cancel
+    if use_signal:
+        if platform.system() == 'Windows':
+            pytest.skip("SIGINT not supported on windows")
+        os.kill(os.getpid(), signal.SIGINT)
+    else:
+        raise KeyboardInterrupt
+
+
+async def stream_from(portal: tractor.Portal):
+    async with portal.open_stream_from(stream_forever) as stream:
+        async for value in stream:
+            print(value)
+
+
+async def unpack_reg(
+    actor_or_portal: tractor.Portal|tractor.Actor,
+):
+    '''
+    Get and unpack a "registry" RPC request from the registrar
+    system.
+
+    '''
+    if getattr(actor_or_portal, 'get_registry', None):
+        msg = await actor_or_portal.get_registry()
+    else:
+        msg = await actor_or_portal.run_from_ns('self', 'get_registry')
+
+    return {
+        tuple(key.split('.')): val
+        for key, val in msg.items()
+    }
+
+
+async def spawn_and_check_registry(
+    reg_addr: tuple,
+    use_signal: bool,
+    debug_mode: bool = False,
+    remote_arbiter: bool = False,
+    with_streaming: bool = False,
+    maybe_daemon: tuple[
+        subprocess.Popen,
+        psutil.Process,
+    ]|None = None,
+
+) -> None:
+
+    if maybe_daemon:
+        popen, proc = maybe_daemon
+        # breakpoint()
+
+    async with tractor.open_root_actor(
+        registry_addrs=[reg_addr],
+        debug_mode=debug_mode,
+    ):
+        async with tractor.get_registry(
+            addr=reg_addr,
+        ) as portal:
+            # runtime needs to be up to call this
+            actor = tractor.current_actor()
+
+            if remote_arbiter:
+                assert not actor.is_registrar
+
+            if actor.is_registrar:
+                extra = 1  # registrar is local root actor
+                get_reg = partial(unpack_reg, actor)
+
+            else:
+                get_reg = partial(unpack_reg, portal)
+                extra = 2  # local root actor + remote registrar
+
+            # ensure current actor is registered
+            registry: dict = await get_reg()
+            assert actor.aid.uid in registry
+
+            try:
+                async with tractor.open_nursery() as an:
+                    async with (
+                        collapse_eg(),
+                        trio.open_nursery() as trion,
+                    ):
+                        portals = {}
+                        for i in range(3):
+                            name = f'a{i}'
+                            if with_streaming:
+                                portals[name] = await an.start_actor(
+                                    name=name, enable_modules=[__name__])
+
+                            else:  # no streaming
+                                portals[name] = await an.run_in_actor(
+                                    trio.sleep_forever, name=name)
+
+                        # wait on last actor to come up
+                        async with tractor.wait_for_actor(name):
+                            registry = await get_reg()
+                            for uid in an._children:
+                                assert uid in registry
+
+                        assert len(portals) + extra == len(registry)
+
+                        if with_streaming:
+                            await trio.sleep(0.1)
+
+                            pts = list(portals.values())
+                            for p in pts[:-1]:
+                                trion.start_soon(stream_from, p)
+
+                            # stream for 1 sec
+                            trion.start_soon(cancel, use_signal, 1)
+
+                            last_p = pts[-1]
+                            await stream_from(last_p)
+
+                        else:
+                            await cancel(use_signal)
+
+            finally:
+                await trio.sleep(0.5)
+
+                # all subactors should have de-registered
+                registry = await get_reg()
+                start: float = time.time()
+                while (
+                    not (len(registry) == extra)
+                    and
+                    (time.time() - start) < 5
+                ):
+                    print(
+                        f'Waiting for remaining subs to dereg..\n'
+                        f'{registry!r}\n'
+                    )
+                    await trio.sleep(0.3)
+                else:
+                    assert len(registry) == extra
+
+                assert actor.aid.uid in registry
+
+
+async def with_timeout(
+    main: Callable,
+    timeout: float = 6,
+):
+    with trio.fail_after(timeout):
+        await main()
+
+
+@pytest.mark.parametrize('use_signal', [False, True])
+@pytest.mark.parametrize('with_streaming', [False, True])
+def test_subactors_unregister_on_cancel(
+    debug_mode: bool,
+    start_method: str,
+    use_signal: bool,
+    reg_addr: tuple,
+    with_streaming: bool,
+):
+    '''
+    Verify that cancelling a nursery results in all subactors
+    deregistering themselves with the registrar.
+
+    '''
+    with pytest.raises(KeyboardInterrupt):
+        trio.run(
+            # with_timeout,
+            partial(
+                spawn_and_check_registry,
+                reg_addr,
+                use_signal,
+                debug_mode=debug_mode,
+                remote_arbiter=False,
+                with_streaming=with_streaming,
+            ),
+        )
+
+
+@pytest.mark.parametrize('use_signal', [False, True])
+@pytest.mark.parametrize('with_streaming', [False, True])
+def test_subactors_unregister_on_cancel_remote_daemon(
+    daemon: subprocess.Popen,
+    debug_mode: bool,
+    start_method: str,
+    use_signal: bool,
+    reg_addr: tuple,
+    with_streaming: bool,
+):
+    '''
+    Verify that cancelling a nursery results in all subactors
+    deregistering themselves with a **remote** (not in the local
+    process tree) registrar.
+
+    '''
+    with pytest.raises(KeyboardInterrupt):
+        trio.run(
+            with_timeout,
+            partial(
+                spawn_and_check_registry,
+                reg_addr,
+                use_signal,
+                debug_mode=debug_mode,
+                remote_arbiter=True,
+                with_streaming=with_streaming,
+                maybe_daemon=(
+                    daemon,
+                    psutil.Process(daemon.pid)
+                ),
+            ),
+        )
+
+
+async def streamer(agen):
+    async for item in agen:
+        print(item)
+
+
+async def close_chans_before_nursery(
+    reg_addr: tuple,
+    use_signal: bool,
+    remote_arbiter: bool = False,
+) -> None:
+
+    # logic for how many actors should still be
+    # in the registry at teardown.
+    if remote_arbiter:
+        entries_at_end = 2
+    else:
+        entries_at_end = 1
+
+    async with tractor.open_root_actor(
+        registry_addrs=[reg_addr],
+    ):
+        async with tractor.get_registry(reg_addr) as aportal:
+            try:
+                get_reg = partial(unpack_reg, aportal)
+
+                async with tractor.open_nursery() as an:
+                    portal1 = await an.start_actor(
+                        name='consumer1',
+                        enable_modules=[__name__],
+                    )
+                    portal2 = await an.start_actor(
+                        'consumer2',
+                        enable_modules=[__name__],
+                    )
+
+                    async with (
+                        portal1.open_stream_from(
+                            stream_forever
+                        ) as agen1,
+                        portal2.open_stream_from(
+                            stream_forever
+                        ) as agen2,
+                    ):
+                            async with (
+                                collapse_eg(),
+                                trio.open_nursery() as tn,
+                            ):
+                                tn.start_soon(streamer, agen1)
+                                tn.start_soon(cancel, use_signal, .5)
+                                try:
+                                    await streamer(agen2)
+                                finally:
+                                    # Kill the root nursery thus resulting in
+                                    # normal registrar channel ops to fail during
+                                    # teardown. It doesn't seem like this is
+                                    # reliably triggered by an external SIGINT.
+                                    # tractor.current_actor()._root_nursery.cancel_scope.cancel()
+
+                                    # XXX: THIS IS THE KEY THING that
+                                    # happens **before** exiting the
+                                    # actor nursery block
+
+                                    # also kill off channels cuz why not
+                                    await agen1.aclose()
+                                    await agen2.aclose()
+
+            finally:
+                with trio.CancelScope(shield=True):
+                    await trio.sleep(1)
+
+                    # all subactors should have de-registered
+                    registry = await get_reg()
+                    assert portal1.channel.aid.uid not in registry
+                    assert portal2.channel.aid.uid not in registry
+                    assert len(registry) == entries_at_end
+
+
+@pytest.mark.parametrize('use_signal', [False, True])
+def test_close_channel_explicit(
+    start_method: str,
+    use_signal: bool,
+    reg_addr: tuple,
+):
+    '''
+    Verify that closing a stream explicitly and killing the actor's
+    "root nursery" **before** the containing nursery tears down also
+    results in subactor(s) deregistering from the registrar.
+
+    '''
+    with pytest.raises(KeyboardInterrupt):
+        trio.run(
+            partial(
+                close_chans_before_nursery,
+                reg_addr,
+                use_signal,
+                remote_arbiter=False,
+            ),
+        )
+
+
+@pytest.mark.parametrize('use_signal', [False, True])
+def test_close_channel_explicit_remote_registrar(
+    daemon: subprocess.Popen,
+    start_method: str,
+    use_signal: bool,
+    reg_addr: tuple,
+):
+    '''
+    Verify that closing a stream explicitly and killing the actor's
+    "root nursery" **before** the containing nursery tears down also
+    results in subactor(s) deregistering from the registrar.
+
+    '''
+    with pytest.raises(KeyboardInterrupt):
+        trio.run(
+            partial(
+                close_chans_before_nursery,
+                reg_addr,
+                use_signal,
+                remote_arbiter=True,
+            ),
+        )
+
+
+@tractor.context
+async def kill_transport(
+    ctx: tractor.Context,
+) -> None:
+
+    await ctx.started()
+    actor: tractor.Actor = tractor.current_actor()
+    actor.ipc_server.cancel()
+    await trio.sleep_forever()
+
+
+
+# ?TODO, do a OSc style signalling test on this?
+# -[ ] doesn't work for fork backends
+# @pytest.mark.parametrize('use_signal', [False, True])
+#
+# Wall-clock bound via `pytest-timeout` (`method='thread'`).
+# Under `--spawn-backend=subint` this test can wedge in an
+# un-Ctrl-C-able state (abandoned-subint + shared-GIL
+# starvation → signal-wakeup-fd pipe fills → SIGINT silently
+# dropped; see `ai/conc-anal/subint_sigint_starvation_issue.md`).
+# `method='thread'` is specifically required because `signal`-
+# method SIGALRM suffers the same GIL-starvation path and
+# wouldn't fire the Python-level handler.
+# At timeout the plugin hard-kills the pytest process — that's
+# the intended behavior here; the alternative is an unattended
+# suite run that never returns.
+# @pytest.mark.timeout(
+#     30,
+#     # NOTE should be a 2.1s happy path.
+#     # XXX for `main_thread_forkserver` this is SUPER SENSITIVE
+#     # so keep it higher to avoid flaky runs..
+#     method='thread',
+# )
+@pytest.mark.skipon_spawn_backend(
+    'subint',
+    # 'main_thread_forkserver',
+    reason=(
+        'XXX SUBINT HANGING TEST XXX\n'
+        'See outstanding issue(s)\n'
+        # TODO, put issue link!
+    )
+)
+def test_stale_entry_is_deleted(
+    debug_mode: bool,
+    daemon: subprocess.Popen,
+    start_method: str,
+    reg_addr: tuple,
+    # set_fork_aware_capture,
+):
+    '''
+    Ensure that when a stale entry is detected in the registrar's
+    table that the `find_actor()` API takes care of deleting the
+    stale entry and not delivering a bad portal.
+
+    '''
+    async def main():
+        name: str = 'transport_fails_actor'
+        _reg_ptl: tractor.Portal
+        an: tractor.ActorNursery
+        async with (
+            tractor.open_nursery(
+                debug_mode=debug_mode,
+                registry_addrs=[reg_addr],
+            ) as an,
+            tractor.get_registry(reg_addr) as _reg_ptl,
+        ):
+            ptl: tractor.Portal = await an.start_actor(
+                name,
+                enable_modules=[__name__],
+            )
+            async with ptl.open_context(
+                kill_transport,
+            ) as (first, ctx):
+                async with tractor.find_actor(
+                    name,
+                    registry_addrs=[reg_addr],
+                ) as maybe_portal:
+                    # because the transitive
+                    # `._api.maybe_open_portal()` call should
+                    # fail and implicitly call `.delete_addr()`
+                    assert maybe_portal is None
+                    registry: dict = await unpack_reg(_reg_ptl)
+                    assert ptl.chan.aid.uid not in registry
+
+                # should fail since we knocked out the IPC tpt XD
+                await ptl.cancel_actor()
+                await an.cancel()
+
+    # XXX, for tracing if this starts being flaky again..
+    #
+    timeout: float = 4
+    async def _timeout_main():
+        with trio.move_on_after(timeout) as cs:
+            await main()
+
+        if (
+            cs.cancel_called
+            and
+            debug_mode
+        ):
+            await tractor.pause()
+
+    # TODO, remove once the `[subint]` variant no longer hangs.
+    #
+    # Status (as of Phase B hard-kill landing):
+    #
+    # - `[trio]`/`[mp_*]` variants: completes normally; `dump_on_hang`
+    #   is a no-op safety net here.
+    #
+    # - `[subint]` variant: hangs indefinitely AND is un-Ctrl-C-able.
+    #   `strace -p <pytest_pid>` while in the hang reveals a silently-
+    #   dropped SIGINT — the C signal handler tries to write the
+    #   signum byte to Python's signal-wakeup fd and gets `EAGAIN`,
+    #   meaning the pipe is full (nobody's draining it).
+    #
+    #   Root-cause chain: our hard-kill in `spawn._subint` abandoned
+    #   the driver OS-thread (which is `daemon=True`) after the soft-
+    #   kill timeout, but the *sub-interpreter* inside that thread is
+    #   still running `trio.run()` — `_interpreters.destroy()` can't
+    #   force-stop a running subint (raises `InterpreterError`), and
+    #   legacy-config subints share the main GIL. The abandoned subint
+    #   starves the parent's trio event loop from iterating often
+    #   enough to drain its wakeup pipe → SIGINT silently drops.
+    #
+    #   This is structurally a CPython-level limitation: there's no
+    #   public force-destroy primitive for a running subint. We
+    #   escape on the harness side via a SIGINT-loop in the `daemon`
+    #   fixture teardown (killing the bg registrar subproc closes its
+    #   end of the IPC, which eventually unblocks a recv in main trio,
+    #   which lets the loop drain the wakeup pipe). Long-term fix path:
+    #   msgspec PEP 684 support (jcrist/msgspec#563) → isolated-mode
+    #   subints with per-interp GIL.
+    #
+    #   Full analysis:
+    #   `ai/conc-anal/subint_sigint_starvation_issue.md`
+    #
+    #   See also the *sibling* hang class documented in
+    #   `ai/conc-anal/subint_cancel_delivery_hang_issue.md` — same
+    #   subint backend, different root cause (Ctrl-C-able hang, main
+    #   trio loop iterating fine; ours to fix, not CPython's).
+    #   Reproduced by `tests/test_subint_cancellation.py
+    #   ::test_subint_non_checkpointing_child`.
+    #
+    # Kept here (and not behind a `pytestmark.skip`) so we can still
+    # inspect the dump file if the hang ever returns after a refactor.
+    # `pytest`'s stderr capture eats `faulthandler` output otherwise,
+    # so we route `dump_on_hang` to a file.
+    with dump_on_hang(
+        seconds=timeout*2,
+        path=f'/tmp/test_stale_entry_is_deleted_{start_method}.dump',
+    ):
+        trio.run(_timeout_main)
--- a/tests/discovery/test_tpt_bind_addrs.py
+++ b/tests/discovery/test_tpt_bind_addrs.py
@ -0,0 +1,345 @@
+'''
+`open_root_actor(tpt_bind_addrs=...)` test suite.
+
+Verify all three runtime code paths for explicit IPC-server
+bind-address selection in `_root.py`:
+
+1. Non-registrar, no explicit bind -> random addrs from registry proto
+2. Registrar, no explicit bind -> binds to registry_addrs
+3. Explicit bind given -> wraps via `wrap_address()` and uses them
+
+'''
+import pytest
+import trio
+import tractor
+from tractor.discovery._addr import (
+    wrap_address,
+)
+from tractor.discovery._multiaddr import mk_maddr
+from tractor._testing.addr import get_rando_addr
+
+
+# ------------------------------------------------------------------
+# helpers
+# ------------------------------------------------------------------
+def _bound_bindspaces(
+    actor: tractor.Actor,
+) -> set[str]:
+    '''
+    Collect the set of bindspace strings from the actor's
+    currently bound IPC-server accept addresses.
+
+    '''
+    return {
+        wrap_address(a).bindspace
+        for a in actor.accept_addrs
+    }
+
+
+def _bound_wrapped(
+    actor: tractor.Actor,
+) -> list:
+    '''
+    Return the actor's accept addrs as wrapped `Address` objects.
+
+    '''
+    return [
+        wrap_address(a)
+        for a in actor.accept_addrs
+    ]
+
+
+# ------------------------------------------------------------------
+# 1) Registrar + explicit tpt_bind_addrs
+# ------------------------------------------------------------------
+@pytest.mark.parametrize(
+    'addr_combo',
+    [
+        'bind-eq-reg',
+        'bind-subset-reg',
+        'bind-disjoint-reg',
+    ],
+    ids=lambda v: v,
+)
+def test_registrar_root_tpt_bind_addrs(
+    reg_addr: tuple,
+    tpt_proto: str,
+    debug_mode: bool,
+    addr_combo: str,
+):
+    '''
+    Registrar root-actor with explicit `tpt_bind_addrs`:
+    bound set must include all registry + all bind addr bindspaces
+    (merge behavior).
+
+    '''
+    reg_wrapped = wrap_address(reg_addr)
+
+    if addr_combo == 'bind-eq-reg':
+        bind_addrs = [reg_addr]
+        # extra secondary reg addr for subset test
+        extra_reg = []
+
+    elif addr_combo == 'bind-subset-reg':
+        second_reg = get_rando_addr(tpt_proto)
+        bind_addrs = [reg_addr]
+        extra_reg = [second_reg]
+
+    elif addr_combo == 'bind-disjoint-reg':
+        # port=0 on same host -> completely different addr
+        rando = wrap_address(reg_addr).get_random(
+            bindspace=reg_wrapped.bindspace,
+        )
+        bind_addrs = [rando.unwrap()]
+        extra_reg = []
+
+    all_reg = [reg_addr] + extra_reg
+
+    async def _main():
+        async with tractor.open_root_actor(
+            registry_addrs=all_reg,
+            tpt_bind_addrs=bind_addrs,
+            debug_mode=debug_mode,
+        ):
+            actor = tractor.current_actor()
+            assert actor.is_registrar
+
+            bound = actor.accept_addrs
+            bound_bs = _bound_bindspaces(actor)
+
+            # all registry bindspaces must appear in bound set
+            for ra in all_reg:
+                assert wrap_address(ra).bindspace in bound_bs
+
+            # all bind-addr bindspaces must appear
+            for ba in bind_addrs:
+                assert wrap_address(ba).bindspace in bound_bs
+
+            # registry addr must appear verbatim in bound
+            # (after wrapping both sides for comparison)
+            bound_w = _bound_wrapped(actor)
+            assert reg_wrapped in bound_w
+
+            if addr_combo == 'bind-disjoint-reg':
+                assert len(bound) >= 2
+
+    trio.run(_main)
+
+
+@pytest.mark.parametrize(
+    'addr_combo',
+    [
+        'bind-same-bindspace',
+        'bind-disjoint',
+    ],
+    ids=lambda v: v,
+)
+def test_non_registrar_root_tpt_bind_addrs(
+    daemon,
+    reg_addr: tuple,
+    tpt_proto: str,
+    debug_mode: bool,
+    addr_combo: str,
+):
+    '''
+    Non-registrar root with explicit `tpt_bind_addrs`:
+    bound set must exactly match the requested bind addrs
+    (no merge with registry).
+
+    '''
+    reg_wrapped = wrap_address(reg_addr)
+
+    if addr_combo == 'bind-same-bindspace':
+        # same bindspace as reg but port=0 so we get a random port
+        rando = reg_wrapped.get_random(
+            bindspace=reg_wrapped.bindspace,
+        )
+        bind_addrs = [rando.unwrap()]
+
+    elif addr_combo == 'bind-disjoint':
+        rando = reg_wrapped.get_random(
+            bindspace=reg_wrapped.bindspace,
+        )
+        bind_addrs = [rando.unwrap()]
+
+    async def _main():
+        async with tractor.open_root_actor(
+            registry_addrs=[reg_addr],
+            tpt_bind_addrs=bind_addrs,
+            debug_mode=debug_mode,
+        ):
+            actor = tractor.current_actor()
+            assert not actor.is_registrar
+
+            bound = actor.accept_addrs
+            assert len(bound) == len(bind_addrs)
+
+            # bindspaces must match
+            bound_bs = _bound_bindspaces(actor)
+            for ba in bind_addrs:
+                assert wrap_address(ba).bindspace in bound_bs
+
+            # TCP port=0 should resolve to a real port
+            for uw_addr in bound:
+                w = wrap_address(uw_addr)
+                if w.proto_key == 'tcp':
+                    _host, port = uw_addr
+                    assert port > 0
+
+    trio.run(_main)
+
+
+# ------------------------------------------------------------------
+# 3) Non-registrar, default random bind (baseline)
+# ------------------------------------------------------------------
+def test_non_registrar_default_random_bind(
+    daemon,
+    reg_addr: tuple,
+    debug_mode: bool,
+):
+    '''
+    Baseline: no `tpt_bind_addrs`, daemon running.
+    Bound bindspace matches registry bindspace,
+    but bound addr differs from reg_addr (random).
+
+    '''
+    reg_wrapped = wrap_address(reg_addr)
+
+    async def _main():
+        async with tractor.open_root_actor(
+            registry_addrs=[reg_addr],
+            debug_mode=debug_mode,
+        ):
+            actor = tractor.current_actor()
+            assert not actor.is_registrar
+
+            bound_bs = _bound_bindspaces(actor)
+            assert reg_wrapped.bindspace in bound_bs
+
+            # bound addr should differ from the registry addr
+            # (the runtime picks a random port/path)
+            bound_w = _bound_wrapped(actor)
+            assert reg_wrapped not in bound_w
+
+    trio.run(_main)
+
+
+# ------------------------------------------------------------------
+# 4) Multiaddr string input
+# ------------------------------------------------------------------
+def test_tpt_bind_addrs_as_maddr_str(
+    reg_addr: tuple,
+    debug_mode: bool,
+):
+    '''
+    Pass multiaddr strings as `tpt_bind_addrs`.
+    Runtime should parse and bind successfully.
+
+    '''
+    reg_wrapped = wrap_address(reg_addr)
+    # build a port-0 / random maddr string for binding
+    rando = reg_wrapped.get_random(
+        bindspace=reg_wrapped.bindspace,
+    )
+    maddr_str: str = str(mk_maddr(rando))
+
+    async def _main():
+        async with tractor.open_root_actor(
+            registry_addrs=[reg_addr],
+            tpt_bind_addrs=[maddr_str],
+            debug_mode=debug_mode,
+        ):
+            actor = tractor.current_actor()
+            assert actor.is_registrar
+
+            for uw_addr in actor.accept_addrs:
+                w = wrap_address(uw_addr)
+                if w.proto_key == 'tcp':
+                    _host, port = uw_addr
+                    assert port > 0
+
+    trio.run(_main)
+
+
+# ------------------------------------------------------------------
+# 5) Registrar merge produces union of binds
+# ------------------------------------------------------------------
+def test_registrar_merge_binds_union(
+    tpt_proto: str,
+    debug_mode: bool,
+):
+    '''
+    Registrar + disjoint bind addr: bound set must include
+    both registry and explicit bind addresses.
+
+    '''
+    reg_addr = get_rando_addr(tpt_proto)
+    reg_wrapped = wrap_address(reg_addr)
+
+    rando = reg_wrapped.get_random(
+        bindspace=reg_wrapped.bindspace,
+    )
+    bind_addrs = [rando.unwrap()]
+
+    # NOTE: for UDS, `get_random()` produces the same
+    # filename for the same pid+actor-state, so the
+    # "disjoint" premise only holds when the addrs
+    # actually differ (always true for TCP, may
+    # collide for UDS).
+    expect_disjoint: bool = (
+        tuple(reg_addr) != rando.unwrap()
+    )
+
+    async def _main():
+        async with tractor.open_root_actor(
+            registry_addrs=[reg_addr],
+            tpt_bind_addrs=bind_addrs,
+            debug_mode=debug_mode,
+        ):
+            actor = tractor.current_actor()
+            assert actor.is_registrar
+
+            bound = actor.accept_addrs
+            bound_w = _bound_wrapped(actor)
+
+            if expect_disjoint:
+                # must have at least 2 (registry + bind)
+                assert len(bound) >= 2
+
+            # registry addr must appear in bound set
+            assert reg_wrapped in bound_w
+
+    trio.run(_main)
+
+
+# ------------------------------------------------------------------
+# 6) open_nursery forwards tpt_bind_addrs
+# ------------------------------------------------------------------
+def test_open_nursery_forwards_tpt_bind_addrs(
+    reg_addr: tuple,
+    debug_mode: bool,
+):
+    '''
+    `open_nursery(tpt_bind_addrs=...)` forwards through
+    `**kwargs` to `open_root_actor()`.
+
+    '''
+    reg_wrapped = wrap_address(reg_addr)
+    rando = reg_wrapped.get_random(
+        bindspace=reg_wrapped.bindspace,
+    )
+    bind_addrs = [rando.unwrap()]
+
+    async def _main():
+        async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
+            tpt_bind_addrs=bind_addrs,
+            debug_mode=debug_mode,
+        ):
+            actor = tractor.current_actor()
+            bound_bs = _bound_bindspaces(actor)
+
+            for ba in bind_addrs:
+                assert wrap_address(ba).bindspace in bound_bs
+
+    trio.run(_main)
--- a/tests/ipc/test_each_tpt.py
+++ b/tests/ipc/test_each_tpt.py
@ -8,17 +8,16 @@ from pathlib import Path
 import pytest
 import trio
 import tractor
-from tractor import (
-    Actor,
-    _state,
-    _addr,
-)
+from tractor import Actor
+from tractor.runtime import _state
+from tractor.discovery import _addr


@pytest.fixture
 def bindspace_dir_str() -> str:

-    rt_dir: Path = tractor._state.get_rt_dir()
+    from tractor.runtime._state import get_rt_dir
+    rt_dir: Path = get_rt_dir()
    bs_dir: Path = rt_dir / 'doggy'
    bs_dir_str: str = str(bs_dir)
    assert not bs_dir.is_dir()
--- a/tests/ipc/test_multi_tpt.py
+++ b/tests/ipc/test_multi_tpt.py
@ -13,9 +13,9 @@ from tractor import (
    Portal,
    ipc,
    msg,
-    _state,
-    _addr,
 )
+from tractor.runtime import _state
+from tractor.discovery import _addr

@tractor.context
 async def chk_tpts(
@ -59,9 +59,19 @@ async def chk_tpts(
 )
 def test_root_passes_tpt_to_sub(
    tpt_proto_key: str,
+    tpt_proto: str,
    reg_addr: tuple,
    debug_mode: bool,
 ):
+    # `reg_addr` is sourced from the CLI `--tpt-proto={tpt_proto}`,
+    # so when the parametrized `tpt_proto_key` differs, the test
+    # asks the runtime to `enable_transports=[<other_proto>]` while
+    # pointing `registry_addrs` at a `reg_addr` of the wrong proto.
+    # The layer-2 guard in `open_root_actor` is expected to fail
+    # fast with `ValueError` on this mismatch (rather than the prior
+    # silent hang during the registrar handshake).
+    proto_mismatch: bool = (tpt_proto_key != tpt_proto)
+
    async def main():
        async with tractor.open_nursery(
            enable_transports=[tpt_proto_key],
@ -92,4 +102,14 @@ def test_root_passes_tpt_to_sub(
            # shudown sub-actor(s)
            await an.cancel()

-    trio.run(main)
+    if proto_mismatch:
+        # mismatched proto must raise `ValueError` from the
+        # `open_root_actor` runtime guard before any subactor spawn.
+        with pytest.raises(ValueError) as excinfo:
+            trio.run(main)
+        msg: str = str(excinfo.value)
+        assert 'enable_transports' in msg
+        assert 'registry_addrs' in msg
+        assert tpt_proto_key in msg or tpt_proto in msg
+    else:
+        trio.run(main)
--- a/tests/msg/init.py
+++ b/tests/msg/init.py
@ -0,0 +1,4 @@
+'''
+`tractor.msg.*` sub-sys test suite.
+
+'''
--- a/tests/msg/conftest.py
+++ b/tests/msg/conftest.py
@ -0,0 +1,4 @@
+'''
+`tractor.msg.*` test sub-pkg conf.
+
+'''
--- a/tests/msg/test_ext_types_msgspec.py
+++ b/tests/msg/test_ext_types_msgspec.py
@ -57,6 +57,7 @@ from tractor.msg._ops import (
    limit_plds,
 )

+
 def enc_nsp(obj: Any) -> Any:
    actor: Actor = tractor.current_actor(
        err_on_no_runtime=False,
@ -617,6 +618,17 @@ def test_ext_types_over_ipc(
    debug_mode: bool,
    pld_spec: Union[Type],
    add_hooks: bool,
+
+    set_fork_aware_capture,
+    # ^^XXX? for forking spawners
+
+    # capfd: pytest.CaptureFixture,
+    # ^^NOTE, super interesting that if
+    # we disable this below then the tpt-layer
+    # suffers as an "unclean EOF"??
+    # ?TODO, determine why/how that mks sense when addressing,
+    # https://github.com/pytest-dev/pytest/issues/14444
+    #
 ):
    '''
    Ensure we can support extension types coverted using
@ -725,18 +737,26 @@ def test_ext_types_over_ipc(

            await p.cancel_actor()

+    async def fa_main():
+        with (
+            trio.fail_after(2),
+            # ?TODO, investigate? see NOTE above..
+            # capfd.disabled(),
+        ):
+            await main()
+
    if (
        NamespacePath in pld_types
        and
        add_hooks
    ):
-        trio.run(main)
+        trio.run(fa_main)

    else:
        with pytest.raises(
            expected_exception=tractor.RemoteActorError,
        ) as excinfo:
-            trio.run(main)
+            trio.run(fa_main)

        exc = excinfo.value
        # bc `.started(nsp: NamespacePath)` will raise
--- a/tests/msg/test_pldrx_limiting.py
+++ b/tests/msg/test_pldrx_limiting.py
@ -55,13 +55,38 @@ async def maybe_expect_raises(
    raises: BaseException|None = None,
    ensure_in_message: list[str]|None = None,
    post_mortem: bool = False,
-    timeout: int = 3,
+    # NOTE, `None` selects a backend-aware default below —
+    # see `_BACKEND_TIMEOUT_DEFAULTS` for rationale. Caller
+    # can override with an explicit value to opt out.
+    timeout: int|None = None,
 ) -> None:
    '''
    Async wrapper for ensuring errors propagate from the inner scope.

    '''
-    if tractor._state.debug_mode():
+    if timeout is None:
+        # Pick a backend-aware default. Fork-based backends
+        # (`main_thread_forkserver`) need much more headroom
+        # because actor spawn + IPC ctx-exit + msg-validation
+        # error path takes longer than under `trio` backend
+        # — especially under cross-pytest-stream contention
+        # (#451). `test_basic_payload_spec` empirically:
+        #   - 3s flaked all-valid variant (`TooSlowError`)
+        #   - 8s flaked `invalid-return` variant
+        #     (`Cancelled` surfaced instead of `MsgTypeError`
+        #     because `fail_after` fired mid-error-path)
+        #   - 15s flaked under cross-stream contention
+        # 30s for fork-based gives plenty of headroom while
+        # still failing-loud on a genuine hang. Other
+        # backends keep the original 3s.
+        from tractor.spawn import _spawn as _spawn_mod
+        timeout = (
+            30
+            if _spawn_mod._spawn_method == 'main_thread_forkserver'
+            else 3
+        )
+
+    if tractor.debug_mode():
        timeout += 999

    with trio.fail_after(timeout):
@ -259,6 +284,11 @@ def test_basic_payload_spec(
    return_value: str|None,
    started_value: int|PldMsg,
    pld_check_started_value: bool,
+
+    set_fork_aware_capture,
+    # ^XXX TODO? for forking spawners, seems to prevent hangs when
+    # --capture=sys not set, but only for a while then the problem
+    # accumulates?
 ):
    '''
    Validate the most basic `PldRx` msg-type-spec semantics around
--- a/tests/msg/test_pretty_struct.py
+++ b/tests/msg/test_pretty_struct.py
@ -0,0 +1,240 @@
+'''
+Unit tests for `tractor.msg.pretty_struct`
+private-field filtering in `pformat()`.
+
+'''
+import pytest
+
+from tractor.msg.pretty_struct import (
+    Struct,
+    pformat,
+    iter_struct_ppfmt_lines,
+)
+from tractor.msg._codec import (
+    MsgDec,
+    mk_dec,
+)
+
+
+# ------ test struct definitions ------ #
+
+class PublicOnly(Struct):
+    '''
+    All-public fields for baseline testing.
+
+    '''
+    name: str = 'alice'
+    age: int = 30
+
+
+class PrivateOnly(Struct):
+    '''
+    Only underscore-prefixed (private) fields.
+
+    '''
+    _secret: str = 'hidden'
+    _internal: int = 99
+
+
+class MixedFields(Struct):
+    '''
+    Mix of public and private fields.
+
+    '''
+    name: str = 'bob'
+    _hidden: int = 42
+    value: float = 3.14
+    _meta: str = 'internal'
+
+
+class Inner(
+    Struct,
+    frozen=True,
+):
+    '''
+    Frozen inner struct with a private field,
+    for nesting tests.
+
+    '''
+    x: int = 1
+    _secret: str = 'nope'
+
+
+class Outer(Struct):
+    '''
+    Outer struct nesting an `Inner`.
+
+    '''
+    label: str = 'outer'
+    inner: Inner = Inner()
+
+
+class EmptyStruct(Struct):
+    '''
+    Struct with zero fields.
+
+    '''
+    pass
+
+
+# ------ tests ------ #
+
+@pytest.mark.parametrize(
+    'struct_and_expected',
+    [
+        (
+            PublicOnly(),
+            {
+                'shown': ['name', 'age'],
+                'hidden': [],
+            },
+        ),
+        (
+            MixedFields(),
+            {
+                'shown': ['name', 'value'],
+                'hidden': ['_hidden', '_meta'],
+            },
+        ),
+        (
+            PrivateOnly(),
+            {
+                'shown': [],
+                'hidden': ['_secret', '_internal'],
+            },
+        ),
+    ],
+    ids=[
+        'all-public',
+        'mixed-pub-priv',
+        'all-private',
+    ],
+)
+def test_field_visibility_in_pformat(
+    struct_and_expected: tuple[
+        Struct,
+        dict[str, list[str]],
+    ],
+):
+    '''
+    Verify `pformat()` shows public fields
+    and hides `_`-prefixed private fields.
+
+    '''
+    (
+        struct,
+        expected,
+    ) = struct_and_expected
+    output: str = pformat(struct)
+
+    for field_name in expected['shown']:
+        assert field_name in output, (
+            f'{field_name!r} should appear in:\n'
+            f'{output}'
+        )
+
+    for field_name in expected['hidden']:
+        assert field_name not in output, (
+            f'{field_name!r} should NOT appear in:\n'
+            f'{output}'
+        )
+
+
+def test_iter_ppfmt_lines_skips_private():
+    '''
+    Directly verify `iter_struct_ppfmt_lines()`
+    never yields tuples with `_`-prefixed field
+    names.
+
+    '''
+    struct = MixedFields()
+    lines: list[tuple[str, str]] = list(
+        iter_struct_ppfmt_lines(
+            struct,
+            field_indent=2,
+        )
+    )
+    # should have lines for public fields only
+    assert len(lines) == 2
+
+    for _prefix, line_content in lines:
+        field_name: str = (
+            line_content.split(':')[0].strip()
+        )
+        assert not field_name.startswith('_'), (
+            f'private field leaked: {field_name!r}'
+        )
+
+
+def test_nested_struct_filters_inner_private():
+    '''
+    Verify that nested struct's private fields
+    are also filtered out during recursion.
+
+    '''
+    outer = Outer()
+    output: str = pformat(outer)
+
+    # outer's public field
+    assert 'label' in output
+
+    # inner's public field (recursed into)
+    assert 'x' in output
+
+    # inner's private field must be hidden
+    assert '_secret' not in output
+
+
+def test_empty_struct_pformat():
+    '''
+    An empty struct should produce a valid
+    `pformat()` result with no field lines.
+
+    '''
+    output: str = pformat(EmptyStruct())
+    assert 'EmptyStruct(' in output
+    assert output.rstrip().endswith(')')
+
+    # no field lines => only struct header+footer
+    lines: list[tuple[str, str]] = list(
+        iter_struct_ppfmt_lines(
+            EmptyStruct(),
+            field_indent=2,
+        )
+    )
+    assert lines == []
+
+
+def test_real_msgdec_pformat_hides_private():
+    '''
+    Verify `pformat()` on a real `MsgDec`
+    hides the `_dec` internal field.
+
+    NOTE: `MsgDec.__repr__` is custom and does
+    NOT call `pformat()`, so we call it directly.
+
+    '''
+    dec: MsgDec = mk_dec(spec=int)
+    output: str = pformat(dec)
+
+    # the private `_dec` field should be filtered
+    assert '_dec' not in output
+
+    # but the struct type name should be present
+    assert 'MsgDec(' in output
+
+
+def test_pformat_repr_integration():
+    '''
+    Verify that `Struct.__repr__()` (which calls
+    `pformat()`) also hides private fields for
+    custom structs that do NOT override `__repr__`.
+
+    '''
+    mixed = MixedFields()
+    output: str = repr(mixed)
+
+    assert 'name' in output
+    assert 'value' in output
+    assert '_hidden' not in output
+    assert '_meta' not in output
--- a/tests/test_2way.py
+++ b/tests/test_2way.py
@ -1,7 +1,12 @@
-"""
-Bidirectional streaming.
+'''
+Audit the simplest inter-actor bidirectional (streaming)
+msg patterns.

-"""
+'''
+from __future__ import annotations
+from typing import (
+    Callable,
+)
 import pytest
 import trio
 import tractor
@ -9,10 +14,8 @@ import tractor

@tractor.context
 async def simple_rpc(
-
    ctx: tractor.Context,
    data: int,
-
 ) -> None:
    '''
    Test a small ping-pong server.
@ -39,15 +42,13 @@ async def simple_rpc(

@tractor.context
 async def simple_rpc_with_forloop(
-
    ctx: tractor.Context,
    data: int,
-
 ) -> None:
-    """Same as previous test but using ``async for`` syntax/api.
-
-    """
+    '''
+    Same as previous test but using `async for` syntax/api.

+    '''
    # signal to parent that we're up
    await ctx.started(data + 1)

@ -68,62 +69,78 @@ async def simple_rpc_with_forloop(

@pytest.mark.parametrize(
    'use_async_for',
-    [True, False],
+    [
+        True,
+        False,
+    ],
+    ids='use_async_for={}'.format,
 )
@pytest.mark.parametrize(
    'server_func',
-    [simple_rpc, simple_rpc_with_forloop],
+    [
+        simple_rpc,
+        simple_rpc_with_forloop,
+    ],
+    ids='server_func={}'.format,
 )
-def test_simple_rpc(server_func, use_async_for):
+def test_simple_rpc(
+    server_func: Callable,
+    use_async_for: bool,
+    loglevel: str,
+    debug_mode: bool,
+):
    '''
    The simplest request response pattern.

    '''
    async def main():
-        async with tractor.open_nursery() as n:
+        with trio.fail_after(6):
+            async with tractor.open_nursery(
+                loglevel=loglevel,
+                debug_mode=debug_mode,
+            ) as an:
+                portal: tractor.Portal = await an.start_actor(
+                    'rpc_server',
+                    enable_modules=[__name__],
+                )

-            portal = await n.start_actor(
-                'rpc_server',
-                enable_modules=[__name__],
-            )
+                async with portal.open_context(
+                    server_func,  # taken from pytest parameterization
+                    data=10,
+                ) as (ctx, sent):

-            async with portal.open_context(
-                server_func,  # taken from pytest parameterization
-                data=10,
-            ) as (ctx, sent):
+                    assert sent == 11

-                assert sent == 11
+                    async with ctx.open_stream() as stream:

-                async with ctx.open_stream() as stream:
+                        if use_async_for:

-                    if use_async_for:
-
-                        count = 0
-                        # receive msgs using async for style
-                        print('ping')
-                        await stream.send('ping')
-
-                        async for msg in stream:
-                            assert msg == 'pong'
+                            count = 0
+                            # receive msgs using async for style
                            print('ping')
                            await stream.send('ping')
-                            count += 1

-                            if count >= 9:
-                                break
+                            async for msg in stream:
+                                assert msg == 'pong'
+                                print('ping')
+                                await stream.send('ping')
+                                count += 1

-                    else:
-                        # classic send/receive style
-                        for _ in range(10):
+                                if count >= 9:
+                                    break

-                            print('ping')
-                            await stream.send('ping')
-                            assert await stream.receive() == 'pong'
+                        else:
+                            # classic send/receive style
+                            for _ in range(10):

-                # stream should terminate here
+                                print('ping')
+                                await stream.send('ping')
+                                assert await stream.receive() == 'pong'

-            # final context result(s) should be consumed here in __aexit__()
+                    # stream should terminate here

-            await portal.cancel_actor()
+                # final context result(s) should be consumed here in __aexit__()
+
+                await portal.cancel_actor()

    trio.run(main)
--- a/tests/test_advanced_faults.py
+++ b/tests/test_advanced_faults.py
@ -98,7 +98,8 @@ def test_ipc_channel_break_during_stream(
        expect_final_exc = TransportClosed

    mod: ModuleType = import_path(
-        examples_dir() / 'advanced_faults'
+        examples_dir()
+        / 'advanced_faults'
        / 'ipc_failure_during_stream.py',
        root=examples_dir(),
        consider_namespace_packages=False,
@ -113,8 +114,9 @@ def test_ipc_channel_break_during_stream(
    if (
        # only expect EoC if trans is broken on the child side,
        ipc_break['break_child_ipc_after'] is not False
+        and
        # AND we tell the child to call `MsgStream.aclose()`.
-        and pre_aclose_msgstream
+        pre_aclose_msgstream
    ):
        # expect_final_exc = trio.EndOfChannel
        # ^XXX NOPE! XXX^ since now `.open_stream()` absorbs this
@ -144,9 +146,6 @@ def test_ipc_channel_break_during_stream(
        # a user sending ctl-c by raising a KBI.
        if pre_aclose_msgstream:
            expect_final_exc = KeyboardInterrupt
-            if tpt_proto == 'uds':
-                expect_final_exc = TransportClosed
-                expect_final_cause = trio.BrokenResourceError

            # XXX OLD XXX
            # if child calls `MsgStream.aclose()` then expect EoC.
@ -160,16 +159,13 @@ def test_ipc_channel_break_during_stream(
        ipc_break['break_child_ipc_after'] is not False
        and (
            ipc_break['break_parent_ipc_after']
-            > ipc_break['break_child_ipc_after']
+            >
+            ipc_break['break_child_ipc_after']
        )
    ):
        if pre_aclose_msgstream:
            expect_final_exc = KeyboardInterrupt

-            if tpt_proto == 'uds':
-                expect_final_exc = TransportClosed
-                expect_final_cause = trio.BrokenResourceError
-
    # NOTE when the parent IPC side dies (even if the child does as well
    # but the child fails BEFORE the parent) we always expect the
    # IPC layer to raise a closed-resource, NEVER do we expect
@ -248,8 +244,15 @@ def test_ipc_channel_break_during_stream(
    # get raw instance from pytest wrapper
    value = excinfo.value
    if isinstance(value, ExceptionGroup):
-        excs = value.exceptions
-        assert len(excs) == 1
+        excs: tuple[Exception] = value.exceptions
+        assert (
+            len(excs) <= 2
+            and
+            all(
+                isinstance(exc, TransportClosed)
+                for exc in excs
+            )
+        )
        final_exc = excs[0]
        assert isinstance(final_exc, expect_final_exc)

--- a/tests/test_advanced_streaming.py
+++ b/tests/test_advanced_streaming.py
@ -5,10 +5,15 @@ Advanced streaming patterns using bidirectional streams and contexts.
 from collections import Counter
 import itertools
 import platform
+from typing import Type

 import pytest
 import trio
 import tractor
+from tractor._testing.trace import (
+    AfkAlarmWTraceFactory,
+    FailAfterWTraceFactory,
+)


 def is_win():
@ -76,9 +81,7 @@ async def subscribe(


 async def consumer(
-
    subs: list[str],
-
 ) -> None:

    uid = tractor.current_actor().uid
@ -108,59 +111,193 @@ async def consumer(
                        print(f'{uid} got: {value}')


-def test_dynamic_pub_sub():
+# NOTE: deliberately NOT using `@pytest.mark.timeout(...)` —
+# both pytest-timeout enforcement modes break trio under
+# fork-based backends:
+#
+# - `method='signal'` (SIGALRM): the handler synchronously
+#   raises `Failed` in trio's main thread mid-`epoll.poll()`,
+#   leaves `GLOBAL_RUN_CONTEXT` half-installed ("Trio guest
+#   run got abandoned"), and EVERY subsequent `trio.run()`
+#   in the same pytest process bails with
+#   `RuntimeError: Attempted to call run() from inside a
+#   run()` — session-wide poison.
+#
+# - `method='thread'`: calls `_thread.interrupt_main()`
+#   raising `KeyboardInterrupt` into the main thread. Under
+#   fork-based backends with mid-cascade fd-juggling the KBI
+#   can escape trio's `KIManager` and bubble out of pytest
+#   itself — kills the WHOLE session.
+#
+# Instead we use `trio.fail_after()` INSIDE `main()` below:
+# trio's own `Cancelled`/`TooSlowError` machinery handles the
+# timeout, cleanly unwinds the actor nursery's cancel
+# cascade, and only fails the single test (no cross-test
+# state corruption either way).
+#
+# `pyproject.toml`'s default `timeout = 200` is still a
+# last-resort safety net.
+@pytest.mark.parametrize(
+    'expect_cancel_exc', [
+        KeyboardInterrupt,
+        trio.TooSlowError,
+    ],
+    ids=lambda item:
+        f'expect_user_exc_raised={item.__name__}'
+)
+def test_dynamic_pub_sub(
+    reg_addr: tuple,
+    debug_mode: bool,
+    test_log: tractor.log.StackLevelAdapter,
+    reap_subactors_per_test: int,
+    expect_cancel_exc: Type[BaseException],
+
+    is_forking_spawner: bool,
+    set_fork_aware_capture,
+
+    fail_after_w_trace: FailAfterWTraceFactory,
+    afk_alarm_w_trace: AfkAlarmWTraceFactory,
+):
+    failed_to_raise_report: str = (
+        f'Never got a {expect_cancel_exc!r} ??'
+    )

    global _registry

    from multiprocessing import cpu_count
    cpus = cpu_count()

+    # Hard safety cap via trio's own cancellation. NOTE see the
+    # module-level note on why we avoid `pytest-timeout` for this
+    # test. Picked backend-aware: under `trio` backend spawn is
+    # cheap (~1s for `cpus` actors) but fork-based backends pay
+    # a per-spawn cost (forkserver round-trip + IPC peer-handshake)
+    # that can stack up over `cpus - 1` sequential `n.run_in_actor()`
+    # calls — especially on UDS under cross-pytest contention
+    # (#451 / #452). 4s was flaking right at the edge under fork
+    # backends — bumped to 8s with diag-snapshot-on-timeout via
+    # `fail_after_w_trace` so a borderline run still fails loud
+    # but lands a ptree/wchan/py-spy dump in
+    # `$XDG_CACHE_HOME/tractor/hung-dumps/` for inspection.
+    #
+    # XXX caveat: this is an *inner* trio cancel — its `Cancelled`
+    # cannot reach a task parked in a shielded `await` (e.g. inside
+    # actor-nursery teardown). When the in-band cancel path is
+    # itself buggy (the bug-class-3 `raise KBI` swallow we're
+    # currently chasing) this guard does NOT fire and the test
+    # sits forever until external SIGINT. The `afk_alarm_w_trace`
+    # outer guard below is the AFK-safety counterpart (SIGALRM
+    # raises in the main thread regardless of trio scope state).
+    fail_after_s: int = (
+        8
+        if is_forking_spawner
+        else 20
+    )
+
    async def main():
-        async with tractor.open_nursery() as n:
-
-            # name of this actor will be same as target func
-            await n.run_in_actor(publisher)
-
-            for i, sub in zip(
-                range(cpus - 2),
-                itertools.cycle(_registry.keys())
-            ):
-                await n.run_in_actor(
-                    consumer,
-                    name=f'consumer_{sub}',
-                    subs=[sub],
+        # bug-class-3 breadcrumb: tag each level of the cancel path
+        # so when the run hangs and we capture cancel-level logs, the
+        # *last* breadcrumb that fired names the swallow point.
+        test_log.cancel('test_dynamic_pub_sub: enter main()')
+        try:
+            async with fail_after_w_trace(fail_after_s):
+                test_log.cancel(
+                    f'test_dynamic_pub_sub: '
+                    f'enter `fail_after_w_trace({fail_after_s})` scope'
                )
+                try:
+                    async with tractor.open_nursery(
+                        registry_addrs=[reg_addr],
+                        debug_mode=debug_mode,
+                    ) as n:
+                        test_log.cancel(
+                            'test_dynamic_pub_sub: '
+                            'actor nursery opened'
+                        )

-            # make one dynamic subscriber
-            await n.run_in_actor(
-                consumer,
-                name='consumer_dynamic',
-                subs=list(_registry.keys()),
+                        # name of this actor will be same as target func
+                        await n.run_in_actor(publisher)
+
+                        for i, sub in zip(
+                            range(cpus - 2),
+                            itertools.cycle(_registry.keys())
+                        ):
+                            await n.run_in_actor(
+                                consumer,
+                                name=f'consumer_{sub}',
+                                subs=[sub],
+                            )
+
+                        # make one dynamic subscriber
+                        await n.run_in_actor(
+                            consumer,
+                            name='consumer_dynamic',
+                            subs=list(_registry.keys()),
+                        )
+
+                        # block until "cancelled by user"
+                        await trio.sleep(3)
+                        test_log.warning(
+                            f'Raising user cancel exc: '
+                            f'{expect_cancel_exc!r}'
+                        )
+                        test_log.cancel(
+                            f'test_dynamic_pub_sub: '
+                            f'ABOUT TO RAISE {expect_cancel_exc!r}'
+                        )
+                        raise expect_cancel_exc('simulate user cancel!')
+                finally:
+                    test_log.cancel(
+                        'test_dynamic_pub_sub: '
+                        'actor nursery `__aexit__` returned'
+                    )
+            test_log.cancel(
+                'test_dynamic_pub_sub: `fail_after` scope exited'
+            )
+        finally:
+            test_log.cancel(
+                'test_dynamic_pub_sub: leaving `main()`'
            )

-            # block until cancelled by user
-            with trio.fail_after(3):
-                await trio.sleep_forever()
+    def _run_and_match():
+        try:
+            trio.run(main)
+            pytest.fail(failed_to_raise_report)
+        except expect_cancel_exc:
+            # parent-side raised the user-cancel exc directly and
+            # it propagated unwrapped; clean path.
+            test_log.exception('Got user-cancel exc AS EXPECTED')
+        except BaseExceptionGroup as err:
+            # under fork-based backends the user-raised cancel
+            # can race with subactor-side stream teardown
+            # (`trio.EndOfChannel` from a publisher's `send()`
+            # whose remote half got cut). The expected exc may
+            # then be nested deeper in the group rather than at
+            # the top level. `BaseExceptionGroup.split()` walks
+            # the exc tree recursively (Python 3.11+).
+            matched, _ = err.split(expect_cancel_exc)
+            if matched is None:
+                pytest.fail(failed_to_raise_report)

-    try:
-        trio.run(main)
-    except (
-        trio.TooSlowError,
-        ExceptionGroup,
-    ) as err:
-        if isinstance(err, ExceptionGroup):
-            for suberr in err.exceptions:
-                if isinstance(suberr, trio.TooSlowError):
-                    break
-            else:
-                pytest.fail('Never got a `TooSlowError` ?')
+            test_log.exception('Got user-cancel exc AS EXPECTED')
+
+    # outer SIGALRM-based guard — survives a shielded-await
+    # deadlock since `signal.alarm` raises in the main thread
+    # regardless of trio's scope state, AND captures a full diag
+    # snapshot to `$XDG_CACHE_HOME/tractor/hung-dumps/` before
+    # re-raising. ONLY armed under fork-based backends since the
+    # bug we're chasing is MTF-specific. Cap = `fail_after_s + 5`
+    # so the trio-native path always wins when it works.
+    if is_forking_spawner:
+        with afk_alarm_w_trace(fail_after_s + 5):
+            _run_and_match()
+    else:
+        _run_and_match()


@tractor.context
 async def one_task_streams_and_one_handles_reqresp(
-
    ctx: tractor.Context,
-
 ) -> None:

    await ctx.started()
@ -257,7 +394,8 @@ async def echo_ctx_stream(


 def test_sigint_both_stream_types():
-    '''Verify that running a bi-directional and recv only stream
+    '''
+    Verify that running a bi-directional and recv only stream
    side-by-side will cancel correctly from SIGINT.

    '''
@ -287,9 +425,11 @@ def test_sigint_both_stream_types():
                            assert resp == msg
                            raise KeyboardInterrupt

+    # TODO, use pytest.raises() here instead?
+    # (why weren't we originally?)
    try:
        trio.run(main)
-        assert 0, "Didn't receive KBI!?"
+        pytest.fail("Didn't receive KBI!?")
    except KeyboardInterrupt:
        pass

@ -356,7 +496,12 @@ async def inf_streamer(
    print('streamer exited .open_streamer() block')


+# @pytest.mark.timeout(
+#     6,
+#     method='signal',
+# )
 def test_local_task_fanout_from_stream(
+    reg_addr: tuple,
    debug_mode: bool,
 ):
    '''
@ -421,4 +566,9 @@ def test_local_task_fanout_from_stream(

            await p.cancel_actor()

-    trio.run(main)
+    async def w_timeout():
+        with trio.fail_after(6):
+            await main()
+
+    # trio.run(main)
+    trio.run(w_timeout)
--- a/tests/test_cancellation.py
+++ b/tests/test_cancellation.py
@ -7,6 +7,7 @@ import signal
 import platform
 import time
 from itertools import repeat
+from typing  import Type

 import pytest
 import trio
@ -14,11 +15,52 @@ import tractor
 from tractor._testing import (
    tractor_test,
 )
+from tractor._testing.trace import FailAfterWTraceFactory
 from .conftest import no_windows


-def is_win():
-    return platform.system() == 'Windows'
+_non_linux: bool = platform.system() != 'Linux'
+_friggin_windows: bool = platform.system() == 'Windows'
+
+
+pytestmark = [
+    # Multi-actor cancel cascades under
+    # `--spawn-backend=subint` trip the abandoned-subint
+    # GIL-hostage class — a stuck subint can starve the
+    # parent's trio loop and block cancel-delivery.
+    # Apply the skip module-wide rather than per-test
+    # since every test here exercises the same cascade.
+    pytest.mark.skipon_spawn_backend(
+        'subint',
+        reason=(
+            'XXX SUBINT GIL-CONTENTION HANGING TEST XXX\n'
+            'Cancel cascades under '
+            '`--spawn-backend=subint` trip the abandoned-subint '
+            'GIL-hostage class — see\n'
+            '  - `ai/conc-anal/subint_sigint_starvation_issue.md` '
+            '(GIL-hostage, SIGINT-unresponsive)\n'
+            '  - `ai/conc-anal/subint_cancel_delivery_hang_issue.md` '
+            '(sibling: parent parks on dead chan)\n'
+            '  - https://github.com/goodboy/tractor/issues/379 '
+            '(subint umbrella)\n'
+        )
+    ),
+    pytest.mark.usefixtures(
+        'reap_subactors_per_test',
+        # NOTE, cancellation tests stress the SIGKILL
+        # `hard_kill` path which leaks UDS sock-files when
+        # the subactor's IPC server `finally:` cleanup
+        # doesn't run. Track per-test for blame attribution.
+        'track_orphaned_uds_per_test',
+        # NOTE, cancel-cascade timing races (see
+        # `test_nested_multierrors`) can also leave a
+        # subactor spinning at 100% CPU when its cancel
+        # signal got swallowed mid-handshake. Catches the
+        # runaway-loop class that doesn't leak UDS socks
+        # but burns the box.
+        'detect_runaway_subactors_per_test',
+    ),
+]


 async def assert_err(delay=0):
@ -45,7 +87,11 @@ async def do_nuthin():
    ],
    ids=['no_args', 'unexpected_args'],
 )
-def test_remote_error(reg_addr, args_err):
+def test_remote_error(
+    reg_addr: tuple,
+    args_err: tuple[dict, Type[Exception]],
+    set_fork_aware_capture,
+):
    '''
    Verify an error raised in a subactor that is propagated
    to the parent nursery, contains the underlying boxed builtin
@ -112,6 +158,8 @@ def test_remote_error(reg_addr, args_err):

 def test_multierror(
    reg_addr: tuple[str, int],
+    start_method: str,  # parametrized
+    set_fork_aware_capture, #: Callable,
 ):
    '''
    Verify we raise a ``BaseExceptionGroup`` out of a nursery where
@ -141,31 +189,68 @@ def test_multierror(
        trio.run(main)


-@pytest.mark.parametrize('delay', (0, 0.5))
@pytest.mark.parametrize(
-    'num_subactors', range(25, 26),
+    'delay',
+    (0, 0.5),
+    ids='delays={}'.format,
 )
-def test_multierror_fast_nursery(reg_addr, start_method, num_subactors, delay):
-    """Verify we raise a ``BaseExceptionGroup`` out of a nursery where
+@pytest.mark.parametrize(
+    'num_subactors',
+    range(25, 26),
+    ids= 'num_subs={}'.format,
+)
+def test_multierror_fast_nursery(
+    reg_addr: tuple,
+    start_method: str,
+    num_subactors: int,
+    delay: float,
+    set_fork_aware_capture,
+    fail_after_w_trace: FailAfterWTraceFactory,
+):
+    '''
+    Verify we raise a ``BaseExceptionGroup`` out of a nursery where
    more then one actor errors and also with a delay before failure
    to test failure during an ongoing spawning.
-    """
-    async def main():
-        async with tractor.open_nursery(
-            registry_addrs=[reg_addr],
-        ) as nursery:

-            for i in range(num_subactors):
-                await nursery.run_in_actor(
-                    assert_err,
-                    name=f'errorer{i}',
-                    delay=delay
-                )
+    '''
+    async def main():
+        # budget = 2× natural trio-backend cascade time for
+        # 25 errorer subactors (~14s observed). on-timeout
+        # diag snapshot → if the cancel cascade hangs
+        # (observed under MTF backend with N>=14 errorer
+        # subactors) we get a fresh ptree/wchan/py-spy dump
+        # on disk INSTEAD of an opaque pytest timeout-kill.
+        # See `tractor/_testing/trace.py` for the helper.
+        async with fail_after_w_trace(30.0):
+            async with tractor.open_nursery(
+                registry_addrs=[reg_addr],
+            ) as nursery:
+
+                for i in range(num_subactors):
+                    await nursery.run_in_actor(
+                        assert_err,
+                        name=f'errorer{i}',
+                        delay=delay
+                    )

    # with pytest.raises(trio.MultiError) as exc_info:
-    with pytest.raises(BaseExceptionGroup) as exc_info:
+    # NOTE, `trio.TooSlowError` from `fail_after_w_trace`
+    # bubbles UN-wrapped if `open_nursery.__aexit__` never
+    # gets re-entered; wrapped inside a `BaseExceptionGroup`
+    # if it did. Accept both shapes so the matcher itself
+    # doesn't lie about *what* failed.
+    with pytest.raises(
+        (BaseExceptionGroup, trio.TooSlowError),
+    ) as exc_info:
        trio.run(main)

+    if isinstance(exc_info.value, trio.TooSlowError):
+        pytest.fail(
+            f'cancel cascade hung past 12s '
+            f'(num_subactors={num_subactors}, delay={delay}); '
+            f'see stderr for `fail_after_w_trace` snapshot path'
+        )
+
    assert exc_info.type == ExceptionGroup
    err = exc_info.value
    exceptions = err.exceptions
@ -189,8 +274,15 @@ async def do_nothing():
    pass


-@pytest.mark.parametrize('mechanism', ['nursery_cancel', KeyboardInterrupt])
-def test_cancel_single_subactor(reg_addr, mechanism):
+@pytest.mark.parametrize(
+    'mechanism', [
+    'nursery_cancel',
+    KeyboardInterrupt,
+])
+def test_cancel_single_subactor(
+    reg_addr: tuple,
+    mechanism: str|KeyboardInterrupt,
+):
    '''
    Ensure a ``ActorNursery.start_actor()`` spawned subactor
    cancels when the nursery is cancelled.
@ -232,9 +324,14 @@ async def stream_forever():
        await trio.sleep(0.01)


-@tractor_test
-async def test_cancel_infinite_streamer(start_method):
-
+@tractor_test(
+    timeout=6,
+)
+async def test_cancel_infinite_streamer(
+    reg_addr: tuple,
+    start_method: str,
+    set_fork_aware_capture,
+):
    # stream for at most 1 seconds
    with (
        trio.fail_after(4),
@ -286,11 +383,15 @@ async def test_cancel_infinite_streamer(start_method):
        'no_daemon_actors_fail_all_run_in_actors_sleep_then_fail',
    ],
 )
-@tractor_test
+@tractor_test(
+    timeout=10,
+)
 async def test_some_cancels_all(
    num_actors_and_errs: tuple,
+    reg_addr: tuple,
    start_method: str,
    loglevel: str,
+    set_fork_aware_capture, #: Callable,
 ):
    '''
    Verify a subset of failed subactors causes all others in
@ -370,7 +471,10 @@ async def test_some_cancels_all(
        pytest.fail("Should have gotten a remote assertion error?")


-async def spawn_and_error(breadth, depth) -> None:
+async def spawn_and_error(
+    breadth: int,
+    depth: int,
+) -> None:
    name = tractor.current_actor().name
    async with tractor.open_nursery() as nursery:
        for i in range(breadth):
@ -395,28 +499,140 @@ async def spawn_and_error(breadth, depth) -> None:
            await nursery.run_in_actor(*args, **kwargs)


-@tractor_test
-async def test_nested_multierrors(loglevel, start_method):
+# NOTE: `main_thread_forkserver` capture-fd hang class is no
+# longer skipped here — `--capture=sys` (the new `pyproject.toml`
+# default) sidesteps the pipe-buffer-fill deadlock for
+# `test_nested_multierrors`. See
+# `ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`
+# / #449 for the post-mortem.
+# @pytest.mark.timeout(
+#     10,
+#     method='thread',
+# )
+@pytest.mark.parametrize(
+    'depth',
+    [1, 3],
+    ids='depth={}'.format,
+)
+@tractor_test(
+    # bumped from the 30s default to cover fork-based
+    # cancel-cascade flakes; 2 spawners × 2 errorers × depth 1+
+    # cascade through 6 portal-wait_for_result paths each
+    # paying `terminate_after=1.6s` + UDS sock-unlink under
+    # MTF/UDS contention can easily blow past 30s.
+    # Trio backend is fast and won't notice the extra budget.
+    # See `ai/conc-anal/cancel_cascade_too_slow_under_main_thread_forkserver_issue.md`.
+    timeout=10,
+)
+async def test_nested_multierrors(
+    reg_addr: tuple,
+    loglevel: str,
+    start_method: str,
+    set_fork_aware_capture,
+    fail_after_w_trace: FailAfterWTraceFactory,
+    request: pytest.FixtureRequest,
+    depth: int,
+):
    '''
-    Test that failed actor sets are wrapped in `BaseExceptionGroup`s. This
-    test goes only 2 nurseries deep but we should eventually have tests
-    for arbitrary n-depth actor trees.
+    Test that failed actor sets are wrapped in `BaseExceptionGroup`s.
+
+    Parametrized over recursion `depth ∈ {1, 3}`:
+
+      - `depth=1`: shallow tree (2 spawners × 2 errorers, 2
+        levels). Cascade completes well within budget on ALL
+        backends including MTF — regression-safety green case.
+
+      - `depth=3`: deep tree (2 spawners × recursive depth-3
+        spawn-and-error). On `main_thread_forkserver` this
+        trips the cancel-cascade shape-mismatch bug class
+        (see `ai/conc-anal/cancel_cascade_too_slow_under_main_thread_forkserver_issue.md`)
+        — xfailed below.

    '''
-    if start_method == 'trio':
-        depth = 3
-        subactor_breadth = 2
-    else:
-        # XXX: multiprocessing can't seem to handle any more then 2 depth
-        # process trees for whatever reason.
-        # Any more process levels then this and we see bugs that cause
-        # hangs and broken pipes all over the place...
-        if start_method == 'forkserver':
-            pytest.skip("Forksever sux hard at nested spawning...")
-        depth = 1  # means an additional actor tree of spawning (2 levels deep)
-        subactor_breadth = 2
+    # XXX: `multiprocessing.forkserver` can't handle nested
+    # spawning at any depth — hangs / broken-pipes. Pre-existing
+    # backend limitation, NOT depth-specific.
+    if start_method == 'forkserver':
+        pytest.skip("Forksever sux hard at nested spawning...")

-    with trio.fail_after(120):
+    subactor_breadth = 2
+
+    # MTF backend trips a probabilistic timing race in the
+    # cancel-cascade — NOT depth-gated; depth amplifies the
+    # variance so depth=3 misses nearly every run while
+    # depth=1 misses occasionally. Both get the xfail mark
+    # (with `strict=False`) since the bug class can fire at
+    # either depth.
+    #
+    # The scenario in detail:
+    #
+    #     T=0      spawn spawner_0 + spawner_1 in parallel
+    #     T=t1     spawner_0's child errors →
+    #              RemoteActorError reaches root nursery
+    #     T=t1+ε   root nursery starts cancelling
+    #              spawner_1's portal-wait
+    #     T=t2     spawner_1's child errors → tries to send
+    #              RemoteActorError back
+    #
+    #     if t2 < t1+ε:  BEG = [RAE, RAE]        ← clean (xpass)
+    #     if t2 > t1+ε:  BEG = [RAE, Cancelled]  ← race tripped (xfail)
+    #
+    # i.e. the assertion below (`isinstance(_, RemoteActorError)`)
+    # fails iff cancel-delivery beats the other tree's natural
+    # error-propagation. Depth amplifies `t2-t1` variance
+    # (longer per-tree paths = more skew); under MTF the
+    # fork-spawn jitter + UDS-contention widens both `t1` and
+    # `t2` further.
+    #
+    # With `strict=False` the clean-cascade cases (most
+    # depth=1 runs, rare depth=3 runs) report as `xpassed`
+    # while the race-tripped cases report as `xfailed` —
+    # neither flakes `--lf`. When MTF cancel-cascade
+    # eventually speeds up enough to close the race even at
+    # depth=3, BOTH variants will reliably `xpass` and
+    # pytest will yell — our signal to drop the marker. See
+    # `ai/conc-anal/cancel_cascade_too_slow_under_main_thread_forkserver_issue.md`.
+    if start_method == 'main_thread_forkserver':
+        request.node.add_marker(
+            pytest.mark.xfail(
+                strict=False,
+                reason=(
+                    f'MTF cancel-cascade shape-mismatch at '
+                    f'depth={depth} (Cancelled races '
+                    f'RemoteActorError in BEG); see conc-anal/'
+                    'cancel_cascade_too_slow_under_main_thread_forkserver_issue.md'
+                ),
+            )
+        )
+
+    # Per-backend/-depth budgets: in the non-hang case the
+    # whole spawn + cancel-cascade should complete in well
+    # under these. On the borderline hang case the
+    # `fail_after_w_trace` fires `TooSlowError` AND captures a
+    # ptree/wchan/py-spy snapshot to
+    # `$XDG_CACHE_HOME/tractor/hung-dumps/` for offline
+    # inspection. See
+    # `ai/conc-anal/cancel_cascade_too_slow_under_main_thread_forkserver_issue.md`.
+    #
+    # NOTE: the `trio` depth=3 budget was bumped 6 -> 12s after
+    # the `trio` 0.29 -> 0.33 lock bump (commit c7741bba) slowed
+    # the depth-3 cancel-cascade from <6s to ~7-8s; the 6s
+    # deadline was firing and its `Cancelled(source='deadline')`
+    # (trio 0.33 cancel-reason metadata) collapsed a BEG branch,
+    # breaking the `RemoteActorError` assertion below. depth=1
+    # still finishes in ~3s so keeps the 6s budget. See
+    # `ai/conc-anal/trio_033_cancel_cascade_slowdown_depth3_issue.md`.
+    match (start_method, depth):
+        case ('trio', 1):
+            timeout = 6
+        case ('trio', 3):
+            timeout = 12
+        case ('main_thread_forkserver', 1):
+            timeout = 16
+        case ('main_thread_forkserver', 3):
+            timeout = 30
+
+    async with fail_after_w_trace(timeout):
        try:
            async with tractor.open_nursery() as nursery:
                for i in range(subactor_breadth):
@ -431,7 +647,7 @@ async def test_nested_multierrors(loglevel, start_method):
            for subexc in err.exceptions:

                # verify first level actor errors are wrapped as remote
-                if is_win():
+                if _friggin_windows:

                    # windows is often too slow and cancellation seems
                    # to happen before an actor is spawned
@ -464,7 +680,7 @@ async def test_nested_multierrors(loglevel, start_method):
                    # XXX not sure what's up with this..
                    # on windows sometimes spawning is just too slow and
                    # we get back the (sent) cancel signal instead
-                    if is_win():
+                    if _friggin_windows:
                        if isinstance(subexc, tractor.RemoteActorError):
                            assert subexc.boxed_type in (
                                BaseExceptionGroup,
@ -483,20 +699,24 @@ async def test_nested_multierrors(loglevel, start_method):

@no_windows
 def test_cancel_via_SIGINT(
-    loglevel,
-    start_method,
-    spawn_backend,
+    reg_addr: tuple,
+    loglevel: str,
+    start_method: str,
 ):
-    """Ensure that a control-C (SIGINT) signal cancels both the parent and
+    '''
+    Ensure that a control-C (SIGINT) signal cancels both the parent and
    child processes in trionic fashion
-    """
-    pid = os.getpid()
+
+    '''
+    pid: int = os.getpid()

    async def main():
        with trio.fail_after(2):
-            async with tractor.open_nursery() as tn:
+            async with tractor.open_nursery(
+                registry_addrs=[reg_addr],
+            ) as tn:
                await tn.start_actor('sucka')
-                if 'mp' in spawn_backend:
+                if 'mp' in start_method:
                    time.sleep(0.1)
                os.kill(pid, signal.SIGINT)
                await trio.sleep_forever()
@ -507,23 +727,38 @@ def test_cancel_via_SIGINT(

@no_windows
 def test_cancel_via_SIGINT_other_task(
-    loglevel,
-    start_method,
-    spawn_backend,
+    reg_addr: tuple,
+    loglevel: str,
+    start_method: str,
+    spawn_backend: str,
 ):
-    """Ensure that a control-C (SIGINT) signal cancels both the parent
-    and child processes in trionic fashion even a subprocess is started
-    from a seperate ``trio`` child  task.
-    """
-    pid = os.getpid()
-    timeout: float = 2
-    if is_win():  # smh
+    '''
+    Ensure that a control-C (SIGINT) signal cancels both the parent
+    and child processes in trionic fashion even a subprocess is
+    started from a seperate ``trio`` child  task.
+
+    '''
+    from .conftest import cpu_scaling_factor
+
+    pid: int = os.getpid()
+    timeout: float = (
+        4 if _non_linux
+        else 2
+    )
+    if _friggin_windows:  # smh
        timeout += 1

+    # add latency headroom for CPU freq scaling (auto-cpufreq et al.)
+    headroom: float = cpu_scaling_factor()
+    if headroom != 1.:
+        timeout *= headroom
+
    async def spawn_and_sleep_forever(
        task_status=trio.TASK_STATUS_IGNORED
    ):
-        async with tractor.open_nursery() as tn:
+        async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
+        ) as tn:
            for i in range(3):
                await tn.run_in_actor(
                    sleep_forever,
@ -587,7 +822,7 @@ async def spawn_sub_with_sync_blocking_task():
 def test_cancel_while_childs_child_in_sync_sleep(
    loglevel: str,
    start_method: str,
-    spawn_backend: str,
+    is_forking_spawner: bool,
    debug_mode: bool,
    reg_addr: tuple,
    man_cancel_outer: bool,
@ -603,7 +838,10 @@ def test_cancel_while_childs_child_in_sync_sleep(

    '''
    if start_method == 'forkserver':
-        pytest.skip("Forksever sux hard at resuming from sync sleep...")
+        pytest.skip(
+            "`multiprocessing`'s forkserver sux hard at "
+            "resuming from sync sleep..."
+        )

    async def main():
        #
@ -644,7 +882,15 @@ def test_cancel_while_childs_child_in_sync_sleep(
        #
        # delay = 1  # no AssertionError in eg, TooSlowError raised.
        # delay = 2  # is AssertionError in eg AND no TooSlowError !?
-        delay = 4  # is AssertionError in eg AND no _cs cancellation.
+        # is AssertionError in eg AND no _cs cancellation.
+        delay = (
+            6 if (
+                _non_linux
+                or
+                is_forking_spawner
+            )
+            else 4 
+        )

        with trio.fail_after(delay) as _cs:
        # with trio.CancelScope() as cs:
@ -678,7 +924,7 @@ def test_cancel_while_childs_child_in_sync_sleep(


 def test_fast_graceful_cancel_when_spawn_task_in_soft_proc_wait_for_daemon(
-    start_method,
+    start_method: str,
 ):
    '''
    This is a very subtle test which demonstrates how cancellation
@ -696,7 +942,7 @@ def test_fast_graceful_cancel_when_spawn_task_in_soft_proc_wait_for_daemon(
    kbi_delay = 0.5
    timeout: float = 2.9

-    if is_win():  # smh
+    if _friggin_windows:  # smh
        timeout += 1

    async def main():
--- a/tests/test_child_manages_service_nursery.py
+++ b/tests/test_child_manages_service_nursery.py
@ -18,16 +18,15 @@ from tractor import RemoteActorError


 async def aio_streamer(
-    from_trio: asyncio.Queue,
-    to_trio: trio.abc.SendChannel,
+    chan: tractor.to_asyncio.LinkedTaskChannel,
 ) -> trio.abc.ReceiveChannel:

    # required first msg to sync caller
-    to_trio.send_nowait(None)
+    chan.started_nowait(None)

    from itertools import cycle
    for i in cycle(range(10)):
-        to_trio.send_nowait(i)
+        chan.send_nowait(i)
        await asyncio.sleep(0.01)


@ -69,7 +68,7 @@ async def wrapper_mngr(
        else:
            async with tractor.to_asyncio.open_channel_from(
                aio_streamer,
-            ) as (first, from_aio):
+            ) as (from_aio, first):
                assert not first

                # cache it so next task uses broadcast receiver
--- a/tests/test_clustering.py
+++ b/tests/test_clustering.py
@ -10,7 +10,19 @@ from tractor._testing import tractor_test
 MESSAGE = 'tractoring at full speed'


-def test_empty_mngrs_input_raises() -> None:
+def test_empty_mngrs_input_raises(
+    tpt_proto: str,
+) -> None:
+    # TODO, the `open_actor_cluster()` teardown hangs
+    # intermittently on UDS when `gather_contexts(mngrs=())`
+    # raises `ValueError` mid-setup; likely a race in the
+    # actor-nursery cleanup vs UDS socket shutdown. Needs
+    # a deeper look at `._clustering`/`._supervise` teardown
+    # paths with the UDS transport.
+    if tpt_proto == 'uds':
+        pytest.skip(
+            'actor-cluster teardown hangs intermittently on UDS'
+        )

    async def main():
        with trio.fail_after(3):
@ -56,25 +68,44 @@ async def worker(
            print(msg)
            assert msg == MESSAGE

-        # TODO: does this ever cause a hang
+        # ?TODO, does this ever cause a hang?
        # assert 0


+# ?TODO, but needs a fn-scoped tpt_proto fixture..
+# @pytest.mark.no_tpt('uds')
@tractor_test
-async def test_streaming_to_actor_cluster() -> None:
+async def test_streaming_to_actor_cluster(
+    tpt_proto: str,
+    is_forking_spawner: bool,
+):
+    '''
+    Open an actor "cluster" using the (experimental) `._clustering`
+    API and conduct standard inter-task-ctx streaming.

-    async with (
-        open_actor_cluster(modules=[__name__]) as portals,
+    '''
+    if tpt_proto == 'uds':
+        pytest.skip(
+            f'Test currently fails with tpt-proto={tpt_proto!r}\n'
+        )

-        gather_contexts(
-            mngrs=[p.open_context(worker) for p in portals.values()],
-        ) as contexts,
+    delay: float = (
+        10 if is_forking_spawner
+        else 6
+    )
+    with trio.fail_after(delay):
+        async with (
+            open_actor_cluster(modules=[__name__]) as portals,

-        gather_contexts(
-            mngrs=[ctx[0].open_stream() for ctx in contexts],
-        ) as streams,
+            gather_contexts(
+                mngrs=[p.open_context(worker) for p in portals.values()],
+            ) as contexts,

-    ):
-        with trio.move_on_after(1):
-            for stream in itertools.cycle(streams):
-                await stream.send(MESSAGE)
+            gather_contexts(
+                mngrs=[ctx[0].open_stream() for ctx in contexts],
+            ) as streams,
+
+        ):
+            with trio.move_on_after(1):
+                for stream in itertools.cycle(streams):
+                    await stream.send(MESSAGE)
--- a/tests/test_context_stream_semantics.py
+++ b/tests/test_context_stream_semantics.py
@ -9,6 +9,7 @@ from itertools import count
 import math
 import platform
 from pprint import pformat
+import sys
 from typing import (
    Callable,
 )
@ -25,7 +26,7 @@ from tractor._exceptions import (
    StreamOverrun,
    ContextCancelled,
 )
-from tractor._state import current_ipc_ctx
+from tractor.runtime._state import current_ipc_ctx

 from tractor._testing import (
    tractor_test,
@ -114,10 +115,12 @@ async def not_started_but_stream_opened(
 )
 def test_started_misuse(
    target: Callable,
+    reg_addr: tuple,
    debug_mode: bool,
 ):
    async def main():
        async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
            debug_mode=debug_mode,
        ) as an:
            portal = await an.start_actor(
@ -183,15 +186,24 @@ def test_simple_context(
    error_parent,
    child_blocks_forever,
    pointlessly_open_stream,
+    reg_addr: tuple,
    debug_mode: bool,
+    is_forking_spawner: bool,
 ):

-    timeout = 1.5 if not platform.system() == 'Windows' else 4
+    timeout: float = 1.5
+    # windows and forking-spawner both have "slower but more
+    # deterministic" cancel teardown.
+    if platform.system() == 'Windows':
+        timeout = 4
+    elif is_forking_spawner:
+        timeout = 3

    async def main():

        with trio.fail_after(timeout):
            async with tractor.open_nursery(
+                registry_addrs=[reg_addr],
                debug_mode=debug_mode,
            ) as an:
                portal = await an.start_actor(
@ -277,6 +289,7 @@ def test_parent_cancels(
    cancel_method: str,
    chk_ctx_result_before_exit: bool,
    child_returns_early: bool,
+    reg_addr: tuple,
    debug_mode: bool,
 ):
    '''
@ -354,6 +367,7 @@ def test_parent_cancels(
    async def main():

        async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
            debug_mode=debug_mode,
        ) as an:
            portal = await an.start_actor(
@ -930,6 +944,7 @@ async def keep_sending_from_child(
 )
 def test_one_end_stream_not_opened(
    overrun_by: tuple[str, int, Callable],
+    reg_addr: tuple,
    debug_mode: bool,
 ):
    '''
@ -938,11 +953,17 @@ def test_one_end_stream_not_opened(

    '''
    overrunner, buf_size_increase, entrypoint = overrun_by
-    from tractor._runtime import Actor
+    from tractor.runtime._runtime import Actor
    buf_size = buf_size_increase + Actor.msg_buffer_size

+    timeout: float = (
+        1 if sys.platform == 'linux'
+        else 3
+    )
+
    async def main():
        async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
            debug_mode=debug_mode,
        ) as an:
            portal = await an.start_actor(
@ -950,7 +971,7 @@ def test_one_end_stream_not_opened(
                enable_modules=[__name__],
            )

-            with trio.fail_after(1):
+            with trio.fail_after(timeout):
                async with portal.open_context(
                    entrypoint,
                ) as (ctx, sent):
@ -1107,6 +1128,7 @@ def test_maybe_allow_overruns_stream(

    # conftest wide
    loglevel: str,
+    reg_addr: tuple,
    debug_mode: bool,
 ):
    '''
@ -1127,6 +1149,7 @@ def test_maybe_allow_overruns_stream(
    '''
    async def main():
        async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
            debug_mode=debug_mode,
        ) as an:
            portal = await an.start_actor(
@ -1243,6 +1266,7 @@ def test_maybe_allow_overruns_stream(

 def test_ctx_with_self_actor(
    loglevel: str,
+    reg_addr: tuple,
    debug_mode: bool,
 ):
    '''
@ -1257,6 +1281,7 @@ def test_ctx_with_self_actor(
    '''
    async def main():
        async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
            debug_mode=debug_mode,
            enable_modules=[__name__],
        ) as an:
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@ -1,415 +0,0 @@
-"""
-Actor "discovery" testing
-"""
-import os
-import signal
-import platform
-from functools import partial
-import itertools
-
-import psutil
-import pytest
-import subprocess
-import tractor
-from tractor.trionics import collapse_eg
-from tractor._testing import tractor_test
-import trio
-
-
-@tractor_test
-async def test_reg_then_unreg(reg_addr):
-    actor = tractor.current_actor()
-    assert actor.is_arbiter
-    assert len(actor._registry) == 1  # only self is registered
-
-    async with tractor.open_nursery(
-        registry_addrs=[reg_addr],
-    ) as n:
-
-        portal = await n.start_actor('actor', enable_modules=[__name__])
-        uid = portal.channel.uid
-
-        async with tractor.get_registry(reg_addr) as aportal:
-            # this local actor should be the arbiter
-            assert actor is aportal.actor
-
-            async with tractor.wait_for_actor('actor'):
-                # sub-actor uid should be in the registry
-                assert uid in aportal.actor._registry
-                sockaddrs = actor._registry[uid]
-                # XXX: can we figure out what the listen addr will be?
-                assert sockaddrs
-
-        await n.cancel()  # tear down nursery
-
-        await trio.sleep(0.1)
-        assert uid not in aportal.actor._registry
-        sockaddrs = actor._registry.get(uid)
-        assert not sockaddrs
-
-
-the_line = 'Hi my name is {}'
-
-
-async def hi():
-    return the_line.format(tractor.current_actor().name)
-
-
-async def say_hello(
-    other_actor: str,
-    reg_addr: tuple[str, int],
-):
-    await trio.sleep(1)  # wait for other actor to spawn
-    async with tractor.find_actor(
-        other_actor,
-        registry_addrs=[reg_addr],
-    ) as portal:
-        assert portal is not None
-        return await portal.run(__name__, 'hi')
-
-
-async def say_hello_use_wait(
-    other_actor: str,
-    reg_addr: tuple[str, int],
-):
-    async with tractor.wait_for_actor(
-        other_actor,
-        registry_addr=reg_addr,
-    ) as portal:
-        assert portal is not None
-        result = await portal.run(__name__, 'hi')
-        return result
-
-
-@tractor_test
-@pytest.mark.parametrize('func', [say_hello, say_hello_use_wait])
-async def test_trynamic_trio(
-    func,
-    start_method,
-    reg_addr,
-):
-    '''
-    Root actor acting as the "director" and running one-shot-task-actors
-    for the directed subs.
-
-    '''
-    async with tractor.open_nursery() as n:
-        print("Alright... Action!")
-
-        donny = await n.run_in_actor(
-            func,
-            other_actor='gretchen',
-            reg_addr=reg_addr,
-            name='donny',
-        )
-        gretchen = await n.run_in_actor(
-            func,
-            other_actor='donny',
-            reg_addr=reg_addr,
-            name='gretchen',
-        )
-        print(await gretchen.result())
-        print(await donny.result())
-        print("CUTTTT CUUTT CUT!!?! Donny!! You're supposed to say...")
-
-
-async def stream_forever():
-    for i in itertools.count():
-        yield i
-        await trio.sleep(0.01)
-
-
-async def cancel(use_signal, delay=0):
-    # hold on there sally
-    await trio.sleep(delay)
-
-    # trigger cancel
-    if use_signal:
-        if platform.system() == 'Windows':
-            pytest.skip("SIGINT not supported on windows")
-        os.kill(os.getpid(), signal.SIGINT)
-    else:
-        raise KeyboardInterrupt
-
-
-async def stream_from(portal):
-    async with portal.open_stream_from(stream_forever) as stream:
-        async for value in stream:
-            print(value)
-
-
-async def unpack_reg(actor_or_portal):
-    '''
-    Get and unpack a "registry" RPC request from the "arbiter" registry
-    system.
-
-    '''
-    if getattr(actor_or_portal, 'get_registry', None):
-        msg = await actor_or_portal.get_registry()
-    else:
-        msg = await actor_or_portal.run_from_ns('self', 'get_registry')
-
-    return {tuple(key.split('.')): val for key, val in msg.items()}
-
-
-async def spawn_and_check_registry(
-    reg_addr: tuple,
-    use_signal: bool,
-    debug_mode: bool = False,
-    remote_arbiter: bool = False,
-    with_streaming: bool = False,
-    maybe_daemon: tuple[
-        subprocess.Popen,
-        psutil.Process,
-    ]|None = None,
-
-) -> None:
-
-    if maybe_daemon:
-        popen, proc = maybe_daemon
-        # breakpoint()
-
-    async with tractor.open_root_actor(
-        registry_addrs=[reg_addr],
-        debug_mode=debug_mode,
-    ):
-        async with tractor.get_registry(reg_addr) as portal:
-            # runtime needs to be up to call this
-            actor = tractor.current_actor()
-
-            if remote_arbiter:
-                assert not actor.is_arbiter
-
-            if actor.is_arbiter:
-                extra = 1  # arbiter is local root actor
-                get_reg = partial(unpack_reg, actor)
-
-            else:
-                get_reg = partial(unpack_reg, portal)
-                extra = 2  # local root actor + remote arbiter
-
-            # ensure current actor is registered
-            registry: dict = await get_reg()
-            assert actor.uid in registry
-
-            try:
-                async with tractor.open_nursery() as an:
-                    async with (
-                        collapse_eg(),
-                        trio.open_nursery() as trion,
-                    ):
-                        portals = {}
-                        for i in range(3):
-                            name = f'a{i}'
-                            if with_streaming:
-                                portals[name] = await an.start_actor(
-                                    name=name, enable_modules=[__name__])
-
-                            else:  # no streaming
-                                portals[name] = await an.run_in_actor(
-                                    trio.sleep_forever, name=name)
-
-                        # wait on last actor to come up
-                        async with tractor.wait_for_actor(name):
-                            registry = await get_reg()
-                            for uid in an._children:
-                                assert uid in registry
-
-                        assert len(portals) + extra == len(registry)
-
-                        if with_streaming:
-                            await trio.sleep(0.1)
-
-                            pts = list(portals.values())
-                            for p in pts[:-1]:
-                                trion.start_soon(stream_from, p)
-
-                            # stream for 1 sec
-                            trion.start_soon(cancel, use_signal, 1)
-
-                            last_p = pts[-1]
-                            await stream_from(last_p)
-
-                        else:
-                            await cancel(use_signal)
-
-            finally:
-                await trio.sleep(0.5)
-
-                # all subactors should have de-registered
-                registry = await get_reg()
-                assert len(registry) == extra
-                assert actor.uid in registry
-
-
-@pytest.mark.parametrize('use_signal', [False, True])
-@pytest.mark.parametrize('with_streaming', [False, True])
-def test_subactors_unregister_on_cancel(
-    debug_mode: bool,
-    start_method,
-    use_signal,
-    reg_addr,
-    with_streaming,
-):
-    '''
-    Verify that cancelling a nursery results in all subactors
-    deregistering themselves with the arbiter.
-
-    '''
-    with pytest.raises(KeyboardInterrupt):
-        trio.run(
-            partial(
-                spawn_and_check_registry,
-                reg_addr,
-                use_signal,
-                debug_mode=debug_mode,
-                remote_arbiter=False,
-                with_streaming=with_streaming,
-            ),
-        )
-
-
-@pytest.mark.parametrize('use_signal', [False, True])
-@pytest.mark.parametrize('with_streaming', [False, True])
-def test_subactors_unregister_on_cancel_remote_daemon(
-    daemon: subprocess.Popen,
-    debug_mode: bool,
-    start_method,
-    use_signal,
-    reg_addr,
-    with_streaming,
-):
-    """Verify that cancelling a nursery results in all subactors
-    deregistering themselves with a **remote** (not in the local process
-    tree) arbiter.
-    """
-    with pytest.raises(KeyboardInterrupt):
-        trio.run(
-            partial(
-                spawn_and_check_registry,
-                reg_addr,
-                use_signal,
-                debug_mode=debug_mode,
-                remote_arbiter=True,
-                with_streaming=with_streaming,
-                maybe_daemon=(
-                    daemon,
-                    psutil.Process(daemon.pid)
-                ),
-            ),
-        )
-
-
-async def streamer(agen):
-    async for item in agen:
-        print(item)
-
-
-async def close_chans_before_nursery(
-    reg_addr: tuple,
-    use_signal: bool,
-    remote_arbiter: bool = False,
-) -> None:
-
-    # logic for how many actors should still be
-    # in the registry at teardown.
-    if remote_arbiter:
-        entries_at_end = 2
-    else:
-        entries_at_end = 1
-
-    async with tractor.open_root_actor(
-        registry_addrs=[reg_addr],
-    ):
-        async with tractor.get_registry(reg_addr) as aportal:
-            try:
-                get_reg = partial(unpack_reg, aportal)
-
-                async with tractor.open_nursery() as tn:
-                    portal1 = await tn.start_actor(
-                        name='consumer1', enable_modules=[__name__])
-                    portal2 = await tn.start_actor(
-                        'consumer2', enable_modules=[__name__])
-
-                    # TODO: compact this back as was in last commit once
-                    # 3.9+, see https://github.com/goodboy/tractor/issues/207
-                    async with portal1.open_stream_from(
-                        stream_forever
-                    ) as agen1:
-                        async with portal2.open_stream_from(
-                            stream_forever
-                        ) as agen2:
-                            async with (
-                                collapse_eg(),
-                                trio.open_nursery() as tn,
-                            ):
-                                tn.start_soon(streamer, agen1)
-                                tn.start_soon(cancel, use_signal, .5)
-                                try:
-                                    await streamer(agen2)
-                                finally:
-                                    # Kill the root nursery thus resulting in
-                                    # normal arbiter channel ops to fail during
-                                    # teardown. It doesn't seem like this is
-                                    # reliably triggered by an external SIGINT.
-                                    # tractor.current_actor()._root_nursery.cancel_scope.cancel()
-
-                                    # XXX: THIS IS THE KEY THING that
-                                    # happens **before** exiting the
-                                    # actor nursery block
-
-                                    # also kill off channels cuz why not
-                                    await agen1.aclose()
-                                    await agen2.aclose()
-            finally:
-                with trio.CancelScope(shield=True):
-                    await trio.sleep(1)
-
-                    # all subactors should have de-registered
-                    registry = await get_reg()
-                    assert portal1.channel.uid not in registry
-                    assert portal2.channel.uid not in registry
-                    assert len(registry) == entries_at_end
-
-
-@pytest.mark.parametrize('use_signal', [False, True])
-def test_close_channel_explicit(
-    start_method,
-    use_signal,
-    reg_addr,
-):
-    """Verify that closing a stream explicitly and killing the actor's
-    "root nursery" **before** the containing nursery tears down also
-    results in subactor(s) deregistering from the arbiter.
-    """
-    with pytest.raises(KeyboardInterrupt):
-        trio.run(
-            partial(
-                close_chans_before_nursery,
-                reg_addr,
-                use_signal,
-                remote_arbiter=False,
-            ),
-        )
-
-
-@pytest.mark.parametrize('use_signal', [False, True])
-def test_close_channel_explicit_remote_arbiter(
-    daemon: subprocess.Popen,
-    start_method,
-    use_signal,
-    reg_addr,
-):
-    """Verify that closing a stream explicitly and killing the actor's
-    "root nursery" **before** the containing nursery tears down also
-    results in subactor(s) deregistering from the arbiter.
-    """
-    with pytest.raises(KeyboardInterrupt):
-        trio.run(
-            partial(
-                close_chans_before_nursery,
-                reg_addr,
-                use_signal,
-                remote_arbiter=True,
-            ),
-        )
--- a/tests/test_docs_examples.py
+++ b/tests/test_docs_examples.py
@ -9,12 +9,17 @@ import sys
 import subprocess
 import platform
 import shutil
+from typing import Callable

 import pytest
+import tractor
 from tractor._testing import (
    examples_dir,
 )

+_non_linux: bool = platform.system() != 'Linux'
+_friggin_macos: bool = platform.system() == 'Darwin'
+

@pytest.fixture
 def run_example_in_subproc(
@ -89,8 +94,10 @@ def run_example_in_subproc(
        for f in p[2]

        if (
-            '__' not in f
-            and f[0] != '_'
+            '__' not in f  # ignore any pkg-mods
+            # ignore any `__pycache__` subdir
+            and '__pycache__' not in str(p[0])
+            and f[0] != '_'  # ignore any WIP "examplel mods"
            and 'debugging' not in p[0]
            and 'integration' not in p[0]
            and 'advanced_faults' not in p[0]
@ -101,8 +108,10 @@ def run_example_in_subproc(
    ids=lambda t: t[1],
 )
 def test_example(
-    run_example_in_subproc,
-    example_script,
+    run_example_in_subproc: Callable,
+    example_script: str,
+    test_log: tractor.log.StackLevelAdapter,
+    ci_env: bool,
 ):
    '''
    Load and run scripts from this repo's ``examples/`` dir as a user
@ -116,9 +125,39 @@ def test_example(
    '''
    ex_file: str = os.path.join(*example_script)

-    if 'rpc_bidir_streaming' in ex_file and sys.version_info < (3, 9):
+    if (
+        'rpc_bidir_streaming' in ex_file
+        and
+        sys.version_info < (3, 9)
+    ):
        pytest.skip("2-way streaming example requires py3.9 async with syntax")

+    if (
+        'full_fledged_streaming_service' in ex_file
+        and
+        _friggin_macos
+        and
+        ci_env
+    ):
+        pytest.skip(
+            'Streaming example is too flaky in CI\n'
+            'AND their competitor runs this CI service..\n'
+            'This test does run just fine "in person" however..'
+        )
+
+    from .conftest import cpu_scaling_factor
+
+    timeout: float = (
+        60
+        if ci_env and _non_linux
+        else 16
+    )
+
+    # add latency headroom for CPU freq scaling (auto-cpufreq et al.)
+    headroom: float = cpu_scaling_factor()
+    if headroom != 1.:
+        timeout *= headroom
+
    with open(ex_file, 'r') as ex:
        code = ex.read()

@ -126,9 +165,12 @@ def test_example(
            err = None
            try:
                if not proc.poll():
-                    _, err = proc.communicate(timeout=15)
+                    _, err = proc.communicate(timeout=timeout)

            except subprocess.TimeoutExpired as e:
+                test_log.exception(
+                    f'Example failed to finish within {timeout}s ??\n'
+                )
                proc.kill()
                err = e.stderr

--- a/tests/test_infected_asyncio.py
+++ b/tests/test_infected_asyncio.py
@ -26,10 +26,36 @@ from tractor import (
    to_asyncio,
    RemoteActorError,
    ContextCancelled,
-    _state,
 )
+from tractor.runtime import _state
 from tractor.trionics import BroadcastReceiver
 from tractor._testing import expect_ctxc
+from tractor._testing.trace import (
+    AfkAlarmWTraceFactory,
+    FailAfterWTraceFactory,
+)
+
+
+# Per-test zombie-subactor reaper. Opt-in (NOT autouse) —
+# see `tractor._testing.pytest.reap_subactors_per_test`'s
+# docstring for the full rationale. This module specifically
+# needs it because tests like
+# `test_echoserver_detailed_mechanics[KeyboardInterrupt]`
+# and the `test_sigint_closes_lifetime_stack[*]` matrix have
+# been observed to hang past pytest's wall-clock under
+# `main_thread_forkserver`, leaving subactor forks that
+# squat on registrar resources and cascade-fail every
+# subsequent test (`test_inter_peer_cancellation`,
+# `test_legacy_one_way_streaming`, etc.).
+pytestmark = pytest.mark.usefixtures(
+    'reap_subactors_per_test',
+    # NOTE, asyncio cancel cascade has historically
+    # triggered both UDS sockfile leaks (SIGKILL path)
+    # AND the trio `WakeupSocketpair.drain()` busy-loop
+    # — see `test_aio_simple_error`'s history.
+    'track_orphaned_uds_per_test',
+    'detect_runaway_subactors_per_test',
+)


@pytest.fixture(
@ -47,12 +73,11 @@ async def sleep_and_err(

    # just signature placeholders for compat with
    # ``to_asyncio.open_channel_from()``
-    to_trio: trio.MemorySendChannel|None = None,
-    from_trio: asyncio.Queue|None = None,
+    chan: to_asyncio.LinkedTaskChannel|None = None,

 ):
-    if to_trio:
-        to_trio.send_nowait('start')
+    if chan:
+        chan.started_nowait('start')

    await asyncio.sleep(sleep_for)
    assert 0
@ -184,6 +209,7 @@ def test_tractor_cancels_aio(
    async def main():
        async with tractor.open_nursery(
            debug_mode=debug_mode,
+            registry_addrs=[reg_addr],
        ) as an:
            portal = await an.run_in_actor(
                asyncio_actor,
@ -206,11 +232,11 @@ def test_trio_cancels_aio(

    '''
    async def main():
-
+        # cancel the nursery shortly after boot
        with trio.move_on_after(1):
-            # cancel the nursery shortly after boot
-
-            async with tractor.open_nursery() as tn:
+            async with tractor.open_nursery(
+                registry_addrs=[reg_addr],
+            ) as tn:
                await tn.run_in_actor(
                    asyncio_actor,
                    target='aio_sleep_forever',
@ -238,7 +264,7 @@ async def trio_ctx(
                trio.open_nursery() as tn,
                tractor.to_asyncio.open_channel_from(
                    sleep_and_err,
-                ) as (first, chan),
+                ) as (chan, first),
            ):

                assert first == 'start'
@ -278,7 +304,9 @@ def test_context_spawns_aio_task_that_errors(
    '''
    async def main():
        with trio.fail_after(1 + delay):
-            async with tractor.open_nursery() as an:
+            async with tractor.open_nursery(
+                registry_addrs=[reg_addr],
+            ) as an:
                p = await an.start_actor(
                    'aio_daemon',
                    enable_modules=[__name__],
@ -361,7 +389,9 @@ def test_aio_cancelled_from_aio_causes_trio_cancelled(
    async def main():

        an: tractor.ActorNursery
-        async with tractor.open_nursery() as an:
+        async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
+        ) as an:
            p: tractor.Portal = await an.run_in_actor(
                asyncio_actor,
                target='aio_cancel',
@ -399,7 +429,7 @@ async def no_to_trio_in_args():

 async def push_from_aio_task(
    sequence: Iterable,
-    to_trio: trio.abc.SendChannel,
+    chan: to_asyncio.LinkedTaskChannel,
    expect_cancel: False,
    fail_early: bool,
    exit_early: bool,
@ -407,15 +437,12 @@ async def push_from_aio_task(
 ) -> None:

    try:
-        # print('trying breakpoint')
-        # breakpoint()
-
        # sync caller ctx manager
-        to_trio.send_nowait(True)
+        chan.started_nowait(True)

        for i in sequence:
            print(f'asyncio sending {i}')
-            to_trio.send_nowait(i)
+            chan.send_nowait(i)
            await asyncio.sleep(0.001)

            if (
@ -478,7 +505,7 @@ async def stream_from_aio(
                trio_exit_early
            ))

-        ) as (first, chan):
+        ) as (chan, first):

            assert first is True

@ -573,7 +600,9 @@ def test_basic_interloop_channel_stream(
    async def main():
        # TODO, figure out min timeout here!
        with trio.fail_after(6):
-            async with tractor.open_nursery() as an:
+            async with tractor.open_nursery(
+                registry_addrs=[reg_addr],
+            ) as an:
                portal = await an.run_in_actor(
                    stream_from_aio,
                    infect_asyncio=True,
@ -586,9 +615,13 @@ def test_basic_interloop_channel_stream(


 # TODO: parametrize the above test and avoid the duplication here?
-def test_trio_error_cancels_intertask_chan(reg_addr):
+def test_trio_error_cancels_intertask_chan(
+    reg_addr: tuple[str, int],
+):
    async def main():
-        async with tractor.open_nursery() as an:
+        async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
+        ) as an:
            portal = await an.run_in_actor(
                stream_from_aio,
                trio_raise_err=True,
@ -623,6 +656,7 @@ def test_trio_closes_early_causes_aio_checkpoint_raise(
            async with tractor.open_nursery(
                debug_mode=debug_mode,
                # enable_stack_on_sig=True,
+                registry_addrs=[reg_addr],
            ) as an:
                portal = await an.run_in_actor(
                    stream_from_aio,
@ -671,6 +705,7 @@ def test_aio_exits_early_relays_AsyncioTaskExited(
    async def main():
        with trio.fail_after(1 + delay):
            async with tractor.open_nursery(
+                registry_addrs=[reg_addr],
                debug_mode=debug_mode,
                # enable_stack_on_sig=True,
            ) as an:
@ -711,6 +746,7 @@ def test_aio_errors_and_channel_propagates_and_closes(
 ):
    async def main():
        async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
            debug_mode=debug_mode,
        ) as an:
            portal = await an.run_in_actor(
@ -732,15 +768,21 @@ def test_aio_errors_and_channel_propagates_and_closes(


 async def aio_echo_server(
-    to_trio: trio.MemorySendChannel,
-    from_trio: asyncio.Queue,
+    chan: to_asyncio.LinkedTaskChannel,
 ) -> None:
+    '''
+    An IPC-msg "echo server" with msgs received and relayed by
+    a parent `trio.Task` into a child `asyncio.Task`
+    and then repeated back to that local parent (`trio.Task`)
+    and sent again back to the original calling remote actor.

-    to_trio.send_nowait('start')
+    '''
+    # same semantics as `trio.TaskStatus.started()`
+    chan.started_nowait('start')

    while True:
        try:
-            msg = await from_trio.get()
+            msg = await chan.get()
        except to_asyncio.TrioTaskExited:
            print(
                'breaking aio echo loop due to `trio` exit!'
@ -748,7 +790,7 @@ async def aio_echo_server(
            break

        # echo the msg back
-        to_trio.send_nowait(msg)
+        chan.send_nowait(msg)

        # if we get the terminate sentinel
        # break the echo loop
@ -765,7 +807,10 @@ async def trio_to_aio_echo_server(
 ):
    async with to_asyncio.open_channel_from(
        aio_echo_server,
-    ) as (first, chan):
+    ) as (
+        chan,
+        first,  # value from `chan.started_nowait()` above
+    ):
        assert first == 'start'

        await ctx.started(first)
@ -776,7 +821,8 @@ async def trio_to_aio_echo_server(
                await chan.send(msg)

                out = await chan.receive()
-                # echo back to parent actor-task
+
+                # echo back to parent-actor's remote parent-ctx-task!
                await stream.send(out)

                if out is None:
@ -790,16 +836,47 @@ async def trio_to_aio_echo_server(

@pytest.mark.parametrize(
    'raise_error_mid_stream',
-    [False, Exception, KeyboardInterrupt],
+    [
+        False,
+        Exception,
+        KeyboardInterrupt,
+    ],
    ids='raise_error={}'.format,
 )
 def test_echoserver_detailed_mechanics(
    reg_addr: tuple[str, int],
    debug_mode: bool,
    raise_error_mid_stream,
+
+    is_forking_spawner: bool,
+    fail_after_w_trace: FailAfterWTraceFactory,
 ):
-    async def main():
+    # NOTE: under fork-based backends the cancel-cascade
+    # path is structurally slower than `trio`'s subproc-exec
+    # (per-spawn forkserver-handshake compounds during
+    # teardown). Bump the cap so cross-test contamination
+    # doesn't flake this — see
+    # `ai/conc-anal/cancel_cascade_too_slow_under_main_thread_forkserver_issue.md`.
+    timeout: float = (
+        999 if tractor.debug_mode()
+        else 4 if is_forking_spawner
+        # was 1; the `trio` 0.29 -> 0.33 bump slowed the
+        # cancel-cascade so a 1s budget raced the ~1s teardown
+        # deadline. On a deadline-fire the injected
+        # `Cancelled(source='deadline')` wraps the mid-stream
+        # KBI in a `BaseExceptionGroup`, breaking the bare
+        # `pytest.raises(KeyboardInterrupt)` below. See
+        # `ai/conc-anal/trio_033_cancel_cascade_slowdown_depth3_issue.md`.
+        else 4
+    )
+
+    # body factored out so the `fail_after_w_trace`-wrapping
+    # `main()` stays a 2-liner — keeps the deep `open_nursery`
+    # /`open_context`/`open_stream` block at its natural indent
+    # level instead of pushing it under yet another `async with`.
+    async def _body():
        async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
            debug_mode=debug_mode,
        ) as an:
            p = await an.start_actor(
@ -843,6 +920,15 @@ def test_echoserver_detailed_mechanics(
            # is cancelled by kbi or out of task cancellation
            await p.cancel_actor()

+    async def main():
+        # on-timeout diag snapshot via `fail_after_w_trace`
+        # — when the cancel cascade hangs under MTF we get a
+        # fresh `ptree`/`wchan`/`py-spy` dump on disk INSTEAD
+        # of an opaque pytest timeout-kill. See
+        # `tractor/_testing/trace.py`.
+        async with fail_after_w_trace(timeout):
+            await _body()
+
    if raise_error_mid_stream:
        with pytest.raises(raise_error_mid_stream):
            trio.run(main)
@ -978,7 +1064,7 @@ async def manage_file(
    ],
    ids=[
        'bg_aio_task',
-        'just_trio_slee',
+        'just_trio_sleep',
    ],
 )
@pytest.mark.parametrize(
@ -994,11 +1080,15 @@ async def manage_file(
 )
 def test_sigint_closes_lifetime_stack(
    tmp_path: Path,
+    reg_addr: tuple,
+    debug_mode: bool,
+
    wait_for_ctx: bool,
    bg_aio_task: bool,
    trio_side_is_shielded: bool,
-    debug_mode: bool,
    send_sigint_to: str,
+    is_forking_spawner: bool,
+    afk_alarm_w_trace: AfkAlarmWTraceFactory,
 ):
    '''
    Ensure that an infected child can use the `Actor.lifetime_stack`
@ -1008,12 +1098,30 @@ def test_sigint_closes_lifetime_stack(
    '''
    async def main():

-        delay = 999 if tractor.debug_mode() else 1
+        delay: float = (
+            999
+            if debug_mode
+            else 1
+        )
+        # pre-init so the `except (KeyboardInterrupt, ContextCancelled)`
+        # handler below doesn't `UnboundLocalError` if KBI fires BEFORE
+        # we ever enter the `as (ctx, first)` body (e.g. when
+        # `p.open_context().__aenter__` is hung waiting for the
+        # subactor's `StartAck` due to a fork-child IPC race —
+        # see `dynamic_pub_sub_spawn_time_transport_close_under_mtf_issue.md`).
+        tmp_file: Path|None = None
+        ctx: tractor.Context|None = None
        try:
            an: tractor.ActorNursery
            async with tractor.open_nursery(
+                registry_addrs=[reg_addr],
                debug_mode=debug_mode,
            ) as an:
+
+                # sanity
+                if debug_mode:
+                    assert tractor.debug_mode()
+
                p: tractor.Portal = await an.start_actor(
                    'file_mngr',
                    enable_modules=[__name__],
@ -1028,7 +1136,7 @@ def test_sigint_closes_lifetime_stack(
                ) as (ctx, first):

                    path_str, cpid = first
-                    tmp_file: Path = Path(path_str)
+                    tmp_file = Path(path_str)
                    assert tmp_file.exists()

                    # XXX originally to simulate what (hopefully)
@ -1048,6 +1156,10 @@ def test_sigint_closes_lifetime_stack(
                        cpid if send_sigint_to == 'child'
                        else os.getpid()
                    )
+                    print(
+                        f'Sending SIGINT to {send_sigint_to!r}\n'
+                        f'pid: {pid!r}\n'
+                    )
                    os.kill(
                        pid,
                        signal.SIGINT,
@ -1058,13 +1170,37 @@ def test_sigint_closes_lifetime_stack(
                    # timeout should trigger!
                    if wait_for_ctx:
                        print('waiting for ctx outcome in parent..')
+
+                        if debug_mode:
+                            assert delay == 999
+
                        try:
-                            with trio.fail_after(1 + delay):
+                            with trio.fail_after(
+                                1 + delay
+                            ):
                                await ctx.wait_for_result()
                        except tractor.ContextCancelled as ctxc:
                            assert ctxc.canceller == ctx.chan.uid
                            raise

+                        except trio.TooSlowError:
+                            if (
+                                send_sigint_to == 'child'
+                                and
+                                is_forking_spawner
+                            ):
+                                pytest.xfail(
+                                    reason=(
+                                        'SIGINT delivery to fork-child subactor is known '
+                                        'to NOT SUCCEED, precisely bc we have not wired up a'
+                                        '"trio SIGINT mode" in the child pre-fork.\n'
+                                        'Also see `test_orphaned_subactor_sigint_cleanup_DRAFT` for'
+                                        'a dedicated suite demonstrating this expected limitation as '
+                                        'well as the detailed doc:\n'
+                                        '`ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md`.\n'
+                                    ),
+                                )
+
                    # XXX CASE 2: this seems to be the source of the
                    # original issue which exhibited BEFORE we put
                    # a `Actor.cancel_soon()` inside
@ -1078,6 +1214,21 @@ def test_sigint_closes_lifetime_stack(
            KeyboardInterrupt,
            ContextCancelled,
        ):
+            # If we got here BEFORE entering the ctx body (e.g.
+            # spawn-time IPC race hung `open_context.__aenter__` and
+            # the AFK-guard `signal.alarm` fired KBI from outside the
+            # trio loop), `tmp_file`/`ctx` are still `None` — surface
+            # that fact directly instead of `UnboundLocalError`.
+            if tmp_file is None:
+                pytest.fail(
+                    'KBI/ctxc fired BEFORE `p.open_context()` returned '
+                    "the child's `started` value — likely fork-child "
+                    'IPC race; see '
+                    '`ai/conc-anal/'
+                    'dynamic_pub_sub_spawn_time_transport_close_'
+                    'under_mtf_issue.md`'
+                )
+
            # XXX CASE 2: without the bug fixed, in the
            # KBI-raised-in-parent case, the actor teardown should
            # never get run (silently abaondoned by `asyncio`..) and
@ -1085,29 +1236,45 @@ def test_sigint_closes_lifetime_stack(
            assert not tmp_file.exists()
            assert ctx.maybe_error

-    trio.run(main)
+    # outer hard wall-clock backstop via `afk_alarm_w_trace`:
+    # when the in-band trio cancel path doesn't fire (e.g.
+    # parent is parked in a shielded `await` inside actor-
+    # nursery teardown, or `open_context.__aenter__` hangs
+    # waiting for a child's `StartAck` that never comes), the
+    # `signal.alarm` inside the CM raises `AFKAlarmTimeout`
+    # in the main thread regardless of trio's scope state —
+    # AND captures a full diag snapshot to
+    # `$XDG_CACHE_HOME/tractor/hung-dumps/` before re-raising.
+    # Only armed under fork-based backends since this hang-
+    # class is MTF-specific.
+    if (
+        not debug_mode
+        and
+        is_forking_spawner
+    ):
+        with afk_alarm_w_trace(10):
+            trio.run(main)
+    else:
+        trio.run(main)



 # ?TODO asyncio.Task fn-deco?
-# -[ ] do sig checkingat import time like @context?
-# -[ ] maybe name it @aio_task ??
 # -[ ] chan: to_asyncio.InterloopChannel ??
+# -[ ] do fn-sig checking at import time like @context?
+#  |_[ ] maybe name it @a(sync)io_task ??
+# @asyncio_task  <- not bad ??
 async def raise_before_started(
-    # from_trio: asyncio.Queue,
-    # to_trio: trio.abc.SendChannel,
    chan: to_asyncio.LinkedTaskChannel,
-
 ) -> None:
    '''
    `asyncio.Task` entry point which RTEs before calling
-    `to_trio.send_nowait()`.
+    `chan.started_nowait()`.

    '''
    await asyncio.sleep(0.2)
    raise RuntimeError('Some shite went wrong before `.send_nowait()`!!')

-    # to_trio.send_nowait('Uhh we shouldve RTE-d ^^ ??')
    chan.started_nowait('Uhh we shouldve RTE-d ^^ ??')
    await asyncio.sleep(float('inf'))

@ -1167,6 +1334,7 @@ def test_aio_side_raises_before_started(
        with trio.fail_after(3):
            an: tractor.ActorNursery
            async with tractor.open_nursery(
+                registry_addrs=[reg_addr],
                debug_mode=debug_mode,
                loglevel=loglevel,
            ) as an:
--- a/tests/test_inter_peer_cancellation.py
+++ b/tests/test_inter_peer_cancellation.py
@ -11,18 +11,46 @@ import trio
 import tractor
 from tractor import (  # typing
    Actor,
-    current_actor,
-    open_nursery,
-    Portal,
    Context,
    ContextCancelled,
+    MsgStream,
+    Portal,
    RemoteActorError,
+    current_actor,
+    open_nursery,
 )
 from tractor._testing import (
    # tractor_test,
    expect_ctxc,
 )

+from .conftest import cpu_scaling_factor
+
+pytestmark = [
+    pytest.mark.skipon_spawn_backend(
+        'subint',
+        reason=(
+            'XXX SUBINT GIL-CONTENTION HANGING TEST XXX\n'
+            'Inter-peer cancel cascades under '
+            '`--spawn-backend=subint` trip the abandoned-subint '
+            'GIL-hostage class — see\n'
+            '  - `ai/conc-anal/subint_sigint_starvation_issue.md` '
+            '(GIL-hostage, SIGINT-unresponsive)\n'
+            '  - `ai/conc-anal/subint_cancel_delivery_hang_issue.md` '
+            '(sibling: parent parks on dead chan)\n'
+            '  - https://github.com/goodboy/tractor/issues/379 '
+            '(subint umbrella)\n'
+        )
+    ),
+    # NOTE, inter-peer cancellation tests stress the
+    # multi-actor cancel cascade which under SIGKILL
+    # leaves UDS sock-files orphaned. Track per-test
+    # for blame attribution.
+    pytest.mark.usefixtures(
+        'track_orphaned_uds_per_test',
+    ),
+]
+
 # XXX TODO cases:
 # - [x] WE cancelled the peer and thus should not see any raised
 #   `ContextCancelled` as it should be reaped silently?
@ -200,7 +228,7 @@ async def stream_from_peer(
 ) -> None:

    # sanity
-    assert tractor._state.debug_mode() == debug_mode
+    assert tractor.debug_mode() == debug_mode

    peer: Portal
    try:
@ -580,7 +608,7 @@ def test_peer_canceller(
                            assert (
                                re.canceller
                                ==
-                                root.uid
+                                root.aid.uid
                            )

                        else:  # the other 2 ctxs
@ -589,7 +617,7 @@ def test_peer_canceller(
                                and (
                                    re.canceller
                                    ==
-                                    canceller.channel.uid
+                                    canceller.channel.aid.uid
                                )
                            )

@ -744,7 +772,7 @@ def test_peer_canceller(
                        # -> each context should have received
                        # a silently absorbed context cancellation
                        # in its remote nursery scope.
-                        # assert ctx.chan.uid == ctx.canceller
+                        # assert ctx.chan.aid.uid == ctx.canceller

                    # NOTE: when an inter-peer cancellation
                    # occurred, we DO NOT expect this
@ -796,12 +824,12 @@ async def basic_echo_server(

 ) -> None:
    '''
-    Just the simplest `MsgStream` echo server which resays what
-    you told it but with its uid in front ;)
+    Just the simplest `MsgStream` echo server which resays what you
+    told it but with its uid in front ;)

    '''
    actor: Actor = tractor.current_actor()
-    uid: tuple = actor.uid
+    uid: tuple = actor.aid.uid
    await ctx.started(uid)
    async with ctx.open_stream() as ipc:
        async for msg in ipc:
@ -840,7 +868,7 @@ async def serve_subactors(
    async with open_nursery() as an:

        # sanity
-        assert tractor._state.debug_mode() == debug_mode
+        assert tractor.debug_mode() == debug_mode

        await ctx.started(peer_name)
        async with ctx.open_stream() as ipc:
@ -856,7 +884,7 @@ async def serve_subactors(
                    f'|_{peer}\n'
                )
                await ipc.send((
-                    peer.chan.uid,
+                    peer.chan.aid.uid,
                    peer.chan.raddr.unwrap(),
                ))

@ -879,7 +907,7 @@ async def client_req_subactor(
 ) -> None:
    # sanity
    if debug_mode:
-        assert tractor._state.debug_mode()
+        assert tractor.debug_mode()

    # TODO: other cases to do with sub lifetimes:
    # -[ ] test that we can have the server spawn a sub
@ -966,9 +994,14 @@ async def tell_little_bro(

    caller: str = '',
    err_after: float|None = None,
-    rng_seed: int = 50,
+    rng_seed: int = 100,
+    # NOTE, ensure ^ is large enough (on fast hw anyway)
+    # to ensure the peer cancel req arrives before the
+    # echoing dialog does itself Bp
 ):
    # contact target actor, do a stream dialog.
+    lb: Portal
+    echo_ipc: MsgStream
    async with (
        tractor.wait_for_actor(
            name=actor_name
@ -983,17 +1016,17 @@ async def tell_little_bro(
                else None
            ),
        ) as (sub_ctx, first),
-
        sub_ctx.open_stream() as echo_ipc,
    ):
        actor: Actor = current_actor()
-        uid: tuple = actor.uid
+        uid: tuple = actor.aid.uid
        for i in range(rng_seed):
            msg: tuple = (
                uid,
                i,
            )
            await echo_ipc.send(msg)
+            await trio.sleep(0.001)
            resp = await echo_ipc.receive()
            print(
                f'{caller} => {actor_name}: {msg}\n'
@ -1006,6 +1039,9 @@ async def tell_little_bro(
            assert sub_uid != uid
            assert _i == i

+    # XXX, usually should never get here!
+    # await tractor.pause()
+

@pytest.mark.parametrize(
    'raise_client_error',
@ -1020,6 +1056,10 @@ def test_peer_spawns_and_cancels_service_subactor(
    raise_client_error: str,
    reg_addr: tuple[str, int],
    raise_sub_spawn_error_after: float|None,
+    loglevel: str,
+    test_log: tractor.log.StackLevelAdapter,
+    # ^XXX, set to 'warning' to see masked-exc warnings
+    # that may transpire during actor-nursery teardown.
 ):
    # NOTE: this tests for the modden `mod wks open piker` bug
    # discovered as part of implementing workspace ctx
@ -1049,6 +1089,7 @@ def test_peer_spawns_and_cancels_service_subactor(
            # NOTE: to halt the peer tasks on ctxc, uncomment this.
            debug_mode=debug_mode,
            registry_addrs=[reg_addr],
+            loglevel=loglevel,
        ) as an:
            server: Portal = await an.start_actor(
                (server_name := 'spawn_server'),
@ -1084,7 +1125,7 @@ def test_peer_spawns_and_cancels_service_subactor(
                    ) as (client_ctx, client_says),
                ):
                    root: Actor = current_actor()
-                    spawner_uid: tuple = spawn_ctx.chan.uid
+                    spawner_uid: tuple = spawn_ctx.chan.aid.uid
                    print(
                        f'Server says: {first}\n'
                        f'Client says: {client_says}\n'
@ -1103,7 +1144,7 @@ def test_peer_spawns_and_cancels_service_subactor(
                    print(
                        'Sub-spawn came online\n'
                        f'portal: {sub}\n'
-                        f'.uid: {sub.actor.uid}\n'
+                        f'.uid: {sub.actor.aid.uid}\n'
                        f'chan.raddr: {sub.chan.raddr}\n'
                    )

@ -1137,7 +1178,7 @@ def test_peer_spawns_and_cancels_service_subactor(

                        assert isinstance(res, ContextCancelled)
                        assert client_ctx.cancel_acked
-                        assert res.canceller == root.uid
+                        assert res.canceller == root.aid.uid
                        assert not raise_sub_spawn_error_after

                        # cancelling the spawner sub should
@ -1171,8 +1212,8 @@ def test_peer_spawns_and_cancels_service_subactor(
                        # little_bro: a `RuntimeError`.
                        #
                        check_inner_rte(rae)
-                        assert rae.relay_uid == client.chan.uid
-                        assert rae.src_uid == sub.chan.uid
+                        assert rae.relay_uid == client.chan.aid.uid
+                        assert rae.src_uid == sub.chan.aid.uid

                        assert not client_ctx.cancel_acked
                        assert (
@ -1201,12 +1242,12 @@ def test_peer_spawns_and_cancels_service_subactor(
            except ContextCancelled as ctxc:
                _ctxc = ctxc
                print(
-                    f'{root.uid} caught ctxc from ctx with {client_ctx.chan.uid}\n'
+                    f'{root.aid.uid} caught ctxc from ctx with {client_ctx.chan.aid.uid}\n'
                    f'{repr(ctxc)}\n'
                )

                if not raise_sub_spawn_error_after:
-                    assert ctxc.canceller == root.uid
+                    assert ctxc.canceller == root.aid.uid
                else:
                    assert ctxc.canceller == spawner_uid

@ -1237,9 +1278,20 @@ def test_peer_spawns_and_cancels_service_subactor(

                # assert spawn_ctx.cancelled_caught

+
    async def _main():
+        headroom: float = cpu_scaling_factor()
+        this_fast_on_linux: float = 3
+        this_fast = this_fast_on_linux * headroom
+        if headroom != 1.:
+            test_log.warning(
+                f'Adding latency headroom on linux bc CPU scaling,\n'
+                f'headroom: {headroom}\n'
+                f'this_fast_on_linux: {this_fast_on_linux} -> {this_fast}\n'
+            )
        with trio.fail_after(
-            3 if not debug_mode
+            this_fast
+            if not debug_mode
            else 999
        ):
            await main()
--- a/tests/test_legacy_one_way_streaming.py
+++ b/tests/test_legacy_one_way_streaming.py
@ -1,15 +1,22 @@
-"""
-Streaming via async gen api
-"""
+'''
+Streaming via the, now legacy, "async-gen API".
+
+'''
 import time
 from functools import partial
 import platform
+from typing import Callable

 import trio
 import tractor
 import pytest

 from tractor._testing import tractor_test
+from tractor._exceptions import ActorTooSlowError
+
+_non_linux: bool = (
+    _sys := platform.system()
+) != 'Linux'


 def test_must_define_ctx():
@ -19,7 +26,11 @@ def test_must_define_ctx():
        async def no_ctx():
            pass

-    assert "no_ctx must be `ctx: tractor.Context" in str(err.value)
+    assert (
+        "no_ctx must be `ctx: tractor.Context"
+        in
+        str(err.value)
+    )

    @tractor.stream
    async def has_ctx(ctx):
@ -62,21 +73,23 @@ async def stream_from_single_subactor(
    start_method,
    stream_func,
 ):
-    """Verify we can spawn a daemon actor and retrieve streamed data.
-    """
+    '''
+    Verify we can spawn a daemon actor and retrieve streamed data.
+
+    '''
    # only one per host address, spawns an actor if None

    async with tractor.open_nursery(
        registry_addrs=[reg_addr],
        start_method=start_method,
-    ) as nursery:
+    ) as an:

        async with tractor.find_actor('streamerd') as portals:

            if not portals:

                # no brokerd actor found
-                portal = await nursery.start_actor(
+                portal = await an.start_actor(
                    'streamerd',
                    enable_modules=[__name__],
                )
@ -116,11 +129,22 @@ async def stream_from_single_subactor(


@pytest.mark.parametrize(
-    'stream_func', [async_gen_stream, context_stream]
+    'stream_func',
+    [
+        async_gen_stream,
+        context_stream,
+    ],
+    ids='stream_func={}'.format
 )
-def test_stream_from_single_subactor(reg_addr, start_method, stream_func):
-    """Verify streaming from a spawned async generator.
-    """
+def test_stream_from_single_subactor(
+    reg_addr: tuple,
+    start_method: str,
+    stream_func: Callable,
+):
+    '''
+    Verify streaming from a spawned async generator.
+
+    '''
    trio.run(
        partial(
            stream_from_single_subactor,
@ -132,10 +156,9 @@ def test_stream_from_single_subactor(reg_addr, start_method, stream_func):


 # this is the first 2 actors, streamer_1 and streamer_2
-async def stream_data(seed):
+async def stream_data(seed: int):

    for i in range(seed):
-
        yield i

        # trigger scheduler to simulate practical usage
@ -143,15 +166,17 @@ async def stream_data(seed):


 # this is the third actor; the aggregator
-async def aggregate(seed):
-    """Ensure that the two streams we receive match but only stream
+async def aggregate(seed: int):
+    '''
+    Ensure that the two streams we receive match but only stream
    a single set of values to the parent.
-    """
-    async with tractor.open_nursery() as nursery:
+
+    '''
+    async with tractor.open_nursery() as an:
        portals = []
        for i in range(1, 3):
            # fork point
-            portal = await nursery.start_actor(
+            portal = await an.start_actor(
                name=f'streamer_{i}',
                enable_modules=[__name__],
            )
@ -164,20 +189,28 @@ async def aggregate(seed):
            async with send_chan:

                async with portal.open_stream_from(
-                    stream_data, seed=seed,
+                    stream_data,
+                    seed=seed,
                ) as stream:

                    async for value in stream:
                        # leverage trio's built-in backpressure
                        await send_chan.send(value)

-            print(f"FINISHED ITERATING {portal.channel.uid}")
+            print(
+                f'FINISHED ITERATING!\n'
+                f'peer: {portal.channel.aid.uid}'
+            )

        # spawn 2 trio tasks to collect streams and push to a local queue
-        async with trio.open_nursery() as n:
+        async with trio.open_nursery() as tn:

            for portal in portals:
-                n.start_soon(push_to_chan, portal, send_chan.clone())
+                tn.start_soon(
+                    push_to_chan,
+                    portal,
+                    send_chan.clone(),
+                )

            # close this local task's reference to send side
            await send_chan.aclose()
@ -194,20 +227,21 @@ async def aggregate(seed):

            print("FINISHED ITERATING in aggregator")

-        await nursery.cancel()
+        await an.cancel()
        print("WAITING on `ActorNursery` to finish")
    print("AGGREGATOR COMPLETE!")


-# this is the main actor and *arbiter*
-async def a_quadruple_example():
-    # a nursery which spawns "actors"
-    async with tractor.open_nursery() as nursery:
+async def a_quadruple_example() -> list[int]:
+    '''
+    Open the root-actor which is also a "registrar".

+    '''
+    async with tractor.open_nursery() as an:
        seed = int(1e3)
        pre_start = time.time()

-        portal = await nursery.start_actor(
+        portal = await an.start_actor(
            name='aggregator',
            enable_modules=[__name__],
        )
@ -215,23 +249,45 @@ async def a_quadruple_example():
        start = time.time()
        # the portal call returns exactly what you'd expect
        # as if the remote "aggregate" function was called locally
-        result_stream = []
+        result_stream: list[int] = []

-        async with portal.open_stream_from(aggregate, seed=seed) as stream:
+        async with portal.open_stream_from(
+            aggregate,
+            seed=seed,
+        ) as stream:
            async for value in stream:
                result_stream.append(value)

-        print(f"STREAM TIME = {time.time() - start}")
-        print(f"STREAM + SPAWN TIME = {time.time() - pre_start}")
+        print(
+            f"STREAM TIME = {time.time() - start}\n"
+            f"STREAM + SPAWN TIME = {time.time() - pre_start}\n"
+        )
        assert result_stream == list(range(seed))
        await portal.cancel_actor()
        return result_stream


-async def cancel_after(wait, reg_addr):
-    async with tractor.open_root_actor(registry_addrs=[reg_addr]):
-        with trio.move_on_after(wait):
-            return await a_quadruple_example()
+async def cancel_after(
+    wait: float,
+    reg_addr: tuple,
+    expect_cancel: bool,
+) -> list[int]:
+
+    async with tractor.open_root_actor(
+        registry_addrs=[reg_addr],
+    ):
+        res: list[int]|None = None
+        with trio.move_on_after(wait) as cs:
+            res: list[int] = await a_quadruple_example()
+            return res
+
+        if (
+            not expect_cancel
+            and
+            cs.cancelled_caught
+        ):
+            assert not res
+            raise ActorTooSlowError


@pytest.fixture(scope='module')
@ -239,7 +295,16 @@ def time_quad_ex(
    reg_addr: tuple,
    ci_env: bool,
    spawn_backend: str,
+    is_forking_spawner: bool,
+    tpt_proto: str,
 ):
+    if (
+        ci_env
+        and
+        _non_linux
+    ):
+        pytest.skip(f'Test is too flaky on {_sys!r} in CI')
+
    if spawn_backend == 'mp':
        '''
        no idea but the  mp *nix runs are flaking out here often...
@ -247,32 +312,79 @@ def time_quad_ex(
        '''
        pytest.skip("Test is too flaky on mp in CI")

-    timeout = 7 if platform.system() in ('Windows', 'Darwin') else 4
-    start = time.time()
-    results = trio.run(cancel_after, timeout, reg_addr)
-    diff = time.time() - start
-    assert results
+    timeout: float = (
+        7 if _non_linux
+        else 4
+    )
+
+    if (
+        is_forking_spawner
+        and
+        tpt_proto in [
+            'uds',
+        ]
+    ):
+        timeout += 1
+
+    start: float = time.time()
+    results: list[int] = trio.run(partial(
+        cancel_after,
+        wait=timeout,
+        reg_addr=reg_addr,
+        expect_cancel=True,
+    ))
+    diff: float = time.time() - start
+    if results is None:
+        raise ActorTooSlowError(
+            f'Streaming example took longer then timeout ??\n'
+            f'timeout={timeout!r}\n'
+            f'diff={diff!r}\n'
+            f'results={results!r}\n'
+        )
+
    return results, diff


 def test_a_quadruple_example(
-    time_quad_ex: tuple,
+    time_quad_ex: tuple[list[int], float],
    ci_env: bool,
    spawn_backend: str,
+    test_log: tractor.log.StackLevelAdapter,
 ):
    '''
-    This also serves as a kind of "we'd like to be this fast test".
+    This also serves as a "we'd like to be this fast" smoke test
+    given past empirical eval of this suite.

    '''
+
+    this_fast_on_linux: float = 3
+    this_fast = (
+        6 if _non_linux
+        else this_fast_on_linux
+    )
+    # ^ XXX NOTE,
+    # i've noticed that tweaking the CPU governor setting
+    # to not "always" enable "turbo" mode can result in latency
+    # which causes this limit to be too little. Not sure if it'd
+    # be worth it to adjust the linux value based on reading the
+    # CPU conf from the sys?
+    #
+    # For ex, see the `auto-cpufreq` docs on such settings,
+    # https://github.com/AdnanHodzic/auto-cpufreq?tab=readme-ov-file#example-config-file-contents
+    #
+    # HENCE this below latency-headroom compensation logic..
+    from .conftest import cpu_scaling_factor
+    headroom: float = cpu_scaling_factor()
+    if headroom != 1.:
+        this_fast = this_fast_on_linux * headroom
+        test_log.warning(
+            f'Adding latency headroom on linux bc CPU scaling,\n'
+            f'headroom: {headroom}\n'
+            f'this_fast_on_linux: {this_fast_on_linux} -> {this_fast}\n'
+        )
+
    results, diff = time_quad_ex
    assert results
-    this_fast = (
-        6 if platform.system() in (
-            'Windows',
-            'Darwin',
-        )
-        else 3
-    )
    assert diff < this_fast


@ -281,43 +393,77 @@ def test_a_quadruple_example(
    list(map(lambda i: i/10, range(3, 9)))
 )
 def test_not_fast_enough_quad(
-    reg_addr, time_quad_ex, cancel_delay, ci_env, spawn_backend
+    reg_addr: tuple,
+    time_quad_ex: tuple[list[int], float],
+    cancel_delay: float,
+
+    ci_env: bool,
+    spawn_backend: str,
+    is_forking_spawner: bool,
+    tpt_proto: str,
+    test_log: tractor.log.StackLevelAdapter,
 ):
-    """Verify we can cancel midway through the quad example and all actors
-    cancel gracefully.
-    """
+    '''
+    Verify we can cancel midway through `a_quadruple_example()`, at
+    various delays, and all subactors cancel gracefully.
+
+    '''
    results, diff = time_quad_ex
    delay = max(diff - cancel_delay, 0)
-    results = trio.run(cancel_after, delay, reg_addr)
-    system = platform.system()
-    if system in ('Windows', 'Darwin') and results is not None:
+    results: list[int] = trio.run(partial(
+        cancel_after,
+        wait=delay,
+        reg_addr=reg_addr,
+        expect_cancel=True,
+    ))
+    system: str = platform.system()
+    if (
+        system in ('Windows', 'Darwin')
+        and
+        results is not None
+    ):
        # In CI envoirments it seems later runs are quicker then the first
        # so just ignore these
-        print(f"Woa there {system} caught your breath eh?")
+        print(f'Woa there {system} caught your breath eh?')
    else:
+        if (
+            results
+            and
+            is_forking_spawner
+            and
+            tpt_proto in [
+                'uds',
+            ]
+        ):
+            pytest.xfail(
+                f'Spawning backend + tpt-proto is too fast XD\n'
+                f'{spawn_backend!r} + {tpt_proto!r}\n'
+            )
+
        # should be cancelled mid-streaming
        assert results is None


-@tractor_test
+@tractor_test(timeout=20)
 async def test_respawn_consumer_task(
-    reg_addr,
-    spawn_backend,
-    loglevel,
+    reg_addr: tuple,
+    spawn_backend: str,
+    loglevel: str,
 ):
-    """Verify that ``._portal.ReceiveStream.shield()``
+    '''
+    Verify that ``._portal.ReceiveStream.shield()``
    sucessfully protects the underlying IPC channel from being closed
    when cancelling and respawning a consumer task.

    This also serves to verify that all values from the stream can be
    received despite the respawns.

-    """
+    '''
    stream = None

-    async with tractor.open_nursery() as n:
+    async with tractor.open_nursery() as an:

-        portal = await n.start_actor(
+        portal = await an.start_actor(
            name='streamer',
            enable_modules=[__name__]
        )
--- a/tests/test_local.py
+++ b/tests/test_local.py
@ -1,5 +1,5 @@
 """
-Arbiter and "local" actor api
+Registrar and "local" actor api
 """
 import time

@ -10,24 +10,28 @@ import tractor
 from tractor._testing import tractor_test


-@pytest.mark.trio
-async def test_no_runtime():
-    """An arbitter must be established before any nurseries
+def test_no_runtime():
+    '''
+    A registrar must be established before any nurseries
    can be created.

-    (In other words ``tractor.open_root_actor()`` must be engaged at
-    some point?)
-    """
-    with pytest.raises(RuntimeError) :
+    (In other words ``tractor.open_root_actor()`` must be
+    engaged at some point?)
+
+    '''
+    async def main():
        async with tractor.find_actor('doggy'):
            pass

+    with pytest.raises(tractor._exceptions.NoRuntime) :
+        trio.run(main)
+

@tractor_test
 async def test_self_is_registered(reg_addr):
-    "Verify waiting on the arbiter to register itself using the standard api."
+    "Verify waiting on the registrar to register itself using the standard api."
    actor = tractor.current_actor()
-    assert actor.is_arbiter
+    assert actor.is_registrar
    with trio.fail_after(0.2):
        async with tractor.wait_for_actor('root') as portal:
            assert portal.channel.uid[0] == 'root'
@ -35,11 +39,11 @@ async def test_self_is_registered(reg_addr):

@tractor_test
 async def test_self_is_registered_localportal(reg_addr):
-    "Verify waiting on the arbiter to register itself using a local portal."
+    "Verify waiting on the registrar to register itself using a local portal."
    actor = tractor.current_actor()
-    assert actor.is_arbiter
+    assert actor.is_registrar
    async with tractor.get_registry(reg_addr) as portal:
-        assert isinstance(portal, tractor._portal.LocalPortal)
+        assert isinstance(portal, tractor.runtime._portal.LocalPortal)

        with trio.fail_after(0.2):
            sockaddr = await portal.run_from_ns(
@ -57,8 +61,8 @@ def test_local_actor_async_func(reg_addr):
        async with tractor.open_root_actor(
            registry_addrs=[reg_addr],
        ):
-            # arbiter is started in-proc if dne
-            assert tractor.current_actor().is_arbiter
+            # registrar is started in-proc if dne
+            assert tractor.current_actor().is_registrar

            for i in range(10):
                nums.append(i)
--- a/tests/test_log_sys.py
+++ b/tests/test_log_sys.py
@ -0,0 +1,260 @@
+'''
+`tractor.log`-wrapping unit tests.
+
+'''
+from pathlib import Path
+import shutil
+from types import ModuleType
+
+import pytest
+import tractor
+from tractor import (
+    _code_load,
+    log,
+)
+
+
+def test_root_pkg_not_duplicated_in_logger_name():
+    '''
+    When both `pkg_name` and `name` are passed and they have
+    a common `<root_name>.< >` prefix, ensure that it is not
+    duplicated in the child's `StackLevelAdapter.name: str`.
+
+    Also pins the explicit-`name` contract: an explicitly passed
+    dotted `name` is treated as a *literal* sub-logger path and is
+    NOT leaf-collapsed. The leaf-module is only dropped when the
+    trailing token duplicates the *caller's own* `__name__` leaf (the
+    `{filename}` field) — see `test_implicit_mod_name_applied_for_child`
+    for that (auto-naming) path. This is what keeps a real (possibly
+    nested) sub-PACKAGE like `subpkg.mod` -> `devx.debug` addressable
+    by the `tractor.log` logging-spec, instead of collapsing to its
+    parent.
+
+    '''
+    project_name: str = 'pylib'
+    pkg_path: str = 'pylib.subpkg.mod'
+
+    assert not tractor.current_actor(
+        err_on_no_runtime=False,
+    )
+    proj_log = log.get_logger(
+        pkg_name=project_name,
+        mk_sublog=False,
+    )
+
+    sublog = log.get_logger(
+        pkg_name=project_name,
+        name=pkg_path,
+    )
+
+    assert proj_log is not sublog
+    # the root pkg-name appears exactly once (no `pylib.pylib...`)
+    assert sublog.name.count(proj_log.name) == 1
+    # explicit dotted `name` is preserved literally (NOT collapsed);
+    # the trailing token survives since it's not the *caller's* own
+    # leaf-module (`test_log_sys`), so this is treated as a literal
+    # sub-pkg path.
+    assert sublog.name == f'{project_name}.subpkg.mod'
+
+
+def test_implicit_mod_name_applied_for_child(
+    testdir: pytest.Pytester,
+    loglevel: str,
+):
+    '''
+    Verify that when `.log.get_logger(pkg_name='pylib')` is called
+    from a given sub-mod from within the `pylib` pkg-path, we
+    implicitly set the equiv of `name=__name__` from the caller's
+    module.
+
+    '''
+    # tractor.log.get_console_log(level=loglevel)
+    proj_name: str = 'snakelib'
+    mod_code: str = (
+        f'import tractor\n'
+        f'\n'
+        # if you need to trace `testdir` stuff @ import-time..
+        # f'breakpoint()\n'
+        f'log = tractor.log.get_logger(pkg_name="{proj_name}")\n'
+    )
+
+    # create a sub-module for each pkg layer
+    _lib = testdir.mkpydir(proj_name)
+    pkg: Path = Path(_lib)
+    pkg_init_mod: Path = pkg / "__init__.py"
+    pkg_init_mod.write_text(mod_code)
+
+    subpkg: Path = pkg / 'subpkg'
+    subpkg.mkdir()
+    subpkgmod: Path = subpkg / "__init__.py"
+    subpkgmod.touch()
+    subpkgmod.write_text(mod_code)
+
+    _submod: Path = testdir.makepyfile(
+        _mod=mod_code,
+    )
+
+    pkg_submod = pkg / 'mod.py'
+    pkg_subpkg_submod = subpkg / 'submod.py'
+    shutil.copyfile(
+        _submod,
+        pkg_submod,
+    )
+    shutil.copyfile(
+        _submod,
+        pkg_subpkg_submod,
+    )
+    testdir.chdir()
+    # NOTE, to introspect the py-file-module-layout use (in .xsh
+    # syntax): `ranger @str(testdir)`
+
+    # XXX NOTE, once the "top level" pkg mod has been
+    # imported, we can then use `import` syntax to
+    # import it's sub-pkgs and modules.
+    subpkgmod: ModuleType = _code_load.load_module_from_path(
+        Path(pkg / '__init__.py'),
+        module_name=proj_name,
+    )
+
+    pkg_root_log = log.get_logger(
+        pkg_name=proj_name,
+        mk_sublog=False,
+    )
+    # the top level pkg-mod, created just now,
+    # by above API call.
+    assert pkg_root_log.name == proj_name
+    assert not pkg_root_log.logger.getChildren()
+    #
+    # ^TODO! test this same output but created via a `get_logger()`
+    # call in the `snakelib.__init__py`!!
+
+    # NOTE, the pkg-level "init mod" should of course
+    # have the same name as the package ns-path.
+    import snakelib as init_mod
+    assert init_mod.log.name == proj_name
+
+    # NOTE, a first-pkg-level sub-module should only
+    # use the package-name since the leaf-node-module
+    # will be included in log headers by default.
+    from snakelib import mod
+    assert mod.log.name == proj_name
+
+    from snakelib import subpkg
+    assert (
+        subpkg.log.name
+        ==
+        subpkg.__package__ 
+        ==
+        f'{proj_name}.subpkg'
+    )
+
+    from snakelib.subpkg import submod
+    assert (
+        submod.log.name
+        ==
+        submod.__package__ 
+        ==
+        f'{proj_name}.subpkg'
+    )
+
+    sub_logs = pkg_root_log.logger.getChildren()
+    assert len(sub_logs) == 1  # only one nested sub-pkg module
+    assert submod.log.logger in sub_logs
+
+
+def test_io_custom_level_registered():
+    '''
+    The `IO`(21) level (registered via `add_log_level()` at
+    import, for `tractor.trionics._subproc`'s std-stream relay)
+    is fully wired and SHOWN BY DEFAULT at `info`-level consoles
+    since `21 >= INFO(20)`.
+
+    '''
+    import logging
+    assert log.CUSTOM_LEVELS.get('IO') == 21
+    assert logging.getLevelName(21) == 'IO'
+    assert log.STD_PALETTE.get('IO')
+    assert log.BOLD_PALETTE['bold'].get('IO')
+
+    iolog = log.get_logger('io_lvl_test')
+    assert callable(getattr(iolog, 'io', None))
+    # emit must not raise
+    iolog.io('hello from the IO level')
+
+    # 21 >= INFO(20) -> shown when console set to `info`
+    assert 21 >= logging.INFO
+
+
+def test_add_log_level_pluggable():
+    '''
+    `add_log_level()` is the single pluggable entry-point: one
+    call wires `CUSTOM_LEVELS` + `addLevelName` + both palettes +
+    a same-named `StackLevelAdapter` emit method (so
+    `get_logger()`'s per-level audit passes).
+
+    '''
+    import logging
+    name: str = 'XLVL'
+    val: int = 19
+    try:
+        log.add_log_level(name, val, 'cyan')
+
+        assert log.CUSTOM_LEVELS[name] == val
+        assert logging.getLevelName(val) == name
+        assert log.STD_PALETTE[name] == 'cyan'
+        assert log.BOLD_PALETTE['bold'][name] == 'bold_cyan'
+
+        # the audit in `get_logger()` (asserts a method per
+        # `CUSTOM_LEVELS` entry) must still pass.
+        xlog = log.get_logger('xlvl_test')
+        emit = getattr(xlog, name.lower(), None)
+        assert callable(emit)
+        emit('hello from a plugged-in level')
+
+    finally:
+        # best-effort cleanup of our module-global mutations so
+        # later `get_logger()` audits don't see a half-removed
+        # level.
+        log.CUSTOM_LEVELS.pop(name, None)
+        log.STD_PALETTE.pop(name, None)
+        log.BOLD_PALETTE['bold'].pop(name, None)
+        if hasattr(log.StackLevelAdapter, name.lower()):
+            delattr(log.StackLevelAdapter, name.lower())
+
+
+# TODO, moar tests against existing feats:
+# ------ - ------
+# - [ ] color settings?
+# - [ ] header contents like,
+#   - actor + thread + task names from various conc-primitives,
+# - [ ] `StackLevelAdapter` extensions,
+#   - our custom levels/methods: `transport|runtime|cance|pdb|devx`
+# - [ ] custom-headers support?
+#
+
+# TODO, test driven dev of new-ideas/long-wanted feats,
+# ------ - ------
+# - [ ] https://github.com/goodboy/tractor/issues/244
+#  - [ ] @catern mentioned using a sync / deterministic sys
+#       and in particular `svlogd`?
+#       |_ https://smarden.org/runit/svlogd.8
+
+# - [ ] using adapter vs. filters?
+#    - https://stackoverflow.com/questions/60691759/add-information-to-every-log-message-in-python-logging/61830838#61830838
+
+# - [ ] `.at_least_level()` optimization which short circuits wtv
+#      `logging` is doing behind the scenes when the level filters
+#      the emission..?
+
+# - [ ] use of `.log.get_console_log()` in subactors and the
+#    subtleties of ensuring it actually emits from a subproc.
+
+# - [ ] this idea of activating per-subsys emissions with some
+#    kind of `.name` filter passed to the runtime or maybe configured
+#    via the root `StackLevelAdapter`?
+
+# - [ ] use of `logging.dict.dictConfig()` to simplify the impl
+#      of any of ^^ ??
+#    - https://stackoverflow.com/questions/7507825/where-is-a-complete-example-of-logging-config-dictconfig
+#    - https://docs.python.org/3/library/logging.config.html#configuration-dictionary-schema
+#    - https://docs.python.org/3/library/logging.config.html#logging.config.dictConfig
--- a/tests/test_multi_program.py
+++ b/tests/test_multi_program.py
@ -1,68 +0,0 @@
-"""
-Multiple python programs invoking the runtime.
-"""
-import platform
-import time
-
-import pytest
-import trio
-import tractor
-from tractor._testing import (
-    tractor_test,
-)
-from .conftest import (
-    sig_prog,
-    _INT_SIGNAL,
-    _INT_RETURN_CODE,
-)
-
-
-def test_abort_on_sigint(daemon):
-    assert daemon.returncode is None
-    time.sleep(0.1)
-    sig_prog(daemon, _INT_SIGNAL)
-    assert daemon.returncode == _INT_RETURN_CODE
-
-    # XXX: oddly, couldn't get capfd.readouterr() to work here?
-    if platform.system() != 'Windows':
-        # don't check stderr on windows as its empty when sending CTRL_C_EVENT
-        assert "KeyboardInterrupt" in str(daemon.stderr.read())
-
-
-@tractor_test
-async def test_cancel_remote_arbiter(daemon, reg_addr):
-    assert not tractor.current_actor().is_arbiter
-    async with tractor.get_registry(reg_addr) as portal:
-        await portal.cancel_actor()
-
-    time.sleep(0.1)
-    # the arbiter channel server is cancelled but not its main task
-    assert daemon.returncode is None
-
-    # no arbiter socket should exist
-    with pytest.raises(OSError):
-        async with tractor.get_registry(reg_addr) as portal:
-            pass
-
-
-def test_register_duplicate_name(daemon, reg_addr):
-
-    async def main():
-
-        async with tractor.open_nursery(
-            registry_addrs=[reg_addr],
-        ) as n:
-
-            assert not tractor.current_actor().is_arbiter
-
-            p1 = await n.start_actor('doggy')
-            p2 = await n.start_actor('doggy')
-
-            async with tractor.wait_for_actor('doggy') as portal:
-                assert portal.channel.uid in (p2.channel.uid, p1.channel.uid)
-
-            await n.cancel()
-
-    # run it manually since we want to start **after**
-    # the other "daemon" program
-    trio.run(main)
--- a/tests/test_pubsub.py
+++ b/tests/test_pubsub.py
@ -7,6 +7,14 @@ import tractor
 from tractor.experimental import msgpub
 from tractor._testing import tractor_test

+pytestmark = pytest.mark.skipon_spawn_backend(
+    'subint',
+    reason=(
+        'XXX SUBINT HANGING TEST XXX\n'
+        'See oustanding issue(s)\n'
+        # TODO, put issue link!
+    )
+)

 def test_type_checks():

--- a/tests/test_reg_err_types.py
+++ b/tests/test_reg_err_types.py
@ -0,0 +1,333 @@
+'''
+Verify that externally registered remote actor error
+types are correctly relayed, boxed, and re-raised across
+IPC actor hops via `reg_err_types()`.
+
+Also ensure that when custom error types are NOT registered
+the framework indicates the lookup failure to the user.
+
+'''
+import pytest
+import trio
+import tractor
+from tractor import (
+    Context,
+    Portal,
+    RemoteActorError,
+)
+from tractor._exceptions import (
+    get_err_type,
+    reg_err_types,
+)
+
+
+# -- custom app-level errors for testing --
+class CustomAppError(Exception):
+    '''
+    A hypothetical user-app error that should be
+    boxed+relayed by `tractor` IPC when registered.
+
+    '''
+
+
+class AnotherAppError(Exception):
+    '''
+    A second custom error for multi-type registration.
+
+    '''
+
+
+class UnregisteredAppError(Exception):
+    '''
+    A custom error that is intentionally NEVER
+    registered via `reg_err_types()` so we can
+    verify the framework's failure indication.
+
+    '''
+
+
+# -- remote-task endpoints --
+@tractor.context
+async def raise_custom_err(
+    ctx: Context,
+) -> None:
+    '''
+    Remote ep that raises a `CustomAppError`
+    after sync-ing with the caller.
+
+    '''
+    await ctx.started()
+    raise CustomAppError(
+        'the app exploded remotely'
+    )
+
+
+@tractor.context
+async def raise_another_err(
+    ctx: Context,
+) -> None:
+    '''
+    Remote ep that raises `AnotherAppError`.
+
+    '''
+    await ctx.started()
+    raise AnotherAppError(
+        'another app-level kaboom'
+    )
+
+
+@tractor.context
+async def raise_unreg_err(
+    ctx: Context,
+) -> None:
+    '''
+    Remote ep that raises an `UnregisteredAppError`
+    which has NOT been `reg_err_types()`-registered.
+
+    '''
+    await ctx.started()
+    raise UnregisteredAppError(
+        'this error type is unknown to tractor'
+    )
+
+
+# -- unit tests for the type-registry plumbing --
+
+class TestRegErrTypesPlumbing:
+    '''
+    Low-level checks on `reg_err_types()` and
+    `get_err_type()` without requiring IPC.
+
+    '''
+
+    def test_unregistered_type_returns_none(self):
+        '''
+        An unregistered custom error name should yield
+        `None` from `get_err_type()`.
+
+        '''
+        result = get_err_type('CustomAppError')
+        assert result is None
+
+    def test_register_and_lookup(self):
+        '''
+        After `reg_err_types()`, the custom type should
+        be discoverable via `get_err_type()`.
+
+        '''
+        reg_err_types([CustomAppError])
+        result = get_err_type('CustomAppError')
+        assert result is CustomAppError
+
+    def test_register_multiple_types(self):
+        '''
+        Registering a list of types should make each
+        one individually resolvable.
+
+        '''
+        reg_err_types([
+            CustomAppError,
+            AnotherAppError,
+        ])
+        assert (
+            get_err_type('CustomAppError')
+            is CustomAppError
+        )
+        assert (
+            get_err_type('AnotherAppError')
+            is AnotherAppError
+        )
+
+    def test_builtin_types_always_resolve(self):
+        '''
+        Builtin error types like `RuntimeError` and
+        `ValueError` should always be found without
+        any prior registration.
+
+        '''
+        assert (
+            get_err_type('RuntimeError')
+            is RuntimeError
+        )
+        assert (
+            get_err_type('ValueError')
+            is ValueError
+        )
+
+    def test_tractor_native_types_resolve(self):
+        '''
+        `tractor`-internal exc types (e.g.
+        `ContextCancelled`) should always resolve.
+
+        '''
+        assert (
+            get_err_type('ContextCancelled')
+            is tractor.ContextCancelled
+        )
+
+    def test_boxed_type_str_without_ipc_msg(self):
+        '''
+        When a `RemoteActorError` is constructed
+        without an IPC msg (and no resolvable type),
+        `.boxed_type_str` should return `'<unknown>'`.
+
+        '''
+        rae = RemoteActorError('test')
+        assert rae.boxed_type_str == '<unknown>'
+
+
+# -- IPC-level integration tests --
+
+def test_registered_custom_err_relayed(
+    debug_mode: bool,
+    tpt_proto: str,
+):
+    '''
+    When a custom error type is registered via
+    `reg_err_types()` on BOTH sides of an IPC dialog,
+    the parent should receive a `RemoteActorError`
+    whose `.boxed_type` matches the original custom
+    error type.
+
+    '''
+    reg_err_types([CustomAppError])
+
+    async def main():
+        async with tractor.open_nursery(
+            debug_mode=debug_mode,
+            enable_transports=[tpt_proto],
+        ) as an:
+            ptl: Portal = await an.start_actor(
+                'custom-err-raiser',
+                enable_modules=[__name__],
+            )
+            async with ptl.open_context(
+                raise_custom_err,
+            ) as (ctx, sent):
+                assert not sent
+                try:
+                    await ctx.wait_for_result()
+                except RemoteActorError as rae:
+                    assert rae.boxed_type is CustomAppError
+                    assert rae.src_type is CustomAppError
+                    assert 'the app exploded remotely' in str(
+                        rae.tb_str
+                    )
+                    raise
+
+    with pytest.raises(RemoteActorError) as excinfo:
+        trio.run(main)
+
+    rae = excinfo.value
+    assert rae.boxed_type is CustomAppError
+
+
+def test_registered_another_err_relayed(
+    debug_mode: bool,
+    tpt_proto: str,
+):
+    '''
+    Same as above but for a different custom error
+    type to verify multi-type registration works
+    end-to-end over IPC.
+
+    '''
+    reg_err_types([AnotherAppError])
+
+    async def main():
+        async with tractor.open_nursery(
+            debug_mode=debug_mode,
+            enable_transports=[tpt_proto],
+        ) as an:
+            ptl: Portal = await an.start_actor(
+                'another-err-raiser',
+                enable_modules=[__name__],
+            )
+            async with ptl.open_context(
+                raise_another_err,
+            ) as (ctx, sent):
+                assert not sent
+                try:
+                    await ctx.wait_for_result()
+                except RemoteActorError as rae:
+                    assert (
+                        rae.boxed_type
+                        is AnotherAppError
+                    )
+                    raise
+
+            await an.cancel()
+
+    with pytest.raises(RemoteActorError) as excinfo:
+        trio.run(main)
+
+    rae = excinfo.value
+    assert rae.boxed_type is AnotherAppError
+
+
+def test_unregistered_err_still_relayed(
+    debug_mode: bool,
+    tpt_proto: str,
+):
+    '''
+    Verify that even when a custom error type is NOT registered via
+    `reg_err_types()`, the remote error is still relayed as
+    a `RemoteActorError` with all string-level info preserved
+    (traceback, type name, source actor uid).
+
+    The `.boxed_type` will be `None` (type obj can't be resolved) but
+    `.boxed_type_str` and `.src_type_str` still report the original
+    type name from the IPC msg.
+
+    This documents the expected limitation: without `reg_err_types()`
+    the `.boxed_type` property can NOT resolve to the original Python
+    type.
+
+    '''
+    # NOTE: intentionally do NOT call
+    # `reg_err_types([UnregisteredAppError])`
+
+    async def main():
+        async with tractor.open_nursery(
+            debug_mode=debug_mode,
+            enable_transports=[tpt_proto],
+        ) as an:
+            ptl: Portal = await an.start_actor(
+                'unreg-err-raiser',
+                enable_modules=[__name__],
+            )
+            async with ptl.open_context(
+                raise_unreg_err,
+            ) as (ctx, sent):
+                assert not sent
+                await ctx.wait_for_result()
+
+            await an.cancel()
+
+    with pytest.raises(RemoteActorError) as excinfo:
+        trio.run(main)
+
+    rae = excinfo.value
+
+    # the error IS relayed even without
+    # registration; type obj is unresolvable but
+    # all string-level info is preserved.
+    assert rae.boxed_type is None # NOT `UnregisteredAppError`
+    assert rae.src_type is None
+
+    # string names survive the IPC round-trip
+    # via the `Error` msg fields.
+    assert (
+        rae.src_type_str
+        ==
+        'UnregisteredAppError'
+    )
+    assert (
+        rae.boxed_type_str
+        ==
+        'UnregisteredAppError'
+    )
+
+    # original traceback content is preserved
+    assert 'this error type is unknown' in rae.tb_str
+    assert 'UnregisteredAppError' in rae.tb_str
--- a/tests/test_resource_cache.py
+++ b/tests/test_resource_cache.py
@ -12,14 +12,14 @@ import trio
 import tractor
 from tractor.trionics import (
    maybe_open_context,
+    collapse_eg,
 )
 from tractor.log import (
    get_console_log,
    get_logger,
 )
-log = get_logger(__name__)
-

+log = get_logger()

 _resource: int = 0

@ -213,9 +213,12 @@ def test_open_local_sub_to_stream(
    N local tasks using `trionics.maybe_open_context()`.

    '''
-    timeout: float = 3.6
-    if platform.system() == "Windows":
-        timeout: float = 10
+    from .conftest import cpu_scaling_factor
+    timeout: float = (
+        4
+        if not platform.system() == "Windows"
+        else 10
+    ) * cpu_scaling_factor()

    if debug_mode:
        timeout = 999
@ -319,7 +322,7 @@ def test_open_local_sub_to_stream(


@acm
-async def cancel_outer_cs(
+async def maybe_cancel_outer_cs(
    cs: trio.CancelScope|None = None,
    delay: float = 0,
 ):
@ -333,12 +336,31 @@ async def cancel_outer_cs(
    if cs:
        log.info('task calling cs.cancel()')
        cs.cancel()
-    trio.lowlevel.checkpoint()
+
    yield
-    await trio.sleep_forever()
+
+    if cs:
+        await trio.sleep_forever()
+
+    # XXX, if not cancelled we'll leak this inf-blocking
+    # subtask to the actor's service tn..
+    else:
+        await trio.lowlevel.checkpoint()


+@pytest.mark.parametrize(
+    'delay',
+    [0.05, 0.5, 1],
+    ids="pre_sleep_delay={}".format,
+)
+@pytest.mark.parametrize(
+    'cancel_by_cs',
+    [True, False],
+    ids="cancel_by_cs={}".format,
+)
 def test_lock_not_corrupted_on_fast_cancel(
+    delay: float,
+    cancel_by_cs: bool,
    debug_mode: bool,
    loglevel: str,
 ):
@ -355,17 +377,14 @@ def test_lock_not_corrupted_on_fast_cancel(
      due to it having erronously exited without calling
      `lock.release()`.

-
    '''
-    delay: float = 1.
-
    async def use_moc(
-        cs: trio.CancelScope|None,
        delay: float,
+        cs: trio.CancelScope|None = None,
    ):
        log.info('task entering moc')
        async with maybe_open_context(
-            cancel_outer_cs,
+            maybe_cancel_outer_cs,
            kwargs={
                'cs': cs,
                'delay': delay,
@ -376,7 +395,13 @@ def test_lock_not_corrupted_on_fast_cancel(
            else:
                log.info('1st task entered')

-            await trio.sleep_forever()
+            if cs:
+                await trio.sleep_forever()
+
+            else:
+                await trio.sleep(delay)
+
+        # ^END, exit shared ctx.

    async def main():
        with trio.fail_after(delay + 2):
@ -385,6 +410,7 @@ def test_lock_not_corrupted_on_fast_cancel(
                    debug_mode=debug_mode,
                    loglevel=loglevel,
                ),
+                # ?TODO, pass this as the parent tn?
                trio.open_nursery() as tn,
            ):
                get_console_log('info')
@ -392,15 +418,206 @@ def test_lock_not_corrupted_on_fast_cancel(
                cs = tn.cancel_scope
                tn.start_soon(
                    use_moc,
-                    cs,
                    delay,
+                    cs if cancel_by_cs else None,
                    name='child',
                )
                with trio.CancelScope() as rent_cs:
                    await use_moc(
-                        cs=rent_cs,
                        delay=delay,
+                        cs=rent_cs if cancel_by_cs else None,
                    )

+    trio.run(main)
+
+
+@acm
+async def acm_with_resource(resource_id: str):
+    '''
+    Yield `resource_id` as the cached value.
+
+    Used to verify per-`ctx_key` isolation when the same
+    `acm_func` is called with different kwargs.
+
+    '''
+    yield resource_id
+
+
+def test_per_ctx_key_resource_lifecycle(
+    debug_mode: bool,
+    loglevel: str,
+):
+    '''
+    Verify that `maybe_open_context()` correctly isolates resource
+    lifecycle **per `ctx_key`** when the same `acm_func` is called
+    with different kwargs.
+
+    Previously `_Cache.users` was a single global `int` and
+    `_Cache.locks` was keyed on `fid` (function ID), so calling
+    the same `acm_func` with different kwargs (producing different
+    `ctx_key`s) meant:
+
+    - teardown for one key was skipped bc the *other* key's users
+      kept the global count > 0,
+    - and re-entry could hit the old
+      `assert not resources.get(ctx_key)` crash during the
+      teardown window.
+
+    This was the root cause of a long-standing bug in piker's
+    `brokerd.kraken` backend.
+
+    '''
+    timeout: float = 6
+    if debug_mode:
+        timeout = 999
+
+    async def main():
+        a_ready = trio.Event()
+        a_exit = trio.Event()
+
+        async def hold_resource_a():
+            '''
+            Open resource 'a' and keep it alive until signalled.
+
+            '''
+            async with maybe_open_context(
+                acm_with_resource,
+                kwargs={'resource_id': 'a'},
+            ) as (cache_hit, value):
+                assert not cache_hit
+                assert value == 'a'
+                log.info("resource 'a' entered (holding)")
+                a_ready.set()
+                await a_exit.wait()
+                log.info("resource 'a' exiting")
+
+        with trio.fail_after(timeout):
+            async with (
+                tractor.open_root_actor(
+                    debug_mode=debug_mode,
+                    loglevel=loglevel,
+                ),
+                trio.open_nursery() as tn,
+            ):
+                # Phase 1: bg task holds resource 'a' open.
+                tn.start_soon(hold_resource_a)
+                await a_ready.wait()
+
+                # Phase 2: open resource 'b' (different kwargs,
+                # same acm_func) then exit it while 'a' is still
+                # alive.
+                async with maybe_open_context(
+                    acm_with_resource,
+                    kwargs={'resource_id': 'b'},
+                ) as (cache_hit, value):
+                    assert not cache_hit
+                    assert value == 'b'
+                    log.info("resource 'b' entered")
+
+                log.info("resource 'b' exited, waiting for teardown")
+                await trio.lowlevel.checkpoint()
+
+                # Phase 3: re-open 'b'; must be a fresh cache MISS
+                # proving 'b' was torn down independently of 'a'.
+                #
+                # With the old global `_Cache.users` counter this
+                # would be a stale cache HIT (leaked resource) or
+                # trigger `assert not resources.get(ctx_key)`.
+                async with maybe_open_context(
+                    acm_with_resource,
+                    kwargs={'resource_id': 'b'},
+                ) as (cache_hit, value):
+                    assert not cache_hit, (
+                        "resource 'b' was NOT torn down despite "
+                        "having zero users! (global user count bug)"
+                    )
+                    assert value == 'b'
+                    log.info(
+                        "resource 'b' re-entered "
+                        "(cache miss, correct)"
+                    )
+
+                # Phase 4: let 'a' exit, clean shutdown.
+                a_exit.set()
+
+    trio.run(main)
+
+
+def test_moc_reentry_during_teardown(
+    debug_mode: bool,
+    loglevel: str,
+):
+    '''
+    Reproduce the piker `open_cached_client('kraken')` race:
+
+    - same `acm_func`, NO kwargs (identical `ctx_key`)
+    - multiple tasks share the cached resource
+    - all users exit -> teardown starts
+    - a NEW task enters during `_Cache.run_ctx.__aexit__`
+    - `values[ctx_key]` is gone (popped in inner finally)
+      but `resources[ctx_key]` still exists (outer finally
+      hasn't run yet bc the acm cleanup has checkpoints)
+    - old code: `assert not resources.get(ctx_key)` FIRES
+
+    This models the real-world scenario where `brokerd.kraken`
+    tasks concurrently call `open_cached_client('kraken')`
+    (same `acm_func`, empty kwargs, shared `ctx_key`) and
+    the teardown/re-entry race triggers intermittently.
+
+    '''
+    async def main():
+        in_aexit = trio.Event()
+
+        @acm
+        async def cached_client():
+            '''
+            Simulates `kraken.api.get_client()`:
+            - no params (all callers share one `ctx_key`)
+            - slow-ish cleanup to widen the race window
+              between `values.pop()` and `resources.pop()`
+              inside `_Cache.run_ctx`.
+
+            '''
+            yield 'the-client'
+            # Signal that we're in __aexit__ — at this
+            # point `values` has already been popped by
+            # `run_ctx`'s inner finally, but `resources`
+            # is still alive (outer finally hasn't run).
+            in_aexit.set()
+            await trio.sleep(10)
+
+        first_done = trio.Event()
+
+        async def use_and_exit():
+            async with maybe_open_context(
+                cached_client,
+            ) as (cache_hit, value):
+                assert value == 'the-client'
+            first_done.set()
+
+        async def reenter_during_teardown():
+            '''
+            Wait for the acm's `__aexit__` to start (meaning
+            `values` is popped but `resources` still exists),
+            then re-enter — triggering the assert.
+
+            '''
+            await in_aexit.wait()
+            async with maybe_open_context(
+                cached_client,
+            ) as (cache_hit, value):
+                assert value == 'the-client'
+
+        with trio.fail_after(5):
+            async with (
+                tractor.open_root_actor(
+                    debug_mode=debug_mode,
+                    loglevel=loglevel,
+                ),
+                collapse_eg(),
+                trio.open_nursery() as tn,
+            ):
+                tn.start_soon(use_and_exit)
+                tn.start_soon(reenter_during_teardown)

    trio.run(main)
--- a/tests/test_ringbuf.py
+++ b/tests/test_ringbuf.py
@ -4,6 +4,10 @@ import trio
 import pytest

 import tractor
+
+# XXX `cffi` dun build on py3.14 yet..
+cffi = pytest.importorskip("cffi")
+
 from tractor.ipc._ringbuf import (
    open_ringbuf,
    RBToken,
@ -14,7 +18,7 @@ from tractor._testing.samples import (
    generate_sample_messages,
 )

-# in case you don't want to melt your cores, uncomment dis!
+# XXX, in case you want to melt your cores, comment this skip line XD
 pytestmark = pytest.mark.skip


--- a/tests/test_root_infect_asyncio.py
+++ b/tests/test_root_infect_asyncio.py
@ -49,7 +49,7 @@ def test_infected_root_actor(
                ),
                to_asyncio.open_channel_from(
                    aio_echo_server,
-                ) as (first, chan),
+                ) as (chan, first),
            ):
                assert first == 'start'

@ -91,13 +91,12 @@ def test_infected_root_actor(
 async def sync_and_err(
    # just signature placeholders for compat with
    # ``to_asyncio.open_channel_from()``
-    to_trio: trio.MemorySendChannel,
-    from_trio: asyncio.Queue,
+    chan: tractor.to_asyncio.LinkedTaskChannel,
    ev: asyncio.Event,

 ):
-    if to_trio:
-        to_trio.send_nowait('start')
+    if chan:
+        chan.started_nowait('start')

    await ev.wait()
    raise RuntimeError('asyncio-side')
@ -174,7 +173,7 @@ def test_trio_prestarted_task_bubbles(
                            sync_and_err,
                            ev=aio_ev,
                        )
-                    ) as (first, chan),
+                    ) as (chan, first),
                ):

                    for i in range(5):
--- a/tests/test_root_runtime.py
+++ b/tests/test_root_runtime.py
@ -94,15 +94,15 @@ def test_runtime_vars_unset(
    after the root actor-runtime exits!

    '''
-    assert not tractor._state._runtime_vars['_debug_mode']
+    assert not tractor.runtime._state._runtime_vars['_debug_mode']
    async def main():
-        assert not tractor._state._runtime_vars['_debug_mode']
+        assert not tractor.runtime._state._runtime_vars['_debug_mode']
        async with tractor.open_nursery(
            debug_mode=True,
        ):
-            assert tractor._state._runtime_vars['_debug_mode']
+            assert tractor.runtime._state._runtime_vars['_debug_mode']

        # after runtime closure, should be reverted!
-        assert not tractor._state._runtime_vars['_debug_mode']
+        assert not tractor.runtime._state._runtime_vars['_debug_mode']

    trio.run(main)
--- a/tests/test_rpc.py
+++ b/tests/test_rpc.py
@ -110,7 +110,7 @@ def test_rpc_errors(
        ) as n:

            actor = tractor.current_actor()
-            assert actor.is_arbiter
+            assert actor.is_registrar
            await n.run_in_actor(
                sleep_back_actor,
                actor_name=subactor_requests_to,
--- a/tests/test_runtime.py
+++ b/tests/test_runtime.py
@ -22,6 +22,10 @@ def unlink_file():
 async def crash_and_clean_tmpdir(
    tmp_file_path: str,
    error: bool = True,
+    rent_cancel: bool = True,
+
+    # XXX unused, but do we really need to test these cases?
+    self_cancel: bool = False,
 ):
    global _file_path
    _file_path = tmp_file_path
@ -32,43 +36,75 @@ async def crash_and_clean_tmpdir(
    assert os.path.isfile(tmp_file_path)
    await trio.sleep(0.1)
    if error:
+        print('erroring in subactor!')
        assert 0
-    else:
+
+    elif self_cancel:
+        print('SELF-cancelling subactor!')
        actor.cancel_soon()

+    elif rent_cancel:
+        await trio.sleep_forever()
+
+    print('subactor exiting task!')
+

@pytest.mark.parametrize(
    'error_in_child',
    [True, False],
+    ids='error_in_child={}'.format,
 )
@tractor_test
 async def test_lifetime_stack_wipes_tmpfile(
    tmp_path,
    error_in_child: bool,
+    loglevel: str,
+    # log: tractor.log.StackLevelAdapter,
+    # ^TODO, once landed via macos support!
 ):
    child_tmp_file = tmp_path / "child.txt"
    child_tmp_file.touch()
    assert child_tmp_file.exists()
    path = str(child_tmp_file)

+    # NOTE, this is expected to cancel the sub
+    # in the `error_in_child=False` case!
+    timeout: float = (
+        1.6 if error_in_child
+        else 1
+    )
    try:
-        with trio.move_on_after(0.5):
-            async with tractor.open_nursery() as n:
-                    await (  # inlined portal
-                        await n.run_in_actor(
-                            crash_and_clean_tmpdir,
-                            tmp_file_path=path,
-                            error=error_in_child,
-                        )
-                    ).result()
-
+        with trio.move_on_after(timeout) as cs:
+            async with tractor.open_nursery(
+                loglevel=loglevel,
+            ) as an:
+                await (  # inlined `tractor.Portal`
+                    await an.run_in_actor(
+                        crash_and_clean_tmpdir,
+                        tmp_file_path=path,
+                        error=error_in_child,
+                    )
+                ).result()
    except (
        tractor.RemoteActorError,
-        # tractor.BaseExceptionGroup,
        BaseExceptionGroup,
-    ):
-        pass
+    ) as _exc:
+        exc = _exc
+        from tractor.log import get_console_log
+        log = get_console_log(
+            level=loglevel,
+            name=__name__,
+        )
+        log.exception(
+            f'Subactor failed as expected with {type(exc)!r}\n'
+        )

    # tmp file should have been wiped by
    # teardown stack.
    assert not child_tmp_file.exists()
+
+    if error_in_child:
+        assert not cs.cancel_called
+    else:
+        # expect timeout in some cases?
+        assert cs.cancel_called
--- a/tests/test_shm.py
+++ b/tests/test_shm.py
@ -2,6 +2,7 @@
 Shared mem primitives and APIs.

 """
+import platform
 import uuid

 # import numpy
@ -13,6 +14,20 @@ from tractor.ipc._shm import (
    attach_shm_list,
 )

+pytestmark = pytest.mark.skipon_spawn_backend(
+    'subint',
+    # NOTE, `main_thread_forkserver` works for these tests
+    # via the `mp.SharedMemory(track=False)` +
+    # `mp.resource_tracker` monkey-patch in `.ipc._mp_bs`.
+    # Without that workaround the fork-inherited
+    # `resource_tracker` fd would EBADF on first shm op +
+    # cascade into `FileExistsError` across parametrize
+    # variants. Tracker doc:
+    # `ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md`.
+    reason=(
+        'subint: GIL-contention hanging class.\n'
+    )
+)

@tractor.context
 async def child_attach_shml_alot(
@ -53,7 +68,18 @@ def test_child_attaches_alot():
                    shm_key=shml.key,
                ) as (ctx, start_val),
            ):
-                assert start_val == key
+                assert (_key := shml.key) == start_val
+
+                if platform.system() != 'Darwin':
+                    # XXX, macOS has a char limit..
+                    # see `ipc._shm._shorten_key_for_macos`
+                    assert (
+                        start_val
+                        ==
+                        key
+                        ==
+                        _key
+                    )
                await ctx.result()

            await portal.cancel_actor()
--- a/Show More
+++ b/Show More