tractor/tractor/_testing/trace.py

# tractor: distributed structured concurrency.
# Copyright 2018-eternity Tyler Goodlet.

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

'''
Pure-Python diagnostic state-capture for hung
`pytest`/`tractor` process trees.

This module is the load-bearing core for two consumers:

1. The `xontrib/tractor_diag.xsh::acli.*` xonsh aliases
   (`acli.ptree`, `acli.hung_dump`, `acli.bindspace_scan`,
   `acli.dump_all`) — interactive terminal diag tools.

2. In-test "capture-on-hang" helpers like
   `fail_after_w_trace()` / `afk_alarm_w_trace()` that drop a
   full diag snapshot to disk when a test exceeds its timeout
   budget instead of just emitting a context-less
   `trio.TooSlowError`.

All public dump-* functions RETURN formatted text rather than
printing, so callers can render to a terminal OR write to a
file. `dump_all()` does the file-writing for snapshot-archive
use cases.

Sudo policy:
  Per-pid kernel `stack` + `py-spy dump` need `CAP_SYS_PTRACE`,
  invoked via `sudo -n`. Two modes:

  - `allow_sudo_prompt=True` (terminal CLI default):
    `ensure_sudo_cached()` prompts the user once via `sudo -v`
    if creds aren't cached, then re-uses them per-pid.

  - `allow_sudo_prompt=False` (pytest / in-test default):
    silently skip sudo-required diagnostics; emit a banner
    pointing the human at `sudo -v && acli.hung_dump <pid>`
    for a follow-up manual capture.

'''
from __future__ import annotations

import json
import os
import re
import signal
import subprocess as sp
from contextlib import (
    AbstractAsyncContextManager,
    AbstractContextManager,
    asynccontextmanager,
    contextmanager,
)
from datetime import datetime
from io import StringIO
from pathlib import Path
from typing import (
    AsyncIterator,
    Callable,
    Iterator,
    TypeAlias,
)


# Public type aliases for the `fail_after_w_trace` /
# `afk_alarm_w_trace` fixture-returned CM-factory callables.
# Test signatures can annotate the fixture param directly::
#
#     def test_foo(
#         fail_after_w_trace: FailAfterWTraceFactory,
#     ):
#         async with fail_after_w_trace(5.0):
#             ...
#
# NOTE the fixture name intentionally shadows the underlying
# `fail_after_w_trace` function at test-fn scope; pytest's
# param-resolution overrides the module-level import, so the
# fixture-returned CM-factory wins inside the test body.
#
# `Callable[..., ...]` keeps the kwargs surface loose (caller
# can pass `label=`, `pid=`, `out_dir=`); precise checking of
# the first-arg `seconds` is left to runtime since most callers
# pass an `int|float` literal.
FailAfterWTraceFactory: TypeAlias = Callable[
    ...,
    AbstractAsyncContextManager[None],
]
AfkAlarmWTraceFactory: TypeAlias = Callable[
    ...,
    AbstractContextManager[None],
]

try:
    import psutil
except ImportError:
    psutil = None

try:
    import pytest as _pytest
except ImportError:
    # `trace.py`'s pure-Python core (proc-tree + bindspace +
    # dump_*) is intentionally pytest-free so the `xontrib`
    # CLI can `import` it from any venv. The fixtures at
    # the bottom of this module require `pytest` and are
    # only defined when it's importable.
    _pytest = None


# matches tractor's UDS sock naming: `<actor_name>@<pid>.sock`
_UDS_SOCK_RE = re.compile(
    r'^(?P<name>.+)@(?P<pid>\d+)\.sock$'
)


# ---------------------------------------------------------------
# pid + proc-tree resolution
# ---------------------------------------------------------------

def resolve_pids(arg: str) -> list[int]:
    '''
    Resolve a numeric pid OR a `pgrep -f` pattern to a list of
    pids. Returns `[]` on no match.

    '''
    if arg.isdigit():
        return [int(arg)]
    try:
        out: str = sp.check_output(
            ['pgrep', '-f', arg],
            text=True,
        )
    except sp.CalledProcessError:
        return []
    return [int(p) for p in out.split() if p]


def walk_tree_psutil(pid: int) -> list:
    '''Flat `[Process, *descendants]` via `psutil` (or `[]`).'''
    if psutil is None:
        return []
    try:
        p = psutil.Process(pid)
    except psutil.NoSuchProcess:
        return []
    return [p] + p.children(recursive=True)


def _walk_tree_with_depth(pid: int) -> Iterator[tuple]:
    '''Yield `(proc, depth)` pairs walking `pid`'s subtree.'''
    if psutil is None:
        return
    try:
        root = psutil.Process(pid)
    except psutil.NoSuchProcess:
        return
    yield root, 0
    stack: list = [(root, 0)]
    seen: set = {pid}
    while stack:
        parent, d = stack.pop()
        try:
            kids = parent.children()
        except psutil.NoSuchProcess:
            continue
        for k in kids:
            if k.pid in seen:
                continue
            seen.add(k.pid)
            yield k, d + 1
            stack.append((k, d + 1))


def _walk_tree_pgrep(pid: int) -> list[int]:
    '''psutil-less fallback — recursive `pgrep -P`.'''
    out: list[int] = [pid]
    try:
        kids: list = sp.check_output(
            ['pgrep', '-P', str(pid)],
            text=True,
        ).split()
    except sp.CalledProcessError:
        return out
    for k in kids:
        out.extend(_walk_tree_pgrep(int(k)))
    return out


def _which_cgroup_slice(pid: int) -> str | None:
    '''
    Return `'system'` / `'user'` / `None` for `pid`'s top-level
    systemd cgroup slice. See the full `xontrib` docstring on
    `_which_cgroup_slice` for the bucket semantics.

    '''
    try:
        with open(f'/proc/{pid}/cgroup') as f:
            cg: str = f.read()
    except (
        FileNotFoundError,
        PermissionError,
        ProcessLookupError,
        OSError,
    ):
        return None
    if '/system.slice/' in cg:
        return 'system'
    if '/user.slice/' in cg:
        return 'user'
    return None


def _find_tractor_strays(seen: set[int]) -> list[int]:
    '''
    Scan `/proc/*/cmdline` (+ `/comm` as zombie-safe fallback) for
    `tractor._child` / `tractor[<aid>]` proctitle matches whose
    `pid` is NOT in the `seen` set.

    Used by `dump_proc_tree(include_strays=True)` to surface ghost
    subactor trees from PRIOR test runs that aren't descendants of
    the snapshot's root pid (typically the pytest worker). These
    are usually the source of cross-test launchpad contamination —
    e.g. orphaned `tractor._child` procs still squatting on UDS
    bindspace from a hung-then-killed pytest invocation.

    Returns the pids; caller decides what to do with them
    (typically: walk their subtrees as additional roots and let
    the existing zombie/orphan/live classification handle them).

    Reuses `_reap._is_tractor_subactor` for the cmdline/comm
    intrinsic-marker test so the detection stays in lock-step
    with the reaper's own definition.

    '''
    # lazy-imported to avoid module-import cycle: `_reap.py` is a
    # pytest plugin that imports from this module's siblings.
    from ._reap import _is_tractor_subactor

    strays: list[int] = []
    proc = Path('/proc')
    if not proc.is_dir():
        return strays
    for entry in proc.iterdir():
        if not entry.name.isdigit():
            continue
        pid: int = int(entry.name)
        if pid in seen:
            continue
        if _is_tractor_subactor(pid):
            strays.append(pid)
    return sorted(strays)


def _ppid_from_proc(pid: int) -> int | None:
    '''
    Read `ppid` from `/proc/<pid>/stat`. Returns None on race
    (proc died) / permission / non-linux.

    NB: stat field [1] is `(comm)` which can contain spaces +
    parens — `rsplit(')', 1)` is the safe way to skip past it.

    '''
    try:
        with open(f'/proc/{pid}/stat') as f:
            stat: str = f.read()
        after_comm: str = stat.rsplit(')', 1)[1].strip()
        return int(after_comm.split()[1])  # state(0) ppid(1)
    except (
        FileNotFoundError,
        PermissionError,
        ProcessLookupError,
        OSError,
    ):
        return None


# ---------------------------------------------------------------
# sudo probe / prompt
# ---------------------------------------------------------------

def is_sudo_cached() -> bool:
    '''
    Quietly probe whether `sudo` creds are cached. Never
    prompts — safe to call from non-interactive contexts.

    '''
    try:
        return sp.run(
            ['sudo', '-n', 'true'],
            capture_output=True,
        ).returncode == 0
    except FileNotFoundError:
        return False


def ensure_sudo_cached() -> bool:
    '''
    Like `is_sudo_cached()` but PROMPTS interactively via
    `sudo -v` if not yet cached. Suitable for terminal-CLI use
    only — DO NOT call from inside a pytest run.

    '''
    if is_sudo_cached():
        return True
    print(
        '[tractor-trace] needs `sudo` for '
        '/proc/<pid>/stack and `py-spy dump`; caching creds '
        'via `sudo -v`...'
    )
    try:
        rc: int = sp.run(['sudo', '-v']).returncode
    except KeyboardInterrupt:
        print('  cancelled — proceeding without sudo')
        return False
    except FileNotFoundError:
        print('  sudo not on PATH — proceeding without sudo')
        return False
    return rc == 0


# ---------------------------------------------------------------
# dump_proc_tree (== acli.ptree)
# ---------------------------------------------------------------

def dump_proc_tree(
    roots: list[int],
    *,
    flag_tree: bool = False,
    include_strays: bool = True,
) -> str:
    '''
    Severity-classified proc-tree rendering of `roots` and
    their descendants. Returns formatted text.

    Buckets (severity-ordered):
      - zombies:       `status in (Z, X)`
      - orphans:       `ppid==1`, NOT in a systemd cgroup slice
      - system-slice:  `ppid==1`, under `/system.slice/`
      - user-slice:    `ppid==1`, under `/user.slice/.../*.scope`
      - live:          real (`ppid > 1`) parent

    `flag_tree=True` additionally prepends a flat walk-order
    `## tree` section preserving parent-child shape.

    `include_strays=True` (default) additionally scans
    `/proc/*/cmdline` for `tractor._child` / `tractor[<aid>]`
    procs that are NOT descendants of any provided root — these
    are typically ghost subactor trees from PRIOR test runs
    (cross-test launchpad contamination). Their subtrees are
    walked and classified normally; the bucket counts then
    include them. See `_find_tractor_strays()`.

    '''
    buf = StringIO()

    def echo(line: str = '') -> None:
        buf.write(line + '\n')

    if psutil is None:
        echo(
            'ptree requires `psutil`; '
            'install via `uv pip install psutil`'
        )
        return buf.getvalue()

    # statuses considered "defunct"
    defunct_statuses: set = {
        psutil.STATUS_ZOMBIE,
        getattr(psutil, 'STATUS_DEAD', 'dead'),
    }

    seen: set = set()
    walk_order: list = []
    live: list = []
    orphans: list = []
    system_slice: list = []
    user_slice: list = []
    zombies: list = []
    gone: list = []
    pid_to_bucket: dict = {}

    # lazy-imported, used to override cgroup-slice classification
    # for `tractor._child` strays (they're orphans regardless of
    # whether they happen to be in the user.slice / system.slice
    # cgroup — `desktop-launched app` is the *wrong* read for a
    # leaked subactor that just happens to inherit user-session
    # cgroup membership from its now-dead parent).
    from ._reap import _is_tractor_subactor

    def _classify_walk(walk_roots: list[int]) -> None:
        '''Walk + classify into the closure-shared bucket lists.'''
        for r in walk_roots:
            for (p, depth) in _walk_tree_with_depth(r):
                if p.pid in seen:
                    continue
                seen.add(p.pid)
                try:
                    status: str = p.status()
                    ppid: int = p.ppid()
                except psutil.NoSuchProcess:
                    gone.append(p.pid)
                    continue
                entry = (p, depth)
                if status in defunct_statuses:
                    zombies.append(entry)
                    pid_to_bucket[p.pid] = 'zombies'
                elif ppid == 1:
                    # `tractor._child` procs reparented to init are
                    # leaked subactors regardless of cgroup-slice —
                    # short-circuit to `orphans` before falling back
                    # to the systemd-slice categorization (which is
                    # only meaningful for NON-tractor procs).
                    if _is_tractor_subactor(p.pid):
                        orphans.append(entry)
                        pid_to_bucket[p.pid] = 'orphans'
                    else:
                        slice_kind: str | None = _which_cgroup_slice(p.pid)
                        if slice_kind == 'system':
                            system_slice.append(entry)
                            pid_to_bucket[p.pid] = 'system-slice'
                        elif slice_kind == 'user':
                            user_slice.append(entry)
                            pid_to_bucket[p.pid] = 'user-slice'
                        else:
                            orphans.append(entry)
                            pid_to_bucket[p.pid] = 'orphans'
                else:
                    live.append(entry)
                    pid_to_bucket[p.pid] = 'live'
                walk_order.append(entry)

    _classify_walk(roots)
    explicit_seen: set = set(seen)

    stray_roots: list[int] = []
    if include_strays:
        stray_roots = _find_tractor_strays(seen)
        if stray_roots:
            _classify_walk(stray_roots)

    total: int = (
        len(live)
        + len(orphans)
        + len(system_slice)
        + len(user_slice)
        + len(zombies)
    )
    echo(f'# ptree: {total} procs across roots {roots}')
    if stray_roots:
        n_stray_proc: int = len(seen) - len(explicit_seen)
        echo(
            f'#  + {n_stray_proc} `tractor._child` stray proc(s) '
            f'NOT descendants of {roots} '
            f'(likely cross-test ghosts; see bindspace dump for '
            f'their UDS sock state):'
        )
        for sr in stray_roots:
            echo(f'#    stray-root: {sr}')

    hdr: str = (
        '  ' + 'PID'.rjust(7)
        + '  ' + 'PPID'.rjust(7)
        + '  ' + 'STATUS'.ljust(10)
        + '  CMD'
    )

    def _row(entry, bucket: str | None = None) -> str:
        p, depth = entry
        tree_pfx: str = ('   ' * depth) + ('└─ ' if depth > 0 else '')

        parent_anno: str = ''
        if (
            bucket is not None
            and depth > 0
        ):
            try:
                parent_pid: int = p.ppid()
            except psutil.NoSuchProcess:
                parent_pid = 0
            if parent_pid and parent_pid != 1:
                parent_bucket: str | None = pid_to_bucket.get(parent_pid)
                if (
                    parent_bucket is not None
                    and parent_bucket != bucket
                ):
                    parent_anno = (
                        f'  [parent: {parent_pid} '
                        f'(in `{parent_bucket}`)]'
                    )

        try:
            cmd: str = (
                ' '.join(p.cmdline())[:140]
                or '[' + p.name() + ']'
            )
            r: str = '  ' + str(p.pid).rjust(7)
            r += '  ' + str(p.ppid()).rjust(7)
            r += '  ' + p.status().ljust(10)
            r += '  ' + tree_pfx + cmd + parent_anno
            return r
        except psutil.ZombieProcess:
            try:
                ppid_str: str = str(p.ppid())
                name: str = p.name()
            except psutil.NoSuchProcess:
                ppid_str, name = '?', '?'
            r = '  ' + str(p.pid).rjust(7)
            r += '  ' + ppid_str.rjust(7)
            r += '  ' + 'zombie'.ljust(10)
            r += (
                '  ' + tree_pfx
                + '[' + name + ' <defunct>]'
                + parent_anno
            )
            return r
        except psutil.NoSuchProcess:
            return (
                '  ' + str(p.pid).rjust(7)
                + '  (gone mid-walk)'
            )

    def _section(
        title: str,
        procs: list,
        hint: str = '',
        bucket: str | None = None,
    ) -> None:
        echo()
        echo(
            f'## {title} ({len(procs)})'
            + (f'  — {hint}' if hint else '')
        )
        if not procs:
            echo('  (none)')
            return
        echo(hdr)
        for p in procs:
            echo(_row(p, bucket=bucket))

    if flag_tree:
        _section(
            'tree', walk_order,
            'flat walk-order, parent-child preserved',
        )

    _section(
        'zombies', zombies,
        'status `Z`/`X`, parent has not reaped',
        bucket='zombies',
    )
    _section(
        'orphans', orphans,
        '`ppid==1` + leaked: either NOT in a `system.slice`/'
        '`user.slice` cgroup, OR a known `tractor._child` '
        'proc (leaked subactor, regardless of cgroup-slice)',
        bucket='orphans',
    )
    _section(
        'system-slice', system_slice,
        '`ppid==1`, rooted under `/system.slice/` '
        '(real systemd-managed service — daemon, login '
        'session manager, etc; not a leak)',
        bucket='system-slice',
    )
    _section(
        'user-slice', user_slice,
        '`ppid==1`, rooted under `/user.slice/.../*.scope` '
        '(desktop-launched app wrapped by systemd-user — '
        'browser, editor, etc; not a leak)',
        bucket='user-slice',
    )
    _section('live', live, bucket='live')

    if gone:
        echo()
        echo(f'## gone-during-walk ({len(gone)}): {gone}')

    return buf.getvalue()


# ---------------------------------------------------------------
# dump_hung_state (== acli.hung_dump)
# ---------------------------------------------------------------

def dump_hung_state(
    roots: list[int],
    *,
    allow_sudo_prompt: bool = False,
) -> str:
    '''
    Per-pid kernel + python state for a hung pytest/tractor
    process tree. Walks descendants of each root.

    Captures per-pid:
      - `/proc/<pid>/wchan` (world-readable)
      - `/proc/<pid>/stack` (CAP_SYS_PTRACE — needs sudo)
      - `py-spy dump --pid <N> --locals` (needs sudo)

    Sudo policy controlled by `allow_sudo_prompt`:

    - `True`: call `ensure_sudo_cached()` which prompts via
      `sudo -v` if creds aren't cached. Use from terminal CLI.

    - `False` (default): only probe via `is_sudo_cached()` —
      never prompts. If not cached, skip stack+py-spy and emit
      a banner pointing the human at the manual follow-up cmd.
      Use from inside a pytest run.

    '''
    buf = StringIO()

    def echo(line: str = '') -> None:
        buf.write(line + '\n')

    if allow_sudo_prompt:
        have_sudo: bool = ensure_sudo_cached()
    else:
        have_sudo: bool = is_sudo_cached()

    pids: list[int] = []
    seen: set = set()
    for r in roots:
        if psutil is not None:
            walk: list[int] = [p.pid for p in walk_tree_psutil(r)]
        else:
            walk = _walk_tree_pgrep(r)
        for pid in walk:
            if pid not in seen:
                seen.add(pid)
                pids.append(pid)

    echo(f'# tree: {pids}')

    if not have_sudo:
        echo()
        echo(
            '💡 sudo creds NOT cached — '
            '`/proc/<pid>/stack` + `py-spy dump` SKIPPED '
            'for all pids below.'
        )
        echo(
            '   For full kernel-stack + py-spy frames, '
            're-run manually with sudo cached:'
        )
        echo(f'     sudo -v && acli.hung_dump {pids[0] if pids else "<pid>"}')

    echo()
    echo('## ps forest')
    if pids:
        try:
            ps_out: str = sp.check_output(
                [
                    'ps',
                    '-o', 'pid,ppid,pgid,stat,cmd',
                    '-p', ','.join(map(str, pids)),
                ],
                text=True,
            )
            echo(ps_out.rstrip())
        except (sp.CalledProcessError, FileNotFoundError) as e:
            echo(f'  (ps failed: {e})')

    for pid in pids:
        echo()
        echo(f'## pid {pid}' + (
            ''
            if have_sudo
            else '  (sudo NOT cached — stack/py-spy SKIPPED)'
        ))

        for f in ('wchan', 'stack'):
            path = Path(f'/proc/{pid}/{f}')
            try:
                txt: str = path.read_text().rstrip()
                echo(f'-- /proc/{pid}/{f} --')
                echo(txt)
            except PermissionError:
                if not have_sudo:
                    echo(
                        f'-- /proc/{pid}/{f}: '
                        'PermissionError (no sudo) --'
                    )
                    continue
                try:
                    txt = sp.check_output(
                        ['sudo', '-n', 'cat', str(path)],
                        text=True,
                        stderr=sp.DEVNULL,
                    ).rstrip()
                    echo(f'-- /proc/{pid}/{f} (sudo) --')
                    echo(txt)
                except sp.CalledProcessError:
                    echo(
                        f'-- /proc/{pid}/{f}: '
                        'sudo cred expired? rerun --'
                    )
            except FileNotFoundError:
                echo(f'-- /proc/{pid}/{f}: proc gone --')

        echo(f'-- py-spy {pid} --')
        if not have_sudo:
            echo('  (skipped — no sudo)')
            continue
        try:
            py_spy_out: str = sp.check_output(
                ['sudo', '-n', 'py-spy', 'dump', '--pid', str(pid), '--locals'],
                text=True,
                stderr=sp.STDOUT,
            )
            echo(py_spy_out.rstrip())
        except (sp.CalledProcessError, FileNotFoundError) as e:
            echo(f'  (py-spy failed: {e})')

    return buf.getvalue()


# ---------------------------------------------------------------
# scan_bindspace (== acli.bindspace_scan)
# ---------------------------------------------------------------

def scan_bindspace(arg: str | None = None) -> str:
    '''
    Scan a tractor UDS bindspace dir for orphan sock files.

    `arg` semantics:
      - `None`        -> `$XDG_RUNTIME_DIR/tractor`
      - bare `<name>` -> `$XDG_RUNTIME_DIR/<name>` (e.g. `piker`)
      - path          -> use as-is

    Output buckets: `live-active`, `orphaned-alive`,
    `orphaned-dead`, `non-tractor`.

    '''
    buf = StringIO()

    def echo(line: str = '') -> None:
        buf.write(line + '\n')

    runtime: str = os.environ.get(
        'XDG_RUNTIME_DIR',
        f'/run/user/{os.getuid()}',
    )
    if arg:
        if arg.startswith('/') or '/' in arg:
            bs_dir = Path(arg)
        else:
            bs_dir = Path(runtime) / arg
    else:
        bs_dir = Path(runtime) / 'tractor'

    if not bs_dir.exists():
        echo(f'(no bindspace at {bs_dir})')
        return buf.getvalue()

    socks: list = sorted(bs_dir.glob('*.sock'))
    echo(f'## bindspace {bs_dir} ({len(socks)} sock file(s))')

    live_active: list = []
    live_orphaned: list = []
    dead_orphans: list = []
    bogus: list = []

    for s in socks:
        m = _UDS_SOCK_RE.match(s.name)
        if not m:
            bogus.append(s)
            continue
        pid = int(m['pid'])
        name = m['name']
        try:
            os.kill(pid, 0)
        except ProcessLookupError:
            dead_orphans.append((s, pid, name))
            continue
        except PermissionError:
            live_active.append((s, pid, name, None))
            continue

        ppid: int | None = _ppid_from_proc(pid)
        if ppid == 1:
            live_orphaned.append((s, pid, name, ppid))
        else:
            live_active.append((s, pid, name, ppid))

    echo()
    echo(
        f'## live-active ({len(live_active)})  '
        f'— PID alive, parent still own it'
    )
    if not live_active:
        echo('  (none)')
    for s, pid, name, ppid in live_active:
        row: str = '  ' + str(pid).rjust(7)
        row += '  ' + name.ljust(32)
        row += '  ' + s.name
        if ppid is not None:
            row += f'  (ppid={ppid})'
        echo(row)

    echo()
    echo(
        f'## orphaned-alive ({len(live_orphaned)})  '
        f'— PID alive but `ppid==1`, parent reaped; '
        f'`acli.reap` candidate'
    )
    if not live_orphaned:
        echo('  (none)')
    for s, pid, name, ppid in live_orphaned:
        row = '  ' + str(pid).rjust(7)
        row += '  ' + name.ljust(32)
        row += '  ' + s.name + '  (adopted by init)'
        echo(row)

    echo()
    echo(
        f'## orphaned-dead ({len(dead_orphans)})  '
        f'— PID gone, sock stale'
    )
    if not dead_orphans:
        echo('  (none)')
    for s, pid, name in dead_orphans:
        row = '  ' + str(pid).rjust(7)
        row += '  ' + name.ljust(32)
        row += '  ' + s.name + '  (no live proc)'
        echo(row)

    if bogus:
        echo()
        echo(
            f'## non-tractor ({len(bogus)})  '
            f'— filename lacks `@<pid>` suffix, '
            f'cannot determine liveness intrinsically'
        )
        for s in bogus:
            echo(f'  {s.name}')
        echo()
        echo('to check liveness manually (needs `iproute2`/`ss`):')
        for s in bogus:
            echo(f"  ss -lpx 'src = {s}'")

    if dead_orphans or live_orphaned:
        echo()
        echo(
            'to sweep BOTH orphaned-alive subs '
            '(graceful SIGINT -> SIGKILL) AND dead-orphan '
            'socks in one shot:'
        )
        echo('  acli.reap --uds')

    if dead_orphans:
        unlink_cmd: str = ' '.join(str(o[0]) for o in dead_orphans)
        echo()
        echo(
            '(or to unlink dead-orphan socks manually, '
            "skipping `acli.reap`'s graceful-cancel ladder:)"
        )
        echo(f'  rm {unlink_cmd}')

    return buf.getvalue()


# ---------------------------------------------------------------
# dump_all — file-writing snapshot capture
# ---------------------------------------------------------------

def _default_dump_root() -> Path:
    '''
    `$XDG_CACHE_HOME/tractor/hung-dumps/` with
    `~/.cache/tractor/hung-dumps/` fallback.

    '''
    cache: str = os.environ.get(
        'XDG_CACHE_HOME',
        str(Path.home() / '.cache'),
    )
    return Path(cache) / 'tractor' / 'hung-dumps'


def dump_all(
    pid: int,
    out_dir: Path | None = None,
    *,
    label: str,
    allow_sudo_prompt: bool = False,
) -> Path:
    '''
    Capture full diag snapshot for the proc tree rooted at
    `pid` into a new sub-directory under `out_dir`.

    Layout:
      `<out_dir>/<label>__<iso-timestamp>/`
      ├─ trace.txt        # ptree + hung_state merged
      ├─ bindspace.txt    # bindspace_scan output
      └─ meta.json        # {pid, label, captured_at, sudo_cached}

    Returns the snapshot directory `Path`.

    `out_dir` defaults to
    `$XDG_CACHE_HOME/tractor/hung-dumps/` (fallback
    `~/.cache/tractor/hung-dumps/`).

    See `dump_hung_state()` for `allow_sudo_prompt` semantics
    — defaults to False (test-safe).

    '''
    if out_dir is None:
        out_dir = _default_dump_root()
    out_dir = Path(out_dir)

    ts: str = datetime.now().strftime('%Y-%m-%dT%H-%M-%S')
    # sanitize label for filesystem: collapse anything non-word/-./-
    # into single underscore, strip leading/trailing underscores.
    safe_label: str = re.sub(r'[^\w.\-]+', '_', label).strip('_')
    dump_dir: Path = out_dir / f'{safe_label}__{ts}'
    dump_dir.mkdir(parents=True, exist_ok=True)

    sudo_ok: bool = (
        ensure_sudo_cached()
        if allow_sudo_prompt
        else is_sudo_cached()
    )

    # combined trace.txt: ptree first (classified buckets),
    # then hung_state (per-pid wchan/stack/py-spy)
    trace_txt: str = (
        '# ===== ptree =====\n'
        + dump_proc_tree([pid])
        + '\n# ===== hung_state =====\n'
        + dump_hung_state(
            [pid],
            allow_sudo_prompt=False,  # already prompted above
        )
    )
    (dump_dir / 'trace.txt').write_text(trace_txt)

    (dump_dir / 'bindspace.txt').write_text(scan_bindspace())

    meta: dict = {
        'pid': pid,
        'label': label,
        'captured_at': ts,
        'sudo_cached': sudo_ok,
    }
    (dump_dir / 'meta.json').write_text(
        json.dumps(meta, indent=2) + '\n'
    )

    return dump_dir


# ---------------------------------------------------------------
# in-test capture-on-hang helpers
# ---------------------------------------------------------------
#
# Pair of CMs that combine a tight cooperative/hard timeout with
# a forced `dump_all()` snapshot BEFORE the failure propagates.
# The goal: when a test hangs, the human (or AI reviewer) gets a
# fresh ptree + per-pid wchan/stack + bindspace state captured to
# disk at the exact moment of the timeout — no need to recreate
# it after the fact (which is often impossible since the procs
# have moved on / been reaped).
#
# Two variants for two failure shapes:
#
#  - `fail_after_w_trace` — async CM wrapping `trio.fail_after`.
#    Cooperative: cancellation is delivered at the next trio
#    checkpoint. Use when the hang is at the trio/python level
#    and the runtime is still scheduling normally.
#
#  - `afk_alarm_w_trace` — sync CM wrapping `signal.alarm`.
#    Hard backstop: raises into the python frame at the next
#    bytecode boundary regardless of trio's state. Use as a wall-
#    clock cap when something *below* the trio scheduler is
#    locking up (e.g. forkserver-launchpad in `os.read`, native-
#    lock held by a C extension, GIL-hostage class hangs).
#    Must run on the main thread (signal.alarm constraint).
#
# Both default to dumping the CURRENT process tree (i.e. the
# pytest worker + its subactor descendants). Override `pid=` to
# scope to a specific actor root.
# ---------------------------------------------------------------


class AFKAlarmTimeout(TimeoutError):
    '''
    Raised by `afk_alarm_w_trace`'s SIGALRM handler when the
    alarm fires. Subclass of `TimeoutError` so existing
    `except TimeoutError:` catches still match.

    '''


# Session-scoped list of snapshot (label, dump_dir) tuples
# captured by `fail_after_w_trace` / `afk_alarm_w_trace` during
# the current process lifetime. Populated by
# `_do_capture_snapshot()` on each successful dump. The
# `pytest_terminal_summary` hook in `tractor._testing.pytest`
# reads this at end-of-session to print an index of all
# snapshot dirs so the human doesn't have to scroll back through
# captured-stderr lines to find paths.
_SNAPSHOT_INDEX: list[tuple[str, Path]] = []


# TODO: follow-up — `TRACTOR_TRACE_HOLD=1` pause-on-hang mode.
# When env-var-enabled, `_do_capture_snapshot` would block on
# `input('press Enter to continue...')` reading from
# `sys.__stdin__` AFTER the dump succeeds, BEFORE re-raising the
# original exception. This lets a human invoke
# `acli.ptree`/`acli.bindspace_scan` from a second terminal
# while the cancel-cascade is frozen mid-flight — currently
# impossible because the per-test reaper fixture sweeps
# orphans within ~0.6s of the timeout firing. See discussion
# 2026-05-13: orphans visible in snapshot's `trace.txt`
# (depth_3 / depth_1 init-adopted procs) but invisible to any
# post-test `acli.*` invocation.


def _do_capture_snapshot(
    *,
    label: str,
    pid: int | None,
    out_dir: Path | None,
    seconds: float,
    timeout_kind: str,  # 'fail_after' | 'afk_alarm'
) -> Path | None:
    '''
    Run `dump_all()` inside a best-effort try-block — never let
    capture failure mask the original timeout exception.

    Returns the snapshot `Path` on success, `None` if capture
    itself failed (with a banner printed to stderr).

    Appends `(label, dump_dir)` to the session-scoped
    `_SNAPSHOT_INDEX` on success so the `pytest_terminal_summary`
    hook can render an index at end-of-session.

    '''
    target_pid: int = pid if pid is not None else os.getpid()
    # NOTE: print to `sys.__stderr__` (the ORIGINAL unredirected
    # stderr) rather than `sys.stderr` so the snapshot-path message
    # bypasses pytest's `--capture=sys` redirection. Under pytest
    # xfailed/passed tests have their captured streams SUPPRESSED
    # entirely (and `--show-capture` only affects FAILED tests),
    # so writing to `sys.stderr` would hide the diag info from the
    # human running the suite. `__stderr__` is the pre-capture fd,
    # always lands on the real terminal. Outside pytest (e.g. the
    # xontrib CLI), `sys.__stderr__ is sys.stderr` so no difference.
    import sys

    try:
        dump_dir: Path = dump_all(
            target_pid,
            out_dir=out_dir,
            label=label,
            # in-test default: never prompt for sudo (would
            # deadlock pytest); the dump_hung_state banner
            # points the human at `sudo -v && acli.hung_dump`
            # for a follow-up manual capture.
            allow_sudo_prompt=False,
        )
    except Exception as e:
        print(
            f'[{timeout_kind}_w_trace] '
            f'⚠️  dump_all() failed: {e!r} '
            f'(label={label!r}, pid={target_pid})',
            file=sys.__stderr__,
        )
        return None

    print(
        f'[{timeout_kind}_w_trace] '
        f'⏰ timed out after {seconds}s (label={label!r}, '
        f'pid={target_pid}); snapshot at: {dump_dir}',
        file=sys.__stderr__,
    )
    _SNAPSHOT_INDEX.append((label, dump_dir))
    return dump_dir


@asynccontextmanager
async def fail_after_w_trace(
    seconds: float,
    *,
    label: str,
    pid: int | None = None,
    out_dir: Path | None = None,
) -> AsyncIterator[None]:
    '''
    Async CM: `trio.fail_after(seconds)` + on-timeout
    `dump_all()` snapshot BEFORE the `trio.TooSlowError`
    propagates.

    Parameters
    ----------
    seconds:
        timeout budget passed to `trio.fail_after`.
    label:
        snapshot dir prefix (e.g. test name).
    pid:
        root pid to snapshot. Defaults to current process —
        which under pytest is the test worker, and its
        descendants are the spawned subactor tree.
    out_dir:
        snapshot parent dir. Defaults to
        `$XDG_CACHE_HOME/tractor/hung-dumps/`.

    Snapshot is taken in EITHER of two cases:
      1. `trio.fail_after` raises `TooSlowError` at scope-
         exit (body returned cleanly but past the deadline).
      2. The body raised a non-`TooSlowError` exception AFTER
         our scope's cancel had been triggered — e.g. an
         `open_nursery.__aexit__` wraps the timeout-induced
         `Cancelled` into a `BaseExceptionGroup` and that
         BEG escapes BEFORE `trio.fail_after`'s exit-check
         can raise `TooSlowError`. Without this branch the
         BEG would propagate untouched and no diag would be
         captured.

    The captured dump is best-effort (failure is logged to
    stderr but doesn't mask the original exception). The
    original exception always propagates.

    Example
    -------
    >>> async with fail_after_w_trace(
    ...     5.0,
    ...     label='test_multierror_fast_nursery',
    ... ):
    ...     await some_hangy_thing()

    '''
    # local import — trio is a hard dep of tractor, but the
    # rest of `trace.py` is trio-free (used from xontrib cli).
    # Keeping the import scoped here means `trace.py` stays
    # importable from a plain-python REPL.
    import trio

    captured: bool = False
    try:
        with trio.fail_after(seconds) as scope:
            try:
                yield
            except BaseException:
                # Body raised. If our `fail_after`'s scope had
                # already cancelled (e.g. deadline hit and a
                # nursery `__aexit__` wrapped the resulting
                # `Cancelled` into a `BaseExceptionGroup`), the
                # body's exc is downstream of OUR timeout —
                # capture diag now since `trio.fail_after`'s
                # `TooSlowError` re-raise won't fire when a
                # different exc is in flight.
                if scope.cancel_called:
                    _do_capture_snapshot(
                        label=label,
                        pid=pid,
                        out_dir=out_dir,
                        seconds=seconds,
                        timeout_kind='fail_after',
                    )
                    captured = True
                raise
    except trio.TooSlowError:
        # Body finished without raising; `fail_after`'s exit-
        # check fired `TooSlowError`.
        if not captured:
            _do_capture_snapshot(
                label=label,
                pid=pid,
                out_dir=out_dir,
                seconds=seconds,
                timeout_kind='fail_after',
            )
        raise


@contextmanager
def afk_alarm_w_trace(
    seconds: int,
    *,
    label: str,
    pid: int | None = None,
    out_dir: Path | None = None,
) -> Iterator[None]:
    '''
    Sync CM: arm `signal.alarm(seconds)`, on SIGALRM fire
    `dump_all()` then raise `AFKAlarmTimeout` so the test
    fails.

    Hard-kill backstop for cases where `trio.fail_after`
    cannot deliver cancellation — e.g. python-level GIL-
    hostage hangs, native locks held by C extensions, or a
    forkserver-launchpad parked in `os.read()`.

    Constraints
    -----------
    - Must be invoked from the MAIN thread (`signal.alarm`
      can only be armed on main thread).
    - Cannot be nested with other SIGALRM consumers — the
      previous handler is restored on exit, but two
      overlapping `afk_alarm` CMs will clobber each other.

    Parameters mirror `fail_after_w_trace`. `seconds` is
    clamped to integer (signal.alarm granularity).

    Example
    -------
    >>> with afk_alarm_w_trace(
    ...     60, label='test_sigint_closes_lifetime_stack',
    ... ):
    ...     trio.run(main)

    '''
    seconds_int: int = max(1, int(seconds))

    def _handler(signum, frame):
        raise AFKAlarmTimeout(
            f'afk_alarm fired after {seconds_int}s '
            f'(label={label!r})'
        )

    prev_handler = signal.signal(signal.SIGALRM, _handler)
    signal.alarm(seconds_int)
    try:
        yield
        signal.alarm(0)  # disarm on clean exit
    except AFKAlarmTimeout:
        # alarm already self-cleared; capture diag + re-raise
        _do_capture_snapshot(
            label=label,
            pid=pid,
            out_dir=out_dir,
            seconds=seconds_int,
            timeout_kind='afk_alarm',
        )
        raise
    finally:
        # belt-and-suspenders: ensure alarm is disarmed even
        # on non-alarm exception paths (e.g. test failed for a
        # different reason inside the body).
        signal.alarm(0)
        signal.signal(signal.SIGALRM, prev_handler)


# ---------------------------------------------------------------
# pytest fixture wrappers
# ---------------------------------------------------------------
# Pre-bind the snapshot `label=` to `request.node.name` so tests
# don't have to plumb `request: pytest.FixtureRequest` AND
# `label=request.node.name` through every call site.
#
# Re-exported from `tractor._testing.pytest` so they're picked up
# by pytest's plugin-discovery (per the `pytest_plugins` entry in
# `pyproject.toml`'s `[tool.pytest.ini_options]`).
# ---------------------------------------------------------------

if _pytest is not None:

    @_pytest.fixture(name='fail_after_w_trace')
    def fail_after_w_trace_fixture(
        request: _pytest.FixtureRequest,
    ) -> FailAfterWTraceFactory:
        '''
        Pre-labeled async-CM factory for
        `fail_after_w_trace`.

        Auto-injects `label=request.node.name` so tests just
        do::

            async def test_foo(
                fail_after_w_trace: FailAfterWTraceFactory,
            ):
                async with fail_after_w_trace(5.0):
                    await some_hangy_thing()

        instead of the more verbose::

            async def test_foo(request):
                async with fail_after_w_trace(
                    5.0, label=request.node.name,
                ):
                    ...

        Any kwarg can still be overridden by the caller (e.g.
        a custom `label=` for hand-tuned dedup of snapshot
        dirs when parametrize ids aren't discriminating
        enough).

        '''
        @asynccontextmanager
        async def _bound(seconds, **kwargs):
            kwargs.setdefault('label', request.node.name)
            async with fail_after_w_trace(seconds, **kwargs):
                yield

        return _bound

    @_pytest.fixture(name='afk_alarm_w_trace')
    def afk_alarm_w_trace_fixture(
        request: _pytest.FixtureRequest,
    ) -> AfkAlarmWTraceFactory:
        '''
        Pre-labeled sync-CM factory for `afk_alarm_w_trace`.

        Sync sibling of `fail_after_w_trace` — wraps the
        SIGALRM-based hard wall-clock backstop with auto-
        injected `label=request.node.name`::

            def test_foo(
                afk_alarm_w_trace: AfkAlarmWTraceFactory,
            ):
                with afk_alarm_w_trace(10):
                    trio.run(main)

        See `afk_alarm_w_trace` for constraints (must run on
        main thread; clobbers other SIGALRM consumers).

        '''
        @contextmanager
        def _bound(seconds, **kwargs):
            kwargs.setdefault('label', request.node.name)
            with afk_alarm_w_trace(seconds, **kwargs):
                yield

        return _bound