Compare commits
No commits in common. "subint_forkserver_backend" and "main" have entirely different histories.
subint_for
...
main
|
|
@ -1,38 +0,0 @@
|
||||||
# Docs TODOs
|
|
||||||
|
|
||||||
## Auto-sync README code examples with source
|
|
||||||
|
|
||||||
The `docs/README.rst` has inline code blocks that
|
|
||||||
duplicate actual example files (e.g.
|
|
||||||
`examples/infected_asyncio_echo_server.py`). Every time
|
|
||||||
the public API changes we have to manually sync both.
|
|
||||||
|
|
||||||
Sphinx's `literalinclude` directive can pull code directly
|
|
||||||
from source files:
|
|
||||||
|
|
||||||
```rst
|
|
||||||
.. literalinclude:: ../examples/infected_asyncio_echo_server.py
|
|
||||||
:language: python
|
|
||||||
:caption: examples/infected_asyncio_echo_server.py
|
|
||||||
```
|
|
||||||
|
|
||||||
Or to include only a specific function/section:
|
|
||||||
|
|
||||||
```rst
|
|
||||||
.. literalinclude:: ../examples/infected_asyncio_echo_server.py
|
|
||||||
:language: python
|
|
||||||
:pyobject: aio_echo_server
|
|
||||||
```
|
|
||||||
|
|
||||||
This way the docs always reflect the actual code without
|
|
||||||
manual syncing.
|
|
||||||
|
|
||||||
### Considerations
|
|
||||||
- `README.rst` is also rendered on GitHub/PyPI which do
|
|
||||||
NOT support `literalinclude` - so we'd need a build
|
|
||||||
step or a separate `_sphinx_readme.rst` (which already
|
|
||||||
exists at `docs/github_readme/_sphinx_readme.rst`).
|
|
||||||
- Could use a pre-commit hook or CI step to extract code
|
|
||||||
from examples into the README for GitHub rendering.
|
|
||||||
- Another option: `sphinx-autodoc` style approach where
|
|
||||||
docstrings from the actual module are pulled in.
|
|
||||||
|
|
@ -1,42 +0,0 @@
|
||||||
{
|
|
||||||
"permissions": {
|
|
||||||
"allow": [
|
|
||||||
"Bash(cp .claude/*)",
|
|
||||||
"Read(.claude/**)",
|
|
||||||
"Read(.claude/skills/run-tests/**)",
|
|
||||||
"Write(.claude/**/*commit_msg*)",
|
|
||||||
"Write(.claude/git_commit_msg_LATEST.md)",
|
|
||||||
"Skill(run-tests)",
|
|
||||||
"Skill(close-wkt)",
|
|
||||||
"Skill(open-wkt)",
|
|
||||||
"Skill(prompt-io)",
|
|
||||||
"Bash(date *)",
|
|
||||||
"Bash(git diff *)",
|
|
||||||
"Bash(git log *)",
|
|
||||||
"Bash(git status)",
|
|
||||||
"Bash(git remote:*)",
|
|
||||||
"Bash(git stash:*)",
|
|
||||||
"Bash(git mv:*)",
|
|
||||||
"Bash(git rev-parse:*)",
|
|
||||||
"Bash(test:*)",
|
|
||||||
"Bash(ls:*)",
|
|
||||||
"Bash(grep:*)",
|
|
||||||
"Bash(find:*)",
|
|
||||||
"Bash(ln:*)",
|
|
||||||
"Bash(cat:*)",
|
|
||||||
"Bash(mkdir:*)",
|
|
||||||
"Bash(gh pr:*)",
|
|
||||||
"Bash(gh api:*)",
|
|
||||||
"Bash(gh issue:*)",
|
|
||||||
"Bash(UV_PROJECT_ENVIRONMENT=py* uv sync:*)",
|
|
||||||
"Bash(UV_PROJECT_ENVIRONMENT=py* uv run:*)",
|
|
||||||
"Bash(echo EXIT:$?:*)",
|
|
||||||
"Bash(echo \"EXIT=$?\")",
|
|
||||||
"Read(//tmp/**)"
|
|
||||||
],
|
|
||||||
"deny": [],
|
|
||||||
"ask": []
|
|
||||||
},
|
|
||||||
"prefersReducedMotion": false,
|
|
||||||
"outputStyle": "default"
|
|
||||||
}
|
|
||||||
|
|
@ -1,225 +0,0 @@
|
||||||
# Commit Message Style Guide for `tractor`
|
|
||||||
|
|
||||||
Analysis based on 500 recent commits from the `tractor` repository.
|
|
||||||
|
|
||||||
## Core Principles
|
|
||||||
|
|
||||||
Write commit messages that are technically precise yet casual in
|
|
||||||
tone. Use abbreviations and informal language while maintaining
|
|
||||||
clarity about what changed and why.
|
|
||||||
|
|
||||||
## Subject Line Format
|
|
||||||
|
|
||||||
### Length and Structure
|
|
||||||
- Target: ~50 chars with a hard-max of 67.
|
|
||||||
- Use backticks around code elements (72.2% of commits)
|
|
||||||
- Rarely use colons (5.2%), except for file prefixes
|
|
||||||
- End with '?' for uncertain changes (rare: 0.8%)
|
|
||||||
- End with '!' for important changes (rare: 2.0%)
|
|
||||||
|
|
||||||
### Opening Verbs (Present Tense)
|
|
||||||
|
|
||||||
Most common verbs from analysis:
|
|
||||||
- `Add` (14.4%) - wholly new features/functionality
|
|
||||||
- `Use` (4.4%) - adopt new approach/tool
|
|
||||||
- `Drop` (3.6%) - remove code/feature
|
|
||||||
- `Fix` (2.4%) - bug fixes
|
|
||||||
- `Move`/`Mv` (3.6%) - relocate code
|
|
||||||
- `Adjust` (2.0%) - minor tweaks
|
|
||||||
- `Update` (1.6%) - enhance existing feature
|
|
||||||
- `Bump` (1.2%) - dependency updates
|
|
||||||
- `Rename` (1.2%) - identifier changes
|
|
||||||
- `Set` (1.2%) - configuration changes
|
|
||||||
- `Handle` (1.0%) - add handling logic
|
|
||||||
- `Raise` (1.0%) - add error raising
|
|
||||||
- `Pass` (0.8%) - pass parameters/values
|
|
||||||
- `Support` (0.8%) - add support for something
|
|
||||||
- `Hide` (1.4%) - make private/internal
|
|
||||||
- `Always` (1.4%) - enforce consistent behavior
|
|
||||||
- `Mk` (1.4%) - make/create (abbreviated)
|
|
||||||
- `Start` (1.0%) - begin implementation
|
|
||||||
|
|
||||||
Other frequent verbs: `More`, `Change`, `Extend`, `Disable`, `Log`,
|
|
||||||
`Enable`, `Ensure`, `Expose`, `Allow`
|
|
||||||
|
|
||||||
### Backtick Usage
|
|
||||||
|
|
||||||
Always use backticks for:
|
|
||||||
- Module names: `trio`, `asyncio`, `msgspec`, `greenback`, `stackscope`
|
|
||||||
- Class names: `Context`, `Actor`, `Address`, `PldRx`, `SpawnSpec`
|
|
||||||
- Method names: `.pause_from_sync()`, `._pause()`, `.cancel()`
|
|
||||||
- Function names: `breakpoint()`, `collapse_eg()`, `open_root_actor()`
|
|
||||||
- Decorators: `@acm`, `@context`
|
|
||||||
- Exceptions: `Cancelled`, `TransportClosed`, `MsgTypeError`
|
|
||||||
- Keywords: `finally`, `None`, `False`
|
|
||||||
- Variable names: `tn`, `debug_mode`
|
|
||||||
- Complex expressions: `trio.Cancelled`, `asyncio.Task`
|
|
||||||
|
|
||||||
Most backticked terms in tractor:
|
|
||||||
`trio`, `asyncio`, `Context`, `.pause_from_sync()`, `tn`,
|
|
||||||
`._pause()`, `breakpoint()`, `collapse_eg()`, `Actor`, `@acm`,
|
|
||||||
`.cancel()`, `Cancelled`, `open_root_actor()`, `greenback`
|
|
||||||
|
|
||||||
### Examples
|
|
||||||
|
|
||||||
Good subject lines:
|
|
||||||
```
|
|
||||||
Add `uds` to `._multiaddr`, tweak typing
|
|
||||||
Drop `DebugStatus.shield` attr, add `.req_finished`
|
|
||||||
Use `stackscope` for all actor-tree rendered "views"
|
|
||||||
Fix `.to_asyncio` inter-task-cancellation!
|
|
||||||
Bump `ruff.toml` to target py313
|
|
||||||
Mv `load_module_from_path()` to new `._code_load` submod
|
|
||||||
Always use `tuple`-cast for singleton parent addrs
|
|
||||||
```
|
|
||||||
|
|
||||||
## Body Format
|
|
||||||
|
|
||||||
### General Structure
|
|
||||||
- 43.2% of commits have no body (simple changes)
|
|
||||||
- Use blank line after subject
|
|
||||||
- Max line length: 67 chars
|
|
||||||
- Use `-` bullets for lists (28.0% of commits)
|
|
||||||
- Rarely use `*` bullets (2.4%)
|
|
||||||
|
|
||||||
### Section Markers
|
|
||||||
|
|
||||||
Use these markers to organize longer commit bodies:
|
|
||||||
- `Also,` (most common: 26 occurrences)
|
|
||||||
- `Other,` (13 occurrences)
|
|
||||||
- `Deats,` (11 occurrences) - for implementation details
|
|
||||||
- `Further,` (7 occurrences)
|
|
||||||
- `TODO,` (3 occurrences)
|
|
||||||
- `Impl details,` (2 occurrences)
|
|
||||||
- `Notes,` (1 occurrence)
|
|
||||||
|
|
||||||
### Common Abbreviations
|
|
||||||
|
|
||||||
Use these freely (sorted by frequency):
|
|
||||||
- `msg` (63) - message
|
|
||||||
- `bg` (37) - background
|
|
||||||
- `ctx` (30) - context
|
|
||||||
- `impl` (27) - implementation
|
|
||||||
- `mod` (26) - module
|
|
||||||
- `obvi` (17) - obviously
|
|
||||||
- `tn` (16) - task name
|
|
||||||
- `fn` (15) - function
|
|
||||||
- `vs` (15) - versus
|
|
||||||
- `bc` (14) - because
|
|
||||||
- `var` (14) - variable
|
|
||||||
- `prolly` (9) - probably
|
|
||||||
- `ep` (6) - entry point
|
|
||||||
- `OW` (5) - otherwise
|
|
||||||
- `rn` (4) - right now
|
|
||||||
- `sig` (4) - signal/signature
|
|
||||||
- `deps` (3) - dependencies
|
|
||||||
- `iface` (2) - interface
|
|
||||||
- `subproc` (2) - subprocess
|
|
||||||
- `tho` (2) - though
|
|
||||||
- `ofc` (2) - of course
|
|
||||||
|
|
||||||
### Tone and Style
|
|
||||||
|
|
||||||
- Casual but technical (use `XD` for humor: 23 times)
|
|
||||||
- Use `..` for trailing thoughts (108 occurrences)
|
|
||||||
- Use `Woops,` to acknowledge mistakes (4 subject lines)
|
|
||||||
- Don't be afraid to show personality while being precise
|
|
||||||
|
|
||||||
### Example Bodies
|
|
||||||
|
|
||||||
Simple with bullets:
|
|
||||||
```
|
|
||||||
Add `multiaddr` and bump up some deps
|
|
||||||
|
|
||||||
Since we're planning to use it for (discovery)
|
|
||||||
addressing, allowing replacement of the hacky (pretend)
|
|
||||||
attempt in `tractor._multiaddr` Bp
|
|
||||||
|
|
||||||
Also pin some deps,
|
|
||||||
- make us py312+
|
|
||||||
- use `pdbp` with my frame indexing fix.
|
|
||||||
- mv to latest `xonsh` for fancy cmd/suggestion injections.
|
|
||||||
|
|
||||||
Bump lock file to match obvi!
|
|
||||||
```
|
|
||||||
|
|
||||||
With section markers:
|
|
||||||
```
|
|
||||||
Use `stackscope` for all actor-tree rendered "views"
|
|
||||||
|
|
||||||
Instead of the (much more) limited and hacky `.devx._code`
|
|
||||||
impls, move to using the new `.devx._stackscope` API which
|
|
||||||
wraps the `stackscope` project.
|
|
||||||
|
|
||||||
Deats,
|
|
||||||
- make new `stackscope.extract_stack()` wrapper
|
|
||||||
- port over frame-descing to `_stackscope.pformat_stack()`
|
|
||||||
- move `PdbREPL` to use `stackscope` render approach
|
|
||||||
- update tests for new stack output format
|
|
||||||
|
|
||||||
Also,
|
|
||||||
- tweak log formatting for consistency
|
|
||||||
- add typing hints throughout
|
|
||||||
```
|
|
||||||
|
|
||||||
## Special Patterns
|
|
||||||
|
|
||||||
### WIP Commits
|
|
||||||
Rare (0.2%) - avoid committing WIP if possible
|
|
||||||
|
|
||||||
### Merge Commits
|
|
||||||
Auto-generated (4.4%), don't worry about style
|
|
||||||
|
|
||||||
### File References
|
|
||||||
- Use `module.py` or `.submodule` style
|
|
||||||
- Rarely use `file.py:line` references (0 in analysis)
|
|
||||||
|
|
||||||
### Links
|
|
||||||
- GitHub links used sparingly (3 total)
|
|
||||||
- Prefer code references over external links
|
|
||||||
|
|
||||||
## Footer
|
|
||||||
|
|
||||||
The default footer should credit `claude` (you) for helping generate
|
|
||||||
the commit msg content:
|
|
||||||
|
|
||||||
```
|
|
||||||
(this commit msg was generated in some part by [`claude-code`][claude-code-gh])
|
|
||||||
[claude-code-gh]: https://github.com/anthropics/claude-code
|
|
||||||
```
|
|
||||||
|
|
||||||
Further, if the patch was solely or in part written
|
|
||||||
by `claude`, instead add:
|
|
||||||
|
|
||||||
```
|
|
||||||
(this patch was generated in some part by [`claude-code`][claude-code-gh])
|
|
||||||
[claude-code-gh]: https://github.com/anthropics/claude-code
|
|
||||||
```
|
|
||||||
|
|
||||||
## Summary Checklist
|
|
||||||
|
|
||||||
Before committing, verify:
|
|
||||||
- [ ] Subject line uses present tense verb
|
|
||||||
- [ ] Subject line ~50 chars (hard max 67)
|
|
||||||
- [ ] Code elements wrapped in backticks
|
|
||||||
- [ ] Body lines ≤67 chars
|
|
||||||
- [ ] Abbreviations used where natural
|
|
||||||
- [ ] Casual yet precise tone
|
|
||||||
- [ ] Section markers if body >3 paragraphs
|
|
||||||
- [ ] Technical accuracy maintained
|
|
||||||
|
|
||||||
## Analysis Metadata
|
|
||||||
|
|
||||||
```
|
|
||||||
Source: tractor repository
|
|
||||||
Commits analyzed: 500
|
|
||||||
Date range: 2019-2025
|
|
||||||
Analysis date: 2026-02-08
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
(this style guide was generated by [`claude-code`][claude-code-gh]
|
|
||||||
analyzing commit history)
|
|
||||||
|
|
||||||
[claude-code-gh]: https://github.com/anthropics/claude-code
|
|
||||||
|
|
@ -1,297 +0,0 @@
|
||||||
---
|
|
||||||
name: conc-anal
|
|
||||||
description: >
|
|
||||||
Concurrency analysis for tractor's trio-based
|
|
||||||
async primitives. Trace task scheduling across
|
|
||||||
checkpoint boundaries, identify race windows in
|
|
||||||
shared mutable state, and verify synchronization
|
|
||||||
correctness. Invoke on code segments the user
|
|
||||||
points at, OR proactively when reviewing/writing
|
|
||||||
concurrent cache, lock, or multi-task acm code.
|
|
||||||
argument-hint: "[file:line-range or function name]"
|
|
||||||
allowed-tools:
|
|
||||||
- Read
|
|
||||||
- Grep
|
|
||||||
- Glob
|
|
||||||
- Task
|
|
||||||
---
|
|
||||||
|
|
||||||
Perform a structured concurrency analysis on the
|
|
||||||
target code. This skill should be invoked:
|
|
||||||
|
|
||||||
- **On demand**: user points at a code segment
|
|
||||||
(file:lines, function name, or pastes a snippet)
|
|
||||||
- **Proactively**: when writing or reviewing code
|
|
||||||
that touches shared mutable state across trio
|
|
||||||
tasks — especially `_Cache`, locks, events, or
|
|
||||||
multi-task `@acm` lifecycle management
|
|
||||||
|
|
||||||
## 0. Identify the target
|
|
||||||
|
|
||||||
If the user provides a file:line-range or function
|
|
||||||
name, read that code. If not explicitly provided,
|
|
||||||
identify the relevant concurrent code from context
|
|
||||||
(e.g. the current diff, a failing test, or the
|
|
||||||
function under discussion).
|
|
||||||
|
|
||||||
## 1. Inventory shared mutable state
|
|
||||||
|
|
||||||
List every piece of state that is accessed by
|
|
||||||
multiple tasks. For each, note:
|
|
||||||
|
|
||||||
- **What**: the variable/dict/attr (e.g.
|
|
||||||
`_Cache.values`, `_Cache.resources`,
|
|
||||||
`_Cache.users`)
|
|
||||||
- **Scope**: class-level, module-level, or
|
|
||||||
closure-captured
|
|
||||||
- **Writers**: which tasks/code-paths mutate it
|
|
||||||
- **Readers**: which tasks/code-paths read it
|
|
||||||
- **Guarded by**: which lock/event/ordering
|
|
||||||
protects it (or "UNGUARDED" if none)
|
|
||||||
|
|
||||||
Format as a table:
|
|
||||||
|
|
||||||
```
|
|
||||||
| State | Writers | Readers | Guard |
|
|
||||||
|---------------------|-----------------|-----------------|----------------|
|
|
||||||
| _Cache.values | run_ctx, moc¹ | moc | ctx_key lock |
|
|
||||||
| _Cache.resources | run_ctx, moc | moc, run_ctx | UNGUARDED |
|
|
||||||
```
|
|
||||||
|
|
||||||
¹ `moc` = `maybe_open_context`
|
|
||||||
|
|
||||||
## 2. Map checkpoint boundaries
|
|
||||||
|
|
||||||
For each code path through the target, mark every
|
|
||||||
**checkpoint** — any `await` expression where trio
|
|
||||||
can switch to another task. Use line numbers:
|
|
||||||
|
|
||||||
```
|
|
||||||
L325: await lock.acquire() ← CHECKPOINT
|
|
||||||
L395: await service_tn.start(...) ← CHECKPOINT
|
|
||||||
L411: lock.release() ← (not a checkpoint, but changes lock state)
|
|
||||||
L414: yield (False, yielded) ← SUSPEND (caller runs)
|
|
||||||
L485: no_more_users.set() ← (wakes run_ctx, no switch yet)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Key trio scheduling rules to apply:**
|
|
||||||
- `Event.set()` makes waiters *ready* but does NOT
|
|
||||||
switch immediately
|
|
||||||
- `lock.release()` is not a checkpoint
|
|
||||||
- `await sleep(0)` IS a checkpoint
|
|
||||||
- Code in `finally` blocks CAN have checkpoints
|
|
||||||
(unlike asyncio)
|
|
||||||
- `await` inside `except` blocks can be
|
|
||||||
`trio.Cancelled`-masked
|
|
||||||
|
|
||||||
## 3. Trace concurrent task schedules
|
|
||||||
|
|
||||||
Write out the **interleaved execution trace** for
|
|
||||||
the problematic scenario. Number each step and tag
|
|
||||||
which task executes it:
|
|
||||||
|
|
||||||
```
|
|
||||||
[Task A] 1. acquires lock
|
|
||||||
[Task A] 2. cache miss → allocates resources
|
|
||||||
[Task A] 3. releases lock
|
|
||||||
[Task A] 4. yields to caller
|
|
||||||
[Task A] 5. caller exits → finally runs
|
|
||||||
[Task A] 6. users-- → 0, sets no_more_users
|
|
||||||
[Task A] 7. pops lock from _Cache.locks
|
|
||||||
[run_ctx] 8. wakes from no_more_users.wait()
|
|
||||||
[run_ctx] 9. values.pop(ctx_key)
|
|
||||||
[run_ctx] 10. acm __aexit__ → CHECKPOINT
|
|
||||||
[Task B] 11. creates NEW lock (old one popped)
|
|
||||||
[Task B] 12. acquires immediately
|
|
||||||
[Task B] 13. values[ctx_key] → KeyError
|
|
||||||
[Task B] 14. resources[ctx_key] → STILL EXISTS
|
|
||||||
[Task B] 15. 💥 RuntimeError
|
|
||||||
```
|
|
||||||
|
|
||||||
Identify the **race window**: the range of steps
|
|
||||||
where state is inconsistent. In the example above,
|
|
||||||
steps 9–10 are the window (values gone, resources
|
|
||||||
still alive).
|
|
||||||
|
|
||||||
## 4. Classify the bug
|
|
||||||
|
|
||||||
Categorize what kind of concurrency issue this is:
|
|
||||||
|
|
||||||
- **TOCTOU** (time-of-check-to-time-of-use): state
|
|
||||||
changes between a check and the action based on it
|
|
||||||
- **Stale reference**: a task holds a reference to
|
|
||||||
state that another task has invalidated
|
|
||||||
- **Lifetime mismatch**: a synchronization primitive
|
|
||||||
(lock, event) has a shorter lifetime than the
|
|
||||||
state it's supposed to protect
|
|
||||||
- **Missing guard**: shared state is accessed
|
|
||||||
without any synchronization
|
|
||||||
- **Atomicity gap**: two operations that should be
|
|
||||||
atomic have a checkpoint between them
|
|
||||||
|
|
||||||
## 5. Propose fixes
|
|
||||||
|
|
||||||
For each proposed fix, provide:
|
|
||||||
|
|
||||||
- **Sketch**: pseudocode or diff showing the change
|
|
||||||
- **How it closes the window**: which step(s) from
|
|
||||||
the trace it eliminates or reorders
|
|
||||||
- **Tradeoffs**: complexity, perf, new edge cases,
|
|
||||||
impact on other code paths
|
|
||||||
- **Risk**: what could go wrong (deadlocks, new
|
|
||||||
races, cancellation issues)
|
|
||||||
|
|
||||||
Rate each fix: `[simple|moderate|complex]` impl
|
|
||||||
effort.
|
|
||||||
|
|
||||||
## 6. Output format
|
|
||||||
|
|
||||||
Structure the full analysis as:
|
|
||||||
|
|
||||||
```markdown
|
|
||||||
## Concurrency analysis: `<target>`
|
|
||||||
|
|
||||||
### Shared state
|
|
||||||
<table from step 1>
|
|
||||||
|
|
||||||
### Checkpoints
|
|
||||||
<list from step 2>
|
|
||||||
|
|
||||||
### Race trace
|
|
||||||
<interleaved trace from step 3>
|
|
||||||
|
|
||||||
### Classification
|
|
||||||
<bug type from step 4>
|
|
||||||
|
|
||||||
### Fixes
|
|
||||||
<proposals from step 5>
|
|
||||||
```
|
|
||||||
|
|
||||||
## Tractor-specific patterns to watch
|
|
||||||
|
|
||||||
These are known problem areas in tractor's
|
|
||||||
concurrency model. Flag them when encountered:
|
|
||||||
|
|
||||||
### `_Cache` lock vs `run_ctx` lifetime
|
|
||||||
|
|
||||||
The `_Cache.locks` entry is managed by
|
|
||||||
`maybe_open_context` callers, but `run_ctx` runs
|
|
||||||
in `service_tn` — a different task tree. Lock
|
|
||||||
pop/release in the caller's `finally` does NOT
|
|
||||||
wait for `run_ctx` to finish tearing down. Any
|
|
||||||
state that `run_ctx` cleans up in its `finally`
|
|
||||||
(e.g. `resources.pop()`) is vulnerable to
|
|
||||||
re-entry races after the lock is popped.
|
|
||||||
|
|
||||||
### `values.pop()` → acm `__aexit__` → `resources.pop()` gap
|
|
||||||
|
|
||||||
In `_Cache.run_ctx`, the inner `finally` pops
|
|
||||||
`values`, then the acm's `__aexit__` runs (which
|
|
||||||
has checkpoints), then the outer `finally` pops
|
|
||||||
`resources`. This creates a window where `values`
|
|
||||||
is gone but `resources` still exists — a classic
|
|
||||||
atomicity gap.
|
|
||||||
|
|
||||||
### Global vs per-key counters
|
|
||||||
|
|
||||||
`_Cache.users` as a single `int` (pre-fix) meant
|
|
||||||
that users of different `ctx_key`s inflated each
|
|
||||||
other's counts, preventing teardown when one key's
|
|
||||||
users hit zero. Always verify that per-key state
|
|
||||||
(`users`, `locks`) is actually keyed on `ctx_key`
|
|
||||||
and not on `fid` or some broader key.
|
|
||||||
|
|
||||||
### `Event.set()` wakes but doesn't switch
|
|
||||||
|
|
||||||
`trio.Event.set()` makes waiting tasks *ready* but
|
|
||||||
the current task continues executing until its next
|
|
||||||
checkpoint. Code between `.set()` and the next
|
|
||||||
`await` runs atomically from the scheduler's
|
|
||||||
perspective. Use this to your advantage (or watch
|
|
||||||
for bugs where code assumes the woken task runs
|
|
||||||
immediately).
|
|
||||||
|
|
||||||
### `except` block checkpoint masking
|
|
||||||
|
|
||||||
`await` expressions inside `except` handlers can
|
|
||||||
be masked by `trio.Cancelled`. If a `finally`
|
|
||||||
block runs from an `except` and contains
|
|
||||||
`lock.release()`, the release happens — but any
|
|
||||||
`await` after it in the same `except` may be
|
|
||||||
swallowed. This is why `maybe_open_context`'s
|
|
||||||
cache-miss path does `lock.release()` in a
|
|
||||||
`finally` inside the `except KeyError`.
|
|
||||||
|
|
||||||
### Cancellation in `finally`
|
|
||||||
|
|
||||||
Unlike asyncio, trio allows checkpoints in
|
|
||||||
`finally` blocks. This means `finally` cleanup
|
|
||||||
that does `await` can itself be cancelled (e.g.
|
|
||||||
by nursery shutdown). Watch for cleanup code that
|
|
||||||
assumes it will run to completion.
|
|
||||||
|
|
||||||
### Unbounded waits in cleanup paths
|
|
||||||
|
|
||||||
Any `await <event>.wait()` in a teardown path is
|
|
||||||
a latent deadlock unless the event's setter is
|
|
||||||
GUARANTEED to fire. If the setter depends on
|
|
||||||
external state (peer disconnects, child process
|
|
||||||
exit, subsequent task completion) that itself
|
|
||||||
depends on the current task's progress, you have
|
|
||||||
a mutual wait.
|
|
||||||
|
|
||||||
Rule: **bound every `await X.wait()` in cleanup
|
|
||||||
paths with `trio.move_on_after()`** unless you
|
|
||||||
can prove the setter is unconditionally reachable
|
|
||||||
from the state at the await site. Concrete recent
|
|
||||||
example: `ipc_server.wait_for_no_more_peers()` in
|
|
||||||
`async_main`'s finally (see
|
|
||||||
`ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`
|
|
||||||
"probe iteration 3") — it was unbounded, and when
|
|
||||||
one peer-handler was stuck the wait-for-no-more-
|
|
||||||
peers event never fired, deadlocking the whole
|
|
||||||
actor-tree teardown cascade.
|
|
||||||
|
|
||||||
### The capture-pipe-fill hang pattern (grep this first)
|
|
||||||
|
|
||||||
When investigating any hang in the test suite
|
|
||||||
**especially under fork-based backends**, first
|
|
||||||
check whether the hang reproduces under `pytest
|
|
||||||
-s` (`--capture=no`). If `-s` makes it go away
|
|
||||||
you're not looking at a trio concurrency bug —
|
|
||||||
you're looking at a Linux pipe-buffer fill.
|
|
||||||
|
|
||||||
Mechanism: pytest replaces fds 1,2 with pipe
|
|
||||||
write-ends. Fork-child subactors inherit those
|
|
||||||
fds. High-volume error-log tracebacks (cancel
|
|
||||||
cascade spew) fill the 64KB pipe buffer. Child
|
|
||||||
`write()` blocks. Child can't exit. Parent's
|
|
||||||
`waitpid`/pidfd wait blocks. Deadlock cascades up
|
|
||||||
the tree.
|
|
||||||
|
|
||||||
Pre-existing guards in `tests/conftest.py` encode
|
|
||||||
this knowledge — grep these BEFORE blaming
|
|
||||||
concurrency:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# tests/conftest.py:258
|
|
||||||
if loglevel in ('trace', 'debug'):
|
|
||||||
# XXX: too much logging will lock up the subproc (smh)
|
|
||||||
loglevel: str = 'info'
|
|
||||||
|
|
||||||
# tests/conftest.py:316
|
|
||||||
# can lock up on the `_io.BufferedReader` and hang..
|
|
||||||
stderr: str = proc.stderr.read().decode()
|
|
||||||
```
|
|
||||||
|
|
||||||
Full post-mortem +
|
|
||||||
`ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`
|
|
||||||
for the canonical reproduction. Cost several
|
|
||||||
investigation sessions before catching it —
|
|
||||||
because the capture-pipe symptom was masked by
|
|
||||||
deeper cascade-deadlocks. Once the cascades were
|
|
||||||
fixed, the tree tore down enough to generate
|
|
||||||
pipe-filling log volume → capture-pipe finally
|
|
||||||
surfaced. Grep-note for future-self: **if a
|
|
||||||
multi-subproc tractor test hangs, `pytest -s`
|
|
||||||
first, conc-anal second.**
|
|
||||||
|
|
@ -1,241 +0,0 @@
|
||||||
# PR/Patch-Request Description Format Reference
|
|
||||||
|
|
||||||
Canonical structure for `tractor` patch-request
|
|
||||||
descriptions, designed to work across GitHub,
|
|
||||||
Gitea, SourceHut, and GitLab markdown renderers.
|
|
||||||
|
|
||||||
**Line length: wrap at 72 chars** for all prose
|
|
||||||
content (Summary bullets, Motivation paragraphs,
|
|
||||||
Scopes bullets, etc.). Fill lines *to* 72 — don't
|
|
||||||
stop short at 50-65. Only raw URLs in
|
|
||||||
reference-link definitions may exceed this.
|
|
||||||
|
|
||||||
## Template
|
|
||||||
|
|
||||||
```markdown
|
|
||||||
<!-- pr-msg-meta
|
|
||||||
branch: <branch-name>
|
|
||||||
base: <base-branch>
|
|
||||||
submitted:
|
|
||||||
github: ___
|
|
||||||
gitea: ___
|
|
||||||
srht: ___
|
|
||||||
-->
|
|
||||||
|
|
||||||
## <Title: present-tense verb + backticked code>
|
|
||||||
|
|
||||||
### Summary
|
|
||||||
- [<hash>][<hash>] Description of change ending
|
|
||||||
with period.
|
|
||||||
- [<hash>][<hash>] Another change description
|
|
||||||
ending with period.
|
|
||||||
- [<hash>][<hash>] [<hash>][<hash>] Multi-commit
|
|
||||||
change description.
|
|
||||||
|
|
||||||
### Motivation
|
|
||||||
<1-2 paragraphs: problem/limitation first,
|
|
||||||
then solution. Hard-wrap at 72 chars.>
|
|
||||||
|
|
||||||
### Scopes changed
|
|
||||||
- [<hash>][<hash>] `pkg.mod.func()` — what
|
|
||||||
changed.
|
|
||||||
* [<hash>][<hash>] Also adjusts
|
|
||||||
`.related_thing()` in same module.
|
|
||||||
- [<hash>][<hash>] `tests.test_mod` — new/changed
|
|
||||||
test coverage.
|
|
||||||
|
|
||||||
<!--
|
|
||||||
### Cross-references
|
|
||||||
Also submitted as
|
|
||||||
[github-pr][] | [gitea-pr][] | [srht-patch][].
|
|
||||||
|
|
||||||
### Links
|
|
||||||
- [relevant-issue-or-discussion](url)
|
|
||||||
- [design-doc-or-screenshot](url)
|
|
||||||
-->
|
|
||||||
|
|
||||||
(this pr content was generated in some part by
|
|
||||||
[`claude-code`][claude-code-gh])
|
|
||||||
|
|
||||||
[<hash>]: https://<service>/<owner>/<repo>/commit/<hash>
|
|
||||||
[claude-code-gh]: https://github.com/anthropics/claude-code
|
|
||||||
|
|
||||||
<!-- cross-service pr refs (fill after submit):
|
|
||||||
[github-pr]: https://github.com/<owner>/<repo>/pull/___
|
|
||||||
[gitea-pr]: https://<host>/<owner>/<repo>/pulls/___
|
|
||||||
[srht-patch]: https://git.sr.ht/~<owner>/<repo>/patches/___
|
|
||||||
-->
|
|
||||||
```
|
|
||||||
|
|
||||||
## Markdown Reference-Link Strategy
|
|
||||||
|
|
||||||
Use reference-style links for ALL commit hashes
|
|
||||||
and cross-service PR refs to ensure cross-service
|
|
||||||
compatibility:
|
|
||||||
|
|
||||||
**Inline usage** (in bullets):
|
|
||||||
```markdown
|
|
||||||
- [f3726cf9][f3726cf9] Add `reg_err_types()`
|
|
||||||
for custom exc lookup.
|
|
||||||
```
|
|
||||||
|
|
||||||
**Definition** (bottom of document):
|
|
||||||
```markdown
|
|
||||||
[f3726cf9]: https://github.com/goodboy/tractor/commit/f3726cf9
|
|
||||||
```
|
|
||||||
|
|
||||||
### Why reference-style?
|
|
||||||
- Keeps prose readable without long inline URLs.
|
|
||||||
- All URLs in one place — trivially swappable
|
|
||||||
per-service.
|
|
||||||
- Most git services auto-link bare SHAs anyway,
|
|
||||||
but explicit refs guarantee it works in *any*
|
|
||||||
md renderer.
|
|
||||||
- The `[hash][hash]` form is self-documenting —
|
|
||||||
display text matches the ref ID.
|
|
||||||
- Cross-service PR refs use the same mechanism:
|
|
||||||
`[github-pr][]` resolves via a ref-link def
|
|
||||||
at the bottom, trivially fillable post-submit.
|
|
||||||
|
|
||||||
## Cross-Service PR Placeholder Mechanism
|
|
||||||
|
|
||||||
The generated description includes three layers
|
|
||||||
of cross-service support, all using native md
|
|
||||||
reference-links:
|
|
||||||
|
|
||||||
### 1. Metadata comment (top of file)
|
|
||||||
|
|
||||||
```markdown
|
|
||||||
<!-- pr-msg-meta
|
|
||||||
branch: remote_exc_type_registry
|
|
||||||
base: main
|
|
||||||
submitted:
|
|
||||||
github: ___
|
|
||||||
gitea: ___
|
|
||||||
srht: ___
|
|
||||||
-->
|
|
||||||
```
|
|
||||||
|
|
||||||
A YAML-ish HTML comment block. The `___`
|
|
||||||
placeholders get filled with PR/patch numbers
|
|
||||||
after submission. Machine-parseable for tooling
|
|
||||||
(e.g. `gish`) but invisible in rendered md.
|
|
||||||
|
|
||||||
### 2. Cross-references section (in body)
|
|
||||||
|
|
||||||
```markdown
|
|
||||||
<!--
|
|
||||||
### Cross-references
|
|
||||||
Also submitted as
|
|
||||||
[github-pr][] | [gitea-pr][] | [srht-patch][].
|
|
||||||
-->
|
|
||||||
```
|
|
||||||
|
|
||||||
Commented out at generation time. After submitting
|
|
||||||
to multiple services, uncomment and the ref-links
|
|
||||||
resolve via the stubs at the bottom.
|
|
||||||
|
|
||||||
### 3. Ref-link stubs (bottom of file)
|
|
||||||
|
|
||||||
```markdown
|
|
||||||
<!-- cross-service pr refs (fill after submit):
|
|
||||||
[github-pr]: https://github.com/goodboy/tractor/pull/___
|
|
||||||
[gitea-pr]: https://pikers.dev/goodboy/tractor/pulls/___
|
|
||||||
[srht-patch]: https://git.sr.ht/~goodboy/tractor/patches/___
|
|
||||||
-->
|
|
||||||
```
|
|
||||||
|
|
||||||
Commented out with `___` number placeholders.
|
|
||||||
After submission: uncomment, replace `___` with
|
|
||||||
the actual number. Each service-specific copy
|
|
||||||
fills in all services' numbers so any copy can
|
|
||||||
cross-reference the others.
|
|
||||||
|
|
||||||
### Post-submission file layout
|
|
||||||
|
|
||||||
```
|
|
||||||
pr_msg_LATEST.md # latest draft (skill root)
|
|
||||||
msgs/
|
|
||||||
20260325T002027Z_mybranch_pr_msg.md # timestamped
|
|
||||||
github/
|
|
||||||
42_pr_msg.md # github PR #42
|
|
||||||
gitea/
|
|
||||||
17_pr_msg.md # gitea PR #17
|
|
||||||
srht/
|
|
||||||
5_pr_msg.md # srht patch #5
|
|
||||||
```
|
|
||||||
|
|
||||||
Each `<service>/<num>_pr_msg.md` is a copy with:
|
|
||||||
- metadata `submitted:` fields filled in
|
|
||||||
- cross-references section uncommented
|
|
||||||
- ref-link stubs uncommented with real numbers
|
|
||||||
- all services cross-linked in each copy
|
|
||||||
|
|
||||||
This mirrors the `gish` skill's
|
|
||||||
`<backend>/<num>.md` pattern.
|
|
||||||
|
|
||||||
## Commit-Link URL Patterns by Service
|
|
||||||
|
|
||||||
| Service | Pattern |
|
|
||||||
|-----------|-------------------------------------|
|
|
||||||
| GitHub | `https://github.com/<o>/<r>/commit/<h>` |
|
|
||||||
| Gitea | `https://<host>/<o>/<r>/commit/<h>` |
|
|
||||||
| SourceHut | `https://git.sr.ht/~<o>/<r>/commit/<h>` |
|
|
||||||
| GitLab | `https://gitlab.com/<o>/<r>/-/commit/<h>` |
|
|
||||||
|
|
||||||
## PR/Patch URL Patterns by Service
|
|
||||||
|
|
||||||
| Service | Pattern |
|
|
||||||
|-----------|-------------------------------------|
|
|
||||||
| GitHub | `https://github.com/<o>/<r>/pull/<n>` |
|
|
||||||
| Gitea | `https://<host>/<o>/<r>/pulls/<n>` |
|
|
||||||
| SourceHut | `https://git.sr.ht/~<o>/<r>/patches/<n>` |
|
|
||||||
| GitLab | `https://gitlab.com/<o>/<r>/-/merge_requests/<n>` |
|
|
||||||
|
|
||||||
## Scope Naming Convention
|
|
||||||
|
|
||||||
Use Python namespace-resolution syntax for
|
|
||||||
referencing changed code scopes:
|
|
||||||
|
|
||||||
| File path | Scope reference |
|
|
||||||
|---------------------------|-------------------------------|
|
|
||||||
| `tractor/_exceptions.py` | `tractor._exceptions` |
|
|
||||||
| `tractor/_state.py` | `tractor._state` |
|
|
||||||
| `tests/test_foo.py` | `tests.test_foo` |
|
|
||||||
| Function in module | `tractor._exceptions.func()` |
|
|
||||||
| Method on class | `.RemoteActorError.src_type` |
|
|
||||||
| Class | `tractor._exceptions.RAE` |
|
|
||||||
|
|
||||||
Prefix with the package path for top-level refs;
|
|
||||||
use leading-dot shorthand (`.ClassName.method()`)
|
|
||||||
for sub-bullets where the parent module is already
|
|
||||||
established.
|
|
||||||
|
|
||||||
## Title Conventions
|
|
||||||
|
|
||||||
Same verb vocabulary as commit messages:
|
|
||||||
- `Add` — wholly new feature/API
|
|
||||||
- `Fix` — bug fix
|
|
||||||
- `Drop` — removal
|
|
||||||
- `Use` — adopt new approach
|
|
||||||
- `Move`/`Mv` — relocate code
|
|
||||||
- `Adjust` — minor tweak
|
|
||||||
- `Update` — enhance existing feature
|
|
||||||
- `Support` — add support for something
|
|
||||||
|
|
||||||
Target 50 chars, hard max 70. Always backtick
|
|
||||||
code elements.
|
|
||||||
|
|
||||||
## Tone
|
|
||||||
|
|
||||||
Casual yet technically precise — matching the
|
|
||||||
project's commit-msg style. Terse but every bullet
|
|
||||||
carries signal. Use project abbreviations freely
|
|
||||||
(msg, bg, ctx, impl, mod, obvi, fn, bc, var,
|
|
||||||
prolly, ep, etc.).
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
(this format reference was generated by
|
|
||||||
[`claude-code`][claude-code-gh])
|
|
||||||
[claude-code-gh]: https://github.com/anthropics/claude-code
|
|
||||||
|
|
@ -1,523 +0,0 @@
|
||||||
---
|
|
||||||
name: run-tests
|
|
||||||
description: >
|
|
||||||
Run tractor test suite (or subsets). Use when the user wants
|
|
||||||
to run tests, verify changes, or check for regressions.
|
|
||||||
argument-hint: "[test-path-or-pattern] [--opts]"
|
|
||||||
allowed-tools:
|
|
||||||
- Bash(python -m pytest *)
|
|
||||||
- Bash(python -c *)
|
|
||||||
- Bash(python --version *)
|
|
||||||
- Bash(UV_PROJECT_ENVIRONMENT=py* uv run python *)
|
|
||||||
- Bash(UV_PROJECT_ENVIRONMENT=py* uv run pytest *)
|
|
||||||
- Bash(UV_PROJECT_ENVIRONMENT=py* uv sync *)
|
|
||||||
- Bash(UV_PROJECT_ENVIRONMENT=py* uv pip show *)
|
|
||||||
- Bash(git rev-parse *)
|
|
||||||
- Bash(ls *)
|
|
||||||
- Bash(cat *)
|
|
||||||
- Bash(jq * .pytest_cache/*)
|
|
||||||
- Read
|
|
||||||
- Grep
|
|
||||||
- Glob
|
|
||||||
- Task
|
|
||||||
- AskUserQuestion
|
|
||||||
---
|
|
||||||
|
|
||||||
Run the `tractor` test suite using `pytest`. Follow this
|
|
||||||
process:
|
|
||||||
|
|
||||||
## 1. Parse user intent
|
|
||||||
|
|
||||||
From the user's message and any arguments, determine:
|
|
||||||
|
|
||||||
- **scope**: full suite, specific file(s), specific
|
|
||||||
test(s), or a keyword pattern (`-k`).
|
|
||||||
- **transport**: which IPC transport protocol to test
|
|
||||||
against (default: `tcp`, also: `uds`).
|
|
||||||
- **options**: any extra pytest flags the user wants
|
|
||||||
(e.g. `--ll debug`, `--tpdb`, `-x`, `-v`).
|
|
||||||
|
|
||||||
If the user provides a bare path or pattern as argument,
|
|
||||||
treat it as the test target. Examples:
|
|
||||||
|
|
||||||
- `/run-tests` → full suite
|
|
||||||
- `/run-tests test_local.py` → single file
|
|
||||||
- `/run-tests test_registrar -v` → file + verbose
|
|
||||||
- `/run-tests -k cancel` → keyword filter
|
|
||||||
- `/run-tests tests/ipc/ --tpt-proto uds` → subdir + UDS
|
|
||||||
|
|
||||||
## 2. Construct the pytest command
|
|
||||||
|
|
||||||
Base command:
|
|
||||||
```
|
|
||||||
python -m pytest
|
|
||||||
```
|
|
||||||
|
|
||||||
### Default flags (always include unless user overrides):
|
|
||||||
- `-x` (stop on first failure)
|
|
||||||
- `--tb=short` (concise tracebacks)
|
|
||||||
- `--no-header` (reduce noise)
|
|
||||||
|
|
||||||
### Path resolution:
|
|
||||||
- If the user gives a bare filename like `test_local.py`,
|
|
||||||
resolve it under `tests/`.
|
|
||||||
- If the user gives a subdirectory like `ipc/`, resolve
|
|
||||||
under `tests/ipc/`.
|
|
||||||
- Glob if needed: `tests/**/test_*<pattern>*.py`
|
|
||||||
|
|
||||||
### Key pytest options for this project:
|
|
||||||
|
|
||||||
| Flag | Purpose |
|
|
||||||
|---|---|
|
|
||||||
| `--ll <level>` | Set tractor log level (e.g. `debug`, `info`, `runtime`) |
|
|
||||||
| `--tpdb` / `--debug-mode` | Enable tractor's multi-proc debugger |
|
|
||||||
| `--tpt-proto <key>` | IPC transport: `tcp` (default) or `uds` |
|
|
||||||
| `--spawn-backend <be>` | Spawn method: `trio` (default), `mp_spawn`, `mp_forkserver` |
|
|
||||||
| `-k <expr>` | pytest keyword filter |
|
|
||||||
| `-v` / `-vv` | Verbosity |
|
|
||||||
| `-s` | No output capture (useful with `--tpdb`) |
|
|
||||||
|
|
||||||
### Common combos:
|
|
||||||
```sh
|
|
||||||
# quick smoke test of core modules
|
|
||||||
python -m pytest tests/test_local.py tests/test_rpc.py -x --tb=short --no-header
|
|
||||||
|
|
||||||
# full suite, stop on first failure
|
|
||||||
python -m pytest tests/ -x --tb=short --no-header
|
|
||||||
|
|
||||||
# specific test with debug
|
|
||||||
python -m pytest tests/discovery/test_registrar.py::test_reg_then_unreg -x -s --tpdb --ll debug
|
|
||||||
|
|
||||||
# run with UDS transport
|
|
||||||
python -m pytest tests/ -x --tb=short --no-header --tpt-proto uds
|
|
||||||
|
|
||||||
# keyword filter
|
|
||||||
python -m pytest tests/ -x --tb=short --no-header -k "cancel and not slow"
|
|
||||||
```
|
|
||||||
|
|
||||||
## 3. Pre-flight: venv detection (MANDATORY)
|
|
||||||
|
|
||||||
**Always verify a `uv` venv is active before running
|
|
||||||
`python` or `pytest`.** This project uses
|
|
||||||
`UV_PROJECT_ENVIRONMENT=py<MINOR>` naming (e.g.
|
|
||||||
`py313`) — never `.venv`.
|
|
||||||
|
|
||||||
### Step 1: detect active venv
|
|
||||||
|
|
||||||
Run this check first:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
python -c "
|
|
||||||
import sys, os
|
|
||||||
venv = os.environ.get('VIRTUAL_ENV', '')
|
|
||||||
prefix = sys.prefix
|
|
||||||
print(f'VIRTUAL_ENV={venv}')
|
|
||||||
print(f'sys.prefix={prefix}')
|
|
||||||
print(f'executable={sys.executable}')
|
|
||||||
"
|
|
||||||
```
|
|
||||||
|
|
||||||
### Step 2: interpret results
|
|
||||||
|
|
||||||
**Case A — venv is active** (`VIRTUAL_ENV` is set
|
|
||||||
and points to a `py<MINOR>/` dir under the project
|
|
||||||
root or worktree):
|
|
||||||
|
|
||||||
Use bare `python` / `python -m pytest` for all
|
|
||||||
commands. This is the normal, fast path.
|
|
||||||
|
|
||||||
**Case B — no venv active** (`VIRTUAL_ENV` is empty
|
|
||||||
or `sys.prefix` points to a system Python):
|
|
||||||
|
|
||||||
Use `AskUserQuestion` to ask the user:
|
|
||||||
|
|
||||||
> "No uv venv is active. Should I activate one
|
|
||||||
> via `UV_PROJECT_ENVIRONMENT=py<MINOR> uv sync`,
|
|
||||||
> or would you prefer to activate your shell venv
|
|
||||||
> first?"
|
|
||||||
|
|
||||||
Options:
|
|
||||||
1. **"Create/sync venv"** — run
|
|
||||||
`UV_PROJECT_ENVIRONMENT=py<MINOR> uv sync` where
|
|
||||||
`<MINOR>` is detected from `python --version`
|
|
||||||
(e.g. `313` for 3.13). Then use
|
|
||||||
`py<MINOR>/bin/python` for all subsequent
|
|
||||||
commands in this session.
|
|
||||||
2. **"I'll activate it myself"** — stop and let the
|
|
||||||
user `source py<MINOR>/bin/activate` or similar.
|
|
||||||
|
|
||||||
**Case C — inside a git worktree** (`git rev-parse
|
|
||||||
--git-common-dir` differs from `--git-dir`):
|
|
||||||
|
|
||||||
Verify Python resolves from the **worktree's own
|
|
||||||
venv**, not the main repo's:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
python -c "import tractor; print(tractor.__file__)"
|
|
||||||
```
|
|
||||||
|
|
||||||
If the path points outside the worktree, create a
|
|
||||||
worktree-local venv:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
UV_PROJECT_ENVIRONMENT=py<MINOR> uv sync
|
|
||||||
```
|
|
||||||
|
|
||||||
Then use `py<MINOR>/bin/python` for all commands.
|
|
||||||
|
|
||||||
**Why this matters**: without the correct venv,
|
|
||||||
subprocesses spawned by tractor resolve modules
|
|
||||||
from the wrong editable install, causing spurious
|
|
||||||
`AttributeError` / `ModuleNotFoundError`.
|
|
||||||
|
|
||||||
### Fallback: `uv run`
|
|
||||||
|
|
||||||
If the user can't or won't activate a venv, all
|
|
||||||
`python` and `pytest` commands can be prefixed
|
|
||||||
with `UV_PROJECT_ENVIRONMENT=py<MINOR> uv run`:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
# instead of: python -m pytest tests/ -x
|
|
||||||
UV_PROJECT_ENVIRONMENT=py313 uv run pytest tests/ -x
|
|
||||||
|
|
||||||
# instead of: python -c 'import tractor'
|
|
||||||
UV_PROJECT_ENVIRONMENT=py313 uv run python -c 'import tractor'
|
|
||||||
```
|
|
||||||
|
|
||||||
`uv run` auto-discovers the project and venv,
|
|
||||||
but is slower than a pre-activated venv due to
|
|
||||||
lock-file resolution on each invocation. Prefer
|
|
||||||
activating the venv when possible.
|
|
||||||
|
|
||||||
### Step 3: import + collection checks
|
|
||||||
|
|
||||||
After venv is confirmed, always run these
|
|
||||||
(especially after refactors or module moves):
|
|
||||||
|
|
||||||
```sh
|
|
||||||
# 1. package import smoke check
|
|
||||||
python -c 'import tractor; print(tractor)'
|
|
||||||
|
|
||||||
# 2. verify all tests collect (no import errors)
|
|
||||||
python -m pytest tests/ -x -q --co 2>&1 | tail -5
|
|
||||||
```
|
|
||||||
|
|
||||||
If either fails, fix the import error before running
|
|
||||||
any actual tests.
|
|
||||||
|
|
||||||
### Step 4: zombie-actor / stale-registry check (MANDATORY)
|
|
||||||
|
|
||||||
The tractor runtime's default registry address is
|
|
||||||
**`127.0.0.1:1616`** (TCP) / `/tmp/registry@1616.sock`
|
|
||||||
(UDS). Whenever any prior test run — especially one
|
|
||||||
using a fork-based backend like `subint_forkserver` —
|
|
||||||
leaks a child actor process, that zombie keeps the
|
|
||||||
registry port bound and **every subsequent test
|
|
||||||
session fails to bind**, often presenting as 50+
|
|
||||||
unrelated failures ("all tests broken"!) across
|
|
||||||
backends.
|
|
||||||
|
|
||||||
**This has to be checked before the first run AND
|
|
||||||
after any cancelled/SIGINT'd run** — signal failures
|
|
||||||
in the middle of a test can leave orphan children.
|
|
||||||
|
|
||||||
```sh
|
|
||||||
# 1. TCP registry — any listener on :1616? (primary signal)
|
|
||||||
ss -tlnp 2>/dev/null | grep ':1616' || echo 'TCP :1616 free'
|
|
||||||
|
|
||||||
# 2. leftover actor/forkserver procs — scoped to THIS
|
|
||||||
# repo's python path, so we don't false-flag legit
|
|
||||||
# long-running tractor-using apps (e.g. `piker`,
|
|
||||||
# downstream projects that embed tractor).
|
|
||||||
pgrep -af "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv" \
|
|
||||||
| grep -v 'grep\|pgrep' \
|
|
||||||
|| echo 'no leaked actor procs from this repo'
|
|
||||||
|
|
||||||
# 3. stale UDS registry sockets
|
|
||||||
ls -la /tmp/registry@*.sock 2>/dev/null \
|
|
||||||
|| echo 'no leaked UDS registry sockets'
|
|
||||||
```
|
|
||||||
|
|
||||||
**Interpretation:**
|
|
||||||
|
|
||||||
- **TCP :1616 free AND no stale sockets** → clean,
|
|
||||||
proceed. The actor-procs probe is secondary — false
|
|
||||||
positives are common (piker, any other tractor-
|
|
||||||
embedding app); only cleanup if `:1616` is bound or
|
|
||||||
sockets linger.
|
|
||||||
- **TCP :1616 bound OR stale sockets present** →
|
|
||||||
surface PIDs + cmdlines to the user, offer cleanup:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
# 1. GRACEFUL FIRST (tractor is structured concurrent — it
|
|
||||||
# catches SIGINT as an OS-cancel in `_trio_main` and
|
|
||||||
# cascades Portal.cancel_actor via IPC to every descendant.
|
|
||||||
# So always try SIGINT first with a bounded timeout; only
|
|
||||||
# escalate to SIGKILL if graceful cleanup doesn't complete).
|
|
||||||
pkill -INT -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv"
|
|
||||||
|
|
||||||
# 2. bounded wait for graceful teardown (usually sub-second).
|
|
||||||
# Loop until the processes exit, or timeout. Keep the
|
|
||||||
# bound tight — hung/abrupt-killed descendants usually
|
|
||||||
# hang forever, so don't wait more than a few seconds.
|
|
||||||
for i in $(seq 1 10); do
|
|
||||||
pgrep -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv" >/dev/null || break
|
|
||||||
sleep 0.3
|
|
||||||
done
|
|
||||||
|
|
||||||
# 3. ESCALATE TO SIGKILL only if graceful didn't finish.
|
|
||||||
if pgrep -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv" >/dev/null; then
|
|
||||||
echo 'graceful teardown timed out — escalating to SIGKILL'
|
|
||||||
pkill -9 -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# 4. if a test zombie holds :1616 specifically and doesn't
|
|
||||||
# match the above pattern, find its PID the hard way:
|
|
||||||
ss -tlnp 2>/dev/null | grep ':1616' # prints `users:(("<name>",pid=NNNN,...))`
|
|
||||||
# then (same SIGINT-first ladder):
|
|
||||||
# kill -INT <NNNN>; sleep 1; kill -9 <NNNN> 2>/dev/null
|
|
||||||
|
|
||||||
# 5. remove stale UDS sockets
|
|
||||||
rm -f /tmp/registry@*.sock
|
|
||||||
|
|
||||||
# 6. re-verify
|
|
||||||
ss -tlnp 2>/dev/null | grep ':1616' || echo 'TCP :1616 now free'
|
|
||||||
```
|
|
||||||
|
|
||||||
**Never ignore stale registry state.** If you see the
|
|
||||||
"all tests failing" pattern — especially
|
|
||||||
`trio.TooSlowError` / connection refused / address in
|
|
||||||
use on many unrelated tests — check registry **before**
|
|
||||||
spelunking into test code. The failure signature will
|
|
||||||
be identical across backends because they're all
|
|
||||||
fighting for the same port.
|
|
||||||
|
|
||||||
**False-positive warning for step 2:** a plain
|
|
||||||
`pgrep -af '_actor_child_main'` will also match
|
|
||||||
legit long-running tractor-embedding apps (e.g.
|
|
||||||
`piker` at `~/repos/piker/py*/bin/python3 -m
|
|
||||||
tractor._child ...`). Always scope to the current
|
|
||||||
repo's python path, or only use step 1 (`:1616`) as
|
|
||||||
the authoritative signal.
|
|
||||||
|
|
||||||
## 4. Run and report
|
|
||||||
|
|
||||||
- Run the constructed command.
|
|
||||||
- Use a timeout of **600000ms** (10min) for full suite
|
|
||||||
runs, **120000ms** (2min) for single-file runs.
|
|
||||||
- If the suite is large (full `tests/`), consider running
|
|
||||||
in the background and checking output when done.
|
|
||||||
- Use `--lf` (last-failed) to re-run only previously
|
|
||||||
failing tests when iterating on a fix.
|
|
||||||
|
|
||||||
### On failure:
|
|
||||||
- Show the failing test name(s) and short traceback.
|
|
||||||
- If the failure looks related to recent changes, point
|
|
||||||
out the likely cause and suggest a fix.
|
|
||||||
- **Check the known-flaky list** (section 8) before
|
|
||||||
investigating — don't waste time on pre-existing
|
|
||||||
timeout issues.
|
|
||||||
- **NEVER auto-commit fixes.** If you apply a code fix
|
|
||||||
during test iteration, leave it unstaged. Tell the
|
|
||||||
user what changed and suggest they review the
|
|
||||||
worktree state, stage files manually, and use
|
|
||||||
`/commit-msg` (inline or in a separate session) to
|
|
||||||
generate the commit message. The human drives all
|
|
||||||
`git add` and `git commit` operations.
|
|
||||||
|
|
||||||
### On success:
|
|
||||||
- Report the pass/fail/skip counts concisely.
|
|
||||||
|
|
||||||
## 5. Test directory layout (reference)
|
|
||||||
|
|
||||||
```
|
|
||||||
tests/
|
|
||||||
├── conftest.py # root fixtures, daemon, signals
|
|
||||||
├── devx/ # debugger/tooling tests
|
|
||||||
├── ipc/ # transport protocol tests
|
|
||||||
├── msg/ # messaging layer tests
|
|
||||||
├── discovery/ # discovery subsystem tests
|
|
||||||
│ ├── test_multiaddr.py # multiaddr construction
|
|
||||||
│ └── test_registrar.py # registry/discovery protocol
|
|
||||||
├── test_local.py # registrar + local actor basics
|
|
||||||
├── test_rpc.py # RPC error handling
|
|
||||||
├── test_spawning.py # subprocess spawning
|
|
||||||
├── test_multi_program.py # multi-process tree tests
|
|
||||||
├── test_cancellation.py # cancellation semantics
|
|
||||||
├── test_context_stream_semantics.py # ctx streaming
|
|
||||||
├── test_inter_peer_cancellation.py # peer cancel
|
|
||||||
├── test_infected_asyncio.py # trio-in-asyncio
|
|
||||||
└── ...
|
|
||||||
```
|
|
||||||
|
|
||||||
## 6. Change-type → test mapping
|
|
||||||
|
|
||||||
After modifying specific modules, run the corresponding
|
|
||||||
test subset first for fast feedback:
|
|
||||||
|
|
||||||
| Changed module(s) | Run these tests first |
|
|
||||||
|---|---|
|
|
||||||
| `runtime/_runtime.py`, `runtime/_state.py` | `test_local.py test_rpc.py test_spawning.py test_root_runtime.py` |
|
|
||||||
| `discovery/` (`_registry`, `_discovery`, `_addr`) | `tests/discovery/ test_multi_program.py test_local.py` |
|
|
||||||
| `_context.py`, `_streaming.py` | `test_context_stream_semantics.py test_advanced_streaming.py` |
|
|
||||||
| `ipc/` (`_chan`, `_server`, `_transport`) | `tests/ipc/ test_2way.py` |
|
|
||||||
| `runtime/_portal.py`, `runtime/_rpc.py` | `test_rpc.py test_cancellation.py` |
|
|
||||||
| `spawn/` (`_spawn`, `_entry`) | `test_spawning.py test_multi_program.py` |
|
|
||||||
| `devx/debug/` | `tests/devx/test_debugger.py` (slow!) |
|
|
||||||
| `to_asyncio.py` | `test_infected_asyncio.py test_root_infect_asyncio.py` |
|
|
||||||
| `msg/` | `tests/msg/` |
|
|
||||||
| `_exceptions.py` | `test_remote_exc_relay.py test_inter_peer_cancellation.py` |
|
|
||||||
| `runtime/_supervise.py` | `test_cancellation.py test_spawning.py` |
|
|
||||||
|
|
||||||
## 7. Quick-check shortcuts
|
|
||||||
|
|
||||||
### After refactors (fastest first-pass):
|
|
||||||
```sh
|
|
||||||
# import + collect check
|
|
||||||
python -c 'import tractor' && python -m pytest tests/ -x -q --co 2>&1 | tail -3
|
|
||||||
|
|
||||||
# core subset (~10s)
|
|
||||||
python -m pytest tests/test_local.py tests/test_rpc.py tests/test_spawning.py tests/discovery/test_registrar.py -x --tb=short --no-header
|
|
||||||
```
|
|
||||||
|
|
||||||
### Inspect last failures (without re-running):
|
|
||||||
|
|
||||||
When the user asks "what failed?", "show failures",
|
|
||||||
or wants to check the last-failed set before
|
|
||||||
re-running — read the pytest cache directly. This
|
|
||||||
is instant and avoids test collection overhead.
|
|
||||||
|
|
||||||
```sh
|
|
||||||
python -c "
|
|
||||||
import json, pathlib, sys
|
|
||||||
p = pathlib.Path('.pytest_cache/v/cache/lastfailed')
|
|
||||||
if not p.exists():
|
|
||||||
print('No lastfailed cache found.'); sys.exit()
|
|
||||||
data = json.loads(p.read_text())
|
|
||||||
# filter to real test node IDs (ignore junk
|
|
||||||
# entries that can accumulate from system paths)
|
|
||||||
tests = sorted(k for k in data if k.startswith('tests/'))
|
|
||||||
if not tests:
|
|
||||||
print('No failures recorded.')
|
|
||||||
else:
|
|
||||||
print(f'{len(tests)} last-failed test(s):')
|
|
||||||
for t in tests:
|
|
||||||
print(f' {t}')
|
|
||||||
"
|
|
||||||
```
|
|
||||||
|
|
||||||
**Why not `--cache-show` or `--co --lf`?**
|
|
||||||
|
|
||||||
- `pytest --cache-show 'cache/lastfailed'` works
|
|
||||||
but dumps raw dict repr including junk entries
|
|
||||||
(stale system paths that leak into the cache).
|
|
||||||
- `pytest --co --lf` actually *collects* tests which
|
|
||||||
triggers import resolution and is slow (~0.5s+).
|
|
||||||
Worse, when cached node IDs don't exactly match
|
|
||||||
current parametrize IDs (e.g. param names changed
|
|
||||||
between runs), pytest falls back to collecting
|
|
||||||
the *entire file*, giving false positives.
|
|
||||||
- Reading the JSON directly is instant, filterable
|
|
||||||
to `tests/`-prefixed entries, and shows exactly
|
|
||||||
what pytest recorded — no interpretation.
|
|
||||||
|
|
||||||
**After inspecting**, re-run the failures:
|
|
||||||
```sh
|
|
||||||
python -m pytest --lf -x --tb=short --no-header
|
|
||||||
```
|
|
||||||
|
|
||||||
### Full suite in background:
|
|
||||||
When core tests pass and you want full coverage while
|
|
||||||
continuing other work, run in background:
|
|
||||||
```sh
|
|
||||||
python -m pytest tests/ -x --tb=short --no-header -q
|
|
||||||
```
|
|
||||||
(use `run_in_background=true` on the Bash tool)
|
|
||||||
|
|
||||||
## 8. Known flaky tests
|
|
||||||
|
|
||||||
These tests have **pre-existing** timing/environment
|
|
||||||
sensitivity. If they fail with `TooSlowError` or
|
|
||||||
pexpect `TIMEOUT`, they are almost certainly NOT caused
|
|
||||||
by your changes — note them and move on.
|
|
||||||
|
|
||||||
| Test | Typical error | Notes |
|
|
||||||
|---|---|---|
|
|
||||||
| `devx/test_debugger.py::test_multi_nested_subactors_error_through_nurseries` | pexpect TIMEOUT | Debugger pexpect timing |
|
|
||||||
| `test_cancellation.py::test_cancel_via_SIGINT_other_task` | TooSlowError | Signal handling race |
|
|
||||||
| `test_inter_peer_cancellation.py::test_peer_spawns_and_cancels_service_subactor` | TooSlowError | Async timing (both param variants) |
|
|
||||||
| `test_docs_examples.py::test_example[we_are_processes.py]` | `assert None == 0` | `__main__` missing `__file__` in subproc |
|
|
||||||
|
|
||||||
**Rule of thumb**: if a test fails with `TooSlowError`,
|
|
||||||
`trio.TooSlowError`, or `pexpect.TIMEOUT` and you didn't
|
|
||||||
touch the relevant code path, it's flaky — skip it.
|
|
||||||
|
|
||||||
## 9. The pytest-capture hang pattern (CHECK THIS FIRST)
|
|
||||||
|
|
||||||
**Symptom:** a tractor test hangs indefinitely under
|
|
||||||
default `pytest` but passes instantly when you add
|
|
||||||
`-s` (`--capture=no`).
|
|
||||||
|
|
||||||
**Cause:** tractor subactors (especially under fork-
|
|
||||||
based backends) inherit pytest's stdout/stderr
|
|
||||||
capture pipes via fds 1,2. Under high-volume error
|
|
||||||
logging (e.g. multi-level cancel cascade, nested
|
|
||||||
`run_in_actor` failures, anything triggering
|
|
||||||
`RemoteActorError` + `ExceptionGroup` traceback
|
|
||||||
spew), the **64KB Linux pipe buffer fills** faster
|
|
||||||
than pytest drains it. Subactor writes block → can't
|
|
||||||
finish exit → parent's `waitpid`/pidfd wait blocks →
|
|
||||||
deadlock cascades up the tree.
|
|
||||||
|
|
||||||
**Pre-existing guards in the tractor harness** that
|
|
||||||
encode this same knowledge — grep these FIRST
|
|
||||||
before spelunking:
|
|
||||||
|
|
||||||
- `tests/conftest.py:258-260` (in the `daemon`
|
|
||||||
fixture): `# XXX: too much logging will lock up
|
|
||||||
the subproc (smh)` — downgrades `trace`/`debug`
|
|
||||||
loglevel to `info` to prevent the hang.
|
|
||||||
- `tests/conftest.py:316`: `# can lock up on the
|
|
||||||
_io.BufferedReader and hang..` — noted on the
|
|
||||||
`proc.stderr.read()` post-SIGINT.
|
|
||||||
|
|
||||||
**Debug recipe (in priority order):**
|
|
||||||
|
|
||||||
1. **Try `-s` first.** If the hang disappears with
|
|
||||||
`pytest -s`, you've confirmed it's capture-pipe
|
|
||||||
fill. Skip spelunking.
|
|
||||||
2. **Lower the loglevel.** Default `--ll=error` on
|
|
||||||
this project; if you've bumped it to `debug` /
|
|
||||||
`info`, try dropping back. Each log level
|
|
||||||
multiplies pipe-pressure under fault cascades.
|
|
||||||
3. **If you MUST use default capture + high log
|
|
||||||
volume**, redirect subactor stdout/stderr in the
|
|
||||||
child prelude (e.g.
|
|
||||||
`tractor.spawn._subint_forkserver._child_target`
|
|
||||||
post-`_close_inherited_fds`) to `/dev/null` or a
|
|
||||||
file.
|
|
||||||
|
|
||||||
**Signature tells you it's THIS bug (vs. a real
|
|
||||||
code hang):**
|
|
||||||
|
|
||||||
- Multi-actor test under fork-based backend
|
|
||||||
(`subint_forkserver`, eventually `trio_proc` too
|
|
||||||
under enough log volume).
|
|
||||||
- Multiple `RemoteActorError` / `ExceptionGroup`
|
|
||||||
tracebacks in the error path.
|
|
||||||
- Test passes with `-s` in the 5-10s range, hangs
|
|
||||||
past pytest-timeout (usually 30+ s) without `-s`.
|
|
||||||
- Subactor processes visible via `pgrep -af
|
|
||||||
subint-forkserv` or similar after the hang —
|
|
||||||
they're alive but blocked on `write()` to an
|
|
||||||
inherited stdout fd.
|
|
||||||
|
|
||||||
**Historical reference:** this deadlock cost a
|
|
||||||
multi-session investigation (4 genuine cascade
|
|
||||||
fixes landed along the way) that only surfaced the
|
|
||||||
capture-pipe issue AFTER the deeper fixes let the
|
|
||||||
tree actually tear down enough to produce pipe-
|
|
||||||
filling log volume. Full post-mortem in
|
|
||||||
`ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`.
|
|
||||||
Lesson codified here so future-me grep-finds the
|
|
||||||
workaround before digging.
|
|
||||||
|
|
@ -1,18 +1,10 @@
|
||||||
name: CI
|
name: CI
|
||||||
|
|
||||||
# NOTE distilled from,
|
|
||||||
# https://github.com/orgs/community/discussions/26276
|
|
||||||
on:
|
on:
|
||||||
# any time a new update to 'main'
|
# any time someone pushes a new branch to origin
|
||||||
push:
|
push:
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
|
|
||||||
# for on all (forked) PRs to repo
|
# Allows you to run this workflow manually from the Actions tab
|
||||||
# NOTE, use a draft PR if you just want CI triggered..
|
|
||||||
pull_request:
|
|
||||||
|
|
||||||
# to run workflow manually from the "Actions" tab
|
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
|
@ -82,44 +74,24 @@ jobs:
|
||||||
# run: mypy tractor/ --ignore-missing-imports --show-traceback
|
# run: mypy tractor/ --ignore-missing-imports --show-traceback
|
||||||
|
|
||||||
|
|
||||||
testing:
|
testing-linux:
|
||||||
name: '${{ matrix.os }} Python${{ matrix.python-version }} spawn_backend=${{ matrix.spawn_backend }} tpt_proto=${{ matrix.tpt_proto }}'
|
name: '${{ matrix.os }} Python ${{ matrix.python }} - ${{ matrix.spawn_backend }}'
|
||||||
timeout-minutes: 16
|
timeout-minutes: 10
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
os: [
|
os: [ubuntu-latest]
|
||||||
ubuntu-latest,
|
python-version: ['3.13']
|
||||||
macos-latest,
|
|
||||||
]
|
|
||||||
python-version: [
|
|
||||||
'3.13',
|
|
||||||
# '3.14',
|
|
||||||
]
|
|
||||||
spawn_backend: [
|
spawn_backend: [
|
||||||
'trio',
|
'trio',
|
||||||
# 'mp_spawn',
|
# 'mp_spawn',
|
||||||
# 'mp_forkserver',
|
# 'mp_forkserver',
|
||||||
# ?TODO^ is it worth it to get these running again?
|
|
||||||
#
|
|
||||||
# - [ ] next-gen backends, on 3.13+
|
|
||||||
# https://github.com/goodboy/tractor/issues/379
|
|
||||||
# 'subinterpreter',
|
|
||||||
# 'subint',
|
|
||||||
]
|
]
|
||||||
tpt_proto: [
|
|
||||||
'tcp',
|
|
||||||
'uds',
|
|
||||||
]
|
|
||||||
# https://github.com/orgs/community/discussions/26253#discussioncomment-3250989
|
|
||||||
exclude:
|
|
||||||
# don't do UDS run on macOS (for now)
|
|
||||||
- os: macos-latest
|
|
||||||
tpt_proto: 'uds'
|
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: 'Install uv + py-${{ matrix.python-version }}'
|
- name: 'Install uv + py-${{ matrix.python-version }}'
|
||||||
|
|
@ -146,11 +118,7 @@ jobs:
|
||||||
run: uv tree
|
run: uv tree
|
||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: >
|
run: uv run pytest tests/ --spawn-backend=${{ matrix.spawn_backend }} -rsx
|
||||||
uv run
|
|
||||||
pytest tests/ -rsx
|
|
||||||
--spawn-backend=${{ matrix.spawn_backend }}
|
|
||||||
--tpt-proto=${{ matrix.tpt_proto }}
|
|
||||||
|
|
||||||
# XXX legacy NOTE XXX
|
# XXX legacy NOTE XXX
|
||||||
#
|
#
|
||||||
|
|
|
||||||
|
|
@ -102,69 +102,3 @@ venv.bak/
|
||||||
|
|
||||||
# mypy
|
# mypy
|
||||||
.mypy_cache/
|
.mypy_cache/
|
||||||
|
|
||||||
# all files under
|
|
||||||
.git/
|
|
||||||
|
|
||||||
# require very explicit staging for anything we **really**
|
|
||||||
# want put/kept in repo.
|
|
||||||
notes_to_self/
|
|
||||||
snippets/
|
|
||||||
|
|
||||||
# ------- AI shiz -------
|
|
||||||
# `ai.skillz` symlinks,
|
|
||||||
# (machine-local, deploy via deploy-skill.sh)
|
|
||||||
.claude/skills/py-codestyle
|
|
||||||
.claude/skills/close-wkt
|
|
||||||
.claude/skills/plan-io
|
|
||||||
.claude/skills/prompt-io
|
|
||||||
.claude/skills/resolve-conflicts
|
|
||||||
.claude/skills/inter-skill-review
|
|
||||||
|
|
||||||
# /open-wkt specifics
|
|
||||||
.claude/skills/open-wkt
|
|
||||||
.claude/wkts/
|
|
||||||
claude_wkts
|
|
||||||
|
|
||||||
# /code-review-changes specifics
|
|
||||||
.claude/skills/code-review-changes
|
|
||||||
# review-skill ephemeral ctx (per-PR, single-use)
|
|
||||||
.claude/review_context.md
|
|
||||||
.claude/review_regression.md
|
|
||||||
|
|
||||||
# /pr-msg specifics
|
|
||||||
.claude/skills/pr-msg/*
|
|
||||||
# repo-specific
|
|
||||||
!.claude/skills/pr-msg/format-reference.md
|
|
||||||
# XXX, so u can nvim-telescope this file.
|
|
||||||
# !.claude/skills/pr-msg/pr_msg_LATEST.md
|
|
||||||
|
|
||||||
# /commit-msg specifics
|
|
||||||
# - any commit-msg gen tmp files
|
|
||||||
.claude/*_commit_*.md
|
|
||||||
.claude/*_commit*.txt
|
|
||||||
.claude/skills/commit-msg/*
|
|
||||||
!.claude/skills/commit-msg/style-duie-reference.md
|
|
||||||
|
|
||||||
# use prompt-io instead?
|
|
||||||
.claude/plans
|
|
||||||
|
|
||||||
# nix develop --profile .nixdev
|
|
||||||
.nixdev*
|
|
||||||
|
|
||||||
# :Obsession .
|
|
||||||
Session.vim
|
|
||||||
|
|
||||||
# `gish` local `.md`-files
|
|
||||||
# TODO? better all around automation!
|
|
||||||
# -[ ] it'd be handy to also commit and sync with wtv git service?
|
|
||||||
# -[ ] everything should be put under a `.gish/` no?
|
|
||||||
gitea/
|
|
||||||
gh/
|
|
||||||
|
|
||||||
# ------ macOS ------
|
|
||||||
# Finder metadata
|
|
||||||
**/.DS_Store
|
|
||||||
|
|
||||||
# LLM conversations that should remain private
|
|
||||||
docs/conversations/
|
|
||||||
|
|
|
||||||
|
|
@ -1,161 +0,0 @@
|
||||||
# `subint` backend: parent trio loop parks after subint teardown (Ctrl-C works; not a CPython-level issue)
|
|
||||||
|
|
||||||
Follow-up to the Phase B subint spawn-backend PR (see
|
|
||||||
`tractor.spawn._subint`, issue #379). Distinct from the
|
|
||||||
`subint_sigint_starvation_issue.md` (SIGINT-unresponsive
|
|
||||||
starvation hang): this one is **Ctrl-C-able**, which means
|
|
||||||
it's *not* the shared-GIL-hostage class and is ours to fix
|
|
||||||
from inside tractor rather than waiting on upstream CPython
|
|
||||||
/ msgspec progress.
|
|
||||||
|
|
||||||
## TL;DR
|
|
||||||
|
|
||||||
After a stuck-subint subactor is torn down via the
|
|
||||||
hard-kill path, a parent-side trio task parks on an
|
|
||||||
*orphaned resource* (most likely a `chan.recv()` /
|
|
||||||
`process_messages` loop on the now-dead subint's IPC
|
|
||||||
channel) and waits forever for bytes that can't arrive —
|
|
||||||
because the channel was torn down without emitting a clean
|
|
||||||
EOF/`BrokenResourceError` to the waiting receiver.
|
|
||||||
|
|
||||||
Unlike `subint_sigint_starvation_issue.md`, the main trio
|
|
||||||
loop **is** iterating normally — SIGINT delivers cleanly
|
|
||||||
and the test unhangs. But absent Ctrl-C, the test suite
|
|
||||||
wedges indefinitely.
|
|
||||||
|
|
||||||
## Symptom
|
|
||||||
|
|
||||||
Running `test_subint_non_checkpointing_child` under
|
|
||||||
`--spawn-backend=subint` (in
|
|
||||||
`tests/test_subint_cancellation.py`):
|
|
||||||
|
|
||||||
1. Test spawns a subactor whose main task runs
|
|
||||||
`threading.Event.wait(1.0)` in a loop — releases the
|
|
||||||
GIL but never inserts a trio checkpoint.
|
|
||||||
2. Parent does `an.cancel_scope.cancel()`. Our
|
|
||||||
`subint_proc` cancel path fires: soft-kill sends
|
|
||||||
`Portal.cancel_actor()` over the live IPC channel →
|
|
||||||
subint's trio loop *should* process the cancel msg on
|
|
||||||
its IPC dispatcher task (since the GIL releases are
|
|
||||||
happening).
|
|
||||||
3. Expected: subint's `trio.run()` unwinds, driver thread
|
|
||||||
exits naturally, parent returns.
|
|
||||||
4. Actual: parent `trio.run()` never completes. Test
|
|
||||||
hangs past its `trio.fail_after()` deadline.
|
|
||||||
|
|
||||||
## Evidence
|
|
||||||
|
|
||||||
### `strace` on the hung pytest process during SIGINT
|
|
||||||
|
|
||||||
```
|
|
||||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
|
||||||
write(17, "\2", 1) = 1
|
|
||||||
```
|
|
||||||
|
|
||||||
Contrast with the SIGINT-starvation hang (see
|
|
||||||
`subint_sigint_starvation_issue.md`) where that same
|
|
||||||
`write()` returned `EAGAIN`. Here the SIGINT byte is
|
|
||||||
written successfully → Python's signal handler pipe is
|
|
||||||
being drained → main trio loop **is** iterating → SIGINT
|
|
||||||
gets turned into `trio.Cancelled` → the test unhangs (if
|
|
||||||
the operator happens to be there to hit Ctrl-C).
|
|
||||||
|
|
||||||
### Stack dump (via `tractor.devx.dump_on_hang`)
|
|
||||||
|
|
||||||
Single main thread visible, parked in
|
|
||||||
`trio._core._io_epoll.get_events` inside `trio.run` at the
|
|
||||||
test's `trio.run(...)` call site. No subint driver thread
|
|
||||||
(subint was destroyed successfully — this is *after* the
|
|
||||||
hard-kill path, not during it).
|
|
||||||
|
|
||||||
## Root cause hypothesis
|
|
||||||
|
|
||||||
Most consistent with the evidence: a parent-side trio
|
|
||||||
task is awaiting a `chan.recv()` / `process_messages` loop
|
|
||||||
on the dead subint's IPC channel. The sequence:
|
|
||||||
|
|
||||||
1. Soft-kill in `subint_proc` sends `Portal.cancel_actor()`
|
|
||||||
over the channel. The subint's trio dispatcher *may* or
|
|
||||||
may not have processed the cancel msg before the subint
|
|
||||||
was destroyed — timing-dependent.
|
|
||||||
2. Hard-kill timeout fires (because the subint's main
|
|
||||||
task was in `threading.Event.wait()` with no trio
|
|
||||||
checkpoint — cancel-msg processing couldn't race the
|
|
||||||
timeout).
|
|
||||||
3. Driver thread abandoned, `_interpreters.destroy()`
|
|
||||||
runs. Subint is gone.
|
|
||||||
4. But the parent-side trio task holding a
|
|
||||||
`chan.recv()` / `process_messages` loop against that
|
|
||||||
channel was **not** explicitly cancelled. The channel's
|
|
||||||
underlying socket got torn down, but without a clean
|
|
||||||
EOF delivered to the waiting recv, the task parks
|
|
||||||
forever on `trio.lowlevel.wait_readable` (or similar).
|
|
||||||
|
|
||||||
This matches the "main loop fine, task parked on
|
|
||||||
orphaned I/O" signature.
|
|
||||||
|
|
||||||
## Why this is ours to fix (not CPython's)
|
|
||||||
|
|
||||||
- Main trio loop iterates normally → GIL isn't starved.
|
|
||||||
- SIGINT is deliverable → not a signal-pipe-full /
|
|
||||||
wakeup-fd contention scenario.
|
|
||||||
- The hang is in *our* supervision code, specifically in
|
|
||||||
how `subint_proc` tears down its side of the IPC when
|
|
||||||
the subint is abandoned/destroyed.
|
|
||||||
|
|
||||||
## Possible fix directions
|
|
||||||
|
|
||||||
1. **Explicit parent-side channel abort on subint
|
|
||||||
abandon.** In `subint_proc`'s teardown block, after the
|
|
||||||
hard-kill timeout fires, explicitly close the parent's
|
|
||||||
end of the IPC channel to the subint. Any waiting
|
|
||||||
`chan.recv()` / `process_messages` task sees
|
|
||||||
`BrokenResourceError` (or `ClosedResourceError`) and
|
|
||||||
unwinds.
|
|
||||||
2. **Cancel parent-side RPC tasks tied to the dead
|
|
||||||
subint's channel.** The `Actor._rpc_tasks` / nursery
|
|
||||||
machinery should have a handle on any
|
|
||||||
`process_messages` loops bound to a specific peer
|
|
||||||
channel. Iterate those and cancel explicitly.
|
|
||||||
3. **Bound the top-level `await actor_nursery
|
|
||||||
._join_procs.wait()` shield in `subint_proc`** (same
|
|
||||||
pattern as the other bounded shields the hard-kill
|
|
||||||
patch added). If the nursery never sets `_join_procs`
|
|
||||||
because a child task is parked, the bound would at
|
|
||||||
least let the teardown proceed.
|
|
||||||
|
|
||||||
Of these, (1) is the most surgical and directly addresses
|
|
||||||
the root cause. (2) is a defense-in-depth companion. (3)
|
|
||||||
is a band-aid but cheap to add.
|
|
||||||
|
|
||||||
## Current workaround
|
|
||||||
|
|
||||||
None in-tree. The test's `trio.fail_after()` bound
|
|
||||||
currently fires and raises `TooSlowError`, so the test
|
|
||||||
visibly **fails** rather than hangs — which is
|
|
||||||
intentional (an unbounded cancellation-audit test would
|
|
||||||
defeat itself). But in interactive test runs the operator
|
|
||||||
has to hit Ctrl-C to move past the parked state before
|
|
||||||
pytest reports the failure.
|
|
||||||
|
|
||||||
## Reproducer
|
|
||||||
|
|
||||||
```
|
|
||||||
./py314/bin/python -m pytest \
|
|
||||||
tests/test_subint_cancellation.py::test_subint_non_checkpointing_child \
|
|
||||||
--spawn-backend=subint --tb=short --no-header -v
|
|
||||||
```
|
|
||||||
|
|
||||||
Expected: hangs until `trio.fail_after(15)` fires, or
|
|
||||||
Ctrl-C unwedges it manually.
|
|
||||||
|
|
||||||
## References
|
|
||||||
|
|
||||||
- `tractor.spawn._subint.subint_proc` — current subint
|
|
||||||
teardown code; see the `_HARD_KILL_TIMEOUT` bounded
|
|
||||||
shields + `daemon=True` driver-thread abandonment
|
|
||||||
(commit `b025c982`).
|
|
||||||
- `ai/conc-anal/subint_sigint_starvation_issue.md` — the
|
|
||||||
sibling CPython-level hang (GIL-starvation,
|
|
||||||
SIGINT-unresponsive) which is **not** this issue.
|
|
||||||
- Phase B tracking: issue #379.
|
|
||||||
|
|
@ -1,337 +0,0 @@
|
||||||
# `os.fork()` from a non-main sub-interpreter aborts the child (CPython refuses post-fork cleanup)
|
|
||||||
|
|
||||||
Third `subint`-class analysis in this project. Unlike its
|
|
||||||
two siblings (`subint_sigint_starvation_issue.md`,
|
|
||||||
`subint_cancel_delivery_hang_issue.md`), this one is not a
|
|
||||||
hang — it's a **hard CPython-level refusal** of an
|
|
||||||
experimental spawn strategy we wanted to try.
|
|
||||||
|
|
||||||
## TL;DR
|
|
||||||
|
|
||||||
An in-process sub-interpreter cannot be used as a
|
|
||||||
"launchpad" for `os.fork()` on current CPython. The fork
|
|
||||||
syscall succeeds in the parent, but the forked CHILD
|
|
||||||
process is aborted immediately by CPython's post-fork
|
|
||||||
cleanup with:
|
|
||||||
|
|
||||||
```
|
|
||||||
Fatal Python error: _PyInterpreterState_DeleteExceptMain: not main interpreter
|
|
||||||
```
|
|
||||||
|
|
||||||
This is enforced by a hard `PyStatus_ERR` gate in
|
|
||||||
`Python/pystate.c`. The CPython devs acknowledge the
|
|
||||||
fragility with an in-source comment (`// Ideally we could
|
|
||||||
guarantee tstate is running main.`) but provide no
|
|
||||||
mechanism to satisfy the precondition from user code.
|
|
||||||
|
|
||||||
**Implication for tractor**: the `subint_fork` backend
|
|
||||||
sketched in `tractor.spawn._subint_fork` is structurally
|
|
||||||
dead on current CPython. The submodule is kept as
|
|
||||||
documentation of the attempt; `--spawn-backend=subint_fork`
|
|
||||||
raises `NotImplementedError` pointing here.
|
|
||||||
|
|
||||||
## Context — why we tried this
|
|
||||||
|
|
||||||
The motivation is issue #379's "Our own thoughts, ideas
|
|
||||||
for `fork()`-workaround/hacks..." section. The existing
|
|
||||||
trio-backend (`tractor.spawn._trio.trio_proc`) spawns
|
|
||||||
subactors via `trio.lowlevel.open_process()` → ultimately
|
|
||||||
`posix_spawn()` or `fork+exec`, from the parent's main
|
|
||||||
interpreter that is currently running `trio.run()`. This
|
|
||||||
brushes against a known-fragile interaction between
|
|
||||||
`trio` and `fork()` tracked in
|
|
||||||
[python-trio/trio#1614](https://github.com/python-trio/trio/issues/1614)
|
|
||||||
and siblings — mostly mitigated in `tractor`'s case only
|
|
||||||
incidentally (we `exec()` immediately post-fork).
|
|
||||||
|
|
||||||
The idea was:
|
|
||||||
|
|
||||||
1. Create a subint that has *never* imported `trio`.
|
|
||||||
2. From a worker thread in that subint, call `os.fork()`.
|
|
||||||
3. In the child, `execv()` back into
|
|
||||||
`python -m tractor._child` — same as `trio_proc` does.
|
|
||||||
4. The fork is from a trio-free context → trio+fork
|
|
||||||
hazards avoided regardless of downstream behavior.
|
|
||||||
|
|
||||||
The parent-side orchestration (`ipc_server.wait_for_peer`,
|
|
||||||
`SpawnSpec`, `Portal` yield) would reuse
|
|
||||||
`trio_proc`'s flow verbatim, with only the subproc-spawn
|
|
||||||
mechanics swapped.
|
|
||||||
|
|
||||||
## Symptom
|
|
||||||
|
|
||||||
Running the prototype (`tractor.spawn._subint_fork.subint_fork_proc`,
|
|
||||||
see git history prior to the stub revert) on py3.14:
|
|
||||||
|
|
||||||
```
|
|
||||||
Fatal Python error: _PyInterpreterState_DeleteExceptMain: not main interpreter
|
|
||||||
Python runtime state: initialized
|
|
||||||
|
|
||||||
Current thread 0x00007f6b71a456c0 [subint-fork-lau] (most recent call first):
|
|
||||||
File "<script>", line 2 in <module>
|
|
||||||
<script>:2: DeprecationWarning: This process (pid=802985) is multi-threaded, use of fork() may lead to deadlocks in the child.
|
|
||||||
```
|
|
||||||
|
|
||||||
Key clues:
|
|
||||||
|
|
||||||
- The **`DeprecationWarning`** fires in the parent (before
|
|
||||||
fork completes) — fork *is* executing, we get that far.
|
|
||||||
- The **`Fatal Python error`** comes from the child — it
|
|
||||||
aborts during CPython's post-fork C initialization
|
|
||||||
before any user Python runs in the child.
|
|
||||||
- The thread name `subint-fork-lau[nchpad]` is ours —
|
|
||||||
confirms the fork is being called from the launchpad
|
|
||||||
subint's driver thread.
|
|
||||||
|
|
||||||
## CPython source walkthrough
|
|
||||||
|
|
||||||
### Call site — `Modules/posixmodule.c:728-793`
|
|
||||||
|
|
||||||
The post-fork-child hook CPython runs in the child process:
|
|
||||||
|
|
||||||
```c
|
|
||||||
void
|
|
||||||
PyOS_AfterFork_Child(void)
|
|
||||||
{
|
|
||||||
PyStatus status;
|
|
||||||
_PyRuntimeState *runtime = &_PyRuntime;
|
|
||||||
|
|
||||||
// re-creates runtime->interpreters.mutex (HEAD_UNLOCK)
|
|
||||||
status = _PyRuntimeState_ReInitThreads(runtime);
|
|
||||||
...
|
|
||||||
|
|
||||||
PyThreadState *tstate = _PyThreadState_GET();
|
|
||||||
_Py_EnsureTstateNotNULL(tstate);
|
|
||||||
|
|
||||||
...
|
|
||||||
|
|
||||||
// Ideally we could guarantee tstate is running main. ← !!!
|
|
||||||
_PyInterpreterState_ReinitRunningMain(tstate);
|
|
||||||
|
|
||||||
status = _PyEval_ReInitThreads(tstate);
|
|
||||||
...
|
|
||||||
|
|
||||||
status = _PyInterpreterState_DeleteExceptMain(runtime);
|
|
||||||
if (_PyStatus_EXCEPTION(status)) {
|
|
||||||
goto fatal_error;
|
|
||||||
}
|
|
||||||
...
|
|
||||||
|
|
||||||
fatal_error:
|
|
||||||
Py_ExitStatusException(status);
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
The `// Ideally we could guarantee tstate is running
|
|
||||||
main.` comment is a flashing warning sign — the CPython
|
|
||||||
devs *know* this path is fragile when fork is called from
|
|
||||||
a non-main subint, but they've chosen to abort rather than
|
|
||||||
silently corrupt state. Arguably the right call.
|
|
||||||
|
|
||||||
### The refusal — `Python/pystate.c:1035-1075`
|
|
||||||
|
|
||||||
```c
|
|
||||||
/*
|
|
||||||
* Delete all interpreter states except the main interpreter. If there
|
|
||||||
* is a current interpreter state, it *must* be the main interpreter.
|
|
||||||
*/
|
|
||||||
PyStatus
|
|
||||||
_PyInterpreterState_DeleteExceptMain(_PyRuntimeState *runtime)
|
|
||||||
{
|
|
||||||
struct pyinterpreters *interpreters = &runtime->interpreters;
|
|
||||||
|
|
||||||
PyThreadState *tstate = _PyThreadState_Swap(runtime, NULL);
|
|
||||||
if (tstate != NULL && tstate->interp != interpreters->main) {
|
|
||||||
return _PyStatus_ERR("not main interpreter"); ← our error
|
|
||||||
}
|
|
||||||
|
|
||||||
HEAD_LOCK(runtime);
|
|
||||||
PyInterpreterState *interp = interpreters->head;
|
|
||||||
interpreters->head = NULL;
|
|
||||||
while (interp != NULL) {
|
|
||||||
if (interp == interpreters->main) {
|
|
||||||
interpreters->main->next = NULL;
|
|
||||||
interpreters->head = interp;
|
|
||||||
interp = interp->next;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// XXX Won't this fail since PyInterpreterState_Clear() requires
|
|
||||||
// the "current" tstate to be set?
|
|
||||||
PyInterpreterState_Clear(interp); // XXX must activate?
|
|
||||||
zapthreads(interp);
|
|
||||||
...
|
|
||||||
}
|
|
||||||
...
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
The comment in the docstring (`If there is a current
|
|
||||||
interpreter state, it *must* be the main interpreter.`) is
|
|
||||||
the formal API contract. The `XXX` comments further in
|
|
||||||
suggest the CPython team is already aware this function
|
|
||||||
has latent issues even in the happy path.
|
|
||||||
|
|
||||||
## Chain summary
|
|
||||||
|
|
||||||
1. Our launchpad subint's driver OS-thread calls
|
|
||||||
`os.fork()`.
|
|
||||||
2. `fork()` succeeds. Child wakes up with:
|
|
||||||
- The parent's full memory image (including all
|
|
||||||
subints).
|
|
||||||
- Only the *calling* thread alive (the driver thread).
|
|
||||||
- `_PyThreadState_GET()` on that thread returns the
|
|
||||||
**launchpad subint's tstate**, *not* main's.
|
|
||||||
3. CPython runs `PyOS_AfterFork_Child()`.
|
|
||||||
4. It reaches `_PyInterpreterState_DeleteExceptMain()`.
|
|
||||||
5. Gate check fails: `tstate->interp != interpreters->main`.
|
|
||||||
6. `PyStatus_ERR("not main interpreter")` → `fatal_error`
|
|
||||||
goto → `Py_ExitStatusException()` → child aborts.
|
|
||||||
|
|
||||||
Parent-side consequence: `os.fork()` in the subint
|
|
||||||
bootstrap returned successfully with the child's PID, but
|
|
||||||
the child died before connecting back. Our parent's
|
|
||||||
`ipc_server.wait_for_peer(uid)` would hang forever — the
|
|
||||||
child never gets to `_actor_child_main`.
|
|
||||||
|
|
||||||
## Definitive answer to "Open Question 1"
|
|
||||||
|
|
||||||
From the (now-stub) `subint_fork_proc` docstring:
|
|
||||||
|
|
||||||
> Does CPython allow `os.fork()` from a non-main
|
|
||||||
> sub-interpreter under the legacy config?
|
|
||||||
|
|
||||||
**No.** Not in a usable-by-user-code sense. The fork
|
|
||||||
syscall is not blocked, but the child cannot survive
|
|
||||||
CPython's post-fork initialization. This is enforced, not
|
|
||||||
accidental, and the CPython devs have acknowledged the
|
|
||||||
fragility in-source.
|
|
||||||
|
|
||||||
## What we'd need from CPython to unblock
|
|
||||||
|
|
||||||
Any one of these, from least-to-most invasive:
|
|
||||||
|
|
||||||
1. **A pre-fork hook mechanism** that lets user code (or
|
|
||||||
tractor itself via `os.register_at_fork(before=...)`)
|
|
||||||
swap the current tstate to main before fork runs. The
|
|
||||||
swap would need to work across the subint→main
|
|
||||||
boundary, which is the actual hard part —
|
|
||||||
`_PyThreadState_Swap()` exists but is internal.
|
|
||||||
|
|
||||||
2. **A `_PyInterpreterState_DeleteExceptFor(tstate->interp)`
|
|
||||||
variant** that cleans up all *other* subints while
|
|
||||||
preserving the calling subint's state. Lets the child
|
|
||||||
continue executing in the subint after fork; a
|
|
||||||
subsequent `execv()` clears everything at the OS
|
|
||||||
level anyway.
|
|
||||||
|
|
||||||
3. **A cleaner error** than `Fatal Python error` aborting
|
|
||||||
the child. Even without fixing the underlying
|
|
||||||
capability, a raised Python-level exception in the
|
|
||||||
parent's `fork()` call (rather than a silent child
|
|
||||||
abort) would at least make the failure mode
|
|
||||||
debuggable.
|
|
||||||
|
|
||||||
## Upstream-report draft (for CPython issue tracker)
|
|
||||||
|
|
||||||
### Title
|
|
||||||
|
|
||||||
> `os.fork()` from a non-main sub-interpreter aborts the
|
|
||||||
> child with a fatal error in `PyOS_AfterFork_Child`; can
|
|
||||||
> we at least make it a clean `RuntimeError` in the
|
|
||||||
> parent?
|
|
||||||
|
|
||||||
### Body
|
|
||||||
|
|
||||||
> **Version**: Python 3.14.x
|
|
||||||
>
|
|
||||||
> **Summary**: Calling `os.fork()` from a thread currently
|
|
||||||
> executing inside a sub-interpreter causes the forked
|
|
||||||
> child process to abort during CPython's post-fork
|
|
||||||
> cleanup, with the following output in the child:
|
|
||||||
>
|
|
||||||
> ```
|
|
||||||
> Fatal Python error: _PyInterpreterState_DeleteExceptMain: not main interpreter
|
|
||||||
> ```
|
|
||||||
>
|
|
||||||
> From the **parent's** point of view the fork succeeded
|
|
||||||
> (returned a valid child PID). The failure is completely
|
|
||||||
> opaque to parent-side Python code — unless the parent
|
|
||||||
> does `os.waitpid()` it won't even notice the child
|
|
||||||
> died.
|
|
||||||
>
|
|
||||||
> **Root cause** (as I understand it from reading sources):
|
|
||||||
> `Modules/posixmodule.c::PyOS_AfterFork_Child()` calls
|
|
||||||
> `_PyInterpreterState_DeleteExceptMain()` with a
|
|
||||||
> precondition that `_PyThreadState_GET()->interp` be the
|
|
||||||
> main interpreter. When `fork()` is called from a thread
|
|
||||||
> executing inside a subinterpreter, the child wakes up
|
|
||||||
> with its tstate still pointing at the subint, and the
|
|
||||||
> gate in `Python/pystate.c:1044-1047` fails.
|
|
||||||
>
|
|
||||||
> A comment in the source
|
|
||||||
> (`Modules/posixmodule.c:753` — `// Ideally we could
|
|
||||||
> guarantee tstate is running main.`) suggests this is a
|
|
||||||
> known-fragile path rather than an intentional
|
|
||||||
> invariant.
|
|
||||||
>
|
|
||||||
> **Use case**: I was experimenting with using a
|
|
||||||
> sub-interpreter as a "fork launchpad" — have a subint
|
|
||||||
> that has never imported `trio`, call `os.fork()` from
|
|
||||||
> that subint's thread, and in the child `execv()` back
|
|
||||||
> into a fresh Python interpreter process. The goal was
|
|
||||||
> to sidestep known issues with `trio` + `fork()`
|
|
||||||
> interaction (see
|
|
||||||
> [python-trio/trio#1614](https://github.com/python-trio/trio/issues/1614))
|
|
||||||
> by guaranteeing the forking context had never been
|
|
||||||
> "contaminated" by trio's imports or globals. This
|
|
||||||
> approach would allow `trio`-using applications to
|
|
||||||
> combine `fork`-based subprocess spawning with
|
|
||||||
> per-worker `trio.run()` runtimes — a fairly common
|
|
||||||
> pattern that currently requires workarounds.
|
|
||||||
>
|
|
||||||
> **Request**:
|
|
||||||
>
|
|
||||||
> Ideally: make fork-from-subint work (e.g., by swapping
|
|
||||||
> the caller's tstate to main in the pre-fork hook), or
|
|
||||||
> provide a `_PyInterpreterState_DeleteExceptFor(interp)`
|
|
||||||
> variant that permits the caller's subint to survive
|
|
||||||
> post-fork so user code can subsequently `execv()`.
|
|
||||||
>
|
|
||||||
> Minimally: convert the fatal child-side abort into a
|
|
||||||
> clean `RuntimeError` (or similar) raised in the
|
|
||||||
> parent's `fork()` call. Even if the capability isn't
|
|
||||||
> expanded, the failure mode should be debuggable by
|
|
||||||
> user-code in the parent — right now it's a silent
|
|
||||||
> child death with an error message buried in the
|
|
||||||
> child's stderr that parent code can't programmatically
|
|
||||||
> see.
|
|
||||||
>
|
|
||||||
> **Related**: PEP 684 (per-interpreter GIL), PEP 734
|
|
||||||
> (`concurrent.interpreters` public API). The private
|
|
||||||
> `_interpreters` module is what I used to create the
|
|
||||||
> launchpad — behavior is the same whether using
|
|
||||||
> `_interpreters.create('legacy')` or
|
|
||||||
> `concurrent.interpreters.create()` (the latter was not
|
|
||||||
> tested but the gate is identical).
|
|
||||||
>
|
|
||||||
> Happy to contribute a minimal reproducer + test case if
|
|
||||||
> this is something the team wants to pursue.
|
|
||||||
|
|
||||||
## References
|
|
||||||
|
|
||||||
- `Modules/posixmodule.c:728` —
|
|
||||||
[`PyOS_AfterFork_Child`](https://github.com/python/cpython/blob/main/Modules/posixmodule.c#L728)
|
|
||||||
- `Python/pystate.c:1040` —
|
|
||||||
[`_PyInterpreterState_DeleteExceptMain`](https://github.com/python/cpython/blob/main/Python/pystate.c#L1040)
|
|
||||||
- PEP 684 (per-interpreter GIL):
|
|
||||||
<https://peps.python.org/pep-0684/>
|
|
||||||
- PEP 734 (`concurrent.interpreters` public API):
|
|
||||||
<https://peps.python.org/pep-0734/>
|
|
||||||
- [python-trio/trio#1614](https://github.com/python-trio/trio/issues/1614)
|
|
||||||
— the original motivation for the launchpad idea.
|
|
||||||
- tractor issue #379 — "Our own thoughts, ideas for
|
|
||||||
`fork()`-workaround/hacks..." section where this was
|
|
||||||
first sketched.
|
|
||||||
- `tractor.spawn._subint_fork` — in-tree stub preserving
|
|
||||||
the attempted impl's shape in git history.
|
|
||||||
|
|
@ -1,373 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
'''
|
|
||||||
Standalone CPython-level feasibility check for the "main-interp
|
|
||||||
worker-thread forkserver + subint-hosted trio" architecture
|
|
||||||
proposed as a workaround to the CPython-level refusal
|
|
||||||
documented in
|
|
||||||
`ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`.
|
|
||||||
|
|
||||||
Purpose
|
|
||||||
-------
|
|
||||||
Deliberately NOT a `tractor` test. Zero `tractor` imports.
|
|
||||||
Uses `_interpreters` (private stdlib) + `os.fork()` directly so
|
|
||||||
the signal is unambiguous — pass/fail here is a property of
|
|
||||||
CPython alone, independent of our runtime.
|
|
||||||
|
|
||||||
Run each scenario in isolation; the child's fate is observable
|
|
||||||
only via `os.waitpid()` of the parent and the scenario's own
|
|
||||||
status prints.
|
|
||||||
|
|
||||||
Scenarios (pick one with `--scenario <name>`)
|
|
||||||
---------------------------------------------
|
|
||||||
|
|
||||||
- `control_subint_thread_fork` — the KNOWN-BROKEN case we
|
|
||||||
documented in `subint_fork_blocked_by_cpython_post_fork_issue.md`:
|
|
||||||
drive a subint from a thread, call `os.fork()` inside its
|
|
||||||
`_interpreters.exec()`, watch the child abort. **Included as
|
|
||||||
a control** — if this scenario DOESN'T abort the child, our
|
|
||||||
analysis is wrong and we should re-check everything.
|
|
||||||
|
|
||||||
- `main_thread_fork` — baseline sanity. Call `os.fork()` from
|
|
||||||
the process's main thread. Must always succeed; if this
|
|
||||||
fails something much bigger is broken.
|
|
||||||
|
|
||||||
- `worker_thread_fork` — the architectural assertion. Spawn a
|
|
||||||
regular `threading.Thread` (attached to main interp, NOT a
|
|
||||||
subint), have IT call `os.fork()`. Child should survive
|
|
||||||
post-fork cleanup.
|
|
||||||
|
|
||||||
- `full_architecture` — end-to-end: main-interp worker thread
|
|
||||||
forks. In the child, fork-thread (still main-interp) creates
|
|
||||||
a subint, drives a second worker thread inside it that runs
|
|
||||||
a trivial `trio.run()`. Validates the "root runtime lives in
|
|
||||||
a subint in the child" piece of the proposed arch.
|
|
||||||
|
|
||||||
All scenarios print a self-contained pass/fail banner. Exit
|
|
||||||
code 0 on expected outcome (which for `control_*` means "child
|
|
||||||
aborted", not "child succeeded"!).
|
|
||||||
|
|
||||||
Requires Python 3.14+.
|
|
||||||
|
|
||||||
Usage
|
|
||||||
-----
|
|
||||||
::
|
|
||||||
|
|
||||||
python subint_fork_from_main_thread_smoketest.py \\
|
|
||||||
--scenario main_thread_fork
|
|
||||||
|
|
||||||
python subint_fork_from_main_thread_smoketest.py \\
|
|
||||||
--scenario full_architecture
|
|
||||||
|
|
||||||
'''
|
|
||||||
from __future__ import annotations
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import threading
|
|
||||||
import time
|
|
||||||
|
|
||||||
|
|
||||||
# Hard-require py3.14 for the public `concurrent.interpreters`
|
|
||||||
# API (we still drop to `_interpreters` internally, same as
|
|
||||||
# `tractor.spawn._subint`).
|
|
||||||
try:
|
|
||||||
from concurrent import interpreters as _public_interpreters # noqa: F401
|
|
||||||
import _interpreters # type: ignore
|
|
||||||
except ImportError:
|
|
||||||
print(
|
|
||||||
'FAIL (setup): requires Python 3.14+ '
|
|
||||||
'(missing `concurrent.interpreters`)',
|
|
||||||
file=sys.stderr,
|
|
||||||
)
|
|
||||||
sys.exit(2)
|
|
||||||
|
|
||||||
|
|
||||||
# The actual primitives this script exercises live in
|
|
||||||
# `tractor.spawn._subint_forkserver` — we re-import them here
|
|
||||||
# rather than inlining so the module and the validation stay
|
|
||||||
# in sync. (Early versions of this file had them inline for
|
|
||||||
# the "zero tractor imports" isolation guarantee; now that
|
|
||||||
# CPython-level feasibility is confirmed, the validated
|
|
||||||
# primitives have moved into tractor proper.)
|
|
||||||
from tractor.spawn._subint_forkserver import (
|
|
||||||
fork_from_worker_thread,
|
|
||||||
run_subint_in_worker_thread,
|
|
||||||
wait_child,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
# small observability helpers (test-harness only)
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def _banner(title: str) -> None:
|
|
||||||
line = '=' * 60
|
|
||||||
print(f'\n{line}\n{title}\n{line}', flush=True)
|
|
||||||
|
|
||||||
|
|
||||||
def _report(
|
|
||||||
label: str,
|
|
||||||
*,
|
|
||||||
ok: bool,
|
|
||||||
status_str: str,
|
|
||||||
expect_exit_ok: bool,
|
|
||||||
) -> None:
|
|
||||||
verdict: str = 'PASS' if ok else 'FAIL'
|
|
||||||
expected_str: str = (
|
|
||||||
'normal exit (rc=0)'
|
|
||||||
if expect_exit_ok
|
|
||||||
else 'abnormal death (signal or nonzero exit)'
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
f'[{verdict}] {label}: '
|
|
||||||
f'expected {expected_str}; observed {status_str}',
|
|
||||||
flush=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
# scenario: `control_subint_thread_fork` (known-broken)
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def scenario_control_subint_thread_fork() -> int:
|
|
||||||
_banner(
|
|
||||||
'[control] fork from INSIDE a subint (expected: child aborts)'
|
|
||||||
)
|
|
||||||
interp_id = _interpreters.create('legacy')
|
|
||||||
print(f' created subint {interp_id}', flush=True)
|
|
||||||
|
|
||||||
# Shared flag: child writes a sentinel file we can detect from
|
|
||||||
# the parent. If the child manages to write this, CPython's
|
|
||||||
# post-fork refusal is NOT happening → analysis is wrong.
|
|
||||||
sentinel = '/tmp/subint_fork_smoketest_control_child_ran'
|
|
||||||
try:
|
|
||||||
os.unlink(sentinel)
|
|
||||||
except FileNotFoundError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
bootstrap = (
|
|
||||||
'import os\n'
|
|
||||||
'pid = os.fork()\n'
|
|
||||||
'if pid == 0:\n'
|
|
||||||
# child — if CPython's refusal fires this code never runs
|
|
||||||
f' with open({sentinel!r}, "w") as f:\n'
|
|
||||||
' f.write("ran")\n'
|
|
||||||
' os._exit(0)\n'
|
|
||||||
'else:\n'
|
|
||||||
# parent side (inside the launchpad subint) — stash the
|
|
||||||
# forked PID on a shareable dict so we can waitpid()
|
|
||||||
# from the outer main interp. We can't just return it;
|
|
||||||
# _interpreters.exec() returns nothing useful.
|
|
||||||
' import builtins\n'
|
|
||||||
' builtins._forked_child_pid = pid\n'
|
|
||||||
)
|
|
||||||
|
|
||||||
# NOTE, we can't easily pull state back from the subint.
|
|
||||||
# For the CONTROL scenario we just time-bound the fork +
|
|
||||||
# check the sentinel. If sentinel exists → child ran →
|
|
||||||
# analysis wrong. If not → child aborted → analysis
|
|
||||||
# confirmed.
|
|
||||||
done = threading.Event()
|
|
||||||
|
|
||||||
def _drive() -> None:
|
|
||||||
try:
|
|
||||||
_interpreters.exec(interp_id, bootstrap)
|
|
||||||
except Exception as err:
|
|
||||||
print(
|
|
||||||
f' subint bootstrap raised (expected on some '
|
|
||||||
f'CPython versions): {type(err).__name__}: {err}',
|
|
||||||
flush=True,
|
|
||||||
)
|
|
||||||
finally:
|
|
||||||
done.set()
|
|
||||||
|
|
||||||
t = threading.Thread(
|
|
||||||
target=_drive,
|
|
||||||
name='control-subint-fork-launchpad',
|
|
||||||
daemon=True,
|
|
||||||
)
|
|
||||||
t.start()
|
|
||||||
done.wait(timeout=5.0)
|
|
||||||
t.join(timeout=2.0)
|
|
||||||
|
|
||||||
# Give the (possibly-aborted) child a moment to die.
|
|
||||||
time.sleep(0.5)
|
|
||||||
|
|
||||||
sentinel_present = os.path.exists(sentinel)
|
|
||||||
verdict = (
|
|
||||||
# "PASS" for our analysis means sentinel NOT present.
|
|
||||||
'PASS' if not sentinel_present else 'FAIL (UNEXPECTED)'
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
f'[{verdict}] control: sentinel present={sentinel_present} '
|
|
||||||
f'(analysis predicts False — child should abort before '
|
|
||||||
f'writing)',
|
|
||||||
flush=True,
|
|
||||||
)
|
|
||||||
if sentinel_present:
|
|
||||||
os.unlink(sentinel)
|
|
||||||
|
|
||||||
try:
|
|
||||||
_interpreters.destroy(interp_id)
|
|
||||||
except _interpreters.InterpreterError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return 0 if not sentinel_present else 1
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
# scenario: `main_thread_fork` (baseline sanity)
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def scenario_main_thread_fork() -> int:
|
|
||||||
_banner(
|
|
||||||
'[baseline] fork from MAIN thread (expected: child exits normally)'
|
|
||||||
)
|
|
||||||
|
|
||||||
pid = os.fork()
|
|
||||||
if pid == 0:
|
|
||||||
os._exit(0)
|
|
||||||
|
|
||||||
return 0 if _wait_child(
|
|
||||||
pid,
|
|
||||||
label='main_thread_fork',
|
|
||||||
expect_exit_ok=True,
|
|
||||||
) else 1
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
# scenario: `worker_thread_fork` (architectural assertion)
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def _run_worker_thread_fork_scenario(
|
|
||||||
label: str,
|
|
||||||
*,
|
|
||||||
child_target=None,
|
|
||||||
) -> int:
|
|
||||||
'''
|
|
||||||
Thin wrapper: delegate the actual fork to the
|
|
||||||
`tractor.spawn._subint_forkserver` primitive, then wait
|
|
||||||
on the child and render a pass/fail banner.
|
|
||||||
|
|
||||||
'''
|
|
||||||
try:
|
|
||||||
pid: int = fork_from_worker_thread(
|
|
||||||
child_target=child_target,
|
|
||||||
thread_name=f'worker-fork-thread[{label}]',
|
|
||||||
)
|
|
||||||
except RuntimeError as err:
|
|
||||||
print(f'[FAIL] {label}: {err}', flush=True)
|
|
||||||
return 1
|
|
||||||
print(f' forked child pid={pid}', flush=True)
|
|
||||||
ok, status_str = wait_child(pid, expect_exit_ok=True)
|
|
||||||
_report(
|
|
||||||
label,
|
|
||||||
ok=ok,
|
|
||||||
status_str=status_str,
|
|
||||||
expect_exit_ok=True,
|
|
||||||
)
|
|
||||||
return 0 if ok else 1
|
|
||||||
|
|
||||||
|
|
||||||
def scenario_worker_thread_fork() -> int:
|
|
||||||
_banner(
|
|
||||||
'[arch] fork from MAIN-INTERP WORKER thread '
|
|
||||||
'(expected: child exits normally — this is the one '
|
|
||||||
'that matters)'
|
|
||||||
)
|
|
||||||
return _run_worker_thread_fork_scenario(
|
|
||||||
'worker_thread_fork',
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
# scenario: `full_architecture`
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
_CHILD_TRIO_BOOTSTRAP: str = (
|
|
||||||
'import trio\n'
|
|
||||||
'async def _main():\n'
|
|
||||||
' await trio.sleep(0.05)\n'
|
|
||||||
' return 42\n'
|
|
||||||
'result = trio.run(_main)\n'
|
|
||||||
'assert result == 42, f"trio.run returned {result}"\n'
|
|
||||||
'print(" CHILD subint: trio.run OK, result=42", '
|
|
||||||
'flush=True)\n'
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _child_trio_in_subint() -> int:
|
|
||||||
'''
|
|
||||||
CHILD-side `child_target`: drive a trivial `trio.run()`
|
|
||||||
inside a fresh legacy-config subint on a worker thread,
|
|
||||||
using the `tractor.spawn._subint_forkserver.run_subint_in_worker_thread`
|
|
||||||
primitive. Returns 0 on success.
|
|
||||||
|
|
||||||
'''
|
|
||||||
try:
|
|
||||||
run_subint_in_worker_thread(
|
|
||||||
_CHILD_TRIO_BOOTSTRAP,
|
|
||||||
thread_name='child-subint-trio-thread',
|
|
||||||
)
|
|
||||||
except RuntimeError as err:
|
|
||||||
print(
|
|
||||||
f' CHILD: run_subint_in_worker_thread timed out / thread '
|
|
||||||
f'never returned: {err}',
|
|
||||||
flush=True,
|
|
||||||
)
|
|
||||||
return 3
|
|
||||||
except BaseException as err:
|
|
||||||
print(
|
|
||||||
f' CHILD: subint bootstrap raised: '
|
|
||||||
f'{type(err).__name__}: {err}',
|
|
||||||
flush=True,
|
|
||||||
)
|
|
||||||
return 4
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
def scenario_full_architecture() -> int:
|
|
||||||
_banner(
|
|
||||||
'[arch-full] worker-thread fork + child runs trio in a '
|
|
||||||
'subint (end-to-end proposed arch)'
|
|
||||||
)
|
|
||||||
return _run_worker_thread_fork_scenario(
|
|
||||||
'full_architecture',
|
|
||||||
child_target=_child_trio_in_subint,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
# main
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
SCENARIOS: dict[str, Callable[[], int]] = {
|
|
||||||
'control_subint_thread_fork': scenario_control_subint_thread_fork,
|
|
||||||
'main_thread_fork': scenario_main_thread_fork,
|
|
||||||
'worker_thread_fork': scenario_worker_thread_fork,
|
|
||||||
'full_architecture': scenario_full_architecture,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> int:
|
|
||||||
ap = argparse.ArgumentParser(
|
|
||||||
description=__doc__,
|
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
||||||
)
|
|
||||||
ap.add_argument(
|
|
||||||
'--scenario',
|
|
||||||
choices=sorted(SCENARIOS.keys()),
|
|
||||||
required=True,
|
|
||||||
)
|
|
||||||
args = ap.parse_args()
|
|
||||||
return SCENARIOS[args.scenario]()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
sys.exit(main())
|
|
||||||
|
|
@ -1,385 +0,0 @@
|
||||||
# `subint_forkserver` backend: orphaned-subactor SIGINT wedged in `epoll_wait`
|
|
||||||
|
|
||||||
Follow-up to the Phase C `subint_forkserver` spawn-backend
|
|
||||||
PR (see `tractor.spawn._subint_forkserver`, issue #379).
|
|
||||||
Surfaced by the xfail'd
|
|
||||||
`tests/spawn/test_subint_forkserver.py::test_orphaned_subactor_sigint_cleanup_DRAFT`.
|
|
||||||
|
|
||||||
Related-but-distinct from
|
|
||||||
`subint_cancel_delivery_hang_issue.md` (orphaned-channel
|
|
||||||
park AFTER subint teardown) and
|
|
||||||
`subint_sigint_starvation_issue.md` (GIL-starvation,
|
|
||||||
SIGINT never delivered): here the SIGINT IS delivered,
|
|
||||||
trio's handler IS installed, but trio's event loop never
|
|
||||||
wakes — so the KBI-at-checkpoint → `_trio_main` catch path
|
|
||||||
(which is the runtime's *intentional* OS-cancel design)
|
|
||||||
never fires.
|
|
||||||
|
|
||||||
## TL;DR
|
|
||||||
|
|
||||||
When a `subint_forkserver`-spawned subactor is orphaned
|
|
||||||
(parent `SIGKILL`'d, no IPC cancel path available) and then
|
|
||||||
externally `SIGINT`'d, the subactor hangs in
|
|
||||||
`trio/_core/_io_epoll.py::get_events` (epoll_wait)
|
|
||||||
indefinitely — even though:
|
|
||||||
|
|
||||||
1. `threading.current_thread() is threading.main_thread()`
|
|
||||||
post-fork (CPython 3.14 re-designates correctly).
|
|
||||||
2. Trio's SIGINT handler IS installed in the subactor
|
|
||||||
(`signal.getsignal(SIGINT)` returns
|
|
||||||
`<function KIManager.install.<locals>.handler at 0x...>`).
|
|
||||||
3. The kernel does deliver SIGINT — the signal arrives at
|
|
||||||
the only thread in the process (the fork-inherited
|
|
||||||
worker which IS now "main" per Python).
|
|
||||||
|
|
||||||
Yet `epoll_wait` does not return. Trio's wakeup-fd mechanism
|
|
||||||
— the machinery that turns SIGINT into an epoll-wake — is
|
|
||||||
somehow not firing the wakeup. Until that's fixed, the
|
|
||||||
intentional "KBI-as-OS-cancel" path in
|
|
||||||
`tractor/spawn/_entry.py::_trio_main:164` is unreachable
|
|
||||||
for forkserver-spawned subactors whose parent dies.
|
|
||||||
|
|
||||||
## Symptom
|
|
||||||
|
|
||||||
Test: `tests/spawn/test_subint_forkserver.py::test_orphaned_subactor_sigint_cleanup_DRAFT`
|
|
||||||
(currently marked `@pytest.mark.xfail(strict=True)`).
|
|
||||||
|
|
||||||
1. Harness subprocess brings up a tractor root actor +
|
|
||||||
one `run_in_actor(_sleep_forever)` subactor via
|
|
||||||
`try_set_start_method('subint_forkserver')`.
|
|
||||||
2. Harness prints `CHILD_PID` (subactor) and
|
|
||||||
`PARENT_READY` (root actor) markers to stdout.
|
|
||||||
3. Test `os.kill(parent_pid, SIGKILL)` + `proc.wait()`
|
|
||||||
to fully reap the root-actor harness.
|
|
||||||
4. Child (now reparented to pid 1) is still alive.
|
|
||||||
5. Test `os.kill(child_pid, SIGINT)` and polls
|
|
||||||
`os.kill(child_pid, 0)` for up to 10s.
|
|
||||||
6. **Observed**: the child is still alive at deadline —
|
|
||||||
SIGINT did not unwedge the trio loop.
|
|
||||||
|
|
||||||
## What the "intentional" cancel path IS
|
|
||||||
|
|
||||||
`tractor/spawn/_entry.py::_trio_main:157-186` —
|
|
||||||
|
|
||||||
```python
|
|
||||||
try:
|
|
||||||
if infect_asyncio:
|
|
||||||
actor._infected_aio = True
|
|
||||||
run_as_asyncio_guest(trio_main)
|
|
||||||
else:
|
|
||||||
trio.run(trio_main)
|
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
logmeth = log.cancel
|
|
||||||
exit_status: str = (
|
|
||||||
'Actor received KBI (aka an OS-cancel)\n'
|
|
||||||
...
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
The "KBI == OS-cancel" mapping IS the runtime's
|
|
||||||
deliberate, documented design. An OS-level SIGINT should
|
|
||||||
flow as: kernel → trio handler → KBI at trio checkpoint
|
|
||||||
→ unwinds `async_main` → surfaces at `_trio_main`'s
|
|
||||||
`except KeyboardInterrupt:` → `log.cancel` + clean `rc=0`.
|
|
||||||
|
|
||||||
**So fixing this hang is not "add a new SIGINT behavior" —
|
|
||||||
it's "make the existing designed behavior actually fire in
|
|
||||||
this backend config".** That's why option (B) ("fix root
|
|
||||||
cause") is aligned with existing design intent, not a
|
|
||||||
scope expansion.
|
|
||||||
|
|
||||||
## Evidence
|
|
||||||
|
|
||||||
### Positive control: standalone fork-from-worker + `trio.run(sleep_forever)` + SIGINT WORKS
|
|
||||||
|
|
||||||
```python
|
|
||||||
import os, signal, time, trio
|
|
||||||
from tractor.spawn._subint_forkserver import (
|
|
||||||
fork_from_worker_thread, wait_child,
|
|
||||||
)
|
|
||||||
|
|
||||||
def child_target() -> int:
|
|
||||||
async def _main():
|
|
||||||
try:
|
|
||||||
await trio.sleep_forever()
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
print('CHILD: caught KBI — trio SIGINT works!')
|
|
||||||
return
|
|
||||||
trio.run(_main)
|
|
||||||
return 0
|
|
||||||
|
|
||||||
pid = fork_from_worker_thread(child_target, thread_name='trio-sigint-test')
|
|
||||||
time.sleep(1.0)
|
|
||||||
os.kill(pid, signal.SIGINT)
|
|
||||||
wait_child(pid)
|
|
||||||
```
|
|
||||||
|
|
||||||
Result: `CHILD: caught KBI — trio SIGINT works!` + clean
|
|
||||||
exit. So the fork-child + trio signal plumbing IS healthy
|
|
||||||
in isolation. The hang appears only with the full tractor
|
|
||||||
subactor runtime on top.
|
|
||||||
|
|
||||||
### Negative test: full tractor subactor + orphan-SIGINT
|
|
||||||
|
|
||||||
Equivalent to the xfail test. Traceback dump via
|
|
||||||
`faulthandler.register(SIGUSR1, all_threads=True)` at the
|
|
||||||
stuck moment:
|
|
||||||
|
|
||||||
```
|
|
||||||
Current thread 0x00007... [subint-forkserv] (most recent call first):
|
|
||||||
File ".../trio/_core/_io_epoll.py", line 245 in get_events
|
|
||||||
File ".../trio/_core/_run.py", line 2415 in run
|
|
||||||
File "tractor/spawn/_entry.py", line 162 in _trio_main
|
|
||||||
File "tractor/_child.py", line 72 in _actor_child_main
|
|
||||||
File "tractor/spawn/_subint_forkserver.py", line 650 in _child_target
|
|
||||||
File "tractor/spawn/_subint_forkserver.py", line 308 in _worker
|
|
||||||
File ".../threading.py", line 1024 in run
|
|
||||||
```
|
|
||||||
|
|
||||||
### Thread + signal-mask inventory of the stuck subactor
|
|
||||||
|
|
||||||
Single thread (`tid == pid`, comm `'subint-forkserv'`,
|
|
||||||
which IS `threading.main_thread()` post-fork):
|
|
||||||
|
|
||||||
```
|
|
||||||
SigBlk: 0000000000000000 # nothing blocked
|
|
||||||
SigIgn: 0000000001001000 # SIGPIPE etc (Python defaults)
|
|
||||||
SigCgt: 0000000108000202 # bit 1 = SIGINT caught
|
|
||||||
```
|
|
||||||
|
|
||||||
Bit 1 set in `SigCgt` → SIGINT handler IS installed. So
|
|
||||||
trio's handler IS in place at the kernel level — not a
|
|
||||||
"handler missing" situation.
|
|
||||||
|
|
||||||
### Handler identity
|
|
||||||
|
|
||||||
Inside the subactor's RPC body, `signal.getsignal(SIGINT)`
|
|
||||||
returns `<function KIManager.install.<locals>.handler at
|
|
||||||
0x...>` — trio's own `KIManager` handler. tractor's only
|
|
||||||
SIGINT touches are `signal.getsignal()` *reads* (to stash
|
|
||||||
into `debug.DebugStatus._trio_handler`); nothing writes
|
|
||||||
over trio's handler outside the debug-REPL shielding path
|
|
||||||
(`devx/debug/_tty_lock.py::shield_sigint`) which isn't
|
|
||||||
engaged here (no debug_mode).
|
|
||||||
|
|
||||||
## Ruled out
|
|
||||||
|
|
||||||
- **GIL starvation / signal-pipe-full** (class A,
|
|
||||||
`subint_sigint_starvation_issue.md`): subactor runs on
|
|
||||||
its own GIL (separate OS process), not sharing with the
|
|
||||||
parent → no cross-process GIL contention. And `strace`-
|
|
||||||
equivalent in the signal mask shows SIGINT IS caught,
|
|
||||||
not queued.
|
|
||||||
- **Orphaned channel park** (`subint_cancel_delivery_hang_issue.md`):
|
|
||||||
different failure mode — that one has trio iterating
|
|
||||||
normally and getting wedged on an orphaned
|
|
||||||
`chan.recv()` AFTER teardown. Here trio's event loop
|
|
||||||
itself never wakes.
|
|
||||||
- **Tractor explicitly catching + swallowing KBI**:
|
|
||||||
greppable — the one `except KeyboardInterrupt:` in the
|
|
||||||
runtime is the INTENTIONAL cancel-path catch at
|
|
||||||
`_trio_main:164`. `async_main` uses `except Exception`
|
|
||||||
(not BaseException), so KBI should propagate through
|
|
||||||
cleanly if it ever fires.
|
|
||||||
- **Missing `signal.set_wakeup_fd` (main-thread
|
|
||||||
restriction)**: post-fork, the fork-worker thread IS
|
|
||||||
`threading.main_thread()`, so trio's main-thread check
|
|
||||||
passes and its wakeup-fd install should succeed.
|
|
||||||
|
|
||||||
## Root cause hypothesis (unverified)
|
|
||||||
|
|
||||||
The SIGINT handler fires but trio's wakeup-fd write does
|
|
||||||
not wake `epoll_wait`. Candidate causes, ranked by
|
|
||||||
plausibility:
|
|
||||||
|
|
||||||
1. **Wakeup-fd lifecycle race around tractor IPC setup.**
|
|
||||||
`async_main` spins up an IPC server + `process_messages`
|
|
||||||
loops early. Somewhere in that path the wakeup-fd that
|
|
||||||
trio registered with its epoll instance may be
|
|
||||||
closed/replaced/clobbered, so subsequent SIGINT writes
|
|
||||||
land on an fd that's no longer in the epoll set.
|
|
||||||
Evidence needed: compare
|
|
||||||
`signal.set_wakeup_fd(-1)` return value inside a
|
|
||||||
post-tractor-bringup RPC body vs. a pre-bringup
|
|
||||||
equivalent. If they differ, that's it.
|
|
||||||
2. **Shielded cancel scope around `process_messages`.**
|
|
||||||
The RPC message loop is likely wrapped in a trio cancel
|
|
||||||
scope; if that scope is `shield=True` at any outer
|
|
||||||
layer, KBI scheduled at a checkpoint could be absorbed
|
|
||||||
by the shield and never bubble out to `_trio_main`.
|
|
||||||
3. **Pre-fork wakeup-fd inheritance.** trio in the PARENT
|
|
||||||
process registered a wakeup-fd with its own epoll. The
|
|
||||||
child inherits the fd number but not the parent's
|
|
||||||
epoll instance — if tractor/trio re-uses the parent's
|
|
||||||
stale fd number anywhere, writes would go to a no-op
|
|
||||||
fd. (This is the least likely — `trio.run()` on the
|
|
||||||
child calls `KIManager.install` which should install a
|
|
||||||
fresh wakeup-fd from scratch.)
|
|
||||||
|
|
||||||
## Cross-backend scope question
|
|
||||||
|
|
||||||
**Untested**: does the same orphan-SIGINT hang reproduce
|
|
||||||
against the `trio_proc` backend (stock subprocess + exec)?
|
|
||||||
If yes → pre-existing tractor bug, independent of
|
|
||||||
`subint_forkserver`. If no → something specific to the
|
|
||||||
fork-from-worker path (e.g. inherited fds, mid-epoll-setup
|
|
||||||
interference).
|
|
||||||
|
|
||||||
**Quick repro for trio_proc**:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# save as /tmp/trio_proc_orphan_sigint_repro.py
|
|
||||||
import os, sys, signal, time, glob
|
|
||||||
import subprocess as sp
|
|
||||||
|
|
||||||
SCRIPT = '''
|
|
||||||
import os, sys, trio, tractor
|
|
||||||
async def _sleep_forever():
|
|
||||||
print(f"CHILD_PID={os.getpid()}", flush=True)
|
|
||||||
await trio.sleep_forever()
|
|
||||||
|
|
||||||
async def _main():
|
|
||||||
async with (
|
|
||||||
tractor.open_root_actor(registry_addrs=[("127.0.0.1", 12350)]),
|
|
||||||
tractor.open_nursery() as an,
|
|
||||||
):
|
|
||||||
await an.run_in_actor(_sleep_forever, name="sf-child")
|
|
||||||
print(f"PARENT_READY={os.getpid()}", flush=True)
|
|
||||||
await trio.sleep_forever()
|
|
||||||
|
|
||||||
trio.run(_main)
|
|
||||||
'''
|
|
||||||
|
|
||||||
proc = sp.Popen(
|
|
||||||
[sys.executable, '-c', SCRIPT],
|
|
||||||
stdout=sp.PIPE, stderr=sp.STDOUT,
|
|
||||||
)
|
|
||||||
# parse CHILD_PID + PARENT_READY off proc.stdout ...
|
|
||||||
# SIGKILL parent, SIGINT child, poll.
|
|
||||||
```
|
|
||||||
|
|
||||||
If that hangs too, open a broader issue; if not, this is
|
|
||||||
`subint_forkserver`-specific (likely fd-inheritance-related).
|
|
||||||
|
|
||||||
## Why this is ours to fix (not CPython's)
|
|
||||||
|
|
||||||
- Signal IS delivered (`SigCgt` bitmask confirms).
|
|
||||||
- Handler IS installed (trio's `KIManager`).
|
|
||||||
- Thread identity is correct post-fork.
|
|
||||||
- `_trio_main` already has the intentional KBI→clean-exit
|
|
||||||
path waiting to fire.
|
|
||||||
|
|
||||||
Every CPython-level precondition is met. Something in
|
|
||||||
tractor's runtime or trio's integration with it is
|
|
||||||
breaking the SIGINT→wakeup→event-loop-wake pipeline.
|
|
||||||
|
|
||||||
## Possible fix directions
|
|
||||||
|
|
||||||
1. **Audit the wakeup-fd across tractor's IPC bringup.**
|
|
||||||
Add a trio startup hook that captures
|
|
||||||
`signal.set_wakeup_fd(-1)` at `_trio_main` entry,
|
|
||||||
after `async_main` enters, and periodically — assert
|
|
||||||
it's unchanged. If it moves, track down the writer.
|
|
||||||
2. **Explicit `signal.set_wakeup_fd` reset after IPC
|
|
||||||
setup.** Brute force: re-install a fresh wakeup-fd
|
|
||||||
mid-bringup. Band-aid, but fast to try.
|
|
||||||
3. **Ensure no `shield=True` cancel scope envelopes the
|
|
||||||
RPC-message-loop / IPC-server task.** If one does,
|
|
||||||
KBI-at-checkpoint never escapes.
|
|
||||||
4. **Once fixed, the `child_sigint='trio'` mode on
|
|
||||||
`subint_forkserver_proc`** becomes effectively a no-op
|
|
||||||
or a doc-only mode — trio's natural handler already
|
|
||||||
does the right thing. Might end up removing the flag
|
|
||||||
entirely if there's no behavioral difference between
|
|
||||||
modes.
|
|
||||||
|
|
||||||
## Current workaround
|
|
||||||
|
|
||||||
None; `child_sigint` defaults to `'ipc'` (IPC cancel is
|
|
||||||
the only reliable cancel path today), and the xfail test
|
|
||||||
documents the gap. Operators hitting orphan-SIGINT get a
|
|
||||||
hung process that needs `SIGKILL`.
|
|
||||||
|
|
||||||
## Reproducer
|
|
||||||
|
|
||||||
Inline, standalone (no pytest):
|
|
||||||
|
|
||||||
```python
|
|
||||||
# save as /tmp/orphan_sigint_repro.py (py3.14+)
|
|
||||||
import os, sys, signal, time, glob, trio
|
|
||||||
import tractor
|
|
||||||
from tractor.spawn._subint_forkserver import (
|
|
||||||
fork_from_worker_thread,
|
|
||||||
)
|
|
||||||
|
|
||||||
async def _sleep_forever():
|
|
||||||
print(f'SUBACTOR[{os.getpid()}]', flush=True)
|
|
||||||
await trio.sleep_forever()
|
|
||||||
|
|
||||||
async def _main():
|
|
||||||
async with (
|
|
||||||
tractor.open_root_actor(
|
|
||||||
registry_addrs=[('127.0.0.1', 12349)],
|
|
||||||
),
|
|
||||||
tractor.open_nursery() as an,
|
|
||||||
):
|
|
||||||
await an.run_in_actor(_sleep_forever, name='sf-child')
|
|
||||||
await trio.sleep_forever()
|
|
||||||
|
|
||||||
def child_target() -> int:
|
|
||||||
from tractor.spawn._spawn import try_set_start_method
|
|
||||||
try_set_start_method('subint_forkserver')
|
|
||||||
trio.run(_main)
|
|
||||||
return 0
|
|
||||||
|
|
||||||
pid = fork_from_worker_thread(child_target, thread_name='repro')
|
|
||||||
time.sleep(3.0)
|
|
||||||
|
|
||||||
# find the subactor pid via /proc
|
|
||||||
children = []
|
|
||||||
for path in glob.glob(f'/proc/{pid}/task/*/children'):
|
|
||||||
with open(path) as f:
|
|
||||||
children.extend(int(x) for x in f.read().split() if x)
|
|
||||||
subactor_pid = children[0]
|
|
||||||
|
|
||||||
# SIGKILL root → orphan the subactor
|
|
||||||
os.kill(pid, signal.SIGKILL)
|
|
||||||
os.waitpid(pid, 0)
|
|
||||||
time.sleep(0.3)
|
|
||||||
|
|
||||||
# SIGINT the orphan — should cause clean trio exit
|
|
||||||
os.kill(subactor_pid, signal.SIGINT)
|
|
||||||
|
|
||||||
# poll for exit
|
|
||||||
for _ in range(100):
|
|
||||||
try:
|
|
||||||
os.kill(subactor_pid, 0)
|
|
||||||
time.sleep(0.1)
|
|
||||||
except ProcessLookupError:
|
|
||||||
print('HARNESS: subactor exited cleanly ✔')
|
|
||||||
sys.exit(0)
|
|
||||||
os.kill(subactor_pid, signal.SIGKILL)
|
|
||||||
print('HARNESS: subactor hung — reproduced')
|
|
||||||
sys.exit(1)
|
|
||||||
```
|
|
||||||
|
|
||||||
Expected (current): `HARNESS: subactor hung — reproduced`.
|
|
||||||
|
|
||||||
After fix: `HARNESS: subactor exited cleanly ✔`.
|
|
||||||
|
|
||||||
## References
|
|
||||||
|
|
||||||
- `tractor/spawn/_entry.py::_trio_main:157-186` — the
|
|
||||||
intentional KBI→clean-exit path this bug makes
|
|
||||||
unreachable.
|
|
||||||
- `tractor/spawn/_subint_forkserver` — the backend whose
|
|
||||||
orphan cancel-robustness this blocks.
|
|
||||||
- `tests/spawn/test_subint_forkserver.py::test_orphaned_subactor_sigint_cleanup_DRAFT`
|
|
||||||
— the xfail'd reproducer in the test suite.
|
|
||||||
- `ai/conc-anal/subint_cancel_delivery_hang_issue.md` —
|
|
||||||
sibling "orphaned channel park" hang (different class).
|
|
||||||
- `ai/conc-anal/subint_sigint_starvation_issue.md` —
|
|
||||||
sibling "GIL starvation SIGINT drop" hang (different
|
|
||||||
class).
|
|
||||||
- tractor issue #379 — subint backend tracking.
|
|
||||||
|
|
@ -1,849 +0,0 @@
|
||||||
# `subint_forkserver` backend: `test_cancellation.py` multi-level cancel cascade hang
|
|
||||||
|
|
||||||
Follow-up tracker: surfaced while wiring the new
|
|
||||||
`subint_forkserver` spawn backend into the full tractor
|
|
||||||
test matrix (step 2 of the post-backend-lands plan).
|
|
||||||
See also
|
|
||||||
`ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md`
|
|
||||||
— sibling tracker for a different forkserver-teardown
|
|
||||||
class which probably shares the same fundamental root
|
|
||||||
cause (fork-FD-inheritance across nested spawns).
|
|
||||||
|
|
||||||
## TL;DR
|
|
||||||
|
|
||||||
`tests/test_cancellation.py::test_nested_multierrors[subint_forkserver]`
|
|
||||||
hangs indefinitely under our new backend. The hang is
|
|
||||||
**inside the graceful IPC cancel cascade** — every actor
|
|
||||||
in the multi-level tree parks in `epoll_wait` waiting
|
|
||||||
for IPC messages that never arrive. Not a hard-kill /
|
|
||||||
tree-reap issue (we don't reach the hard-kill fallback
|
|
||||||
path at all).
|
|
||||||
|
|
||||||
Working hypothesis (unverified): **`os.fork()` from a
|
|
||||||
subactor inherits the root parent's IPC listener socket
|
|
||||||
FDs**. When a first-level subactor forkserver-spawns a
|
|
||||||
grandchild, that grandchild inherits both its direct
|
|
||||||
spawner's FDs AND the root's FDs — IPC message routing
|
|
||||||
becomes ambiguous (or silently sends to the wrong
|
|
||||||
channel), so the cancel cascade can't reach its target.
|
|
||||||
|
|
||||||
## Corrected diagnosis vs. earlier draft
|
|
||||||
|
|
||||||
An earlier version of this doc claimed the root cause
|
|
||||||
was **"forkserver teardown doesn't tree-kill
|
|
||||||
descendants"** (SIGKILL only reaches the direct child,
|
|
||||||
grandchildren survive and hold TCP `:1616`). That
|
|
||||||
diagnosis was **wrong**, caused by conflating two
|
|
||||||
observations:
|
|
||||||
|
|
||||||
1. *5-zombie leak holding :1616* — happened in my own
|
|
||||||
workflow when I aborted a bg pytest task with
|
|
||||||
`pkill` (SIGTERM/SIGKILL, not SIGINT). The abrupt
|
|
||||||
kill skipped the graceful `ActorNursery.__aexit__`
|
|
||||||
cancel cascade entirely, orphaning descendants to
|
|
||||||
init. **This was my cleanup bug, not a forkserver
|
|
||||||
teardown bug.** Codified the fix (SIGINT-first +
|
|
||||||
bounded wait before SIGKILL) in
|
|
||||||
`feedback_sc_graceful_cancel_first.md` +
|
|
||||||
`.claude/skills/run-tests/SKILL.md`.
|
|
||||||
2. *`test_nested_multierrors` hangs indefinitely* —
|
|
||||||
the real, separate, forkserver-specific bug
|
|
||||||
captured by this doc.
|
|
||||||
|
|
||||||
The two symptoms are unrelated. The tree-kill / setpgrp
|
|
||||||
fix direction proposed earlier would not help (1) (SC-
|
|
||||||
graceful-cleanup is the right answer there) and would
|
|
||||||
not help (2) (the hang is in the cancel cascade, not
|
|
||||||
in the hard-kill fallback).
|
|
||||||
|
|
||||||
## Symptom
|
|
||||||
|
|
||||||
Reproducer (py3.14, clean env):
|
|
||||||
|
|
||||||
```sh
|
|
||||||
# preflight: ensure clean env
|
|
||||||
ss -tlnp 2>/dev/null | grep ':1616' && echo 'FOUL — cleanup first!' || echo 'clean'
|
|
||||||
|
|
||||||
./py314/bin/python -m pytest --spawn-backend=subint_forkserver \
|
|
||||||
'tests/test_cancellation.py::test_nested_multierrors[subint_forkserver]' \
|
|
||||||
--timeout=30 --timeout-method=thread --tb=short -v
|
|
||||||
```
|
|
||||||
|
|
||||||
Expected: `pytest-timeout` fires at 30s with a thread-
|
|
||||||
dump banner, but the process itself **remains alive
|
|
||||||
after timeout** and doesn't unwedge on subsequent
|
|
||||||
SIGINT. Requires SIGKILL to reap.
|
|
||||||
|
|
||||||
## Evidence (tree structure at hang point)
|
|
||||||
|
|
||||||
All 5 processes are kernel-level `S` (sleeping) in
|
|
||||||
`do_epoll_wait` (trio's event loop waiting on I/O):
|
|
||||||
|
|
||||||
```
|
|
||||||
PID PPID THREADS NAME ROLE
|
|
||||||
333986 1 2 subint-forkserv pytest main (the test body)
|
|
||||||
333993 333986 3 subint-forkserv "child 1" spawner subactor
|
|
||||||
334003 333993 1 subint-forkserv grandchild errorer under child-1
|
|
||||||
334014 333993 1 subint-forkserv grandchild errorer under child-1
|
|
||||||
333999 333986 1 subint-forkserv "child 2" spawner subactor (NO grandchildren!)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Asymmetric tree depth
|
|
||||||
|
|
||||||
The test's `spawn_and_error(breadth=2, depth=3)` should
|
|
||||||
have BOTH direct children spawning 2 grandchildren
|
|
||||||
each, going 3 levels deep. Reality:
|
|
||||||
|
|
||||||
- Child 1 (333993, 3 threads) DID spawn its two
|
|
||||||
grandchildren as expected — fully booted trio
|
|
||||||
runtime.
|
|
||||||
- Child 2 (333999, 1 thread) did NOT spawn any
|
|
||||||
grandchildren — clearly never completed its
|
|
||||||
nursery's first `run_in_actor`. Its 1-thread state
|
|
||||||
suggests the runtime never fully booted (no trio
|
|
||||||
worker threads for `waitpid`/IPC).
|
|
||||||
|
|
||||||
This asymmetry is the key clue: the two direct
|
|
||||||
children started identically but diverged. Probably a
|
|
||||||
race around fork-inherited state (listener FDs,
|
|
||||||
subactor-nursery channel state) that happens to land
|
|
||||||
differently depending on spawn ordering.
|
|
||||||
|
|
||||||
### Parent-side state
|
|
||||||
|
|
||||||
Thread-dump of pytest main (333986) at the hang:
|
|
||||||
|
|
||||||
- Main trio thread — parked in
|
|
||||||
`trio._core._io_epoll.get_events` (epoll_wait on
|
|
||||||
its event loop). Waiting for IPC from children.
|
|
||||||
- Two trio-cache worker threads — each parked in
|
|
||||||
`outcome.capture(sync_fn)` calling
|
|
||||||
`os.waitpid(child_pid, 0)`. These are our
|
|
||||||
`_ForkedProc.wait()` off-loads. They're waiting for
|
|
||||||
the direct children to exit — but children are
|
|
||||||
stuck in their own epoll_wait waiting for IPC from
|
|
||||||
the parent.
|
|
||||||
|
|
||||||
**It's a deadlock, not a leak:** the parent is
|
|
||||||
correctly running `soft_kill(proc, _ForkedProc.wait,
|
|
||||||
portal)` (graceful IPC cancel via
|
|
||||||
`Portal.cancel_actor()`), but the children never
|
|
||||||
acknowledge the cancel message (or the message never
|
|
||||||
reaches them through the tangled post-fork IPC).
|
|
||||||
|
|
||||||
## What's NOT the cause (ruled out)
|
|
||||||
|
|
||||||
- **`_ForkedProc.kill()` only SIGKILLs direct pid /
|
|
||||||
missing tree-kill**: doesn't apply — we never reach
|
|
||||||
the hard-kill path. The deadlock is in the graceful
|
|
||||||
cancel cascade.
|
|
||||||
- **Port `:1616` contention**: ruled out after the
|
|
||||||
`reg_addr` fixture-wiring fix; each test session
|
|
||||||
gets a unique port now.
|
|
||||||
- **GIL starvation / SIGINT pipe filling** (class-A,
|
|
||||||
`subint_sigint_starvation_issue.md`): doesn't apply
|
|
||||||
— each subactor is its own OS process with its own
|
|
||||||
GIL (not legacy-config subint).
|
|
||||||
- **Child-side `_trio_main` absorbing KBI**: grep
|
|
||||||
confirmed; `_trio_main` only catches KBI at the
|
|
||||||
`trio.run()` callsite, which is reached only if the
|
|
||||||
trio loop exits normally. The children here never
|
|
||||||
exit trio.run() — they're wedged inside.
|
|
||||||
|
|
||||||
## Hypothesis: FD inheritance across nested forks
|
|
||||||
|
|
||||||
`subint_forkserver_proc` calls
|
|
||||||
`fork_from_worker_thread()` which ultimately does
|
|
||||||
`os.fork()` from a dedicated worker thread. Standard
|
|
||||||
Linux/POSIX fork semantics: **the child inherits ALL
|
|
||||||
open FDs from the parent**, including listener
|
|
||||||
sockets, epoll fds, trio wakeup pipes, and the
|
|
||||||
parent's IPC channel sockets.
|
|
||||||
|
|
||||||
At root-actor fork-spawn time, the root's IPC server
|
|
||||||
listener FDs are open in the parent. Those get
|
|
||||||
inherited by child 1. Child 1 then forkserver-spawns
|
|
||||||
its OWN subactor (grandchild). The grandchild
|
|
||||||
inherits FDs from child 1 — but child 1's address
|
|
||||||
space still contains **the root's IPC listener FDs
|
|
||||||
too** (inherited at first fork). So the grandchild
|
|
||||||
has THREE sets of FDs:
|
|
||||||
|
|
||||||
1. Its own (created after becoming a subactor).
|
|
||||||
2. Its direct parent child-1's.
|
|
||||||
3. The ROOT's (grandparent's) — inherited transitively.
|
|
||||||
|
|
||||||
IPC message routing may be ambiguous in this tangled
|
|
||||||
state. Or a listener socket that the root thinks it
|
|
||||||
owns is actually open in multiple processes, and
|
|
||||||
messages sent to it go to an arbitrary one. That
|
|
||||||
would exactly match the observed "graceful cancel
|
|
||||||
never propagates".
|
|
||||||
|
|
||||||
This hypothesis predicts the bug **scales with fork
|
|
||||||
depth**: single-level forkserver spawn
|
|
||||||
(`test_subint_forkserver_spawn_basic`) works
|
|
||||||
perfectly, but any test that spawns a second level
|
|
||||||
deadlocks. Matches observations so far.
|
|
||||||
|
|
||||||
## Fix directions (to validate)
|
|
||||||
|
|
||||||
### 1. `close_fds=True` equivalent in `fork_from_worker_thread()`
|
|
||||||
|
|
||||||
`subprocess.Popen` / `trio.lowlevel.open_process` have
|
|
||||||
`close_fds=True` by default on POSIX — they
|
|
||||||
enumerate open FDs in the child post-fork and close
|
|
||||||
everything except stdio + any explicitly-passed FDs.
|
|
||||||
Our raw `os.fork()` doesn't. Adding the equivalent to
|
|
||||||
our `_worker` prelude would isolate each fork
|
|
||||||
generation's FD set.
|
|
||||||
|
|
||||||
Implementation sketch in
|
|
||||||
`tractor.spawn._subint_forkserver.fork_from_worker_thread._worker`:
|
|
||||||
|
|
||||||
```python
|
|
||||||
def _worker() -> None:
|
|
||||||
pid: int = os.fork()
|
|
||||||
if pid == 0:
|
|
||||||
# CHILD: close inherited FDs except stdio + the
|
|
||||||
# pid-pipe we just opened.
|
|
||||||
keep: set[int] = {0, 1, 2, rfd, wfd}
|
|
||||||
import resource
|
|
||||||
soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
|
|
||||||
os.closerange(3, soft) # blunt; or enumerate /proc/self/fd
|
|
||||||
# ... then child_target() as before
|
|
||||||
```
|
|
||||||
|
|
||||||
Problem: overly aggressive — closes FDs the
|
|
||||||
grandchild might legitimately need (e.g. its parent's
|
|
||||||
IPC channel for the spawn-spec handshake, if we rely
|
|
||||||
on that). Needs thought about which FDs are
|
|
||||||
"inheritable and safe" vs. "inherited by accident".
|
|
||||||
|
|
||||||
### 2. Cloexec on tractor's own FDs
|
|
||||||
|
|
||||||
Set `FD_CLOEXEC` on tractor-created sockets (listener
|
|
||||||
sockets, IPC channel sockets, pipes). This flag
|
|
||||||
causes automatic close on `execve`, but since we
|
|
||||||
`fork()` without `exec()`, this alone doesn't help.
|
|
||||||
BUT — combined with a child-side explicit close-
|
|
||||||
non-cloexec loop, it gives us a way to mark "my
|
|
||||||
private FDs" vs. "safe to inherit". Most robust, but
|
|
||||||
requires tractor-wide audit.
|
|
||||||
|
|
||||||
### 3. Explicit FD cleanup in `_ForkedProc`/`_child_target`
|
|
||||||
|
|
||||||
Have `subint_forkserver_proc`'s `_child_target`
|
|
||||||
closure explicitly close the parent-side IPC listener
|
|
||||||
FDs before calling `_actor_child_main`. Requires
|
|
||||||
being able to enumerate "the parent's listener FDs
|
|
||||||
that the child shouldn't keep" — plausible via
|
|
||||||
`Actor.ipc_server`'s socket objects.
|
|
||||||
|
|
||||||
### 4. Use `os.posix_spawn` with explicit `file_actions`
|
|
||||||
|
|
||||||
Instead of raw `os.fork()`, use `os.posix_spawn()`
|
|
||||||
which supports explicit file-action specifications
|
|
||||||
(close this FD, dup2 that FD). Cleaner semantics, but
|
|
||||||
probably incompatible with our "no exec" requirement
|
|
||||||
(subint_forkserver is a fork-without-exec design).
|
|
||||||
|
|
||||||
**Likely correct answer: (3) — targeted FD cleanup
|
|
||||||
via `actor.ipc_server` handle.** (1) is too blunt,
|
|
||||||
(2) is too wide-ranging, (4) changes the spawn
|
|
||||||
mechanism.
|
|
||||||
|
|
||||||
## Reproducer (standalone, no pytest)
|
|
||||||
|
|
||||||
```python
|
|
||||||
# save as /tmp/forkserver_nested_hang_repro.py (py3.14+)
|
|
||||||
import trio, tractor
|
|
||||||
|
|
||||||
async def assert_err():
|
|
||||||
assert 0
|
|
||||||
|
|
||||||
async def spawn_and_error(breadth: int = 2, depth: int = 1):
|
|
||||||
async with tractor.open_nursery() as n:
|
|
||||||
for i in range(breadth):
|
|
||||||
if depth > 0:
|
|
||||||
await n.run_in_actor(
|
|
||||||
spawn_and_error,
|
|
||||||
breadth=breadth,
|
|
||||||
depth=depth - 1,
|
|
||||||
name=f'spawner_{i}_{depth}',
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
await n.run_in_actor(
|
|
||||||
assert_err,
|
|
||||||
name=f'errorer_{i}',
|
|
||||||
)
|
|
||||||
|
|
||||||
async def _main():
|
|
||||||
async with tractor.open_nursery() as n:
|
|
||||||
for i in range(2):
|
|
||||||
await n.run_in_actor(
|
|
||||||
spawn_and_error,
|
|
||||||
name=f'top_{i}',
|
|
||||||
breadth=2,
|
|
||||||
depth=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
from tractor.spawn._spawn import try_set_start_method
|
|
||||||
try_set_start_method('subint_forkserver')
|
|
||||||
with trio.fail_after(20):
|
|
||||||
trio.run(_main)
|
|
||||||
```
|
|
||||||
|
|
||||||
Expected (current): hangs on `trio.fail_after(20)`
|
|
||||||
— children never ack the error-propagation cancel
|
|
||||||
cascade. Pattern: top 2 direct children, 4
|
|
||||||
grandchildren, 1 errorer deadlocks while trying to
|
|
||||||
unwind through its parent chain.
|
|
||||||
|
|
||||||
After fix: `trio.TooSlowError`-free completion; the
|
|
||||||
root's `open_nursery` receives the
|
|
||||||
`BaseExceptionGroup` containing the `AssertionError`
|
|
||||||
from the errorer and unwinds cleanly.
|
|
||||||
|
|
||||||
## Update — 2026-04-23: partial fix landed, deeper layer surfaced
|
|
||||||
|
|
||||||
Three improvements landed as separate commits in the
|
|
||||||
`subint_forkserver_backend` branch (see `git log`):
|
|
||||||
|
|
||||||
1. **`_close_inherited_fds()` in fork-child prelude**
|
|
||||||
(`tractor/spawn/_subint_forkserver.py`). POSIX
|
|
||||||
close-fds-equivalent enumeration via
|
|
||||||
`/proc/self/fd` (or `RLIMIT_NOFILE` fallback), keep
|
|
||||||
only stdio. This is fix-direction (1) from the list
|
|
||||||
above — went with the blunt form rather than the
|
|
||||||
targeted enum-via-`actor.ipc_server` form, turns
|
|
||||||
out the aggressive close is safe because every
|
|
||||||
inheritable resource the fresh child needs
|
|
||||||
(IPC-channel socket, etc.) is opened AFTER the
|
|
||||||
fork anyway.
|
|
||||||
2. **`_ForkedProc.wait()` via `os.pidfd_open()` +
|
|
||||||
`trio.lowlevel.wait_readable()`** — matches the
|
|
||||||
`trio.Process.wait` / `mp.Process.sentinel` pattern
|
|
||||||
used by `trio_proc` and `proc_waiter`. Gives us
|
|
||||||
fully trio-cancellable child-wait (prior impl
|
|
||||||
blocked a cache thread on a sync `os.waitpid` that
|
|
||||||
was NOT trio-cancellable due to
|
|
||||||
`abandon_on_cancel=False`).
|
|
||||||
3. **`_parent_chan_cs` wiring** in
|
|
||||||
`tractor/runtime/_runtime.py`: capture the shielded
|
|
||||||
`loop_cs` for the parent-channel `process_messages`
|
|
||||||
task in `async_main`; explicitly cancel it in
|
|
||||||
`Actor.cancel()` teardown. This breaks the shield
|
|
||||||
during teardown so the parent-chan loop exits when
|
|
||||||
cancel is issued, instead of parking on a parent-
|
|
||||||
socket EOF that might never arrive under fork
|
|
||||||
semantics.
|
|
||||||
|
|
||||||
**Concrete wins from (1):** the sibling
|
|
||||||
`subint_forkserver_orphan_sigint_hang_issue.md` class
|
|
||||||
is **now fixed** — `test_orphaned_subactor_sigint_cleanup_DRAFT`
|
|
||||||
went from strict-xfail to pass. The xfail mark was
|
|
||||||
removed; the test remains as a regression guard.
|
|
||||||
|
|
||||||
**test_nested_multierrors STILL hangs** though.
|
|
||||||
|
|
||||||
### Updated diagnosis (narrowed)
|
|
||||||
|
|
||||||
DIAGDEBUG instrumentation of `process_messages` ENTER/
|
|
||||||
EXIT pairs + `_parent_chan_cs.cancel()` call sites
|
|
||||||
showed (captured during a 20s-timeout repro):
|
|
||||||
|
|
||||||
- 80 `process_messages` ENTERs, 75 EXITs → 5 stuck.
|
|
||||||
- **All 40 `shield=True` ENTERs matched EXIT** — every
|
|
||||||
shielded parent-chan loop exits cleanly. The
|
|
||||||
`_parent_chan_cs` wiring works as intended.
|
|
||||||
- **The 5 stuck loops are all `shield=False`** — peer-
|
|
||||||
channel handlers (inbound connections handled by
|
|
||||||
`handle_stream_from_peer` in stream_handler_tn).
|
|
||||||
- After our `_parent_chan_cs.cancel()` fires, NEW
|
|
||||||
shielded process_messages loops start (on the
|
|
||||||
session reg_addr port — probably discovery-layer
|
|
||||||
reconnection attempts). These don't block teardown
|
|
||||||
(they all exit) but indicate the cancel cascade has
|
|
||||||
more moving parts than expected.
|
|
||||||
|
|
||||||
### Remaining unknown
|
|
||||||
|
|
||||||
Why don't the 5 peer-channel loops exit when
|
|
||||||
`service_tn.cancel_scope.cancel()` fires? They're in
|
|
||||||
`stream_handler_tn` which IS `service_tn` in the
|
|
||||||
current configuration (`open_ipc_server(parent_tn=
|
|
||||||
service_tn, stream_handler_tn=service_tn)`). A
|
|
||||||
standard nursery-scope-cancel should propagate through
|
|
||||||
them — no shield, no special handler. Something
|
|
||||||
specific to the fork-spawned configuration keeps them
|
|
||||||
alive.
|
|
||||||
|
|
||||||
Candidate follow-up experiments:
|
|
||||||
|
|
||||||
- Dump the trio task tree at the hang point (via
|
|
||||||
`stackscope` or direct trio introspection) to see
|
|
||||||
what each stuck loop is awaiting. `chan.__anext__`
|
|
||||||
on a socket recv? An inner lock? A shielded sub-task?
|
|
||||||
- Compare peer-channel handler lifecycle under
|
|
||||||
`trio_proc` vs `subint_forkserver` with equivalent
|
|
||||||
logging to spot the divergence.
|
|
||||||
- Investigate whether the peer handler is caught in
|
|
||||||
the `except trio.Cancelled:` path at
|
|
||||||
`tractor/ipc/_server.py:448` that re-raises — but
|
|
||||||
re-raise means it should still exit. Unless
|
|
||||||
something higher up swallows it.
|
|
||||||
|
|
||||||
### Attempted fix (DID NOT work) — hypothesis (3)
|
|
||||||
|
|
||||||
Tried: in `_serve_ipc_eps` finally, after closing
|
|
||||||
listeners, also iterate `server._peers` and
|
|
||||||
sync-close each peer channel's underlying stream
|
|
||||||
socket fd:
|
|
||||||
|
|
||||||
```python
|
|
||||||
for _uid, _chans in list(server._peers.items()):
|
|
||||||
for _chan in _chans:
|
|
||||||
try:
|
|
||||||
_stream = _chan._transport.stream if _chan._transport else None
|
|
||||||
if _stream is not None:
|
|
||||||
_stream.socket.close() # sync fd close
|
|
||||||
except (AttributeError, OSError):
|
|
||||||
pass
|
|
||||||
```
|
|
||||||
|
|
||||||
Theory: closing the socket fd from outside the stuck
|
|
||||||
recv task would make the recv see EBADF /
|
|
||||||
ClosedResourceError and unblock.
|
|
||||||
|
|
||||||
Result: `test_nested_multierrors[subint_forkserver]`
|
|
||||||
still hangs identically. Either:
|
|
||||||
- The sync `socket.close()` doesn't propagate into
|
|
||||||
trio's in-flight `recv_some()` the way I expected
|
|
||||||
(trio may hold an internal reference that keeps the
|
|
||||||
fd open even after an external close), or
|
|
||||||
- The stuck recv isn't even the root blocker and the
|
|
||||||
peer handlers never reach the finally for some
|
|
||||||
reason I haven't understood yet.
|
|
||||||
|
|
||||||
Either way, the sync-close hypothesis is **ruled
|
|
||||||
out**. Reverted the experiment, restored the skip-
|
|
||||||
mark on the test.
|
|
||||||
|
|
||||||
### Aside: `-s` flag does NOT change `test_nested_multierrors` behavior
|
|
||||||
|
|
||||||
Tested explicitly: both with and without `-s`, the
|
|
||||||
test hangs identically. So the capture-pipe-fill
|
|
||||||
hypothesis is **ruled out** for this test.
|
|
||||||
|
|
||||||
The earlier `test_context_stream_semantics.py` `-s`
|
|
||||||
observation was most likely caused by a competing
|
|
||||||
pytest run in my session (confirmed via process list
|
|
||||||
— my leftover pytest was alive at that time and
|
|
||||||
could have been holding state on the default
|
|
||||||
registry port).
|
|
||||||
|
|
||||||
## Update — 2026-04-23 (late): cancel delivery ruled in, nursery-wait ruled BLOCKER
|
|
||||||
|
|
||||||
**New diagnostic run** instrumented
|
|
||||||
`handle_stream_from_peer` at ENTER / `except
|
|
||||||
trio.Cancelled:` / finally, plus `Actor.cancel()`
|
|
||||||
just before `self._parent_chan_cs.cancel()`. Result:
|
|
||||||
|
|
||||||
- **40 `handle_stream_from_peer` ENTERs**.
|
|
||||||
- **0 `except trio.Cancelled:` hits** — cancel
|
|
||||||
never fires on any peer-handler.
|
|
||||||
- **35 finally hits** — those handlers exit via
|
|
||||||
peer-initiated EOF (normal return), NOT cancel.
|
|
||||||
- **5 handlers never reach finally** — stuck forever.
|
|
||||||
- **`Actor.cancel()` fired in 12 PIDs** — but the
|
|
||||||
PIDs with peer handlers that DIDN'T fire
|
|
||||||
Actor.cancel are exactly **root + 2 direct
|
|
||||||
spawners**. These 3 actors have peer handlers
|
|
||||||
(for their own subactors) that stay stuck because
|
|
||||||
**`Actor.cancel()` at these levels never runs**.
|
|
||||||
|
|
||||||
### The actual deadlock shape
|
|
||||||
|
|
||||||
`Actor.cancel()` lives in
|
|
||||||
`open_root_actor.__aexit__` / `async_main` teardown.
|
|
||||||
That only runs when the enclosing `async with
|
|
||||||
tractor.open_nursery()` exits. The nursery's
|
|
||||||
`__aexit__` calls the backend `*_proc` spawn target's
|
|
||||||
teardown, which does `soft_kill() →
|
|
||||||
_ForkedProc.wait()` on its child PID. That wait is
|
|
||||||
trio-cancellable via pidfd now (good) — but nothing
|
|
||||||
CANCELS it because the outer scope only cancels when
|
|
||||||
`Actor.cancel()` runs, which only runs when the
|
|
||||||
nursery completes, which waits on the child.
|
|
||||||
|
|
||||||
It's a **multi-level mutual wait**:
|
|
||||||
|
|
||||||
```
|
|
||||||
root blocks on spawner.wait()
|
|
||||||
spawner blocks on grandchild.wait()
|
|
||||||
grandchild blocks on errorer.wait()
|
|
||||||
errorer Actor.cancel() ran, but process
|
|
||||||
may not have fully exited yet
|
|
||||||
(something in root_tn holding on?)
|
|
||||||
```
|
|
||||||
|
|
||||||
Each level waits for the level below. The bottom
|
|
||||||
level (errorer) reaches Actor.cancel(), but its
|
|
||||||
process may not fully exit — meaning its pidfd
|
|
||||||
doesn't go readable, meaning the grandchild's
|
|
||||||
waitpid doesn't return, meaning the grandchild's
|
|
||||||
nursery doesn't unwind, etc. all the way up.
|
|
||||||
|
|
||||||
### Refined question
|
|
||||||
|
|
||||||
**Why does an errorer process not exit after its
|
|
||||||
`Actor.cancel()` completes?**
|
|
||||||
|
|
||||||
Possibilities:
|
|
||||||
1. `_parent_chan_cs.cancel()` fires (shielded
|
|
||||||
parent-chan loop unshielded), but the task is
|
|
||||||
stuck INSIDE the shielded loop's recv in a way
|
|
||||||
that cancel still can't break.
|
|
||||||
2. After `Actor.cancel()` returns, `async_main`
|
|
||||||
still has other tasks in `root_tn` waiting for
|
|
||||||
something that never arrives (e.g. outbound
|
|
||||||
IPC reply delivery).
|
|
||||||
3. The `os._exit(rc)` in `_worker` (at
|
|
||||||
`_subint_forkserver.py`) doesn't run because
|
|
||||||
`_child_target` never returns.
|
|
||||||
|
|
||||||
Next-session candidate probes (in priority order):
|
|
||||||
|
|
||||||
1. **Instrument `_worker`'s fork-child branch** to
|
|
||||||
confirm whether `child_target()` returns (and
|
|
||||||
thus `os._exit(rc)` is reached) for errorer
|
|
||||||
PIDs. If yes → process should die; if no →
|
|
||||||
trace back into `_actor_child_main` /
|
|
||||||
`_trio_main` / `async_main` to find the stuck
|
|
||||||
spot.
|
|
||||||
2. **Instrument `async_main`'s final unwind** to
|
|
||||||
see which await in the teardown doesn't
|
|
||||||
complete.
|
|
||||||
3. **Compare under `trio_proc` backend** at the
|
|
||||||
same `_worker`-equivalent level to see where
|
|
||||||
the flows diverge.
|
|
||||||
|
|
||||||
### Rule-out: NOT a stuck peer-chan recv
|
|
||||||
|
|
||||||
Earlier hypothesis was that the 5 stuck peer-chan
|
|
||||||
loops were blocked on a socket recv that cancel
|
|
||||||
couldn't interrupt. This pass revealed the real
|
|
||||||
cause: cancel **never reaches those tasks** because
|
|
||||||
their owning actor's `Actor.cancel()` never runs.
|
|
||||||
The recvs are fine — they're just parked because
|
|
||||||
nothing is telling them to stop.
|
|
||||||
|
|
||||||
## Update — 2026-04-23 (very late): leaves exit, middle actors stuck in `trio.run`
|
|
||||||
|
|
||||||
Yet another instrumentation pass — this time
|
|
||||||
printing at:
|
|
||||||
|
|
||||||
- `_worker` child branch: `pre child_target()` /
|
|
||||||
`child_target RETURNED rc=N` / `about to
|
|
||||||
os._exit(rc)`
|
|
||||||
- `_trio_main`: `about to trio.run` /
|
|
||||||
`trio.run RETURNED NORMALLY` / `FINALLY`
|
|
||||||
|
|
||||||
**Fresh-run results** (`test_nested_multierrors[
|
|
||||||
subint_forkserver]`, depth=1/breadth=2, 1 root + 14
|
|
||||||
forked = 15 actors total):
|
|
||||||
|
|
||||||
- **9 processes completed the full flow** —
|
|
||||||
`trio.run RETURNED NORMALLY` → `child_target
|
|
||||||
RETURNED rc=0` → `about to os._exit(0)`. These
|
|
||||||
are the LEAVES of the tree (errorer actors) plus
|
|
||||||
their direct parents (depth-0 spawners). They
|
|
||||||
actually exit their processes.
|
|
||||||
- **5 processes are stuck INSIDE `trio.run(trio_main)`**
|
|
||||||
— they hit "about to trio.run" but NEVER see
|
|
||||||
"trio.run RETURNED NORMALLY". These are root +
|
|
||||||
top-level spawners + one intermediate.
|
|
||||||
|
|
||||||
**What this means:** `async_main` itself is the
|
|
||||||
deadlock holder, not the peer-channel loops.
|
|
||||||
Specifically, the outer `async with root_tn:` in
|
|
||||||
`async_main` never exits for the 5 stuck actors.
|
|
||||||
Their `trio.run` never returns → `_trio_main`
|
|
||||||
catch/finally never runs → `_worker` never reaches
|
|
||||||
`os._exit(rc)` → the PROCESS never dies → its
|
|
||||||
parent's `_ForkedProc.wait()` blocks → parent's
|
|
||||||
nursery hangs → parent's `async_main` hangs → ...
|
|
||||||
|
|
||||||
### The new precise question
|
|
||||||
|
|
||||||
**What task in the 5 stuck actors' `async_main`
|
|
||||||
never completes?** Candidates:
|
|
||||||
|
|
||||||
1. The shielded parent-chan `process_messages`
|
|
||||||
task in `root_tn` — but we explicitly cancel it
|
|
||||||
via `_parent_chan_cs.cancel()` in `Actor.cancel()`.
|
|
||||||
However, `Actor.cancel()` only runs during
|
|
||||||
`open_root_actor.__aexit__`, which itself runs
|
|
||||||
only after `async_main`'s outer unwind — which
|
|
||||||
doesn't happen. So the shield isn't broken.
|
|
||||||
|
|
||||||
2. `await actor_nursery._join_procs.wait()` or
|
|
||||||
similar in the inline backend `*_proc` flow.
|
|
||||||
|
|
||||||
3. `_ForkedProc.wait()` on a grandchild that
|
|
||||||
actually DID exit — but the pidfd_open watch
|
|
||||||
didn't fire for some reason (race between
|
|
||||||
pidfd_open and the child exiting?).
|
|
||||||
|
|
||||||
The most specific next probe: **add DIAG around
|
|
||||||
`_ForkedProc.wait()` enter/exit** to see whether
|
|
||||||
the pidfd-based wait returns for every grandchild
|
|
||||||
exit. If a stuck parent's `_ForkedProc.wait()`
|
|
||||||
NEVER returns despite its child exiting, the
|
|
||||||
pidfd mechanism has a race bug under nested
|
|
||||||
forkserver.
|
|
||||||
|
|
||||||
Alternative probe: instrument `async_main`'s outer
|
|
||||||
nursery exits to find which nursery's `__aexit__`
|
|
||||||
is stuck, drilling down from `trio.run` to the
|
|
||||||
specific `async with` that never completes.
|
|
||||||
|
|
||||||
### Cascade summary (updated tree view)
|
|
||||||
|
|
||||||
```
|
|
||||||
ROOT (pytest) STUCK in trio.run
|
|
||||||
├── top_0 (spawner, d=1) STUCK in trio.run
|
|
||||||
│ ├── spawner_0_d1_0 (d=0) exited (os._exit 0)
|
|
||||||
│ │ ├── errorer_0_0 exited (os._exit 0)
|
|
||||||
│ │ └── errorer_0_1 exited (os._exit 0)
|
|
||||||
│ └── spawner_0_d1_1 (d=0) exited (os._exit 0)
|
|
||||||
│ ├── errorer_0_2 exited (os._exit 0)
|
|
||||||
│ └── errorer_0_3 exited (os._exit 0)
|
|
||||||
└── top_1 (spawner, d=1) STUCK in trio.run
|
|
||||||
├── spawner_1_d1_0 (d=0) STUCK in trio.run (sibling race?)
|
|
||||||
│ ├── errorer_1_0 exited
|
|
||||||
│ └── errorer_1_1 exited
|
|
||||||
└── spawner_1_d1_1 (d=0) STUCK in trio.run
|
|
||||||
├── errorer_1_2 exited
|
|
||||||
└── errorer_1_3 exited
|
|
||||||
```
|
|
||||||
|
|
||||||
Grandchildren (d=0 spawners) exit OR stick —
|
|
||||||
asymmetric. Not purely depth-determined. Some race
|
|
||||||
condition in nursery teardown when multiple
|
|
||||||
siblings error simultaneously.
|
|
||||||
|
|
||||||
## Update — 2026-04-23 (late, probe iteration 3): hang pinpointed to `wait_for_no_more_peers()`
|
|
||||||
|
|
||||||
Further DIAGDEBUG at every milestone in `async_main`
|
|
||||||
(runtime UP / EXITED service_tn / EXITED root_tn /
|
|
||||||
FINALLY ENTER / RETURNING) plus `_ForkedProc.wait`
|
|
||||||
ENTER/RETURNED per-pidfd. Result:
|
|
||||||
|
|
||||||
**Every stuck actor reaches `async_main: FINALLY
|
|
||||||
ENTER` but NOT `async_main: RETURNING`.**
|
|
||||||
|
|
||||||
That isolates the hang to a specific await in
|
|
||||||
`async_main`'s finally block at
|
|
||||||
`tractor/runtime/_runtime.py:1837+`. The suspect:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Ensure all peers (actors connected to us as clients) are finished
|
|
||||||
if ipc_server := actor.ipc_server and ipc_server.has_peers(check_chans=True):
|
|
||||||
...
|
|
||||||
await ipc_server.wait_for_no_more_peers() # ← UNBOUNDED, blocks forever
|
|
||||||
```
|
|
||||||
|
|
||||||
`_no_more_peers` is an `Event` set only when
|
|
||||||
`server._peers` empties (see
|
|
||||||
`ipc/_server.py:526-530`). If ANY peer-handler is
|
|
||||||
stuck (the 5 unclosed loops from the earlier pass),
|
|
||||||
it keeps its channel in `server._peers`, so the
|
|
||||||
event never fires, so the wait hangs.
|
|
||||||
|
|
||||||
### Applied fix (partial, landed as defensive-in-depth)
|
|
||||||
|
|
||||||
`tractor/runtime/_runtime.py:1981` —
|
|
||||||
`wait_for_no_more_peers()` call now wrapped in
|
|
||||||
`trio.move_on_after(3.0)` + a warning log when the
|
|
||||||
timeout fires. Commented with the full rationale.
|
|
||||||
|
|
||||||
**Verified:** with this fix, ALL 15 actors reach
|
|
||||||
`async_main: RETURNING` cleanly (up from 10/15
|
|
||||||
reaching end before).
|
|
||||||
|
|
||||||
**Unfortunately:** the test still hangs past 45s
|
|
||||||
total — meaning there's YET ANOTHER unbounded wait
|
|
||||||
downstream of `async_main`. The bounded
|
|
||||||
`wait_for_no_more_peers` unblocks one level, but
|
|
||||||
the cascade has another level above it.
|
|
||||||
|
|
||||||
### Candidates for the remaining hang
|
|
||||||
|
|
||||||
1. `open_root_actor`'s own finally / post-
|
|
||||||
`async_main` flow in `_root.py` — specifically
|
|
||||||
`await actor.cancel(None)` which has its own
|
|
||||||
internal waits.
|
|
||||||
2. The `trio.run()` itself doesn't return even
|
|
||||||
after the root task completes because trio's
|
|
||||||
nursery still has background tasks running.
|
|
||||||
3. Maybe `_serve_ipc_eps`'s finally has an await
|
|
||||||
that blocks when peers aren't clearing.
|
|
||||||
|
|
||||||
### Current stance
|
|
||||||
|
|
||||||
- Defensive `wait_for_no_more_peers` bound landed
|
|
||||||
(good hygiene regardless). Revealing a real
|
|
||||||
deadlock-avoidance gap in tractor's cleanup.
|
|
||||||
- Test still hangs → skip-mark restored on
|
|
||||||
`test_nested_multierrors[subint_forkserver]`.
|
|
||||||
- The full chain of unbounded waits needs another
|
|
||||||
session of drilling, probably at
|
|
||||||
`open_root_actor` / `actor.cancel` level.
|
|
||||||
|
|
||||||
### Summary of this investigation's wins
|
|
||||||
|
|
||||||
1. **FD hygiene fix** (`_close_inherited_fds`) —
|
|
||||||
correct, closed orphan-SIGINT sibling issue.
|
|
||||||
2. **pidfd-based `_ForkedProc.wait`** — cancellable,
|
|
||||||
matches trio_proc pattern.
|
|
||||||
3. **`_parent_chan_cs` wiring** —
|
|
||||||
`Actor.cancel()` now breaks the shielded parent-
|
|
||||||
chan `process_messages` loop.
|
|
||||||
4. **`wait_for_no_more_peers` bounded** —
|
|
||||||
prevents the actor-level finally hang.
|
|
||||||
5. **Ruled-out hypotheses:** tree-kill missing
|
|
||||||
(wrong), stuck socket recv (wrong).
|
|
||||||
6. **Pinpointed remaining unknown:** at least one
|
|
||||||
more unbounded wait in the teardown cascade
|
|
||||||
above `async_main`. Concrete candidates
|
|
||||||
enumerated above.
|
|
||||||
|
|
||||||
## Update — 2026-04-23 (VERY late): pytest capture pipe IS the final gate
|
|
||||||
|
|
||||||
After landing fixes 1-4 and instrumenting every
|
|
||||||
layer down to `tractor_test`'s `trio.run(_main)`:
|
|
||||||
|
|
||||||
**Empirical result: with `pytest -s` the test PASSES
|
|
||||||
in 6.20s.** Without `-s` (default `--capture=fd`) it
|
|
||||||
hangs forever.
|
|
||||||
|
|
||||||
DIAG timeline for the root pytest PID (with `-s`
|
|
||||||
implied from later verification):
|
|
||||||
|
|
||||||
```
|
|
||||||
tractor_test: about to trio.run(_main)
|
|
||||||
open_root_actor: async_main task started, yielding to test body
|
|
||||||
_main: about to await wrapped test fn
|
|
||||||
_main: wrapped RETURNED cleanly ← test body completed!
|
|
||||||
open_root_actor: about to actor.cancel(None)
|
|
||||||
Actor.cancel ENTER req_chan=False
|
|
||||||
Actor.cancel RETURN
|
|
||||||
open_root_actor: actor.cancel RETURNED
|
|
||||||
open_root_actor: outer FINALLY
|
|
||||||
open_root_actor: finally END (returning from ctxmgr)
|
|
||||||
tractor_test: trio.run FINALLY (returned or raised) ← trio.run fully returned!
|
|
||||||
```
|
|
||||||
|
|
||||||
`trio.run()` fully returns. The test body itself
|
|
||||||
completes successfully (pytest.raises absorbed the
|
|
||||||
expected `BaseExceptionGroup`). What blocks is
|
|
||||||
**pytest's own stdout/stderr capture** — under
|
|
||||||
`--capture=fd` default, pytest replaces the parent
|
|
||||||
process's fd 1,2 with pipe write-ends it's reading
|
|
||||||
from. Fork children inherit those pipe fds
|
|
||||||
(because `_close_inherited_fds` correctly preserves
|
|
||||||
stdio). High-volume subactor error-log tracebacks
|
|
||||||
(7+ actors each logging multiple
|
|
||||||
`RemoteActorError`/`ExceptionGroup` tracebacks on
|
|
||||||
the error-propagation cascade) fill the 64KB Linux
|
|
||||||
pipe buffer. Subactor writes block. Subactor can't
|
|
||||||
progress. Process doesn't exit. Parent's
|
|
||||||
`_ForkedProc.wait` (now pidfd-based and
|
|
||||||
cancellable, but nothing's cancelling here since
|
|
||||||
the test body already completed) keeps the pipe
|
|
||||||
reader alive... but pytest isn't draining its end
|
|
||||||
fast enough because test-teardown/fixture-cleanup
|
|
||||||
is in progress.
|
|
||||||
|
|
||||||
**Actually** the exact mechanism is slightly
|
|
||||||
different: pytest's capture fixture MIGHT be
|
|
||||||
actively reading, but faster-than-writer subactors
|
|
||||||
overflow its internal buffer. Or pytest might be
|
|
||||||
blocked itself on the finalization step.
|
|
||||||
|
|
||||||
Either way, `-s` conclusively fixes it.
|
|
||||||
|
|
||||||
### Why I ruled this out earlier (and shouldn't have)
|
|
||||||
|
|
||||||
Earlier in this investigation I tested
|
|
||||||
`test_nested_multierrors` with/without `-s` and
|
|
||||||
both hung. That's because AT THAT TIME, fixes 1-4
|
|
||||||
weren't all in place yet. The test was hanging at
|
|
||||||
multiple deeper levels long before reaching the
|
|
||||||
"generate lots of error-log output" phase. Once
|
|
||||||
the cascade actually tore down cleanly, enough
|
|
||||||
output was produced to hit the capture-pipe limit.
|
|
||||||
|
|
||||||
**Classic order-of-operations mistake in
|
|
||||||
debugging:** ruling something out too early based
|
|
||||||
on a test that was actually failing for a
|
|
||||||
different reason.
|
|
||||||
|
|
||||||
### Fix direction (next session)
|
|
||||||
|
|
||||||
Redirect subactor stdout/stderr to `/dev/null` (or
|
|
||||||
a session-scoped log file) in the fork-child
|
|
||||||
prelude, right after `_close_inherited_fds()`. This
|
|
||||||
severs the inherited pytest-capture pipes and lets
|
|
||||||
subactor output flow elsewhere. Under normal
|
|
||||||
production use (non-pytest), stdout/stderr would
|
|
||||||
be the TTY — we'd want to keep that. So the
|
|
||||||
redirect should be conditional or opt-in via the
|
|
||||||
`child_sigint`/proc_kwargs flag family.
|
|
||||||
|
|
||||||
Alternative: document as a gotcha and recommend
|
|
||||||
`pytest -s` for any tests using the
|
|
||||||
`subint_forkserver` backend with multi-level actor
|
|
||||||
trees. Simpler, user-visible, no code change.
|
|
||||||
|
|
||||||
### Current state
|
|
||||||
|
|
||||||
- Skip-mark on `test_nested_multierrors[subint_forkserver]`
|
|
||||||
restored with reason pointing here.
|
|
||||||
- Test confirmed passing with `-s` after all 4
|
|
||||||
cascade fixes applied.
|
|
||||||
- The 4 cascade fixes are NOT wasted — they're
|
|
||||||
correct hardening regardless of the capture-pipe
|
|
||||||
issue, AND without them we'd never reach the
|
|
||||||
"actually produces enough output to fill the
|
|
||||||
pipe" state.
|
|
||||||
|
|
||||||
## Stopgap (landed)
|
|
||||||
|
|
||||||
`test_nested_multierrors` skip-marked under
|
|
||||||
`subint_forkserver` via
|
|
||||||
`@pytest.mark.skipon_spawn_backend('subint_forkserver',
|
|
||||||
reason='...')`, cross-referenced to this doc. Mark
|
|
||||||
should be dropped once the peer-channel-loop exit
|
|
||||||
issue is fixed.
|
|
||||||
|
|
||||||
## References
|
|
||||||
|
|
||||||
- `tractor/spawn/_subint_forkserver.py::fork_from_worker_thread`
|
|
||||||
— the primitive whose post-fork FD hygiene is
|
|
||||||
probably the culprit.
|
|
||||||
- `tractor/spawn/_subint_forkserver.py::subint_forkserver_proc`
|
|
||||||
— the backend function that orchestrates the
|
|
||||||
graceful cancel path hitting this bug.
|
|
||||||
- `tractor/spawn/_subint_forkserver.py::_ForkedProc`
|
|
||||||
— the `trio.Process`-compatible shim; NOT the
|
|
||||||
failing component (confirmed via thread-dump).
|
|
||||||
- `tests/test_cancellation.py::test_nested_multierrors`
|
|
||||||
— the test that surfaced the hang.
|
|
||||||
- `ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md`
|
|
||||||
— sibling hang class; probably same underlying
|
|
||||||
fork-FD-inheritance root cause.
|
|
||||||
- tractor issue #379 — subint backend tracking.
|
|
||||||
|
|
@ -1,184 +0,0 @@
|
||||||
# Revisit `subint_forkserver` thread-cache constraints once msgspec PEP 684 support lands
|
|
||||||
|
|
||||||
Follow-up tracker for cleanup work gated on the msgspec
|
|
||||||
PEP 684 adoption upstream ([jcrist/msgspec#563](https://github.com/jcrist/msgspec/issues/563)).
|
|
||||||
|
|
||||||
Context — why this exists
|
|
||||||
-------------------------
|
|
||||||
|
|
||||||
The `tractor.spawn._subint_forkserver` submodule currently
|
|
||||||
carries two "non-trio" thread-hygiene constraints whose
|
|
||||||
necessity is tangled with issues that *should* dissolve
|
|
||||||
under PEP 684 isolated-mode subinterpreters:
|
|
||||||
|
|
||||||
1. `fork_from_worker_thread()` / `run_subint_in_worker_thread()`
|
|
||||||
internally allocate a **dedicated `threading.Thread`**
|
|
||||||
rather than using `trio.to_thread.run_sync()`.
|
|
||||||
2. The test helper is named
|
|
||||||
`run_fork_in_non_trio_thread()` — the
|
|
||||||
`non_trio` qualifier is load-bearing today.
|
|
||||||
|
|
||||||
This doc catalogs *why* those constraints exist, which of
|
|
||||||
them isolated-mode would fix, and what the
|
|
||||||
audit-and-cleanup path looks like once msgspec #563 is
|
|
||||||
resolved.
|
|
||||||
|
|
||||||
The three reasons the constraints exist
|
|
||||||
---------------------------------------
|
|
||||||
|
|
||||||
### 1. GIL-starvation class → fixed by PEP 684 isolated mode
|
|
||||||
|
|
||||||
The class-A hang documented in
|
|
||||||
`subint_sigint_starvation_issue.md` is entirely about
|
|
||||||
legacy-config subints **sharing the main GIL**. Once
|
|
||||||
msgspec #563 lands and tractor flips
|
|
||||||
`tractor.spawn._subint` to
|
|
||||||
`concurrent.interpreters.create()` (isolated config), each
|
|
||||||
subint gets its own GIL. Abandoned subint threads can't
|
|
||||||
contend for main's GIL → can't starve the main trio loop
|
|
||||||
→ signal-wakeup-pipe drains normally → no SIGINT-drop.
|
|
||||||
|
|
||||||
This class of hazard **dissolves entirely**. The
|
|
||||||
non-trio-thread requirement for *this reason* disappears.
|
|
||||||
|
|
||||||
### 2. Destroy race / tstate-recycling → orthogonal; unclear
|
|
||||||
|
|
||||||
The `subint_proc` dedicated-thread fix (commit `26fb8206`)
|
|
||||||
addressed a different issue: `_interpreters.destroy(interp_id)`
|
|
||||||
was blocking on a trio-cache worker that had run an
|
|
||||||
earlier `interp.exec()` for that subint. Working
|
|
||||||
hypothesis at the time was "the cached thread retains the
|
|
||||||
subint's tstate".
|
|
||||||
|
|
||||||
But tstate-handling is **not specific to GIL mode** —
|
|
||||||
`_PyXI_Enter` / `_PyXI_Exit` (the C-level machinery both
|
|
||||||
configs use to enter/leave a subint from a thread) should
|
|
||||||
restore the caller's tstate regardless of GIL config. So
|
|
||||||
isolated mode **doesn't obviously fix this**. It might be:
|
|
||||||
|
|
||||||
- A py3.13 bug fixed in later versions — we saw the race
|
|
||||||
first on 3.13 and never re-tested on 3.14 after moving
|
|
||||||
to dedicated threads.
|
|
||||||
- A genuine CPython quirk around cached threads that
|
|
||||||
exec'd into a subint, persisting across GIL modes.
|
|
||||||
- Something else we misdiagnosed — the empirical fix
|
|
||||||
(dedicated thread) worked but the analysis may have
|
|
||||||
been incomplete.
|
|
||||||
|
|
||||||
Only way to know: once we're on isolated mode, empirically
|
|
||||||
retry `trio.to_thread.run_sync(interp.exec, ...)` and see
|
|
||||||
if `destroy()` still blocks. If it does, keep the
|
|
||||||
dedicated thread; if not, one constraint relaxed.
|
|
||||||
|
|
||||||
### 3. Fork-from-main-interp-tstate (the constraint in this module's helper names)
|
|
||||||
|
|
||||||
The fork-from-main-interp-tstate invariant — CPython's
|
|
||||||
`PyOS_AfterFork_Child` →
|
|
||||||
`_PyInterpreterState_DeleteExceptMain` gate documented in
|
|
||||||
`subint_fork_blocked_by_cpython_post_fork_issue.md` — is
|
|
||||||
about the calling thread's **current** tstate at the
|
|
||||||
moment `os.fork()` runs. If trio's cache threads never
|
|
||||||
enter subints at all, their tstate is plain main-interp,
|
|
||||||
and fork from them would be fine.
|
|
||||||
|
|
||||||
The reason the smoke test +
|
|
||||||
`run_fork_in_non_trio_thread` test helper
|
|
||||||
currently use a dedicated `threading.Thread` is narrow:
|
|
||||||
**we don't want to risk a trio cache thread that has
|
|
||||||
previously been used as a subint driver being the one that
|
|
||||||
picks up the fork job**. If cached tstate doesn't get
|
|
||||||
cleared (back to reason #2), the fork's child-side
|
|
||||||
post-init would see the wrong interp and abort.
|
|
||||||
|
|
||||||
In an isolated-mode world where msgspec works:
|
|
||||||
|
|
||||||
- `subint_proc` would use the public
|
|
||||||
`concurrent.interpreters.create()` + `Interpreter.exec()`
|
|
||||||
/ `Interpreter.close()` — which *should* handle tstate
|
|
||||||
cleanly (they're the "blessed" API).
|
|
||||||
- If so, trio's cache threads are safe to fork from
|
|
||||||
regardless of whether they've previously driven subints.
|
|
||||||
- → the `non_trio` qualifier in
|
|
||||||
`run_fork_in_non_trio_thread` becomes
|
|
||||||
*overcautious* rather than load-bearing, and the
|
|
||||||
dedicated-thread primitives in `_subint_forkserver.py`
|
|
||||||
can likely be replaced with straight
|
|
||||||
`trio.to_thread.run_sync()` wrappers.
|
|
||||||
|
|
||||||
TL;DR
|
|
||||||
-----
|
|
||||||
|
|
||||||
| constraint | fixed by isolated mode? |
|
|
||||||
|---|---|
|
|
||||||
| GIL-starvation (class A) | **yes** |
|
|
||||||
| destroy race on cached worker | unclear — empirical test on py3.14 + isolated API required |
|
|
||||||
| fork-from-main-tstate requirement on worker | **probably yes, conditional on the destroy-race question above** |
|
|
||||||
|
|
||||||
If #2 also resolves on py3.14+ with isolated mode,
|
|
||||||
tractor could drop the `non_trio` qualifier from the fork
|
|
||||||
helper's name and just use `trio.to_thread.run_sync(...)`
|
|
||||||
for everything. But **we shouldn't do that preemptively**
|
|
||||||
— the current cautious design is cheap (one dedicated
|
|
||||||
thread per fork / per subint-exec) and correct.
|
|
||||||
|
|
||||||
Audit plan when msgspec #563 lands
|
|
||||||
----------------------------------
|
|
||||||
|
|
||||||
Assuming msgspec grows `Py_mod_multiple_interpreters`
|
|
||||||
support:
|
|
||||||
|
|
||||||
1. **Flip `tractor.spawn._subint` to isolated mode.** Drop
|
|
||||||
the `_interpreters.create('legacy')` call in favor of
|
|
||||||
the public API (`concurrent.interpreters.create()` +
|
|
||||||
`Interpreter.exec()` / `Interpreter.close()`). Run the
|
|
||||||
three `ai/conc-anal/subint_*_issue.md` reproducers —
|
|
||||||
class-A (`test_stale_entry_is_deleted` etc.) should
|
|
||||||
pass without the `skipon_spawn_backend('subint')` marks
|
|
||||||
(revisit the marker inventory).
|
|
||||||
|
|
||||||
2. **Empirical destroy-race retest.** In `subint_proc`,
|
|
||||||
swap the dedicated `threading.Thread` back to
|
|
||||||
`trio.to_thread.run_sync(Interpreter.exec, ...,
|
|
||||||
abandon_on_cancel=False)` and run the full subint test
|
|
||||||
suite. If `Interpreter.close()` (or the backing
|
|
||||||
destroy) blocks the same way as the legacy version
|
|
||||||
did, revert and keep the dedicated thread.
|
|
||||||
|
|
||||||
3. **If #2 clean**, audit `_subint_forkserver.py`:
|
|
||||||
- Rename `run_fork_in_non_trio_thread` → drop the
|
|
||||||
`_non_trio_` qualifier (e.g. `run_fork_in_thread`) or
|
|
||||||
inline the two-line `trio.to_thread.run_sync` call at
|
|
||||||
the call sites and drop the helper entirely.
|
|
||||||
- Consider whether `fork_from_worker_thread` +
|
|
||||||
`run_subint_in_worker_thread` still warrant being
|
|
||||||
separate module-level primitives or whether they
|
|
||||||
collapse into a compound
|
|
||||||
`trio.to_thread.run_sync`-driven pattern inside the
|
|
||||||
(future) `subint_forkserver_proc` backend.
|
|
||||||
|
|
||||||
4. **Doc fallout.** `subint_sigint_starvation_issue.md`
|
|
||||||
and `subint_cancel_delivery_hang_issue.md` both cite
|
|
||||||
the legacy-GIL-sharing architecture as the root cause.
|
|
||||||
Close them with commit-refs to the isolated-mode
|
|
||||||
migration. This doc itself should get a closing
|
|
||||||
post-mortem section noting which of #1/#2/#3 actually
|
|
||||||
resolved vs persisted.
|
|
||||||
|
|
||||||
References
|
|
||||||
----------
|
|
||||||
|
|
||||||
- `tractor.spawn._subint_forkserver` — the in-tree module
|
|
||||||
whose constraints this doc catalogs.
|
|
||||||
- `ai/conc-anal/subint_sigint_starvation_issue.md` — the
|
|
||||||
GIL-starvation class.
|
|
||||||
- `ai/conc-anal/subint_cancel_delivery_hang_issue.md` —
|
|
||||||
sibling Ctrl-C-able hang class.
|
|
||||||
- `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
|
|
||||||
— why fork-from-subint is blocked (this drives the
|
|
||||||
forkserver-via-non-subint-thread workaround).
|
|
||||||
- `ai/conc-anal/subint_fork_from_main_thread_smoketest.py`
|
|
||||||
— empirical validation for the workaround.
|
|
||||||
- [PEP 684 — per-interpreter GIL](https://peps.python.org/pep-0684/)
|
|
||||||
- [PEP 734 — `concurrent.interpreters` public API](https://peps.python.org/pep-0734/)
|
|
||||||
- [jcrist/msgspec#563 — PEP 684 support tracker](https://github.com/jcrist/msgspec/issues/563)
|
|
||||||
- tractor issue #379 — subint backend tracking.
|
|
||||||
|
|
@ -1,350 +0,0 @@
|
||||||
# `subint` backend: abandoned-subint thread can wedge main trio event loop (Ctrl-C unresponsive)
|
|
||||||
|
|
||||||
Follow-up to the Phase B subint spawn-backend PR (see
|
|
||||||
`tractor.spawn._subint`, issue #379). The hard-kill escape
|
|
||||||
hatch we landed (`_HARD_KILL_TIMEOUT`, bounded shields,
|
|
||||||
`daemon=True` driver-thread abandonment) handles *most*
|
|
||||||
stuck-subint scenarios cleanly, but there's one class of
|
|
||||||
hang that can't be fully escaped from within tractor: a
|
|
||||||
still-running abandoned sub-interpreter can starve the
|
|
||||||
**parent's** trio event loop to the point where **SIGINT is
|
|
||||||
effectively dropped by the kernel ↔ Python boundary** —
|
|
||||||
making the pytest process un-Ctrl-C-able.
|
|
||||||
|
|
||||||
## Symptom
|
|
||||||
|
|
||||||
Running `test_stale_entry_is_deleted[subint]` under
|
|
||||||
`--spawn-backend=subint`:
|
|
||||||
|
|
||||||
1. Test spawns a subactor (`transport_fails_actor`) which
|
|
||||||
kills its own IPC server and then
|
|
||||||
`trio.sleep_forever()`.
|
|
||||||
2. Parent tries `Portal.cancel_actor()` → channel
|
|
||||||
disconnected → fast return.
|
|
||||||
3. Nursery teardown triggers our `subint_proc` cancel path.
|
|
||||||
Portal-cancel fails (dead channel),
|
|
||||||
`_HARD_KILL_TIMEOUT` fires, driver thread is abandoned
|
|
||||||
(`daemon=True`), `_interpreters.destroy(interp_id)`
|
|
||||||
raises `InterpreterError` (because the subint is still
|
|
||||||
running).
|
|
||||||
4. Test appears to hang indefinitely at the *outer*
|
|
||||||
`async with tractor.open_nursery() as an:` exit.
|
|
||||||
5. `Ctrl-C` at the terminal does nothing. The pytest
|
|
||||||
process is un-interruptable.
|
|
||||||
|
|
||||||
## Evidence
|
|
||||||
|
|
||||||
### `strace` on the hung pytest process
|
|
||||||
|
|
||||||
```
|
|
||||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
|
||||||
write(37, "\2", 1) = -1 EAGAIN (Resource temporarily unavailable)
|
|
||||||
rt_sigreturn({mask=[WINCH]}) = 140585542325792
|
|
||||||
```
|
|
||||||
|
|
||||||
Translated:
|
|
||||||
|
|
||||||
- Kernel delivers `SIGINT` to pytest.
|
|
||||||
- CPython's C-level signal handler fires and tries to
|
|
||||||
write the signal number byte (`0x02` = SIGINT) to fd 37
|
|
||||||
— the **Python signal-wakeup fd** (set via
|
|
||||||
`signal.set_wakeup_fd()`, which trio uses to wake its
|
|
||||||
event loop on signals).
|
|
||||||
- Write returns `EAGAIN` — **the pipe is full**. Nothing
|
|
||||||
is draining it.
|
|
||||||
- `rt_sigreturn` with the signal masked off — signal is
|
|
||||||
"handled" from the kernel's perspective but the actual
|
|
||||||
Python-level handler (and therefore trio's
|
|
||||||
`KeyboardInterrupt` delivery) never runs.
|
|
||||||
|
|
||||||
### Stack dump (via `tractor.devx.dump_on_hang`)
|
|
||||||
|
|
||||||
At 20s into the hang, only the **main thread** is visible:
|
|
||||||
|
|
||||||
```
|
|
||||||
Thread 0x...7fdca0191780 [python] (most recent call first):
|
|
||||||
File ".../trio/_core/_io_epoll.py", line 245 in get_events
|
|
||||||
File ".../trio/_core/_run.py", line 2415 in run
|
|
||||||
File ".../tests/discovery/test_registrar.py", line 575 in test_stale_entry_is_deleted
|
|
||||||
...
|
|
||||||
```
|
|
||||||
|
|
||||||
No driver thread shows up. The abandoned-legacy-subint
|
|
||||||
thread still exists from the OS's POV (it's still running
|
|
||||||
inside `_interpreters.exec()` driving the subint's
|
|
||||||
`trio.run()` on `trio.sleep_forever()`) but the **main
|
|
||||||
interp's faulthandler can't see threads currently executing
|
|
||||||
inside a sub-interpreter's tstate**. Concretely: the thread
|
|
||||||
is alive, holding state we can't introspect from here.
|
|
||||||
|
|
||||||
## Root cause analysis
|
|
||||||
|
|
||||||
The most consistent explanation for both observations:
|
|
||||||
|
|
||||||
1. **Legacy-config subinterpreters share the main GIL.**
|
|
||||||
PEP 734's public `concurrent.interpreters.create()`
|
|
||||||
defaults to `'isolated'` (per-interp GIL), but tractor
|
|
||||||
uses `_interpreters.create('legacy')` as a workaround
|
|
||||||
for C extensions that don't yet support PEP 684
|
|
||||||
(notably `msgspec`, see
|
|
||||||
[jcrist/msgspec#563](https://github.com/jcrist/msgspec/issues/563)).
|
|
||||||
Legacy-mode subints share process-global state
|
|
||||||
including the GIL.
|
|
||||||
|
|
||||||
2. **Our abandoned subint thread never exits.** After our
|
|
||||||
hard-kill timeout, `driver_thread.join()` is abandoned
|
|
||||||
via `abandon_on_cancel=True` and the thread is
|
|
||||||
`daemon=True` so proc-exit won't block on it — but the
|
|
||||||
thread *itself* is still alive inside
|
|
||||||
`_interpreters.exec()`, driving a `trio.run()` that
|
|
||||||
will never return (the subint actor is in
|
|
||||||
`trio.sleep_forever()`).
|
|
||||||
|
|
||||||
3. **`_interpreters.destroy()` cannot force-stop a running
|
|
||||||
subint.** It raises `InterpreterError` on any
|
|
||||||
still-running subinterpreter; there is no public
|
|
||||||
CPython API to force-destroy one.
|
|
||||||
|
|
||||||
4. **Shared-GIL + non-terminating subint thread → main
|
|
||||||
trio loop starvation.** Under enough load (the subint's
|
|
||||||
trio event loop iterating in the background, IPC-layer
|
|
||||||
tasks still in the subint, etc.) the main trio event
|
|
||||||
loop can fail to iterate frequently enough to drain its
|
|
||||||
wakeup pipe. Once that pipe fills, `SIGINT` writes from
|
|
||||||
the C signal handler return `EAGAIN` and signals are
|
|
||||||
silently dropped — exactly what `strace` shows.
|
|
||||||
|
|
||||||
The shielded
|
|
||||||
`await actor_nursery._join_procs.wait()` at the top of
|
|
||||||
`subint_proc` (inherited unchanged from the `trio_proc`
|
|
||||||
pattern) is structurally involved too: if main trio *does*
|
|
||||||
get a schedule slice, it'd find the `subint_proc` task
|
|
||||||
parked on `_join_procs` under shield — which traps whatever
|
|
||||||
`Cancelled` arrives. But that's a second-order effect; the
|
|
||||||
signal-pipe-full condition is the primary "Ctrl-C doesn't
|
|
||||||
work" cause.
|
|
||||||
|
|
||||||
## Why we can't fix this from inside tractor
|
|
||||||
|
|
||||||
- **No force-destroy API.** CPython provides neither a
|
|
||||||
`_interpreters.force_destroy()` nor a thread-
|
|
||||||
cancellation primitive (`pthread_cancel` is actively
|
|
||||||
discouraged and unavailable on Windows). A subint stuck
|
|
||||||
in pure-Python loops (or worse, C code that doesn't poll
|
|
||||||
for signals) is structurally unreachable from outside.
|
|
||||||
- **Shared GIL is the root scheduling issue.** As long as
|
|
||||||
we're forced into legacy-mode subints for `msgspec`
|
|
||||||
compatibility, the abandoned-thread scenario is
|
|
||||||
fundamentally a process-global GIL-starvation window.
|
|
||||||
- **`signal.set_wakeup_fd()` is process-global.** Even if
|
|
||||||
we wanted to put our own drainer on the wakeup pipe,
|
|
||||||
only one party owns it at a time.
|
|
||||||
|
|
||||||
## Current workaround
|
|
||||||
|
|
||||||
- **Fixture-side SIGINT loop on the `daemon` subproc** (in
|
|
||||||
this test's `daemon: subprocess.Popen` fixture in
|
|
||||||
`tests/conftest.py`). The daemon dying closes its end of
|
|
||||||
the registry IPC, which unblocks a pending recv in main
|
|
||||||
trio's IPC-server task, which lets the event loop
|
|
||||||
iterate, which drains the wakeup pipe, which finally
|
|
||||||
delivers the test-harness SIGINT.
|
|
||||||
- **Module-level skip on py3.13**
|
|
||||||
(`pytest.importorskip('concurrent.interpreters')`) — the
|
|
||||||
private `_interpreters` C module exists on 3.13 but the
|
|
||||||
multi-trio-task interaction hangs silently there
|
|
||||||
independently of this issue.
|
|
||||||
|
|
||||||
## Path forward
|
|
||||||
|
|
||||||
1. **Primary**: upstream `msgspec` PEP 684 adoption
|
|
||||||
([jcrist/msgspec#563](https://github.com/jcrist/msgspec/issues/563)).
|
|
||||||
Unlocks `concurrent.interpreters.create()` isolated
|
|
||||||
mode → per-interp GIL → abandoned subint threads no
|
|
||||||
longer starve the parent's main trio loop. At that
|
|
||||||
point we can flip `_subint.py` back to the public API
|
|
||||||
(`create()` / `Interpreter.exec()` / `Interpreter.close()`)
|
|
||||||
and drop the private `_interpreters` path.
|
|
||||||
|
|
||||||
2. **Secondary**: watch CPython for a public
|
|
||||||
force-destroy primitive. If something like
|
|
||||||
`Interpreter.close(force=True)` lands, we can use it as
|
|
||||||
a hard-kill final stage and actually tear down
|
|
||||||
abandoned subints.
|
|
||||||
|
|
||||||
3. **Harness-level**: document the fixture-side SIGINT
|
|
||||||
loop pattern as the "known workaround" for subint-
|
|
||||||
backend tests that can leave background state holding
|
|
||||||
the main event loop hostage.
|
|
||||||
|
|
||||||
## References
|
|
||||||
|
|
||||||
- PEP 734 (`concurrent.interpreters`):
|
|
||||||
<https://peps.python.org/pep-0734/>
|
|
||||||
- PEP 684 (per-interpreter GIL):
|
|
||||||
<https://peps.python.org/pep-0684/>
|
|
||||||
- `msgspec` PEP 684 tracker:
|
|
||||||
<https://github.com/jcrist/msgspec/issues/563>
|
|
||||||
- CPython `_interpretersmodule.c` source:
|
|
||||||
<https://github.com/python/cpython/blob/main/Modules/_interpretersmodule.c>
|
|
||||||
- `tractor.spawn._subint` module docstring (in-tree
|
|
||||||
explanation of the legacy-mode choice and its
|
|
||||||
tradeoffs).
|
|
||||||
|
|
||||||
## Reproducer
|
|
||||||
|
|
||||||
```
|
|
||||||
./py314/bin/python -m pytest \
|
|
||||||
tests/discovery/test_registrar.py::test_stale_entry_is_deleted \
|
|
||||||
--spawn-backend=subint \
|
|
||||||
--tb=short --no-header -v
|
|
||||||
```
|
|
||||||
|
|
||||||
Hangs indefinitely without the fixture-side SIGINT loop;
|
|
||||||
with the loop, the test completes (albeit with the
|
|
||||||
abandoned-thread warning in logs).
|
|
||||||
|
|
||||||
## Additional known-hanging tests (same class)
|
|
||||||
|
|
||||||
All three tests below exhibit the same
|
|
||||||
signal-wakeup-fd-starvation fingerprint (`write() → EAGAIN`
|
|
||||||
on the wakeup pipe after enough SIGINT attempts) and
|
|
||||||
share the same structural cause — abandoned legacy-subint
|
|
||||||
driver threads contending with the main interpreter for
|
|
||||||
the shared GIL until the main trio loop can no longer
|
|
||||||
drain its wakeup pipe fast enough to deliver signals.
|
|
||||||
|
|
||||||
They're listed separately because each exposes the class
|
|
||||||
under a different load pattern worth documenting.
|
|
||||||
|
|
||||||
### `tests/discovery/test_registrar.py::test_stale_entry_is_deleted[subint]`
|
|
||||||
|
|
||||||
Original exemplar — see the **Symptom** and **Evidence**
|
|
||||||
sections above. One abandoned subint
|
|
||||||
(`transport_fails_actor`, stuck in `trio.sleep_forever()`
|
|
||||||
after self-cancelling its IPC server) is sufficient to
|
|
||||||
tip main into starvation once the harness's `daemon`
|
|
||||||
fixture subproc keeps its half of the registry IPC alive.
|
|
||||||
|
|
||||||
### `tests/test_cancellation.py::test_cancel_while_childs_child_in_sync_sleep[subint-False]`
|
|
||||||
|
|
||||||
Cancel a grandchild that's in sync Python sleep from 2
|
|
||||||
nurseries up. The test's own docstring declares the
|
|
||||||
dependency: "its parent should issue a 'zombie reaper' to
|
|
||||||
hard kill it after sufficient timeout" — which for
|
|
||||||
`trio`/`mp_*` is an OS-level `SIGKILL` of the grandchild
|
|
||||||
subproc. **Under `subint` there's no equivalent** (no
|
|
||||||
public CPython API to force-destroy a running
|
|
||||||
sub-interpreter), so the grandchild's sync-sleeping
|
|
||||||
`trio.run()` persists inside its abandoned driver thread
|
|
||||||
indefinitely. The nested actor-tree (parent → child →
|
|
||||||
grandchild, all subints) means a single cancel triggers
|
|
||||||
multiple concurrent hard-kill abandonments, each leaving
|
|
||||||
a live driver thread.
|
|
||||||
|
|
||||||
This test often only manifests the starvation under
|
|
||||||
**full-suite runs** rather than solo execution —
|
|
||||||
earlier-in-session subint tests also leave abandoned
|
|
||||||
driver threads behind, and the combined population is
|
|
||||||
what actually tips main trio into starvation. Solo runs
|
|
||||||
may stay Ctrl-C-able with fewer abandoned threads in the
|
|
||||||
mix.
|
|
||||||
|
|
||||||
### `tests/test_cancellation.py::test_multierror_fast_nursery[subint-25-0.5]`
|
|
||||||
|
|
||||||
Nursery-error-path throughput stress-test parametrized
|
|
||||||
for **25 concurrent subactors**. When the multierror
|
|
||||||
fires and the nursery cancels, every subactor goes
|
|
||||||
through our `subint_proc` teardown. The bounded
|
|
||||||
hard-kills run in parallel (all `subint_proc` tasks are
|
|
||||||
sibling trio tasks), so the timeout budget is ~3s total
|
|
||||||
rather than 3s × 25. After that, **25 abandoned
|
|
||||||
`daemon=True` driver threads are simultaneously alive** —
|
|
||||||
an extreme pressure multiplier on the same mechanism.
|
|
||||||
|
|
||||||
The `strace` fingerprint is striking under this load: six
|
|
||||||
or more **successful** `write(16, "\2", 1) = 1` calls
|
|
||||||
(main trio getting brief GIL slices, each long enough to
|
|
||||||
drain exactly one wakeup-pipe byte) before finally
|
|
||||||
saturating with `EAGAIN`:
|
|
||||||
|
|
||||||
```
|
|
||||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
|
||||||
write(16, "\2", 1) = 1
|
|
||||||
rt_sigreturn({mask=[WINCH]}) = 140141623162400
|
|
||||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
|
||||||
write(16, "\2", 1) = 1
|
|
||||||
rt_sigreturn({mask=[WINCH]}) = 140141623162400
|
|
||||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
|
||||||
write(16, "\2", 1) = 1
|
|
||||||
rt_sigreturn({mask=[WINCH]}) = 140141623162400
|
|
||||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
|
||||||
write(16, "\2", 1) = 1
|
|
||||||
rt_sigreturn({mask=[WINCH]}) = 140141623162400
|
|
||||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
|
||||||
write(16, "\2", 1) = 1
|
|
||||||
rt_sigreturn({mask=[WINCH]}) = 140141623162400
|
|
||||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
|
||||||
write(16, "\2", 1) = 1
|
|
||||||
rt_sigreturn({mask=[WINCH]}) = 140141623162400
|
|
||||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
|
||||||
write(16, "\2", 1) = -1 EAGAIN (Resource temporarily unavailable)
|
|
||||||
rt_sigreturn({mask=[WINCH]}) = 140141623162400
|
|
||||||
```
|
|
||||||
|
|
||||||
Those successful writes indicate CPython's
|
|
||||||
`sys.getswitchinterval()`-based GIL round-robin *is*
|
|
||||||
giving main brief slices — just never long enough to run
|
|
||||||
the Python-level signal handler through to the point
|
|
||||||
where trio converts the delivered SIGINT into a
|
|
||||||
`Cancelled` on the appropriate scope. Once the
|
|
||||||
accumulated write rate outpaces main's drain rate, the
|
|
||||||
pipe saturates and subsequent signals are silently
|
|
||||||
dropped.
|
|
||||||
|
|
||||||
The `pstree` below (pid `530060` = hung `pytest`) shows
|
|
||||||
the subint-driver thread population at the moment of
|
|
||||||
capture. Even with fewer than the full 25 shown (pstree
|
|
||||||
truncates thread names to `subint-driver[<interp_id>` —
|
|
||||||
interpreters `3` and `4` visible across 16 thread
|
|
||||||
entries), the GIL-contender count is more than enough to
|
|
||||||
explain the starvation:
|
|
||||||
|
|
||||||
```
|
|
||||||
>>> pstree -snapt 530060
|
|
||||||
systemd,1 --switched-root --system --deserialize=40
|
|
||||||
└─login,1545 --
|
|
||||||
└─bash,1872
|
|
||||||
└─sway,2012
|
|
||||||
└─alacritty,70471 -e xonsh
|
|
||||||
└─xonsh,70487 .../bin/xonsh
|
|
||||||
└─uv,70955 run xonsh
|
|
||||||
└─xonsh,70959 .../py314/bin/xonsh
|
|
||||||
└─python,530060 .../py314/bin/pytest -v tests/test_cancellation.py --spawn-backend=subint
|
|
||||||
├─{subint-driver[3},531857
|
|
||||||
├─{subint-driver[3},531860
|
|
||||||
├─{subint-driver[3},531862
|
|
||||||
├─{subint-driver[3},531866
|
|
||||||
├─{subint-driver[3},531877
|
|
||||||
├─{subint-driver[3},531882
|
|
||||||
├─{subint-driver[3},531884
|
|
||||||
├─{subint-driver[3},531945
|
|
||||||
├─{subint-driver[3},531950
|
|
||||||
├─{subint-driver[3},531952
|
|
||||||
├─{subint-driver[4},531956
|
|
||||||
├─{subint-driver[4},531959
|
|
||||||
├─{subint-driver[4},531961
|
|
||||||
├─{subint-driver[4},531965
|
|
||||||
├─{subint-driver[4},531968
|
|
||||||
└─{subint-driver[4},531979
|
|
||||||
```
|
|
||||||
|
|
||||||
(`pstree` uses `{...}` to denote threads rather than
|
|
||||||
processes — these are all the **driver OS-threads** our
|
|
||||||
`subint_proc` creates with name
|
|
||||||
`f'subint-driver[{interp_id}]'`. Every one of them is
|
|
||||||
still alive, executing `_interpreters.exec()` inside a
|
|
||||||
sub-interpreter our hard-kill has abandoned. At 16+
|
|
||||||
abandoned driver threads competing for the main GIL, the
|
|
||||||
main-interpreter trio loop gets starved and signal
|
|
||||||
delivery stalls.)
|
|
||||||
|
|
@ -1,54 +0,0 @@
|
||||||
---
|
|
||||||
model: claude-opus-4-6
|
|
||||||
service: claude
|
|
||||||
session: (ad-hoc, not tracked via conf.toml)
|
|
||||||
timestamp: 2026-04-06T17:28:48Z
|
|
||||||
git_ref: 02b2ef1
|
|
||||||
scope: tests
|
|
||||||
substantive: true
|
|
||||||
raw_file: 20260406T172848Z_02b2ef1_prompt_io.raw.md
|
|
||||||
---
|
|
||||||
|
|
||||||
## Prompt
|
|
||||||
|
|
||||||
User asked to extend `tests/test_resource_cache.py` with a test
|
|
||||||
that reproduces the edge case fixed in commit `02b2ef18` (per-key
|
|
||||||
locking+user tracking in `maybe_open_context()`). The bug was
|
|
||||||
originally triggered in piker's `brokerd.kraken` backend where the
|
|
||||||
same `acm_func` was called with different kwargs, and the old
|
|
||||||
global `_Cache.users` counter caused:
|
|
||||||
|
|
||||||
- teardown skipped for one `ctx_key` bc another key's users kept
|
|
||||||
the global count > 0
|
|
||||||
- re-entry hitting `assert not resources.get(ctx_key)` during the
|
|
||||||
teardown window
|
|
||||||
|
|
||||||
User requested a test that would fail under the old code and pass
|
|
||||||
with the fix.
|
|
||||||
|
|
||||||
## Response summary
|
|
||||||
|
|
||||||
Designed and implemented `test_per_ctx_key_resource_lifecycle`
|
|
||||||
which verifies per-`ctx_key` resource isolation by:
|
|
||||||
|
|
||||||
1. Holding resource `'a'` open in a bg task
|
|
||||||
2. Opening+closing resource `'b'` (same `acm_func`, different
|
|
||||||
kwargs) while `'a'` is still alive
|
|
||||||
3. Re-opening `'b'` and asserting cache MISS — proving `'b'` was
|
|
||||||
torn down independently despite `'a'` keeping its own user
|
|
||||||
count > 0
|
|
||||||
|
|
||||||
With the old global counter, phase 3 would produce a stale cache
|
|
||||||
HIT (leaked resource) or crash on the assert.
|
|
||||||
|
|
||||||
Also added a trivial `acm_with_resource(resource_id)` ACM helper
|
|
||||||
at module level.
|
|
||||||
|
|
||||||
## Files changed
|
|
||||||
|
|
||||||
- `tests/test_resource_cache.py` — add `acm_with_resource` ACM +
|
|
||||||
`test_per_ctx_key_resource_lifecycle` test fn
|
|
||||||
|
|
||||||
## Human edits
|
|
||||||
|
|
||||||
None — committed as generated (pending user review).
|
|
||||||
|
|
@ -1,57 +0,0 @@
|
||||||
---
|
|
||||||
model: claude-opus-4-6
|
|
||||||
service: claude
|
|
||||||
session: (ad-hoc, not tracked via conf.toml)
|
|
||||||
timestamp: 2026-04-06T19:31:25Z
|
|
||||||
git_ref: 85f9c5d
|
|
||||||
scope: tests
|
|
||||||
substantive: true
|
|
||||||
raw_file: 20260406T193125Z_85f9c5d_prompt_io.raw.md
|
|
||||||
---
|
|
||||||
|
|
||||||
## Prompt
|
|
||||||
|
|
||||||
User asked to reproduce the exact `assert not resources.get(ctx_key)`
|
|
||||||
crash originally seen in piker's `brokerd.kraken` backend via
|
|
||||||
`open_cached_client('kraken')`. Key constraints from user:
|
|
||||||
|
|
||||||
- In piker, kwargs were the **same** (empty) so all callers
|
|
||||||
share one `ctx_key = (fid, ())`
|
|
||||||
- The root issue is `_Cache.locks` being indexed by `fid`
|
|
||||||
rather than `ctx_key`, plus the race window between
|
|
||||||
`values.pop()` and `resources.pop()` in `_Cache.run_ctx`
|
|
||||||
(the acm `__aexit__` has checkpoints in between)
|
|
||||||
- Mark the test `@pytest.mark.xfail` so it can be committed
|
|
||||||
against the unpatched code and later adjusted once the fix
|
|
||||||
from `9e49eddd` is applied
|
|
||||||
|
|
||||||
User pointed to piker's `open_cached_client('kraken')` usage
|
|
||||||
as the real-world pattern to model.
|
|
||||||
|
|
||||||
## Response summary
|
|
||||||
|
|
||||||
Added `test_moc_reentry_during_teardown` which deterministically
|
|
||||||
reproduces the teardown race:
|
|
||||||
|
|
||||||
1. A `cached_client()` acm (no params, like
|
|
||||||
`kraken.api.get_client()`) signals via `in_aexit` event
|
|
||||||
when its `__aexit__` starts, then sleeps to hold the
|
|
||||||
window open
|
|
||||||
2. `use_and_exit` task enters+exits the cached resource
|
|
||||||
3. `reenter_during_teardown` waits on `in_aexit` then
|
|
||||||
re-enters — landing exactly in the window where
|
|
||||||
`values` is popped but `resources` still exists
|
|
||||||
4. The old `assert not resources.get(ctx_key)` fires
|
|
||||||
5. `@pytest.mark.xfail(raises=AssertionError)` catches it
|
|
||||||
|
|
||||||
Test confirmed: `XFAIL` with the exact traceback at
|
|
||||||
`_mngrs.py:327`.
|
|
||||||
|
|
||||||
## Files changed
|
|
||||||
|
|
||||||
- `tests/test_resource_cache.py` — add
|
|
||||||
`test_moc_reentry_during_teardown` xfail test fn
|
|
||||||
|
|
||||||
## Human edits
|
|
||||||
|
|
||||||
None — committed as generated (pending user review).
|
|
||||||
|
|
@ -1,52 +0,0 @@
|
||||||
---
|
|
||||||
model: claude-opus-4-6
|
|
||||||
service: claude
|
|
||||||
session: (ad-hoc, not tracked via conf.toml)
|
|
||||||
timestamp: 2026-04-06T22:26:48Z
|
|
||||||
git_ref: f086222
|
|
||||||
scope: code
|
|
||||||
substantive: true
|
|
||||||
raw_file: 20260406T222648Z_f086222_prompt_io.raw.md
|
|
||||||
---
|
|
||||||
|
|
||||||
## Prompt
|
|
||||||
|
|
||||||
User applied the per-key locking patch (`f086222d`),
|
|
||||||
removed the `@pytest.mark.xfail` from
|
|
||||||
`test_moc_reentry_during_teardown`, and found the
|
|
||||||
test STILL fails — now with `RuntimeError` instead
|
|
||||||
of `AssertionError`. Asked to diagnose why the fix
|
|
||||||
doesn't actually resolve this race.
|
|
||||||
|
|
||||||
## Response summary
|
|
||||||
|
|
||||||
Traced the exact scheduling sequence across three
|
|
||||||
concurrent tasks to show that the per-`ctx_key`
|
|
||||||
locking change doesn't close the fundamental gap:
|
|
||||||
|
|
||||||
- `maybe_open_context`'s finally pops the lock and
|
|
||||||
sets `no_more_users` in the **exiting caller's
|
|
||||||
task**
|
|
||||||
- But the actual resource cleanup (`values.pop`,
|
|
||||||
acm `__aexit__`, `resources.pop`) runs in the
|
|
||||||
**`run_ctx` task** inside `service_tn`
|
|
||||||
- A re-entering task creates a fresh lock (old one
|
|
||||||
popped), acquires immediately, and hits stale
|
|
||||||
`resources[ctx_key]` while `run_ctx` is still in
|
|
||||||
`__aexit__`
|
|
||||||
|
|
||||||
Identified three possible fix strategies:
|
|
||||||
1. Let `run_ctx` own lock release/pop after
|
|
||||||
`resources.pop()`
|
|
||||||
2. Add a "teardown complete" event that new callers
|
|
||||||
wait on
|
|
||||||
3. Move `resources.pop()` into `run_ctx`'s inner
|
|
||||||
finally (before acm `__aexit__`)
|
|
||||||
|
|
||||||
## Files changed
|
|
||||||
|
|
||||||
No code changes — analysis only.
|
|
||||||
|
|
||||||
## Human edits
|
|
||||||
|
|
||||||
N/A — diagnostic output, no code generated.
|
|
||||||
|
|
@ -1,32 +0,0 @@
|
||||||
---
|
|
||||||
model: claude-opus-4-6
|
|
||||||
service: claude
|
|
||||||
timestamp: 2026-04-06T22:26:48Z
|
|
||||||
git_ref: f086222
|
|
||||||
diff_cmd: git diff f086222~1..f086222
|
|
||||||
---
|
|
||||||
|
|
||||||
Here's the problem. The per-key locking doesn't fix this race because the **lock lifetime is decoupled from `run_ctx`'s lifetime**. Trace through what happens:
|
|
||||||
|
|
||||||
**`maybe_open_context` finally** (lines 468-495):
|
|
||||||
1. `users[ctx_key] -= 1` → 0
|
|
||||||
2. `no_more_users.set()` → wakes `run_ctx` (but it doesn't run yet, just becomes ready)
|
|
||||||
3. `_Cache.locks.pop(ctx_key)` → **lock is gone**
|
|
||||||
4. `maybe_open_context` exits
|
|
||||||
|
|
||||||
**`run_ctx`** now gets scheduled (lines 237-241):
|
|
||||||
5. Inner finally: `values.pop(ctx_key)` → values gone
|
|
||||||
6. `mng.__aexit__()` runs → `in_aexit.set()`, `await trio.sleep(10)` → **CHECKPOINT**
|
|
||||||
|
|
||||||
**`reenter_during_teardown`** wakes:
|
|
||||||
7. Enters `maybe_open_context` → no lock exists for `ctx_key` → creates **fresh** lock, acquires immediately
|
|
||||||
8. `values[ctx_key]` → `KeyError`
|
|
||||||
9. `resources.get(ctx_key)` → **still exists!** (step 6 hasn't reached `resources.pop()` in `run_ctx`'s outer finally yet)
|
|
||||||
10. `raise RuntimeError('Caching resources ALREADY exist?!')`
|
|
||||||
|
|
||||||
The core issue: `no_more_users.set()` (step 2) and `locks.pop()` (step 3) happen in the **exiting caller's task**, but the actual resource cleanup (`values.pop`, acm `__aexit__`, `resources.pop`) happens in the **`run_ctx` task** inside `service_tn`. There's no synchronization between "lock removed, new callers welcome" and "run_ctx has actually finished tearing down". The per-`ctx_key` lock change just made the lock more granular — it didn't close this gap.
|
|
||||||
|
|
||||||
The fix needs to ensure a new caller can't acquire a lock for `ctx_key` until `run_ctx` has fully exited (including `resources.pop()`). Either:
|
|
||||||
- **don't pop the lock in `maybe_open_context`** — let `run_ctx` release/pop it after `resources.pop()`
|
|
||||||
- **wait on a "teardown complete" event** in the cache-miss path when stale `resources` exist
|
|
||||||
- **pop `resources` before the acm `__aexit__`** (move `resources.pop()` into the inner finally alongside `values.pop()`)
|
|
||||||
|
|
@ -1,74 +0,0 @@
|
||||||
---
|
|
||||||
model: claude-opus-4-6
|
|
||||||
service: claude
|
|
||||||
session: (ad-hoc, not tracked via conf.toml)
|
|
||||||
timestamp: 2026-04-13T19:21:16Z
|
|
||||||
git_ref: f851f28
|
|
||||||
scope: code
|
|
||||||
substantive: true
|
|
||||||
---
|
|
||||||
|
|
||||||
## Prompt
|
|
||||||
|
|
||||||
User provided a detailed implementation plan for
|
|
||||||
a new `test_tpt_bind_addrs.py` test suite covering
|
|
||||||
`open_root_actor(tpt_bind_addrs=...)` — the three
|
|
||||||
runtime code paths in `_root.py:385-450`:
|
|
||||||
|
|
||||||
1. Non-registrar, no explicit bind -> random addrs
|
|
||||||
2. Registrar, no explicit bind -> registry_addrs
|
|
||||||
3. Explicit bind given -> `wrap_address()` + merge
|
|
||||||
|
|
||||||
Plan specified 6 test functions (~10 parametrized
|
|
||||||
variants), predicted a type-mixing bug at line 446,
|
|
||||||
and asked for an in-flight fix if confirmed.
|
|
||||||
|
|
||||||
## Response summary
|
|
||||||
|
|
||||||
Created `tests/discovery/test_tpt_bind_addrs.py`
|
|
||||||
with 9 collected test variants across 6 functions:
|
|
||||||
|
|
||||||
- `test_registrar_root_tpt_bind_addrs` (3 variants:
|
|
||||||
`bind-eq-reg`, `bind-subset-reg`,
|
|
||||||
`bind-disjoint-reg`)
|
|
||||||
- `test_non_registrar_root_tpt_bind_addrs`
|
|
||||||
(2 variants: `bind-same-bindspace`,
|
|
||||||
`bind-disjoint`)
|
|
||||||
- `test_non_registrar_default_random_bind`
|
|
||||||
(baseline, no explicit bind)
|
|
||||||
- `test_tpt_bind_addrs_as_maddr_str`
|
|
||||||
(multiaddr string input)
|
|
||||||
- `test_registrar_merge_binds_union`
|
|
||||||
(registrar + disjoint bind -> union)
|
|
||||||
- `test_open_nursery_forwards_tpt_bind_addrs`
|
|
||||||
(`open_nursery(**kwargs)` forwarding)
|
|
||||||
|
|
||||||
Confirmed and fixed the predicted bug at
|
|
||||||
`_root.py:446`: the registrar merge path mixed
|
|
||||||
`Address` objects (`tpt_bind_addrs`) with raw tuples
|
|
||||||
(`uw_reg_addrs`) inside `set()`, preventing
|
|
||||||
deduplication and causing double-bind `OSError`.
|
|
||||||
|
|
||||||
Fix: wrap `uw_reg_addrs` before the set union:
|
|
||||||
```python
|
|
||||||
# before (broken)
|
|
||||||
tpt_bind_addrs = list(set(
|
|
||||||
tpt_bind_addrs + uw_reg_addrs
|
|
||||||
))
|
|
||||||
# after (fixed)
|
|
||||||
tpt_bind_addrs = list(set(
|
|
||||||
tpt_bind_addrs
|
|
||||||
+ [wrap_address(a) for a in uw_reg_addrs]
|
|
||||||
))
|
|
||||||
```
|
|
||||||
|
|
||||||
All 9 tests pass after the fix.
|
|
||||||
|
|
||||||
## Files changed
|
|
||||||
|
|
||||||
- `tests/discovery/test_tpt_bind_addrs.py` (new)
|
|
||||||
- `tractor/_root.py:446` (bug fix, 1 line)
|
|
||||||
|
|
||||||
## Human edits
|
|
||||||
|
|
||||||
N/A — pending review.
|
|
||||||
|
|
@ -1,50 +0,0 @@
|
||||||
---
|
|
||||||
model: claude-opus-4-6
|
|
||||||
service: claude
|
|
||||||
session: 76154e65-d8e1-4b5f-9275-0ea45ba7e98a
|
|
||||||
timestamp: 2026-04-13T20:50:48Z
|
|
||||||
git_ref: 269d939c
|
|
||||||
scope: code
|
|
||||||
substantive: true
|
|
||||||
raw_file: 20260413T205048Z_269d939c_prompt_io.raw.md
|
|
||||||
---
|
|
||||||
|
|
||||||
## Prompt
|
|
||||||
|
|
||||||
Implement a `parse_endpoints()` API in
|
|
||||||
`tractor.discovery._multiaddr` that lets downstream
|
|
||||||
projects declare per-actor transport bind addresses
|
|
||||||
as a service table of actor-name -> multiaddr
|
|
||||||
strings (loaded from config, e.g. TOML `[network]`
|
|
||||||
section). Add type aliases `EndpointsTable` and
|
|
||||||
`ParsedEndpoints`, plus 7 unit tests covering TCP,
|
|
||||||
UDS, tuples, mixed input, unsupported protocols, and
|
|
||||||
edge cases.
|
|
||||||
|
|
||||||
A detailed implementation plan was provided
|
|
||||||
specifying insertion points, type signatures, test
|
|
||||||
names, and verification commands.
|
|
||||||
|
|
||||||
## Response summary
|
|
||||||
|
|
||||||
Add `parse_endpoints()` function with two supporting
|
|
||||||
type aliases to `_multiaddr.py`. The function
|
|
||||||
iterates a `dict[str, list[str|tuple]]` service
|
|
||||||
table and delegates each entry to the existing
|
|
||||||
`wrap_address()` helper, which already handles maddr
|
|
||||||
strings, raw tuples, and pre-wrapped `Address`
|
|
||||||
objects. Seven pure unit tests (no trio runtime)
|
|
||||||
validate all input variants and edge cases. All 22
|
|
||||||
multiaddr tests pass.
|
|
||||||
|
|
||||||
## Files changed
|
|
||||||
|
|
||||||
- `tractor/discovery/_multiaddr.py` — add
|
|
||||||
`EndpointsTable`, `ParsedEndpoints` type aliases
|
|
||||||
and `parse_endpoints()` function after line 133
|
|
||||||
- `tests/discovery/test_multiaddr.py` — add
|
|
||||||
`parse_endpoints` import and 7 new test functions
|
|
||||||
|
|
||||||
## Human edits
|
|
||||||
|
|
||||||
None — committed as generated.
|
|
||||||
|
|
@ -1,32 +0,0 @@
|
||||||
---
|
|
||||||
model: claude-opus-4-6
|
|
||||||
service: claude
|
|
||||||
timestamp: 2026-04-13T20:50:48Z
|
|
||||||
git_ref: 269d939c
|
|
||||||
diff_cmd: git diff 269d939c~1..269d939c
|
|
||||||
---
|
|
||||||
|
|
||||||
## Implementation output
|
|
||||||
|
|
||||||
### `tractor/discovery/_multiaddr.py`
|
|
||||||
|
|
||||||
> `git diff 269d939c~1..269d939c -- tractor/discovery/_multiaddr.py`
|
|
||||||
|
|
||||||
Type aliases `EndpointsTable` and `ParsedEndpoints`
|
|
||||||
plus `parse_endpoints()` helper that wraps each
|
|
||||||
entry via `wrap_address()`.
|
|
||||||
|
|
||||||
### `tests/discovery/test_multiaddr.py`
|
|
||||||
|
|
||||||
> `git diff 269d939c~1..269d939c -- tests/discovery/test_multiaddr.py`
|
|
||||||
|
|
||||||
Add import of `parse_endpoints` to existing import
|
|
||||||
block, then 7 new test functions.
|
|
||||||
|
|
||||||
### Test run output
|
|
||||||
|
|
||||||
```
|
|
||||||
22 passed, 1 warning in 0.05s
|
|
||||||
```
|
|
||||||
|
|
||||||
All 22 tests pass (15 existing + 7 new).
|
|
||||||
|
|
@ -1,38 +0,0 @@
|
||||||
---
|
|
||||||
model: claude-opus-4-6
|
|
||||||
service: claude
|
|
||||||
session: multiaddr-support-rename-prefer
|
|
||||||
timestamp: 2026-04-14T16:33:00Z
|
|
||||||
git_ref: befedc49
|
|
||||||
scope: code
|
|
||||||
substantive: true
|
|
||||||
raw_file: 20260414T163300Z_befedc49_prompt_io.raw.md
|
|
||||||
---
|
|
||||||
|
|
||||||
## Prompt
|
|
||||||
|
|
||||||
Create a helper function that determines the best transport given
|
|
||||||
actor locality (distributed vs same host). Use PID/hostname
|
|
||||||
comparison for locality detection, apply at registry addr selection
|
|
||||||
only (not spawn-time).
|
|
||||||
|
|
||||||
## Response summary
|
|
||||||
|
|
||||||
New `prefer_addr()` + `_is_local_addr()` helpers
|
|
||||||
in `_api.py` using `socket.getaddrinfo()` and
|
|
||||||
`ipaddress` for PID/hostname locality detection.
|
|
||||||
Preference: UDS > local TCP > remote TCP.
|
|
||||||
Integrated into `query_actor()` and
|
|
||||||
`wait_for_actor()`. Also changed
|
|
||||||
`Registrar.find_actor()` to return full addr list
|
|
||||||
so callers can apply preference.
|
|
||||||
|
|
||||||
## Files changed
|
|
||||||
|
|
||||||
- `tractor/discovery/_discovery.py` → `_api.py`
|
|
||||||
— renamed + added `prefer_addr()`,
|
|
||||||
`_is_local_addr()`; updated `query_actor()` and
|
|
||||||
`wait_for_actor()` call sites
|
|
||||||
- `tractor/discovery/_registry.py`
|
|
||||||
— `Registrar.find_actor()` returns
|
|
||||||
`list[UnwrappedAddress]|None`
|
|
||||||
|
|
@ -1,62 +0,0 @@
|
||||||
---
|
|
||||||
model: claude-opus-4-6
|
|
||||||
service: claude
|
|
||||||
timestamp: 2026-04-14T16:33:00Z
|
|
||||||
git_ref: befedc49
|
|
||||||
diff_cmd: git diff befedc49~1..befedc49
|
|
||||||
---
|
|
||||||
|
|
||||||
### `tractor/discovery/_api.py`
|
|
||||||
|
|
||||||
> `git diff befedc49~1..befedc49 -- tractor/discovery/_api.py`
|
|
||||||
|
|
||||||
Add `_is_local_addr()` and `prefer_addr()` transport
|
|
||||||
preference helpers.
|
|
||||||
|
|
||||||
#### `_is_local_addr(addr: Address) -> bool`
|
|
||||||
|
|
||||||
Determines whether an `Address` is reachable on the
|
|
||||||
local host:
|
|
||||||
|
|
||||||
- `UDSAddress`: always returns `True`
|
|
||||||
(filesystem-bound, inherently local)
|
|
||||||
- `TCPAddress`: checks if `._host` is a loopback IP
|
|
||||||
via `ipaddress.ip_address().is_loopback`, then
|
|
||||||
falls back to comparing against the machine's own
|
|
||||||
interface IPs via
|
|
||||||
`socket.getaddrinfo(socket.gethostname(), None)`
|
|
||||||
|
|
||||||
#### `prefer_addr(addrs: list[UnwrappedAddress]) -> UnwrappedAddress`
|
|
||||||
|
|
||||||
Selects the "best" transport address from a
|
|
||||||
multihomed actor's address list. Wraps each
|
|
||||||
candidate via `wrap_address()` to get typed
|
|
||||||
`Address` objects, then classifies into three tiers:
|
|
||||||
|
|
||||||
1. **UDS** (same-host guaranteed, lowest overhead)
|
|
||||||
2. **TCP loopback / same-host IP** (local network)
|
|
||||||
3. **TCP remote** (only option for distributed)
|
|
||||||
|
|
||||||
Within each tier, the last-registered (latest) entry
|
|
||||||
is preferred. Falls back to `addrs[-1]` if no
|
|
||||||
heuristic matches.
|
|
||||||
|
|
||||||
### `tractor/discovery/_registry.py`
|
|
||||||
|
|
||||||
> `git diff befedc49~1..befedc49 -- tractor/discovery/_registry.py`
|
|
||||||
|
|
||||||
`Registrar.find_actor()` return type broadened from
|
|
||||||
single addr to `list[UnwrappedAddress]|None` — full
|
|
||||||
addr list lets callers apply transport preference.
|
|
||||||
|
|
||||||
#### Integration
|
|
||||||
|
|
||||||
`query_actor()` and `wait_for_actor()` now call
|
|
||||||
`prefer_addr(addrs)` instead of `addrs[-1]`.
|
|
||||||
|
|
||||||
### Verification
|
|
||||||
|
|
||||||
All discovery tests pass (13/13 non-daemon).
|
|
||||||
`test_local.py` and `test_multi_program.py` also
|
|
||||||
pass (daemon fixture teardown failures are
|
|
||||||
pre-existing and unrelated).
|
|
||||||
|
|
@ -1,101 +0,0 @@
|
||||||
---
|
|
||||||
model: claude-opus-4-7[1m]
|
|
||||||
service: claude
|
|
||||||
session: subints-spawner-design-kickoff
|
|
||||||
timestamp: 2026-04-17T03:49:18Z
|
|
||||||
git_ref: 9703210
|
|
||||||
scope: docs
|
|
||||||
substantive: true
|
|
||||||
raw_file: 20260417T034918Z_9703210_prompt_io.raw.md
|
|
||||||
---
|
|
||||||
|
|
||||||
## Prompt
|
|
||||||
|
|
||||||
Drive the "first big boi, from GH issue" task seeded by
|
|
||||||
`ai/prompt-io/prompts/subints_spawner.md`: design, plan
|
|
||||||
and implement sub-interpreter (subint) spawn-backend
|
|
||||||
support per issue #379, including (1) modularizing
|
|
||||||
`tractor.spawn._spawn` into per-backend submods, (2) a new
|
|
||||||
`._subint` backend, and (3) harness parametrization via the
|
|
||||||
existing `--spawn-backend` / `start_method` pytest fixture
|
|
||||||
in `tractor._testing.pytest`.
|
|
||||||
|
|
||||||
Follow-up clarifications from the user (this turn):
|
|
||||||
1. Pin `<3.15` on this dev branch and feature-gate subint
|
|
||||||
tests — chose option (a).
|
|
||||||
2. Split Phase A (modularization) into its own PR first.
|
|
||||||
3. Defer the `fork()`-via-subint hack to a follow-up.
|
|
||||||
4. Harness flag is `pytest --spawn-backend <key>` CLI →
|
|
||||||
`start_method` fixture (prompt file updated to match).
|
|
||||||
|
|
||||||
## Response summary
|
|
||||||
|
|
||||||
Produced a three-phase plan and a concrete Phase A (pure
|
|
||||||
modularization) file-split plan for user review; no code
|
|
||||||
written yet — the green-light to start Phase A was given
|
|
||||||
in this same turn conditional on logging this prompt-io
|
|
||||||
entry first.
|
|
||||||
|
|
||||||
Phases:
|
|
||||||
- **A — modularize** `tractor/spawn/_spawn.py` (847 LOC):
|
|
||||||
keep generic machinery in `_spawn.py`, extract
|
|
||||||
`trio_proc` → `spawn/_trio.py`, `mp_proc` →
|
|
||||||
`spawn/_mp.py`. No pin bump.
|
|
||||||
- **B — `_subint` backend**: bump `pyproject.toml`
|
|
||||||
`requires-python` upper to `<3.15`; add `'subint'` to
|
|
||||||
`SpawnMethodKey`; reuse existing UDS transport; shm
|
|
||||||
escape-hatch deferred.
|
|
||||||
- **C — harness**: drive the valid-backend tuple in
|
|
||||||
`tractor/_testing/pytest.py:345-349` from
|
|
||||||
`typing.get_args(SpawnMethodKey)`; skip subint tests on
|
|
||||||
Python < 3.14.
|
|
||||||
|
|
||||||
Key findings surfaced to the user:
|
|
||||||
- `pyproject.toml:12` currently pins `<3.14`; PEP 734
|
|
||||||
`concurrent.interpreters` only ships in 3.14 — the
|
|
||||||
load-bearing constraint.
|
|
||||||
- `_testing/pytest.py:345-349` hardcodes valid backends
|
|
||||||
as a string tuple (`'mp_spawn'`, `'mp_forkserver'`,
|
|
||||||
`'trio'`) — should be `get_args(SpawnMethodKey)`.
|
|
||||||
- `_testing/pytest.py:228` already imports
|
|
||||||
`try_set_start_method` from `tractor.spawn._spawn` —
|
|
||||||
keeping the `_spawn.py` path as the "core" module
|
|
||||||
avoids breaking external refs during Phase A.
|
|
||||||
|
|
||||||
Phase A commit breakdown (each green via `/run-tests`):
|
|
||||||
1. Drive `_testing/pytest.py` backend-tuple from
|
|
||||||
`get_args(SpawnMethodKey)` — prep.
|
|
||||||
2. Extract `trio_proc` → `spawn/_trio.py`; `_methods`
|
|
||||||
registry imports it.
|
|
||||||
3. Extract `mp_proc` → `spawn/_mp.py`; `_methods` imports
|
|
||||||
it.
|
|
||||||
4. Tidy `spawn/__init__.py` docstring for the new layout.
|
|
||||||
|
|
||||||
Workflow: Phase A runs in a new worktree via `/open-wkt
|
|
||||||
spawn_modularize` (branch `wkt/spawn_modularize`) and
|
|
||||||
lands as its own PR before Phase B/C returns to the
|
|
||||||
current `claude_bun_opus4.7_subint_spawner_backend`
|
|
||||||
branch.
|
|
||||||
|
|
||||||
## Files changed
|
|
||||||
|
|
||||||
None this turn — design/planning only. The user updated
|
|
||||||
`ai/prompt-io/prompts/subints_spawner.md` themselves to
|
|
||||||
complete the truncated sentence on line 32 (harness-flag
|
|
||||||
reference).
|
|
||||||
|
|
||||||
Upcoming (Phase A, next turn):
|
|
||||||
- `tractor/spawn/_spawn.py` — trimmed to "core"
|
|
||||||
- `tractor/spawn/_trio.py` — new; holds `trio_proc()`
|
|
||||||
- `tractor/spawn/_mp.py` — new; holds `mp_proc()`
|
|
||||||
- `tractor/spawn/__init__.py` — docstring refresh
|
|
||||||
- `tractor/_testing/pytest.py` — backend tuple from
|
|
||||||
`get_args(SpawnMethodKey)`
|
|
||||||
|
|
||||||
## Human edits
|
|
||||||
|
|
||||||
None — committed as generated. User completed the
|
|
||||||
truncated sentence on line 32 of
|
|
||||||
`ai/prompt-io/prompts/subints_spawner.md` themselves
|
|
||||||
*before* this prompt-io log was written; that edit was
|
|
||||||
the user's authoring, not an edit to AI output.
|
|
||||||
|
|
@ -1,117 +0,0 @@
|
||||||
---
|
|
||||||
model: claude-opus-4-7[1m]
|
|
||||||
service: claude
|
|
||||||
timestamp: 2026-04-17T03:49:18Z
|
|
||||||
git_ref: 9703210
|
|
||||||
diff_cmd: git diff HEAD~1..HEAD
|
|
||||||
---
|
|
||||||
|
|
||||||
No code was produced in this turn — this is a design/planning
|
|
||||||
exchange for subint spawner support per issue #379. Captured
|
|
||||||
verbatim below are the two substantive AI responses, which
|
|
||||||
together fix the phased plan (A: modularize, B: subint, C:
|
|
||||||
harness) and the concrete Phase A file split before any
|
|
||||||
implementation begins.
|
|
||||||
|
|
||||||
## Turn 1 — initial triage + gating questions
|
|
||||||
|
|
||||||
> Read: issue #379, current `spawn/_spawn.py` (847 lines),
|
|
||||||
> and pyproject pins.
|
|
||||||
>
|
|
||||||
> ## Task summary
|
|
||||||
> Design/plan/implement subint backend per #379, plus
|
|
||||||
> modularize `spawn/_spawn.py` into per-backend submods, plus
|
|
||||||
> harness flag for test parametrization.
|
|
||||||
>
|
|
||||||
> ## Key finding
|
|
||||||
> `pyproject.toml:12` pins `requires-python = ">=3.12, <3.14"`
|
|
||||||
> — but `concurrent.interpreters` / PEP 734 **only lands in
|
|
||||||
> 3.14**. This is the load-bearing design question before
|
|
||||||
> coding.
|
|
||||||
>
|
|
||||||
> ## Proposed phasing (incremental commits)
|
|
||||||
> - **Phase A — modularize** `_spawn.py`: extract `trio_proc`
|
|
||||||
> → `spawn/_trio.py`, `mp_proc` → `spawn/_mp.py`, keep
|
|
||||||
> generic helpers (`hard_kill`/`soft_kill`/`new_proc`
|
|
||||||
> dispatcher/`SpawnMethodKey`) in a slim core. Tests green
|
|
||||||
> at each step.
|
|
||||||
> - **Phase B — `_subint` backend**: extend `SpawnMethodKey`
|
|
||||||
> with `'subint'`; reuse UDS transport by default; shm is
|
|
||||||
> escape-hatch follow-up.
|
|
||||||
> - **Phase C — harness**: add backend-parametrize flag, skip
|
|
||||||
> subint when Python < 3.14.
|
|
||||||
>
|
|
||||||
> ## Gating questions before I start
|
|
||||||
> 1. **Python pin**: bump `<3.15` and feature-gate subint
|
|
||||||
> tests (a), require 3.14 outright (b), or backport shim
|
|
||||||
> (c)? I'd recommend (a).
|
|
||||||
> 2. **PR strategy**: one branch, commits per phase (current
|
|
||||||
> `claude_bun_opus4.7_subint_spawner_backend`)? Or split
|
|
||||||
> Phase A as its own PR first?
|
|
||||||
> 3. **`fork()`-via-subint hack** from the issue's "Our own
|
|
||||||
> thoughts" section — v1 or follow-up? I'd defer to
|
|
||||||
> follow-up.
|
|
||||||
> 4. **Prompt file line 32 is truncated** — what's the
|
|
||||||
> existing harness flag name you had in mind ("using the
|
|
||||||
> existing ___")?
|
|
||||||
>
|
|
||||||
> Which of these are already decided vs. open?
|
|
||||||
|
|
||||||
## User answers (for the raw record)
|
|
||||||
|
|
||||||
1. bump `<3.15`, feature-gate — option (a).
|
|
||||||
2. split Phase A as its own PR first.
|
|
||||||
3. defer `fork()`-via-subint hack.
|
|
||||||
4. harness flag is `--spawn-backend` CLI → `start_method`
|
|
||||||
fixture, per `tractor._testing._pytest`. Prompt file
|
|
||||||
updated accordingly.
|
|
||||||
|
|
||||||
## Turn 2 — concrete Phase A plan
|
|
||||||
|
|
||||||
> Good — prompt file now ends cleanly; harness flag
|
|
||||||
> confirmed as `--spawn-backend` CLI → `start_method`
|
|
||||||
> fixture (via `_testing/pytest.py:193,355`). Also noted
|
|
||||||
> `pytest.py:345-349` hardcodes valid backends as a string
|
|
||||||
> tuple — `typing.get_args(SpawnMethodKey)` should drive
|
|
||||||
> that list.
|
|
||||||
>
|
|
||||||
> ## Phase A — concrete plan (separate PR)
|
|
||||||
>
|
|
||||||
> **File split** in `tractor/spawn/`:
|
|
||||||
> - `_spawn.py` stays as "core": keeps `SpawnMethodKey`,
|
|
||||||
> `_spawn_method`, `proc_waiter`, `try_set_start_method`,
|
|
||||||
> `exhaust_portal`, `cancel_on_completion`, `hard_kill`,
|
|
||||||
> `soft_kill`, `new_proc` dispatcher, `_methods`
|
|
||||||
> registry. Kept at this path so `_testing/pytest.py:228`
|
|
||||||
> and other external imports don't break.
|
|
||||||
> - `_trio.py` ← `trio_proc()` (lines 448–671, ~224 lines)
|
|
||||||
> - `_mp.py` ← `mp_proc()` (lines 672–842, ~171 lines)
|
|
||||||
> - `_methods` dict imports the two backend fns from the
|
|
||||||
> new modules.
|
|
||||||
>
|
|
||||||
> **Commits** (each green via `/run-tests`):
|
|
||||||
> 1. Drive `_testing/pytest.py` backend-tuple from
|
|
||||||
> `get_args(SpawnMethodKey)` — tiny prep.
|
|
||||||
> 2. Extract `trio_proc` → `spawn/_trio.py`; `_methods`
|
|
||||||
> imports it.
|
|
||||||
> 3. Extract `mp_proc` → `spawn/_mp.py`; `_methods`
|
|
||||||
> imports it.
|
|
||||||
> 4. Tidy `spawn/__init__.py` docstring to describe new
|
|
||||||
> layout.
|
|
||||||
>
|
|
||||||
> No pyproject pin bump in Phase A (that's Phase B).
|
|
||||||
>
|
|
||||||
> **Workflow**: I'd run this in a new worktree via
|
|
||||||
> `/open-wkt spawn_modularize` (branch
|
|
||||||
> `wkt/spawn_modularize`), then open a PR before returning
|
|
||||||
> to the current `claude_bun_opus4.7_subint_spawner_backend`
|
|
||||||
> branch for Phase B/C.
|
|
||||||
>
|
|
||||||
> Green-light to `/open-wkt` and start Phase A?
|
|
||||||
|
|
||||||
## User final response
|
|
||||||
|
|
||||||
> all this first then yes, green light.
|
|
||||||
|
|
||||||
— log this `/prompt-io` entry first, then proceed with
|
|
||||||
Phase A in a worktree.
|
|
||||||
|
|
@ -1,86 +0,0 @@
|
||||||
---
|
|
||||||
model: claude-opus-4-7[1m]
|
|
||||||
service: claude
|
|
||||||
session: subints-spawner-phase-a-impl
|
|
||||||
timestamp: 2026-04-17T03:58:05Z
|
|
||||||
git_ref: 61a73ba
|
|
||||||
scope: code
|
|
||||||
substantive: true
|
|
||||||
raw_file: 20260417T035800Z_61a73ba_prompt_io.raw.md
|
|
||||||
---
|
|
||||||
|
|
||||||
## Prompt
|
|
||||||
|
|
||||||
Follow-up from the design session logged in
|
|
||||||
`20260417T034918Z_9703210_prompt_io.md`: the user
|
|
||||||
greenlit Phase A after `/prompt-io` tracking, so
|
|
||||||
execute the Phase A "modularize `spawn._spawn`" plan
|
|
||||||
end-to-end in a new worktree — four logical steps
|
|
||||||
(pytest harness prep, `trio_proc` extraction,
|
|
||||||
`mp_proc` extraction, docstring refresh), each green
|
|
||||||
via `/run-tests`.
|
|
||||||
|
|
||||||
User directives during execution:
|
|
||||||
- **Option 3** for the test cadence ("continue A.2–A.4
|
|
||||||
first, then run the full suite once at the end of
|
|
||||||
Phase A").
|
|
||||||
- **One commit** for the whole phase ("can't we just
|
|
||||||
commit the whole patch in one commit?") instead of
|
|
||||||
the 3/4-commit split I initially proposed.
|
|
||||||
- **Don't pre-draft** commit messages — wait for the
|
|
||||||
user to invoke `/commit-msg` (captured as feedback
|
|
||||||
memory `feedback_no_auto_draft_commit_msgs.md`).
|
|
||||||
|
|
||||||
## Response summary
|
|
||||||
|
|
||||||
Produced the cohesive Phase A modularization patch,
|
|
||||||
landed as commit `61a73bae` (subject: `Mv
|
|
||||||
trio_proc`/`mp_proc` to per-backend submods`). Five
|
|
||||||
files changed, +565 / -418 lines.
|
|
||||||
|
|
||||||
Key pieces of the patch (generated by claude,
|
|
||||||
reviewed by the human before commit):
|
|
||||||
- `tractor/spawn/_trio.py` — **new**; receives
|
|
||||||
`trio_proc()` verbatim from `_spawn.py`; imports
|
|
||||||
cross-backend helpers back from `._spawn`.
|
|
||||||
- `tractor/spawn/_mp.py` — **new**; receives
|
|
||||||
`mp_proc()` verbatim; uses `from . import _spawn`
|
|
||||||
for late-binding access to the mutable `_ctx` /
|
|
||||||
`_spawn_method` globals (design decision made
|
|
||||||
during impl, not the original plan).
|
|
||||||
- `tractor/spawn/_spawn.py` — shrunk 847 → 448 LOC;
|
|
||||||
import pruning; bottom-of-module late imports for
|
|
||||||
`trio_proc` / `mp_proc` with a one-line comment
|
|
||||||
explaining the circular-dep reason.
|
|
||||||
- `tractor/spawn/__init__.py` — docstring refresh
|
|
||||||
describing the new layout.
|
|
||||||
- `tractor/_testing/pytest.py` — the valid-backend
|
|
||||||
set now comes from `typing.get_args(SpawnMethodKey)`
|
|
||||||
so future additions (`'subint'`) don't need harness
|
|
||||||
edits.
|
|
||||||
|
|
||||||
## Files changed
|
|
||||||
|
|
||||||
See `git diff 61a73ba~1..61a73ba --stat`:
|
|
||||||
|
|
||||||
```
|
|
||||||
tractor/_testing/pytest.py | 12 +-
|
|
||||||
tractor/spawn/__init__.py | 31 +++-
|
|
||||||
tractor/spawn/_mp.py | 235 ++++++++++++++++++++++++
|
|
||||||
tractor/spawn/_spawn.py | 413 +-------------------------------
|
|
||||||
tractor/spawn/_trio.py | 292 ++++++++++++++++++++++++++++
|
|
||||||
5 files changed, 565 insertions(+), 418 deletions(-)
|
|
||||||
```
|
|
||||||
|
|
||||||
Validation:
|
|
||||||
- import probe + `_methods` wiring check — OK
|
|
||||||
- spawn-relevant test subset — 37 passed, 1 skipped
|
|
||||||
- full suite — 350 passed, 14 skipped, 7 xfailed, 1
|
|
||||||
xpassed
|
|
||||||
|
|
||||||
## Human edits
|
|
||||||
|
|
||||||
None — committed as generated by claude (no diff
|
|
||||||
between `.claude/git_commit_msg_LATEST.md` and the
|
|
||||||
committed body, as far as the assistant could
|
|
||||||
observe).
|
|
||||||
|
|
@ -1,138 +0,0 @@
|
||||||
---
|
|
||||||
model: claude-opus-4-7[1m]
|
|
||||||
service: claude
|
|
||||||
timestamp: 2026-04-17T03:58:05Z
|
|
||||||
git_ref: 61a73ba
|
|
||||||
diff_cmd: git diff 61a73ba~1..61a73ba
|
|
||||||
---
|
|
||||||
|
|
||||||
Code generated in this turn was committed verbatim as
|
|
||||||
`61a73bae` ("Mv `trio_proc`/`mp_proc` to per-backend
|
|
||||||
submods"). Per diff-ref mode, per-file code is captured
|
|
||||||
via the pointers below, each followed by a prose
|
|
||||||
summary of what the AI generated. Non-code output
|
|
||||||
(sanity-check results, design rationale) is included
|
|
||||||
verbatim.
|
|
||||||
|
|
||||||
## Per-file generated content
|
|
||||||
|
|
||||||
### `tractor/spawn/_trio.py` (new, 292 lines)
|
|
||||||
|
|
||||||
> `git diff 61a73ba~1..61a73ba -- tractor/spawn/_trio.py`
|
|
||||||
|
|
||||||
Pure lift-and-shift of `trio_proc()` out of
|
|
||||||
`tractor/spawn/_spawn.py` (previously lines 448–670).
|
|
||||||
Added AGPL header + module docstring describing the
|
|
||||||
backend; imports include local `from ._spawn import
|
|
||||||
cancel_on_completion, hard_kill, soft_kill` which
|
|
||||||
creates the bottom-of-module late-import pattern in
|
|
||||||
the core file to avoid a cycle. All call sites,
|
|
||||||
log-format strings, and body logic are byte-identical
|
|
||||||
to the originals — no semantic change.
|
|
||||||
|
|
||||||
### `tractor/spawn/_mp.py` (new, 235 lines)
|
|
||||||
|
|
||||||
> `git diff 61a73ba~1..61a73ba -- tractor/spawn/_mp.py`
|
|
||||||
|
|
||||||
Pure lift-and-shift of `mp_proc()` out of
|
|
||||||
`tractor/spawn/_spawn.py` (previously lines 672–842).
|
|
||||||
Same AGPL header convention. Key difference from
|
|
||||||
`_trio.py`: uses `from . import _spawn` (module
|
|
||||||
import, not from-import) for `_ctx` and
|
|
||||||
`_spawn_method` references — these are mutated at
|
|
||||||
runtime by `try_set_start_method()`, so late binding
|
|
||||||
via `_spawn._ctx` / `_spawn._spawn_method` is required
|
|
||||||
for correctness. Also imports `cancel_on_completion`,
|
|
||||||
`soft_kill`, `proc_waiter` from `._spawn`.
|
|
||||||
|
|
||||||
### `tractor/spawn/_spawn.py` (modified, 847 → 448 LOC)
|
|
||||||
|
|
||||||
> `git diff 61a73ba~1..61a73ba -- tractor/spawn/_spawn.py`
|
|
||||||
|
|
||||||
- removed `trio_proc()` body (moved to `_trio.py`)
|
|
||||||
- removed `mp_proc()` body (moved to `_mp.py`)
|
|
||||||
- pruned imports now unused in core: `sys`,
|
|
||||||
`is_root_process`, `current_actor`,
|
|
||||||
`is_main_process`, `_mp_main`, `ActorFailure`,
|
|
||||||
`pretty_struct`, `_pformat`
|
|
||||||
- added bottom-of-file late imports
|
|
||||||
`from ._trio import trio_proc` and
|
|
||||||
`from ._mp import mp_proc` with a one-line
|
|
||||||
comment explaining why (circular dep)
|
|
||||||
- `_methods` dict unchanged structurally; still binds
|
|
||||||
`'trio' → trio_proc`, `'mp_spawn' → mp_proc`,
|
|
||||||
`'mp_forkserver' → mp_proc`
|
|
||||||
|
|
||||||
### `tractor/spawn/__init__.py` (modified)
|
|
||||||
|
|
||||||
> `git diff 61a73ba~1..61a73ba -- tractor/spawn/__init__.py`
|
|
||||||
|
|
||||||
Docstring rewrite only — no code. New text describes
|
|
||||||
the per-backend `Layout` with `- ._spawn` (core),
|
|
||||||
`- ._trio` (default), `- ._mp` (two mp variants), and
|
|
||||||
reiterates the "no eager import" NOTE.
|
|
||||||
|
|
||||||
### `tractor/_testing/pytest.py` (modified)
|
|
||||||
|
|
||||||
> `git diff 61a73ba~1..61a73ba -- tractor/_testing/pytest.py`
|
|
||||||
|
|
||||||
Two-line logical change (plus a comment):
|
|
||||||
- `from typing import` grows `get_args`
|
|
||||||
- `pytest_generate_tests()` replaces the hardcoded
|
|
||||||
`('mp_spawn','mp_forkserver','trio')` tuple with
|
|
||||||
`get_args(SpawnMethodKey)` via lazy import
|
|
||||||
`from tractor.spawn._spawn import SpawnMethodKey`
|
|
||||||
inside the function body (matching the project's
|
|
||||||
existing lazy-import convention for the spawn
|
|
||||||
module)
|
|
||||||
|
|
||||||
## Non-code output (verbatim)
|
|
||||||
|
|
||||||
### Design rationale for the file split
|
|
||||||
|
|
||||||
From my Phase A plan turn (already captured in
|
|
||||||
`20260417T034918Z_9703210_prompt_io.md`):
|
|
||||||
- `_spawn.py` stays as the "core" rather than renamed
|
|
||||||
to `_core.py` so external imports
|
|
||||||
(`_testing/pytest.py:228` imports
|
|
||||||
`try_set_start_method` from `tractor.spawn._spawn`)
|
|
||||||
keep working without churn.
|
|
||||||
- Per-backend extraction chosen over alternatives
|
|
||||||
(e.g. splitting generic helpers further) because
|
|
||||||
the immediate motivation is hosting a 3rd
|
|
||||||
`_subint.py` sibling cleanly in Phase B.
|
|
||||||
|
|
||||||
### Sanity-check output (verbatim terminal excerpts)
|
|
||||||
|
|
||||||
Post-extraction import probe:
|
|
||||||
```
|
|
||||||
extraction OK
|
|
||||||
_methods: {'trio': 'tractor.spawn._trio.trio_proc',
|
|
||||||
'mp_spawn': 'tractor.spawn._mp.mp_proc',
|
|
||||||
'mp_forkserver': 'tractor.spawn._mp.mp_proc'}
|
|
||||||
```
|
|
||||||
|
|
||||||
Spawn-relevant test subset (`tests/test_local.py
|
|
||||||
test_rpc.py test_spawning.py test_multi_program.py
|
|
||||||
test_discovery.py`):
|
|
||||||
```
|
|
||||||
37 passed, 1 skipped, 14 warnings in 55.37s
|
|
||||||
```
|
|
||||||
|
|
||||||
Full suite:
|
|
||||||
```
|
|
||||||
350 passed, 14 skipped, 7 xfailed, 1 xpassed,
|
|
||||||
151 warnings in 437.73s (0:07:17)
|
|
||||||
```
|
|
||||||
|
|
||||||
No regressions vs. `main`. One transient `-x`
|
|
||||||
early-stop `ERROR` on
|
|
||||||
`test_close_channel_explicit_remote_registrar[trio-True]`
|
|
||||||
was flaky (passed solo, passed without `-x`), not
|
|
||||||
caused by this refactor.
|
|
||||||
|
|
||||||
### Commit message
|
|
||||||
|
|
||||||
Also AI-drafted (via `/commit-msg`) — the 40-line
|
|
||||||
message on commit `61a73bae` itself. Not reproduced
|
|
||||||
here; see `git log -1 61a73bae`.
|
|
||||||
|
|
@ -1,97 +0,0 @@
|
||||||
---
|
|
||||||
model: claude-opus-4-7[1m]
|
|
||||||
service: claude
|
|
||||||
session: subints-spawner-phase-b1-impl
|
|
||||||
timestamp: 2026-04-17T12:44:37Z
|
|
||||||
git_ref: 5cd6df5
|
|
||||||
scope: code
|
|
||||||
substantive: true
|
|
||||||
raw_file: 20260417T124437Z_5cd6df5_prompt_io.raw.md
|
|
||||||
---
|
|
||||||
|
|
||||||
## Prompt
|
|
||||||
|
|
||||||
Continuing the Phase B work from the design sessions
|
|
||||||
logged in `20260417T034918Z_9703210_prompt_io.md` and
|
|
||||||
the Phase A impl in `20260417T035800Z_61a73ba_...`.
|
|
||||||
|
|
||||||
User direction at this step:
|
|
||||||
- "ok now let's continue with the subints part of the
|
|
||||||
plan ya?" → kick off Phase B.
|
|
||||||
- After I proposed a 4-sub-phase breakdown (B.1
|
|
||||||
scaffolding, B.2 min-viable spawn, B.3 lifecycle,
|
|
||||||
B.4 test enablement), user: "ok can we continue
|
|
||||||
with B" → approving the phasing and design
|
|
||||||
defaults (thread-per-subint, UDS default,
|
|
||||||
code-string bootstrap via `interp.exec()`).
|
|
||||||
- User switched the worktree to
|
|
||||||
`subint_spawner_backend` (instead of opening a
|
|
||||||
fresh worktree as I'd proposed): "i already
|
|
||||||
switched branches in the wkt, you should work off
|
|
||||||
`subint_spawner_backend` now".
|
|
||||||
|
|
||||||
Scope of this turn: **B.1 scaffolding only** —
|
|
||||||
wire the `'subint'` name through the spawn-method
|
|
||||||
registry + harness, add a feature-detected stub
|
|
||||||
module, bump pyproject. No actual spawn flow.
|
|
||||||
|
|
||||||
## Response summary
|
|
||||||
|
|
||||||
Produced the Phase B.1 scaffolding patch, landed as
|
|
||||||
commit `5cd6df58` (subject: `Add `'subint'` spawn
|
|
||||||
backend scaffold (#379)`). Four files changed,
|
|
||||||
+124 / -2 lines.
|
|
||||||
|
|
||||||
Key pieces (all generated by claude, reviewed by
|
|
||||||
human before commit):
|
|
||||||
- `tractor/spawn/_subint.py` — **new**; feature-
|
|
||||||
detects `concurrent.interpreters`; `subint_proc()`
|
|
||||||
stub raises `RuntimeError` on py<3.14 or
|
|
||||||
`NotImplementedError` with issue-#379 URL on
|
|
||||||
py≥3.14. Signature mirrors `trio_proc`/`mp_proc`
|
|
||||||
so B.2 can drop the impl in without touching
|
|
||||||
`_methods`.
|
|
||||||
- `tractor/spawn/_spawn.py` — adds `'subint'` to
|
|
||||||
`SpawnMethodKey`, grows a `case 'subint'` arm in
|
|
||||||
`try_set_start_method()` with feature-gate, re-
|
|
||||||
imports `sys` for the gate-error msg, adds late
|
|
||||||
`from ._subint import subint_proc` import and
|
|
||||||
`_methods` entry.
|
|
||||||
- `tractor/_testing/pytest.py` — converts the
|
|
||||||
gate-error into `pytest.UsageError` via a
|
|
||||||
`try/except` around `try_set_start_method()` so
|
|
||||||
`--spawn-backend=subint` on py<3.14 prints a
|
|
||||||
clean banner instead of a traceback.
|
|
||||||
- `pyproject.toml` — pin `requires-python` `<3.14`
|
|
||||||
→ `<3.15`, add `3.14` trove classifier.
|
|
||||||
|
|
||||||
## Files changed
|
|
||||||
|
|
||||||
See `git diff 5cd6df5~1..5cd6df5 --stat`:
|
|
||||||
|
|
||||||
```
|
|
||||||
pyproject.toml | 3 +-
|
|
||||||
tractor/_testing/pytest.py | 8 +++-
|
|
||||||
tractor/spawn/_spawn.py | 15 ++++++
|
|
||||||
tractor/spawn/_subint.py | 100 +++++++++++++++++++++++++
|
|
||||||
4 files changed, 124 insertions(+), 2 deletions(-)
|
|
||||||
```
|
|
||||||
|
|
||||||
Validation on py3.13:
|
|
||||||
- registry/gate probe — OK; `_has_subints = False`,
|
|
||||||
`subint_proc()` raises `RuntimeError` as expected.
|
|
||||||
- `try_set_start_method('subint')` — raises cleanly.
|
|
||||||
- `pytest --spawn-backend=subint` — exits with
|
|
||||||
`pytest.UsageError` banner.
|
|
||||||
- Spawn-relevant test subset — 69 passed, 1 skipped.
|
|
||||||
- Full suite on py3.13 deferred (scaffolding is
|
|
||||||
no-op on <3.14).
|
|
||||||
- Full-suite on py3.14 pending user setting up the
|
|
||||||
venv.
|
|
||||||
|
|
||||||
## Human edits
|
|
||||||
|
|
||||||
None — committed as generated by claude. `uv.lock`
|
|
||||||
was intentionally left unstaged by the user (pin-
|
|
||||||
bump regenerated cp314 wheel entries but they were
|
|
||||||
deferred to a follow-up).
|
|
||||||
|
|
@ -1,168 +0,0 @@
|
||||||
---
|
|
||||||
model: claude-opus-4-7[1m]
|
|
||||||
service: claude
|
|
||||||
timestamp: 2026-04-17T12:44:37Z
|
|
||||||
git_ref: 5cd6df5
|
|
||||||
diff_cmd: git diff 5cd6df5~1..5cd6df5
|
|
||||||
---
|
|
||||||
|
|
||||||
Code generated in this turn was committed verbatim as
|
|
||||||
`5cd6df58` ("Add `'subint'` spawn backend scaffold
|
|
||||||
(#379)"). Per diff-ref mode, per-file code is captured
|
|
||||||
via the pointers below, each followed by a prose
|
|
||||||
summary. Non-code output (sanity-check results,
|
|
||||||
design rationale) is included verbatim.
|
|
||||||
|
|
||||||
## Per-file generated content
|
|
||||||
|
|
||||||
### `tractor/spawn/_subint.py` (new, 100 lines)
|
|
||||||
|
|
||||||
> `git diff 5cd6df5~1..5cd6df5 -- tractor/spawn/_subint.py`
|
|
||||||
|
|
||||||
New scaffolding module for the PEP 734 subinterpreter
|
|
||||||
backend. Contents:
|
|
||||||
- AGPL header + module docstring (describes backend
|
|
||||||
intent, 3.14+ availability gate, and explicit
|
|
||||||
"SCAFFOLDING STUB" status pointing at issue #379).
|
|
||||||
- Top-level `try/except ImportError` wrapping
|
|
||||||
`from concurrent import interpreters as
|
|
||||||
_interpreters` → sets module-global
|
|
||||||
`_has_subints: bool`. This lets the registry stay
|
|
||||||
introspectable on py<3.14 while spawn-time still
|
|
||||||
fails cleanly.
|
|
||||||
- `subint_proc()` coroutine with signature matching
|
|
||||||
`trio_proc`/`mp_proc` exactly (same param names,
|
|
||||||
defaults, and `TaskStatus[Portal]` typing) —
|
|
||||||
intentional so Phase B.2 can drop the impl in
|
|
||||||
without touching `_methods` or changing call-site
|
|
||||||
binding.
|
|
||||||
- Body raises `RuntimeError` on py<3.14 (with
|
|
||||||
`sys.version` printed) or `NotImplementedError`
|
|
||||||
with issue-#379 URL on py≥3.14.
|
|
||||||
|
|
||||||
### `tractor/spawn/_spawn.py` (modified, +15 LOC)
|
|
||||||
|
|
||||||
> `git diff 5cd6df5~1..5cd6df5 -- tractor/spawn/_spawn.py`
|
|
||||||
|
|
||||||
- `import sys` re-added (pruned during Phase A, now
|
|
||||||
needed again for the py-version string in the
|
|
||||||
`'subint'` gate-error).
|
|
||||||
- `SpawnMethodKey = Literal[...]` grows `'subint'` as
|
|
||||||
the 4th member, with inline comment `# py3.14+ via
|
|
||||||
`concurrent.interpreters` (PEP 734)`.
|
|
||||||
- `try_set_start_method()` match-block grows a new
|
|
||||||
`case 'subint':` arm that imports
|
|
||||||
`from ._subint import _has_subints` lazily and
|
|
||||||
raises `RuntimeError` with a multi-line gate msg
|
|
||||||
if unavailable.
|
|
||||||
- Bottom-of-module late-import section grows
|
|
||||||
`from ._subint import subint_proc` alongside the
|
|
||||||
existing `_trio` / `_mp` imports.
|
|
||||||
- `_methods` dict grows `'subint': subint_proc`.
|
|
||||||
|
|
||||||
### `tractor/_testing/pytest.py` (modified, +8 LOC)
|
|
||||||
|
|
||||||
> `git diff 5cd6df5~1..5cd6df5 -- tractor/_testing/pytest.py`
|
|
||||||
|
|
||||||
`pytest_configure()` wraps the
|
|
||||||
`try_set_start_method(backend)` call in a
|
|
||||||
`try/except RuntimeError` that re-raises as
|
|
||||||
`pytest.UsageError(str(err))`. Rationale: the gate
|
|
||||||
error on py<3.14 is legitimately a configuration
|
|
||||||
problem, not a test failure, so pytest's UsageError
|
|
||||||
path (exit code 4) gives a clean single-line banner
|
|
||||||
instead of a traceback.
|
|
||||||
|
|
||||||
### `pyproject.toml` (modified, +2 / -1)
|
|
||||||
|
|
||||||
> `git diff 5cd6df5~1..5cd6df5 -- pyproject.toml`
|
|
||||||
|
|
||||||
- `requires-python` pin relaxed `>=3.12, <3.14` →
|
|
||||||
`>=3.12, <3.15` to admit 3.14 as a valid target.
|
|
||||||
- Added `"Programming Language :: Python :: 3.14"`
|
|
||||||
to the trove classifiers.
|
|
||||||
|
|
||||||
## Non-code output (verbatim)
|
|
||||||
|
|
||||||
### Design choices captured in the prior turn
|
|
||||||
|
|
||||||
(Previously confirmed by the user on April 17 in this
|
|
||||||
session — relevant excerpts captured here for
|
|
||||||
provenance since they drove the shape of the impl):
|
|
||||||
|
|
||||||
1. **Python pin**: `<3.15` + feature-gate the subint
|
|
||||||
backend at spawn time — user answered option (a)
|
|
||||||
from the design triage.
|
|
||||||
2. **Deferred `fork()`-via-subint hack** from issue
|
|
||||||
#379's "Our own thoughts" section.
|
|
||||||
3. **Phase B phasing**: user approved the B.1 / B.2 /
|
|
||||||
B.3 / B.4 breakdown — this commit is strictly B.1
|
|
||||||
(scaffolding only, no spawn-flow impl).
|
|
||||||
4. **Option (B) worktree strategy**: new worktree
|
|
||||||
branched from `wkt/spawn_modularize`. *(Amended by
|
|
||||||
user at runtime: user switched the existing
|
|
||||||
`spawn_modularize` worktree to the
|
|
||||||
`subint_spawner_backend` branch instead.)*
|
|
||||||
|
|
||||||
### Sanity-check output (verbatim terminal excerpts)
|
|
||||||
|
|
||||||
Registry / feature-gate verification on py3.13:
|
|
||||||
```
|
|
||||||
SpawnMethodKey values: ('trio', 'mp_spawn',
|
|
||||||
'mp_forkserver', 'subint')
|
|
||||||
_methods keys: ['trio', 'mp_spawn',
|
|
||||||
'mp_forkserver', 'subint']
|
|
||||||
_has_subints: False (py version: (3, 13) )
|
|
||||||
[expected] RuntimeError: The 'subint' spawn backend
|
|
||||||
requires Python 3.14+ (stdlib
|
|
||||||
`concurrent.interpreters`, PEP 734).
|
|
||||||
```
|
|
||||||
|
|
||||||
`try_set_start_method('subint')` gate on py3.13:
|
|
||||||
```
|
|
||||||
[expected] RuntimeError: Spawn method 'subint'
|
|
||||||
requires Python 3.14+ (stdlib
|
|
||||||
`concurrent.interpreters`, PEP 734).
|
|
||||||
```
|
|
||||||
|
|
||||||
Pytest `--spawn-backend=subint` on py3.13 (the new
|
|
||||||
UsageError wrapper kicking in):
|
|
||||||
```
|
|
||||||
ERROR: Spawn method 'subint' requires Python 3.14+
|
|
||||||
(stdlib `concurrent.interpreters`, PEP 734).
|
|
||||||
Current runtime: 3.13.11 (main, Dec 5 2025,
|
|
||||||
16:06:33) [GCC 15.2.0]
|
|
||||||
```
|
|
||||||
|
|
||||||
Collection probe: `404 tests collected in 0.18s`
|
|
||||||
(no import errors from the new module).
|
|
||||||
|
|
||||||
Spawn-relevant test subset (`tests/test_local.py
|
|
||||||
test_rpc.py test_spawning.py test_multi_program.py
|
|
||||||
tests/discovery/`):
|
|
||||||
```
|
|
||||||
69 passed, 1 skipped, 10 warnings in 61.38s
|
|
||||||
```
|
|
||||||
|
|
||||||
Full suite was **not** run on py3.13 for this commit
|
|
||||||
— the scaffolding is no-op on <3.14 and full-suite
|
|
||||||
validation under py3.14 is pending that venv being
|
|
||||||
set up by the user.
|
|
||||||
|
|
||||||
### Commit message
|
|
||||||
|
|
||||||
Also AI-drafted (via `/commit-msg`, with the prose
|
|
||||||
rewrapped through `/home/goodboy/.claude/skills/pr-msg/
|
|
||||||
scripts/rewrap.py --width 67`) — the 33-line message
|
|
||||||
on commit `5cd6df58` itself. Not reproduced here; see
|
|
||||||
`git log -1 5cd6df58`.
|
|
||||||
|
|
||||||
### Known follow-ups flagged to user
|
|
||||||
|
|
||||||
- **`uv.lock` deferred**: pin-bump regenerated cp314
|
|
||||||
wheel entries in `uv.lock`, but the user chose to
|
|
||||||
not stage `uv.lock` for this commit. Warned
|
|
||||||
explicitly.
|
|
||||||
- **Phase B.2 needs py3.14 venv** — running the
|
|
||||||
actual subint impl requires it; user said they'd
|
|
||||||
set it up separately.
|
|
||||||
|
|
@ -1,117 +0,0 @@
|
||||||
---
|
|
||||||
model: claude-opus-4-7[1m]
|
|
||||||
service: claude
|
|
||||||
session: subints-phase-b2-destroy-race-fix
|
|
||||||
timestamp: 2026-04-18T04:25:26Z
|
|
||||||
git_ref: 26fb820
|
|
||||||
scope: code
|
|
||||||
substantive: true
|
|
||||||
raw_file: 20260418T042526Z_26fb820_prompt_io.raw.md
|
|
||||||
---
|
|
||||||
|
|
||||||
## Prompt
|
|
||||||
|
|
||||||
Follow-up to Phase B.2 (`5cd6df58`) after the user
|
|
||||||
observed intermittent mid-suite hangs when running
|
|
||||||
the tractor test suite under `--spawn-backend=subint`
|
|
||||||
on py3.14. The specific sequence of prompts over
|
|
||||||
several turns:
|
|
||||||
|
|
||||||
1. User pointed at the `test_context_stream_semantics.py`
|
|
||||||
suite as the first thing to make run clean under
|
|
||||||
`--spawn-backend=subint`.
|
|
||||||
2. After a series of `timeout`-terminated runs that
|
|
||||||
gave no diagnostic info, user nudged me to stop
|
|
||||||
relying on `timeout` and get actual runtime
|
|
||||||
diagnostics ("the suite hangs indefinitely, so i
|
|
||||||
don't think this `timeout 30` is helping you at
|
|
||||||
all.."). Switched to
|
|
||||||
`faulthandler.dump_traceback_later(...)` and a
|
|
||||||
resource-tracker fixture to rule out leaks.
|
|
||||||
3. Captured a stack pinning the hang on
|
|
||||||
`_interpreters.destroy(interp_id)` in the subint
|
|
||||||
teardown finally block.
|
|
||||||
4. Proposed dedicated-OS-thread fix. User greenlit.
|
|
||||||
5. Implemented + verified on-worktree; user needed
|
|
||||||
to be pointed at the *worktree*'s `./py313` venv
|
|
||||||
because bare `pytest` was picking up the main
|
|
||||||
repo's venv (running un-patched `_subint.py`) and
|
|
||||||
still hanging.
|
|
||||||
|
|
||||||
Running theme over the whole exchange: this patch
|
|
||||||
only closes the *destroy race*. The user and I also
|
|
||||||
traced through the deeper cancellation story — SIGINT
|
|
||||||
can't reach subints, legacy-mode shares the GIL,
|
|
||||||
portal-cancel dies when the IPC channel is already
|
|
||||||
broken — and agreed the next step is a bounded
|
|
||||||
hard-kill in `subint_proc`'s teardown plus a
|
|
||||||
dedicated cancellation test suite. Those land as
|
|
||||||
separate commits.
|
|
||||||
|
|
||||||
## Response summary
|
|
||||||
|
|
||||||
Produced the `tractor/spawn/_subint.py` patch landed
|
|
||||||
as commit `26fb8206` ("Fix subint destroy race via
|
|
||||||
dedicated OS thread"). One file, +110/-84 LOC.
|
|
||||||
|
|
||||||
Mechanism: swap `trio.to_thread.run_sync(_interpreters
|
|
||||||
.exec, ...)` for a plain `threading.Thread(target=...
|
|
||||||
, daemon=False)`. The trio thread cache recycles
|
|
||||||
workers — so the OS thread that ran `_interpreters
|
|
||||||
.exec()` remained alive in the cache holding a
|
|
||||||
stale subint tstate, blocking
|
|
||||||
`_interpreters.destroy()` in the finally indefinitely.
|
|
||||||
A dedicated one-shot thread exits naturally after
|
|
||||||
the sync target returns, releasing tstate and
|
|
||||||
unblocking destroy.
|
|
||||||
|
|
||||||
Coordination across the trio↔thread boundary:
|
|
||||||
- `trio.lowlevel.current_trio_token()` captured at
|
|
||||||
`subint_proc` entry
|
|
||||||
- driver thread signals `subint_exited.set()` back
|
|
||||||
to parent trio via `trio.from_thread.run_sync(...,
|
|
||||||
trio_token=token)` (synchronous from the thread's
|
|
||||||
POV; the call returns after trio has run `.set()`)
|
|
||||||
- `trio.RunFinishedError` swallowed in that path for
|
|
||||||
the process-teardown case where parent trio already
|
|
||||||
exited
|
|
||||||
- teardown `finally` off-loads the sync
|
|
||||||
`driver_thread.join()` via `to_thread.run_sync` (a
|
|
||||||
cache thread carries no subint tstate — safe)
|
|
||||||
|
|
||||||
## Files changed
|
|
||||||
|
|
||||||
See `git diff 26fb820~1..26fb820 --stat`:
|
|
||||||
|
|
||||||
```
|
|
||||||
tractor/spawn/_subint.py | 194 +++++++++++++++++++------------
|
|
||||||
1 file changed, 110 insertions(+), 84 deletions(-)
|
|
||||||
```
|
|
||||||
|
|
||||||
Validation:
|
|
||||||
- `test_parent_cancels[chk_ctx_result_before_exit=True-
|
|
||||||
cancel_method=ctx-child_returns_early=False]`
|
|
||||||
(the specific test that was hanging for the user)
|
|
||||||
— passed in 1.06s.
|
|
||||||
- Full `tests/test_context_stream_semantics.py` under
|
|
||||||
subint — 61 passed in 100.35s (clean-cache re-run:
|
|
||||||
100.82s).
|
|
||||||
- Trio backend regression subset — 69 passed / 1
|
|
||||||
skipped / 89.19s — no regressions from this change.
|
|
||||||
|
|
||||||
## Files changed
|
|
||||||
|
|
||||||
Beyond the `_subint.py` patch, the raw log also
|
|
||||||
records the cancellation-semantics research that
|
|
||||||
spanned this conversation but did not ship as code
|
|
||||||
in *this* commit. Preserving it inline under "Non-
|
|
||||||
code output" because it directly informs the
|
|
||||||
Phase B.3 hard-kill impl that will follow (and any
|
|
||||||
upstream CPython bug reports we end up filing).
|
|
||||||
|
|
||||||
## Human edits
|
|
||||||
|
|
||||||
None — committed as generated. The commit message
|
|
||||||
itself was also AI-drafted via `/commit-msg` and
|
|
||||||
rewrapped via the project's `rewrap.py --width 67`
|
|
||||||
tooling; user landed it without edits.
|
|
||||||
|
|
@ -1,220 +0,0 @@
|
||||||
---
|
|
||||||
model: claude-opus-4-7[1m]
|
|
||||||
service: claude
|
|
||||||
timestamp: 2026-04-18T04:25:26Z
|
|
||||||
git_ref: 26fb820
|
|
||||||
diff_cmd: git diff 26fb820~1..26fb820
|
|
||||||
---
|
|
||||||
|
|
||||||
Code generated in this turn was committed as `26fb8206`
|
|
||||||
("Fix subint destroy race via dedicated OS thread").
|
|
||||||
Per diff-ref mode, per-file code is captured via the
|
|
||||||
pointer below; the rest is verbatim non-code output
|
|
||||||
— test-session diagnostics, CPython-internals
|
|
||||||
reasoning, and the design synthesis that's worth
|
|
||||||
keeping alongside the code since it informs the
|
|
||||||
still-pending Phase B.3 hard-kill work.
|
|
||||||
|
|
||||||
## Per-file generated content
|
|
||||||
|
|
||||||
### `tractor/spawn/_subint.py` (modified, +110/-84 LOC)
|
|
||||||
|
|
||||||
> `git diff 26fb820~1..26fb820 -- tractor/spawn/_subint.py`
|
|
||||||
|
|
||||||
Rewrites the subint driver-thread strategy:
|
|
||||||
- replaces `trio.to_thread.run_sync(_interpreters.exec, ...)`
|
|
||||||
with a plain `threading.Thread(target=_subint_target,
|
|
||||||
daemon=False)` so the OS thread truly exits after
|
|
||||||
`_interpreters.exec()` returns
|
|
||||||
- captures a `trio.lowlevel.current_trio_token()` at
|
|
||||||
`subint_proc` entry; the driver thread signals
|
|
||||||
completion back via `trio.from_thread.run_sync(
|
|
||||||
subint_exited.set, trio_token=...)`
|
|
||||||
- swallows `trio.RunFinishedError` in the signal path
|
|
||||||
for the case where the parent trio loop has already
|
|
||||||
exited (process teardown)
|
|
||||||
- in the teardown `finally` off-loads the sync
|
|
||||||
`driver_thread.join()` call to `trio.to_thread.run_sync`
|
|
||||||
(a cache thread w/ no subint tstate — so no cache
|
|
||||||
conflict) to wait for the driver thread to fully
|
|
||||||
exit before calling `_interpreters.destroy()`
|
|
||||||
|
|
||||||
## Non-code output (verbatim) — the CPython-internals research
|
|
||||||
|
|
||||||
### What went wrong before this commit
|
|
||||||
|
|
||||||
Under `--spawn-backend=subint` on py3.14, most single
|
|
||||||
tests passed but longer runs hung intermittently. The
|
|
||||||
position of the hang moved between runs (test #22 on
|
|
||||||
one run, test #53 on another) suggesting a timing-
|
|
||||||
dependent race rather than a deterministic bug.
|
|
||||||
|
|
||||||
`faulthandler.dump_traceback_later()` eventually
|
|
||||||
caught a stack with the main thread blocked in
|
|
||||||
`_interpreters.destroy(interp_id)` at `_subint.py:293`.
|
|
||||||
Only 2 threads were alive:
|
|
||||||
- main thread waiting in `_interpreters.destroy()`
|
|
||||||
- one idle trio thread-cache worker in
|
|
||||||
`trio._core._thread_cache._work`
|
|
||||||
|
|
||||||
No subint was still running (`_interpreters.list_all()`
|
|
||||||
showed only the main interp). A resource-tracker
|
|
||||||
pytest fixture confirmed threads/subints did NOT
|
|
||||||
accumulate across tests — this was not a leak but a
|
|
||||||
specific "destroy blocks on cached thread w/ stale
|
|
||||||
tstate" race.
|
|
||||||
|
|
||||||
### Why the race exists
|
|
||||||
|
|
||||||
`trio.to_thread.run_sync` uses a thread *cache* to
|
|
||||||
avoid OS-thread creation overhead. When the sync
|
|
||||||
callable returns, the OS thread is NOT terminated —
|
|
||||||
it's parked in `_thread_cache._work` waiting for the
|
|
||||||
next job. CPython's subinterpreter implementation
|
|
||||||
attaches a **tstate** (thread-state object) to each
|
|
||||||
OS thread that ever entered a subint via
|
|
||||||
`_interpreters.exec()`. That tstate is released
|
|
||||||
lazily — either when the thread picks up a new job
|
|
||||||
(which re-attaches a new tstate, evicting the old
|
|
||||||
one) or when the thread truly exits.
|
|
||||||
|
|
||||||
`_interpreters.destroy(interp_id)` waits for *all*
|
|
||||||
tstates associated w/ that subint to be released
|
|
||||||
before it can proceed. If the cached worker is idle
|
|
||||||
holding the stale tstate, destroy blocks indefinitely.
|
|
||||||
Whether the race manifests depends on timing — if
|
|
||||||
the cached thread happens to pick up another job
|
|
||||||
quickly, destroy unblocks; if it sits idle, we hang.
|
|
||||||
|
|
||||||
### Why a dedicated `threading.Thread` fixes it
|
|
||||||
|
|
||||||
A plain `threading.Thread(target=_subint_target,
|
|
||||||
daemon=False)` runs its target once and exits. When
|
|
||||||
the target returns, OS-thread teardown (`_bootstrap_inner`
|
|
||||||
→ `_bootstrap`) fires and CPython releases the
|
|
||||||
tstate for that thread. `_interpreters.destroy()`
|
|
||||||
then has no blocker.
|
|
||||||
|
|
||||||
### Diagnostic tactics that actually helped
|
|
||||||
|
|
||||||
1. `faulthandler.dump_traceback_later(n, repeat=False,
|
|
||||||
file=open(path, 'w'))` for captured stack dumps on
|
|
||||||
hang. Critically, pipe to a `file=` not stderr —
|
|
||||||
pytest captures stderr weirdly and the dump is
|
|
||||||
easy to miss.
|
|
||||||
2. A resource-tracker autouse fixture printing
|
|
||||||
per-test `threading.active_count()` +
|
|
||||||
`len(_interpreters.list_all())` deltas → ruled out
|
|
||||||
leak-accumulation theories quickly.
|
|
||||||
3. Running the hanging test *solo* vs in-suite —
|
|
||||||
when solo passes but in-suite hangs, you know
|
|
||||||
it's a cross-test state-transfer bug rather than
|
|
||||||
a test-internal bug.
|
|
||||||
|
|
||||||
### Design synthesis — SIGINT + subints + SC
|
|
||||||
|
|
||||||
The user and I walked through the cancellation
|
|
||||||
semantics of PEP 684/734 subinterpreters in detail.
|
|
||||||
Key findings we want to preserve:
|
|
||||||
|
|
||||||
**Signal delivery in subints (stdlib limitation).**
|
|
||||||
CPython's signal machinery only delivers signals
|
|
||||||
(SIGINT included) to the *main thread of the main
|
|
||||||
interpreter*. Subints cannot install signal handlers
|
|
||||||
that will ever fire. This is an intentional design
|
|
||||||
choice in PEP 684 and not expected to change. For
|
|
||||||
tractor's subint actors, this means:
|
|
||||||
|
|
||||||
- Ctrl-C never reaches a subint directly.
|
|
||||||
- `trio.run()` running on a worker thread (as we do
|
|
||||||
for subints) already skips SIGINT handler install
|
|
||||||
because `signal.signal()` raises on non-main
|
|
||||||
threads.
|
|
||||||
- The only cancellation surface into a subint is
|
|
||||||
our IPC `Portal.cancel_actor()`.
|
|
||||||
|
|
||||||
**Legacy-mode subints share the main GIL** (which
|
|
||||||
our impl uses since `msgspec` lacks PEP 684 support
|
|
||||||
per `jcrist/msgspec#563`). This means a stuck subint
|
|
||||||
thread can starve the parent's trio loop during
|
|
||||||
cancellation — the parent can't even *start* its
|
|
||||||
teardown handling until the subint yields the GIL.
|
|
||||||
|
|
||||||
**Failure modes identified for Phase B.3 audit:**
|
|
||||||
|
|
||||||
1. Portal cancel lands cleanly → subint unwinds →
|
|
||||||
thread exits → destroy succeeds. (Happy path.)
|
|
||||||
2. IPC channel is already broken when we try to
|
|
||||||
send cancel (e.g., `test_ipc_channel_break_*`)
|
|
||||||
→ cancel raises `BrokenResourceError` → subint
|
|
||||||
keeps running unaware → parent hangs waiting for
|
|
||||||
`subint_exited`. This is what breaks
|
|
||||||
`test_advanced_faults.py` under subint.
|
|
||||||
3. Subint is stuck in non-checkpointing Python code
|
|
||||||
→ portal-cancel msg queued but never processed.
|
|
||||||
4. Subint is in a shielded cancel scope when cancel
|
|
||||||
arrives → delay until shield exits.
|
|
||||||
|
|
||||||
**Current teardown has a shield-bug too:**
|
|
||||||
`trio.CancelScope(shield=True)` wrapping the `finally`
|
|
||||||
block absorbs Ctrl-C, so even when the user tries
|
|
||||||
to break out they can't. This is the reason
|
|
||||||
`test_ipc_channel_break_during_stream[break_parent-...
|
|
||||||
no_msgstream_aclose]` locks up unkillable.
|
|
||||||
|
|
||||||
**B.3 hard-kill fix plan (next commit):**
|
|
||||||
|
|
||||||
1. Bound `driver_thread.join()` with
|
|
||||||
`trio.move_on_after(HARD_KILL_TIMEOUT)`.
|
|
||||||
2. If it times out, log a warning naming the
|
|
||||||
`interp_id` and switch the driver thread to
|
|
||||||
`daemon=True` mode (not actually possible after
|
|
||||||
start — so instead create as daemon=True upfront
|
|
||||||
and accept the tradeoff of proc-exit not waiting
|
|
||||||
for a stuck subint).
|
|
||||||
3. Best-effort `_interpreters.destroy()`; catch the
|
|
||||||
`InterpreterError` if the subint is still running.
|
|
||||||
4. Document that the leak is real and the only
|
|
||||||
escape hatch we have without upstream cooperation.
|
|
||||||
|
|
||||||
**Test plan for Phase B.3:**
|
|
||||||
|
|
||||||
New `tests/test_subint_cancellation.py` covering:
|
|
||||||
- SIGINT at spawn
|
|
||||||
- SIGINT mid-portal-RPC
|
|
||||||
- SIGINT during shielded section in subint
|
|
||||||
- Dead-channel cancel (mirror of `test_ipc_channel_
|
|
||||||
break_during_stream` minimized)
|
|
||||||
- Non-checkpointing subint (tight `while True` in
|
|
||||||
user code)
|
|
||||||
- Per-test `pytest-timeout`-style bounds so the
|
|
||||||
tests visibly fail instead of wedging the runner
|
|
||||||
|
|
||||||
### Sanity-check output (verbatim terminal excerpts)
|
|
||||||
|
|
||||||
Post-fix single-test validation:
|
|
||||||
```
|
|
||||||
1 passed, 1 warning in 1.06s
|
|
||||||
```
|
|
||||||
(same test that was hanging pre-fix:
|
|
||||||
`test_parent_cancels[...cancel_method=ctx-...False]`)
|
|
||||||
|
|
||||||
Full `tests/test_context_stream_semantics.py`
|
|
||||||
under subint:
|
|
||||||
```
|
|
||||||
61 passed, 1 warning in 100.35s (0:01:40)
|
|
||||||
```
|
|
||||||
and a clean-cache re-run:
|
|
||||||
```
|
|
||||||
61 passed, 1 warning in 100.82s (0:01:40)
|
|
||||||
```
|
|
||||||
|
|
||||||
No regressions on trio backend (same subset):
|
|
||||||
```
|
|
||||||
69 passed, 1 skipped, 3 warnings in 89.19s
|
|
||||||
```
|
|
||||||
|
|
||||||
### Commit msg
|
|
||||||
|
|
||||||
Also AI-drafted via `/commit-msg` + `rewrap.py
|
|
||||||
--width 67`. See `git log -1 26fb820`.
|
|
||||||
|
|
@ -1,111 +0,0 @@
|
||||||
---
|
|
||||||
model: claude-opus-4-7[1m]
|
|
||||||
service: claude
|
|
||||||
session: subint-phase-b-hang-classification
|
|
||||||
timestamp: 2026-04-20T19:27:39Z
|
|
||||||
git_ref: HEAD (pre-commit; on branch `subint_spawner_backend`)
|
|
||||||
scope: docs
|
|
||||||
substantive: true
|
|
||||||
raw_file: 20260420T192739Z_5e8cd8b2_prompt_io.raw.md
|
|
||||||
---
|
|
||||||
|
|
||||||
## Prompt
|
|
||||||
|
|
||||||
Collab between user (`goodboy`) and `claude` to
|
|
||||||
disambiguate two distinct hang modes hit during
|
|
||||||
Phase B subint-spawn-backend bringup (issue #379).
|
|
||||||
The user ran the failing suites, captured `strace`
|
|
||||||
evidence on hung pytest pids, and set the framing:
|
|
||||||
|
|
||||||
> "These cannot be the same bug. Different fix
|
|
||||||
> paths. Write them up separately or we'll keep
|
|
||||||
> conflating them."
|
|
||||||
|
|
||||||
Follow-on asks:
|
|
||||||
- Cross-link each doc from its reproducer test so
|
|
||||||
a future triager lands on the analysis without
|
|
||||||
needing to grep `ai/conc-anal/`.
|
|
||||||
- On `test_stale_entry_is_deleted`: arm
|
|
||||||
`dump_on_hang(seconds=20, path=...)` so a
|
|
||||||
regression captures a stack dump (pytest stderr
|
|
||||||
capture otherwise eats `faulthandler` output).
|
|
||||||
- Keep `test_stale_entry_is_deleted` un-`skip`ped
|
|
||||||
so the dump file remains inspectable.
|
|
||||||
|
|
||||||
Defer the actual fix for the class-2 hang
|
|
||||||
(Ctrl-C-able, tractor-side bug) to a later commit
|
|
||||||
per the incremental-commits preference.
|
|
||||||
|
|
||||||
## Response summary
|
|
||||||
|
|
||||||
Wrote two new `ai/conc-anal/` classification docs
|
|
||||||
and cross-linked them from the reproducer tests:
|
|
||||||
|
|
||||||
1. **`subint_sigint_starvation_issue.md`** —
|
|
||||||
class 1: abandoned-legacy-subint thread +
|
|
||||||
shared GIL starves main trio loop →
|
|
||||||
signal-wakeup-fd pipe fills → SIGINT silently
|
|
||||||
dropped (`write() = EAGAIN`). Pytest process
|
|
||||||
un-Ctrl-C-able. Structurally a CPython limit;
|
|
||||||
blocked on `msgspec` PEP 684 support
|
|
||||||
(jcrist/msgspec#563). Reproducer:
|
|
||||||
`test_stale_entry_is_deleted[subint]`.
|
|
||||||
|
|
||||||
2. **`subint_cancel_delivery_hang_issue.md`** —
|
|
||||||
class 2: parent-side trio task parks on an
|
|
||||||
orphaned IPC channel after subint teardown;
|
|
||||||
no clean EOF delivered to waiting receiver.
|
|
||||||
Ctrl-C-able (main trio loop iterating fine).
|
|
||||||
OUR bug to fix. Candidate fix: explicit
|
|
||||||
parent-side channel abort in `subint_proc`'s
|
|
||||||
hard-kill teardown. Reproducer:
|
|
||||||
`test_subint_non_checkpointing_child`.
|
|
||||||
|
|
||||||
Test-side cross-links:
|
|
||||||
- `tests/discovery/test_registrar.py`:
|
|
||||||
`test_stale_entry_is_deleted` → `trio.run(main)`
|
|
||||||
wrapped in `dump_on_hang(seconds=20,
|
|
||||||
path=<per-method-tmp>)`; long inline comment
|
|
||||||
summarizes `strace` evidence + root-cause chain
|
|
||||||
and points at both docs.
|
|
||||||
- `tests/test_subint_cancellation.py`:
|
|
||||||
`test_subint_non_checkpointing_child` docstring
|
|
||||||
extended with "KNOWN ISSUE (Ctrl-C-able hang)"
|
|
||||||
section pointing at the class-2 doc + noting
|
|
||||||
the class-1 doc is NOT what this test hits.
|
|
||||||
|
|
||||||
## Files changed
|
|
||||||
|
|
||||||
- `ai/conc-anal/subint_sigint_starvation_issue.md`
|
|
||||||
— new, 205 LOC
|
|
||||||
- `ai/conc-anal/subint_cancel_delivery_hang_issue.md`
|
|
||||||
— new, 161 LOC
|
|
||||||
- `tests/discovery/test_registrar.py` — +52/-1
|
|
||||||
(arm `dump_on_hang`, inline-comment cross-link)
|
|
||||||
- `tests/test_subint_cancellation.py` — +26
|
|
||||||
(docstring "KNOWN ISSUE" block)
|
|
||||||
|
|
||||||
## Human edits
|
|
||||||
|
|
||||||
Substantive collab — prose was jointly iterated:
|
|
||||||
|
|
||||||
- User framed the two-doc split, set the
|
|
||||||
classification criteria (Ctrl-C-able vs not),
|
|
||||||
and provided the `strace` evidence.
|
|
||||||
- User decided to keep `test_stale_entry_is_deleted`
|
|
||||||
un-`skip`ped (my initial suggestion was
|
|
||||||
`pytestmark.skipif(spawn_backend=='subint')`).
|
|
||||||
- User chose the candidate fix ordering for
|
|
||||||
class 2 and marked "explicit parent-side channel
|
|
||||||
abort" as the surgical preferred fix.
|
|
||||||
- User picked the file naming convention
|
|
||||||
(`subint_<hang-shape>_issue.md`) over my initial
|
|
||||||
`hang_class_{1,2}.md`.
|
|
||||||
- Assistant drafted the prose, aggregated prior-
|
|
||||||
session root-cause findings from Phase B.2/B.3
|
|
||||||
bringup, and wrote the test-side cross-linking
|
|
||||||
comments.
|
|
||||||
|
|
||||||
No further mechanical edits expected before
|
|
||||||
commit; user may still rewrap via
|
|
||||||
`scripts/rewrap.py` if preferred.
|
|
||||||
|
|
@ -1,198 +0,0 @@
|
||||||
---
|
|
||||||
model: claude-opus-4-7[1m]
|
|
||||||
service: claude
|
|
||||||
timestamp: 2026-04-20T19:27:39Z
|
|
||||||
git_ref: HEAD (pre-commit; will land on branch `subint_spawner_backend`)
|
|
||||||
diff_cmd: git diff HEAD~1..HEAD
|
|
||||||
---
|
|
||||||
|
|
||||||
Collab between `goodboy` (user) and `claude` (this
|
|
||||||
assistant) spanning multiple test-run iterations on
|
|
||||||
branch `subint_spawner_backend`. The user ran the
|
|
||||||
failing suites, captured `strace` evidence on the
|
|
||||||
hung pytest pids, and set the direction ("these are
|
|
||||||
two different hangs — write them up separately so
|
|
||||||
we don't re-confuse ourselves later"). The assistant
|
|
||||||
aggregated prior-session findings (Phase B.2/B.3
|
|
||||||
bringup) into two classification docs + test-side
|
|
||||||
cross-links. All prose was jointly iterated; the
|
|
||||||
user had final say on framing and decided which
|
|
||||||
candidate fix directions to list.
|
|
||||||
|
|
||||||
## Per-file generated content
|
|
||||||
|
|
||||||
### `ai/conc-anal/subint_sigint_starvation_issue.md` (new, 205 LOC)
|
|
||||||
|
|
||||||
> `git diff HEAD~1..HEAD -- ai/conc-anal/subint_sigint_starvation_issue.md`
|
|
||||||
|
|
||||||
Writes up the "abandoned-legacy-subint thread wedges
|
|
||||||
the parent trio loop" class. Key sections:
|
|
||||||
|
|
||||||
- **Symptom** — `test_stale_entry_is_deleted[subint]`
|
|
||||||
hangs indefinitely AND is un-Ctrl-C-able.
|
|
||||||
- **Evidence** — annotated `strace` excerpt showing
|
|
||||||
SIGINT delivered to pytest, C-level signal handler
|
|
||||||
tries to write to the signal-wakeup-fd pipe, gets
|
|
||||||
`write() = -1 EAGAIN (Resource temporarily
|
|
||||||
unavailable)`. Pipe is full because main trio loop
|
|
||||||
isn't iterating often enough to drain it.
|
|
||||||
- **Root-cause chain** — our hard-kill abandons the
|
|
||||||
`daemon=True` driver OS thread after
|
|
||||||
`_HARD_KILL_TIMEOUT`; the subint *inside* that
|
|
||||||
thread is still running `trio.run()`;
|
|
||||||
`_interpreters.destroy()` cannot force-stop a
|
|
||||||
running subint (raises `InterpreterError`); legacy
|
|
||||||
subints share the main GIL → abandoned subint
|
|
||||||
starves main trio loop → wakeup-fd fills → SIGINT
|
|
||||||
silently dropped.
|
|
||||||
- **Why it's structurally a CPython limit** — no
|
|
||||||
public force-destroy primitive for a running
|
|
||||||
subint; the only escape is per-interpreter GIL
|
|
||||||
isolation, gated on msgspec PEP 684 adoption
|
|
||||||
(jcrist/msgspec#563).
|
|
||||||
- **Current escape hatch** — harness-side SIGINT
|
|
||||||
loop in the `daemon` fixture teardown that kills
|
|
||||||
the bg registrar subproc, eventually unblocking
|
|
||||||
a parent-side recv enough for the main loop to
|
|
||||||
drain the wakeup pipe.
|
|
||||||
|
|
||||||
### `ai/conc-anal/subint_cancel_delivery_hang_issue.md` (new, 161 LOC)
|
|
||||||
|
|
||||||
> `git diff HEAD~1..HEAD -- ai/conc-anal/subint_cancel_delivery_hang_issue.md`
|
|
||||||
|
|
||||||
Writes up the *sibling* hang class — same subint
|
|
||||||
backend, distinct root cause:
|
|
||||||
|
|
||||||
- **TL;DR** — Ctrl-C-able, so NOT the SIGINT-
|
|
||||||
starvation class; main trio loop iterates fine;
|
|
||||||
ours to fix.
|
|
||||||
- **Symptom** — `test_subint_non_checkpointing_child`
|
|
||||||
hangs past the expected `_HARD_KILL_TIMEOUT`
|
|
||||||
budget even after the subint is torn down.
|
|
||||||
- **Diagnosis** — a parent-side trio task (likely
|
|
||||||
a `chan.recv()` in `process_messages`) parks on
|
|
||||||
an orphaned IPC channel; channel was torn down
|
|
||||||
without emitting a clean EOF /
|
|
||||||
`BrokenResourceError` to the waiting receiver.
|
|
||||||
- **Candidate fix directions** — listed in rough
|
|
||||||
order of preference:
|
|
||||||
1. Explicit parent-side channel abort in
|
|
||||||
`subint_proc`'s hard-kill teardown (surgical;
|
|
||||||
most likely).
|
|
||||||
2. Audit `process_messages` to add a timeout or
|
|
||||||
cancel-scope protection that catches the
|
|
||||||
orphaned-recv state.
|
|
||||||
3. Wrap subint IPC channel construction in a
|
|
||||||
sentinel that can force-close from the parent
|
|
||||||
side regardless of subint liveness.
|
|
||||||
|
|
||||||
### `tests/discovery/test_registrar.py` (modified, +52/-1 LOC)
|
|
||||||
|
|
||||||
> `git diff HEAD~1..HEAD -- tests/discovery/test_registrar.py`
|
|
||||||
|
|
||||||
Wraps the `trio.run(main)` call at the bottom of
|
|
||||||
`test_stale_entry_is_deleted` in
|
|
||||||
`dump_on_hang(seconds=20, path=<per-method-tmp>)`.
|
|
||||||
Adds a long inline comment that:
|
|
||||||
- Enumerates variant-by-variant status
|
|
||||||
(`[trio]`/`[mp_*]` = clean; `[subint]` = hangs
|
|
||||||
+ un-Ctrl-C-able)
|
|
||||||
- Summarizes the `strace` evidence and root-cause
|
|
||||||
chain inline (so a future reader hitting this
|
|
||||||
test doesn't need to cross-ref the doc to
|
|
||||||
understand the hang shape)
|
|
||||||
- Points at
|
|
||||||
`ai/conc-anal/subint_sigint_starvation_issue.md`
|
|
||||||
for full analysis
|
|
||||||
- Cross-links to the *sibling*
|
|
||||||
`subint_cancel_delivery_hang_issue.md` so
|
|
||||||
readers can tell the two classes apart
|
|
||||||
- Explains why it's kept un-`skip`ped: the dump
|
|
||||||
file is useful if the hang ever returns after
|
|
||||||
a refactor. pytest stderr capture would
|
|
||||||
otherwise eat `faulthandler` output, hence the
|
|
||||||
file path.
|
|
||||||
|
|
||||||
### `tests/test_subint_cancellation.py` (modified, +26 LOC)
|
|
||||||
|
|
||||||
> `git diff HEAD~1..HEAD -- tests/test_subint_cancellation.py`
|
|
||||||
|
|
||||||
Extends the docstring of
|
|
||||||
`test_subint_non_checkpointing_child` with a
|
|
||||||
"KNOWN ISSUE (Ctrl-C-able hang)" block:
|
|
||||||
- Describes the current hang: parent-side orphaned
|
|
||||||
IPC recv after hard-kill; distinct from the
|
|
||||||
SIGINT-starvation sibling class.
|
|
||||||
- Cites `strace` distinguishing signal: wakeup-fd
|
|
||||||
`write() = 1` (not `EAGAIN`) — i.e. main loop
|
|
||||||
iterating.
|
|
||||||
- Points at
|
|
||||||
`ai/conc-anal/subint_cancel_delivery_hang_issue.md`
|
|
||||||
for full analysis + candidate fix directions.
|
|
||||||
- Clarifies that the *other* sibling doc
|
|
||||||
(SIGINT-starvation) is NOT what this test hits.
|
|
||||||
|
|
||||||
## Non-code output
|
|
||||||
|
|
||||||
### Classification reasoning (why two docs, not one)
|
|
||||||
|
|
||||||
The user and I converged on the two-doc split after
|
|
||||||
running the suites and noticing two *qualitatively
|
|
||||||
different* hang symptoms:
|
|
||||||
|
|
||||||
1. `test_stale_entry_is_deleted[subint]` — pytest
|
|
||||||
process un-Ctrl-C-able. Ctrl-C at the terminal
|
|
||||||
does nothing. Must kill-9 from another shell.
|
|
||||||
2. `test_subint_non_checkpointing_child` — pytest
|
|
||||||
process Ctrl-C-able. One Ctrl-C at the prompt
|
|
||||||
unblocks cleanly and the test reports a hang
|
|
||||||
via pytest-timeout.
|
|
||||||
|
|
||||||
From the user: "These cannot be the same bug.
|
|
||||||
Different fix paths. Write them up separately or
|
|
||||||
we'll keep conflating them."
|
|
||||||
|
|
||||||
`strace` on the `[subint]` hang gave the decisive
|
|
||||||
signal for the first class:
|
|
||||||
|
|
||||||
```
|
|
||||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
|
||||||
write(5, "\2", 1) = -1 EAGAIN (Resource temporarily unavailable)
|
|
||||||
```
|
|
||||||
|
|
||||||
fd 5 is Python's signal-wakeup-fd pipe. `EAGAIN`
|
|
||||||
on a `write()` of 1 byte to a pipe means the pipe
|
|
||||||
buffer is full → reader side (main Python thread
|
|
||||||
inside `trio.run()`) isn't consuming. That's the
|
|
||||||
GIL-hostage signature.
|
|
||||||
|
|
||||||
The second class's `strace` showed `write(5, "\2",
|
|
||||||
1) = 1` — clean drain — so the main trio loop was
|
|
||||||
iterating and the hang had to be on the application
|
|
||||||
side of things, not the kernel-↔-Python signal
|
|
||||||
boundary.
|
|
||||||
|
|
||||||
### Why the candidate fix for class 2 is "explicit parent-side channel abort"
|
|
||||||
|
|
||||||
The second hang class has the trio loop alive. A
|
|
||||||
parked `chan.recv()` that will never get bytes is
|
|
||||||
fundamentally a tractor-side resource-lifetime bug
|
|
||||||
— the IPC channel was torn down (subint destroyed)
|
|
||||||
but no one explicitly raised
|
|
||||||
`BrokenResourceError` at the parent-side receiver.
|
|
||||||
The `subint_proc` hard-kill path is the natural
|
|
||||||
place to add that notification, because it already
|
|
||||||
knows the subint is unreachable at that point.
|
|
||||||
|
|
||||||
Alternative fix paths (blanket timeouts on
|
|
||||||
`process_messages`, sentinel-wrapped channels) are
|
|
||||||
less surgical and risk masking unrelated bugs —
|
|
||||||
hence the preference ordering in the doc.
|
|
||||||
|
|
||||||
### Why we're not just patching the code now
|
|
||||||
|
|
||||||
The user explicitly deferred the fix to a later
|
|
||||||
commit: "Document both classes now, land the fix
|
|
||||||
for class 2 separately so the diff reviews clean."
|
|
||||||
This matches the incremental-commits preference
|
|
||||||
from memory.
|
|
||||||
|
|
@ -1,155 +0,0 @@
|
||||||
---
|
|
||||||
model: claude-opus-4-7[1m]
|
|
||||||
service: claude
|
|
||||||
session: subints-phase-b-hardening-and-fork-block
|
|
||||||
timestamp: 2026-04-22T20:07:23Z
|
|
||||||
git_ref: 797f57c
|
|
||||||
scope: code
|
|
||||||
substantive: true
|
|
||||||
raw_file: 20260422T200723Z_797f57c_prompt_io.raw.md
|
|
||||||
---
|
|
||||||
|
|
||||||
## Prompt
|
|
||||||
|
|
||||||
Session-spanning work on the Phase B `subint` spawn-backend.
|
|
||||||
Three distinct sub-phases in one log:
|
|
||||||
|
|
||||||
1. **Py3.13 gate tightening** — diagnose a reproducible hang
|
|
||||||
of subint spawn flow under py3.13 (works on py3.14), trace
|
|
||||||
to a private `_interpreters` module vintage issue, tighten
|
|
||||||
our feature gate from "`_interpreters` present" to "public
|
|
||||||
`concurrent.interpreters` present" (i.e. py3.14+).
|
|
||||||
2. **Test-harness hardening** — add `pytest-timeout` dep, put
|
|
||||||
`@pytest.mark.timeout(30, method='thread')` on the
|
|
||||||
three known-hanging subint tests cataloged in
|
|
||||||
`ai/conc-anal/subint_sigint_starvation_issue.md`. Separately,
|
|
||||||
code-review the user's in-flight `skipon_spawn_backend`
|
|
||||||
marker implementation; find four bugs; refactor to use
|
|
||||||
`item.iter_markers()`.
|
|
||||||
3. **`subint_fork` prototype → CPython-block finding** — draft
|
|
||||||
a WIP `subint_fork_proc` backend using a sub-interpreter as
|
|
||||||
a launchpad for `os.fork()` (to sidestep trio#1614). User
|
|
||||||
tests on py3.14, hits
|
|
||||||
`Fatal Python error: _PyInterpreterState_DeleteExceptMain:
|
|
||||||
not main interpreter`. Walk CPython sources (local clone at
|
|
||||||
`~/repos/cpython/`) to pinpoint the refusal
|
|
||||||
(`Modules/posixmodule.c:728` → `Python/pystate.c:1040`).
|
|
||||||
Revert implementation to a `NotImplementedError` stub in a
|
|
||||||
new `_subint_fork.py` submodule, document the finding in a
|
|
||||||
third `conc-anal/` doc with an upstream-report draft for
|
|
||||||
the CPython issue tracker. Finally, discuss user's proposed
|
|
||||||
workaround architecture (main-interp worker-thread
|
|
||||||
forkserver) and draft a standalone smoke-test script for
|
|
||||||
feasibility validation.
|
|
||||||
|
|
||||||
## Response summary
|
|
||||||
|
|
||||||
All three sub-phases landed concrete artifacts:
|
|
||||||
|
|
||||||
**Sub-phase 1** — `_subint.py` + `_spawn.py` gates + error
|
|
||||||
messages updated to require py3.14+ via the public
|
|
||||||
`concurrent.interpreters` module presence check. Module
|
|
||||||
docstring revised to explain the empirical reason
|
|
||||||
(py3.13's private `_interpreters` vintage wedges under
|
|
||||||
multi-trio-task usage even though minimal standalone
|
|
||||||
reproducers work fine there). Test-module
|
|
||||||
`pytest.importorskip` likewise switched.
|
|
||||||
|
|
||||||
**Sub-phase 2** — `pytest-timeout>=2.3` added to `testing`
|
|
||||||
dep group. `@pytest.mark.timeout(30, method='thread')`
|
|
||||||
applied on:
|
|
||||||
- `tests/discovery/test_registrar.py::test_stale_entry_is_deleted`
|
|
||||||
- `tests/test_cancellation.py::test_cancel_while_childs_child_in_sync_sleep`
|
|
||||||
- `tests/test_cancellation.py::test_multierror_fast_nursery`
|
|
||||||
- `tests/test_subint_cancellation.py::test_subint_non_checkpointing_child`
|
|
||||||
|
|
||||||
`method='thread'` documented inline as load-bearing — the
|
|
||||||
GIL-starvation path that drops `SIGINT` would equally drop
|
|
||||||
`SIGALRM`, so only a watchdog-thread timeout can reliably
|
|
||||||
escape.
|
|
||||||
|
|
||||||
`skipon_spawn_backend` plugin refactored into a single
|
|
||||||
`iter_markers`-driven loop in `pytest_collection_modifyitems`
|
|
||||||
(~30 LOC replacing ~30 LOC of nested conditionals). Four
|
|
||||||
bugs dissolved: wrong `.get()` key, module-level `pytestmark`
|
|
||||||
suppressing per-test marks, unhandled `pytestmark = [list]`
|
|
||||||
form, `pytest.Makr` typo. Marker help text updated to
|
|
||||||
document the variadic backend-list + `reason=` kwarg
|
|
||||||
surface.
|
|
||||||
|
|
||||||
**Sub-phase 3** — Prototype drafted (then reverted):
|
|
||||||
|
|
||||||
- `tractor/spawn/_subint_fork.py` — new dedicated submodule
|
|
||||||
housing the `subint_fork_proc` stub. Module docstring +
|
|
||||||
fn docstring explain the attempt, the CPython-level
|
|
||||||
block, and the reason for keeping the stub in-tree
|
|
||||||
(documentation of the attempt + starting point if CPython
|
|
||||||
ever lifts the restriction).
|
|
||||||
- `tractor/spawn/_spawn.py` — `'subint_fork'` registered as a
|
|
||||||
`SpawnMethodKey` literal + in `_methods`, so
|
|
||||||
`--spawn-backend=subint_fork` routes to a clean
|
|
||||||
`NotImplementedError` pointing at the analysis doc rather
|
|
||||||
than an "invalid backend" error.
|
|
||||||
- `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md` —
|
|
||||||
third sibling conc-anal doc. Full annotated CPython
|
|
||||||
source walkthrough from user-visible
|
|
||||||
`Fatal Python error` → `Modules/posixmodule.c:728
|
|
||||||
PyOS_AfterFork_Child()` → `Python/pystate.c:1040
|
|
||||||
_PyInterpreterState_DeleteExceptMain()` gate. Includes a
|
|
||||||
copy-paste-ready upstream-report draft for the CPython
|
|
||||||
issue tracker with a two-tier ask (ideally "make it work",
|
|
||||||
minimally "cleaner error than `Fatal Python error`
|
|
||||||
aborting the child").
|
|
||||||
- `ai/conc-anal/subint_fork_from_main_thread_smoketest.py` —
|
|
||||||
standalone zero-tractor-import CPython-level smoke test
|
|
||||||
for the user's proposed workaround architecture
|
|
||||||
(forkserver on a main-interp worker thread). Four
|
|
||||||
argparse-driven scenarios: `control_subint_thread_fork`
|
|
||||||
(reproduces the known-broken case as a test-harness
|
|
||||||
sanity), `main_thread_fork` (baseline), `worker_thread_fork`
|
|
||||||
(architectural assertion), `full_architecture`
|
|
||||||
(end-to-end trio-in-subint in forked child). User will
|
|
||||||
run on py3.14 next.
|
|
||||||
|
|
||||||
## Files changed
|
|
||||||
|
|
||||||
See `git log 26fb820..HEAD --stat` for the canonical list.
|
|
||||||
New files this session:
|
|
||||||
- `tractor/spawn/_subint_fork.py`
|
|
||||||
- `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
|
|
||||||
- `ai/conc-anal/subint_fork_from_main_thread_smoketest.py`
|
|
||||||
|
|
||||||
Modified (diff pointers in raw log):
|
|
||||||
- `tractor/spawn/_subint.py` (py3.14 gate)
|
|
||||||
- `tractor/spawn/_spawn.py` (`subint_fork` registration)
|
|
||||||
- `tractor/_testing/pytest.py` (`skipon_spawn_backend` refactor)
|
|
||||||
- `pyproject.toml` (`pytest-timeout` dep)
|
|
||||||
- `tests/discovery/test_registrar.py`,
|
|
||||||
`tests/test_cancellation.py`,
|
|
||||||
`tests/test_subint_cancellation.py` (timeout marks,
|
|
||||||
cross-refs to conc-anal docs)
|
|
||||||
|
|
||||||
## Human edits
|
|
||||||
|
|
||||||
Several back-and-forth iterations with user-driven
|
|
||||||
adjustments during the session:
|
|
||||||
|
|
||||||
- User corrected my initial mis-classification of
|
|
||||||
`test_cancel_while_childs_child_in_sync_sleep[subint-False]`
|
|
||||||
as Ctrl-C-able — second strace showed `EAGAIN`, putting
|
|
||||||
it squarely in class A (GIL-starvation). Re-analysis
|
|
||||||
preserved in the raw log.
|
|
||||||
- User independently fixed the `.get(reason)` → `.get('reason', reason)`
|
|
||||||
bug in the marker plugin before my review; preserved their
|
|
||||||
fix.
|
|
||||||
- User suggested moving the `subint_fork_proc` stub from
|
|
||||||
the bottom of `_subint.py` into its own
|
|
||||||
`_subint_fork.py` submodule — applied.
|
|
||||||
- User asked to keep the forkserver-architecture
|
|
||||||
discussion as background for the smoke-test rather than
|
|
||||||
committing to a tractor-side refactor until the smoke
|
|
||||||
test validates the CPython-level assumptions.
|
|
||||||
|
|
||||||
Commit messages in this range (b025c982 … 797f57c) were
|
|
||||||
drafted via `/commit-msg` + `rewrap.py --width 67`; user
|
|
||||||
landed them with the usual review.
|
|
||||||
|
|
@ -1,343 +0,0 @@
|
||||||
---
|
|
||||||
model: claude-opus-4-7[1m]
|
|
||||||
service: claude
|
|
||||||
timestamp: 2026-04-22T20:07:23Z
|
|
||||||
git_ref: 797f57c
|
|
||||||
diff_cmd: git log 26fb820..HEAD # all session commits since the destroy-race fix log
|
|
||||||
---
|
|
||||||
|
|
||||||
Session-spanning conversation covering the Phase B hardening
|
|
||||||
of the `subint` spawn-backend and an investigation into a
|
|
||||||
proposed `subint_fork` follow-up which turned out to be
|
|
||||||
blocked at the CPython level. This log is a narrative capture
|
|
||||||
of the substantive turns (not every message) and references
|
|
||||||
the concrete code + docs the session produced. Per diff-ref
|
|
||||||
mode the actual code diffs are pointed at via `git log` on
|
|
||||||
each ref rather than duplicated inline.
|
|
||||||
|
|
||||||
## Narrative of the substantive turns
|
|
||||||
|
|
||||||
### Py3.13 hang / gate tightening
|
|
||||||
|
|
||||||
Diagnosed a reproducible hang of the `subint` backend under
|
|
||||||
py3.13 (test_spawning tests wedge after root-actor bringup).
|
|
||||||
Root cause: py3.13's vintage of the private `_interpreters` C
|
|
||||||
module has a latent thread/subint-interaction issue that
|
|
||||||
`_interpreters.exec()` silently fails to progress under
|
|
||||||
tractor's multi-trio usage pattern — even though a minimal
|
|
||||||
standalone `threading.Thread` + `_interpreters.exec()`
|
|
||||||
reproducer works fine on the same Python. Empirically
|
|
||||||
py3.14 fixes it.
|
|
||||||
|
|
||||||
Fix (from this session): tighten the `_has_subints` gate in
|
|
||||||
`tractor.spawn._subint` from "private module importable" to
|
|
||||||
"public `concurrent.interpreters` present" — which is 3.14+
|
|
||||||
only. This leaves `subint_proc()` unchanged in behavior (we
|
|
||||||
still call the *private* `_interpreters.create('legacy')`
|
|
||||||
etc. under the hood) but refuses to engage on 3.13.
|
|
||||||
|
|
||||||
Also tightened the matching gate in
|
|
||||||
`tractor.spawn._spawn.try_set_start_method('subint')` and
|
|
||||||
rev'd the corresponding error messages from "3.13+" to
|
|
||||||
"3.14+" with a sentence explaining why. Test-module
|
|
||||||
`pytest.importorskip` switched from `_interpreters` →
|
|
||||||
`concurrent.interpreters` to match.
|
|
||||||
|
|
||||||
### `pytest-timeout` dep + `skipon_spawn_backend` marker plumbing
|
|
||||||
|
|
||||||
Added `pytest-timeout>=2.3` to the `testing` dep group with
|
|
||||||
an inline comment pointing at the `ai/conc-anal/*.md` docs.
|
|
||||||
Applied `@pytest.mark.timeout(30, method='thread')` (the
|
|
||||||
`method='thread'` is load-bearing — `signal`-method
|
|
||||||
`SIGALRM` suffers the same GIL-starvation path that drops
|
|
||||||
`SIGINT` in the class-A hang pattern) to the three known-
|
|
||||||
hanging subint tests cataloged in
|
|
||||||
`subint_sigint_starvation_issue.md`.
|
|
||||||
|
|
||||||
Separately code-reviewed the user's newly-staged
|
|
||||||
`skipon_spawn_backend` pytest marker implementation in
|
|
||||||
`tractor/_testing/pytest.py`. Found four bugs:
|
|
||||||
|
|
||||||
1. `modmark.kwargs.get(reason)` called `.get()` with the
|
|
||||||
*variable* `reason` as the dict key instead of the string
|
|
||||||
`'reason'` — user-supplied `reason=` was never picked up.
|
|
||||||
(User had already fixed this locally via `.get('reason',
|
|
||||||
reason)` by the time my review happened — preserved that
|
|
||||||
fix.)
|
|
||||||
2. The module-level `pytestmark` branch suppressed per-test
|
|
||||||
marker handling (the `else:` was an `else:` rather than
|
|
||||||
independent iteration).
|
|
||||||
3. `mod_pytestmark.mark` assumed a single
|
|
||||||
`MarkDecorator` — broke on the valid-pytest `pytestmark =
|
|
||||||
[mark, mark]` list form.
|
|
||||||
4. Typo: `pytest.Makr` → `pytest.Mark`.
|
|
||||||
|
|
||||||
Refactored the hook to use `item.iter_markers(name=...)`
|
|
||||||
which walks function + class + module scopes uniformly and
|
|
||||||
handles both `pytestmark` forms natively. ~30 LOC replaced
|
|
||||||
the original ~30 LOC of nested conditionals, all four bugs
|
|
||||||
dissolved. Also updated the marker help string to reflect
|
|
||||||
the variadic `*start_methods` + `reason=` surface.
|
|
||||||
|
|
||||||
### `subint_fork_proc` prototype attempt
|
|
||||||
|
|
||||||
User's hypothesis: the known trio+`fork()` issues
|
|
||||||
(python-trio/trio#1614) could be sidestepped by using a
|
|
||||||
sub-interpreter purely as a launchpad — `os.fork()` from a
|
|
||||||
subint that has never imported trio → child is in a
|
|
||||||
trio-free context. In the child `execv()` back into
|
|
||||||
`python -m tractor._child` and the downstream handshake
|
|
||||||
matches `trio_proc()` identically.
|
|
||||||
|
|
||||||
Drafted the prototype at `tractor/spawn/_subint.py`'s bottom
|
|
||||||
(originally — later moved to its own submod, see below):
|
|
||||||
launchpad-subint creation, bootstrap code-string with
|
|
||||||
`os.fork()` + `execv()`, driver-thread orchestration,
|
|
||||||
parent-side `ipc_server.wait_for_peer()` dance. Registered
|
|
||||||
`'subint_fork'` as a new `SpawnMethodKey` literal, added
|
|
||||||
`case 'subint' | 'subint_fork':` feature-gate arm in
|
|
||||||
`try_set_start_method()`, added entry in `_methods` dict.
|
|
||||||
|
|
||||||
### CPython-level block discovered
|
|
||||||
|
|
||||||
User tested on py3.14 and saw:
|
|
||||||
|
|
||||||
```
|
|
||||||
Fatal Python error: _PyInterpreterState_DeleteExceptMain: not main interpreter
|
|
||||||
Python runtime state: initialized
|
|
||||||
|
|
||||||
Current thread 0x00007f6b71a456c0 [subint-fork-lau] (most recent call first):
|
|
||||||
File "<script>", line 2 in <module>
|
|
||||||
<script>:2: DeprecationWarning: This process (pid=802985) is multi-threaded, use of fork() may lead to deadlocks in the child.
|
|
||||||
```
|
|
||||||
|
|
||||||
Walked CPython sources (local clone at `~/repos/cpython/`):
|
|
||||||
|
|
||||||
- **`Modules/posixmodule.c:728` `PyOS_AfterFork_Child()`** —
|
|
||||||
post-fork child-side cleanup. Calls
|
|
||||||
`_PyInterpreterState_DeleteExceptMain(runtime)` with
|
|
||||||
`goto fatal_error` on non-zero status. Has the
|
|
||||||
`// Ideally we could guarantee tstate is running main.`
|
|
||||||
self-acknowledging-fragile comment directly above.
|
|
||||||
|
|
||||||
- **`Python/pystate.c:1040`
|
|
||||||
`_PyInterpreterState_DeleteExceptMain()`** — the
|
|
||||||
refusal. Hard `PyStatus_ERR("not main interpreter")` gate
|
|
||||||
when `tstate->interp != interpreters->main`. Docstring
|
|
||||||
formally declares the precondition ("If there is a
|
|
||||||
current interpreter state, it *must* be the main
|
|
||||||
interpreter"). `XXX` comments acknowledge further latent
|
|
||||||
issues within.
|
|
||||||
|
|
||||||
Definitive answer to "Open Question 1" of the prototype
|
|
||||||
docstring: **no, CPython does not support `os.fork()` from
|
|
||||||
a non-main sub-interpreter**. Not because the fork syscall
|
|
||||||
is blocked (it isn't — the parent returns a valid pid),
|
|
||||||
but because the child cannot survive CPython's post-fork
|
|
||||||
initialization. This is an enforced invariant, not an
|
|
||||||
incidental limitation.
|
|
||||||
|
|
||||||
### Revert: move to stub submod + doc the finding
|
|
||||||
|
|
||||||
Per user request:
|
|
||||||
|
|
||||||
1. Reverted the working `subint_fork_proc` body to a
|
|
||||||
`NotImplementedError` stub, MOVED to its own submod
|
|
||||||
`tractor/spawn/_subint_fork.py` (keeps `_subint.py`
|
|
||||||
focused on the working `subint_proc` backend).
|
|
||||||
2. Updated `_spawn.py` to import the stub from the new
|
|
||||||
submod path; kept `'subint_fork'` in `SpawnMethodKey` +
|
|
||||||
`_methods` so `--spawn-backend=subint_fork` routes to a
|
|
||||||
clean `NotImplementedError` with pointer to the analysis
|
|
||||||
doc rather than an "invalid backend" error.
|
|
||||||
3. Wrote
|
|
||||||
`ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
|
|
||||||
with the full annotated CPython walkthrough + an
|
|
||||||
upstream-report draft for the CPython issue tracker.
|
|
||||||
Draft has a two-tier ask: ideally "make it work"
|
|
||||||
(pre-fork tstate-swap hook or `DeleteExceptFor(interp)`
|
|
||||||
variant), minimally "give us a clean `RuntimeError` in
|
|
||||||
the parent instead of a `Fatal Python error` aborting
|
|
||||||
the child silently".
|
|
||||||
|
|
||||||
### Design discussion — main-interp-thread forkserver workaround
|
|
||||||
|
|
||||||
User proposed: set up a "subint forking server" that fork()s
|
|
||||||
on behalf of subint callers. Core insight: the CPython gate
|
|
||||||
is on `tstate->interp`, not thread identity, so **any thread
|
|
||||||
whose tstate is main-interp** can fork cleanly. A worker
|
|
||||||
thread attached to main-interp (never entering a subint)
|
|
||||||
satisfies the precondition.
|
|
||||||
|
|
||||||
Structurally this is `mp.forkserver` (which tractor already
|
|
||||||
has as `mp_forkserver`) but **in-process**: instead of a
|
|
||||||
separate Python subproc as the fork server, we'd put the
|
|
||||||
forkserver on a thread in the tractor parent process. Pros:
|
|
||||||
faster spawn (no IPC marshalling to external server + no
|
|
||||||
separate Python startup), inherits already-imported modules
|
|
||||||
for free. Cons: less crash isolation (forkserver failure
|
|
||||||
takes the whole process).
|
|
||||||
|
|
||||||
Required tractor-side refactor: move the root actor's
|
|
||||||
`trio.run()` off main-interp-main-thread (so main-thread can
|
|
||||||
run the forkserver loop). Nontrivial; approximately the same
|
|
||||||
magnitude as "Phase C".
|
|
||||||
|
|
||||||
The design would also not fully resolve the class-A
|
|
||||||
GIL-starvation issue because child actors' trio still runs
|
|
||||||
inside subints (legacy config, msgspec PEP 684 pending).
|
|
||||||
Would mitigate SIGINT-starvation specifically if signal
|
|
||||||
handling moves to the forkserver thread.
|
|
||||||
|
|
||||||
Recommended pre-commitment: a standalone CPython-only smoke
|
|
||||||
test validating the four assumptions the arch rests on,
|
|
||||||
before any tractor-side work.
|
|
||||||
|
|
||||||
### Smoke-test script drafted
|
|
||||||
|
|
||||||
Wrote `ai/conc-anal/subint_fork_from_main_thread_smoketest.py`:
|
|
||||||
argparse-driven, four scenarios (`control_subint_thread_fork`
|
|
||||||
reproducing the known-broken case, `main_thread_fork`
|
|
||||||
baseline, `worker_thread_fork` the architectural assertion,
|
|
||||||
`full_architecture` end-to-end with trio in a subint in the
|
|
||||||
forked child). No `tractor` imports; pure CPython + `_interpreters`
|
|
||||||
+ `trio`. Bails cleanly on py<3.14. Pass/fail banners per
|
|
||||||
scenario.
|
|
||||||
|
|
||||||
User will validate on their py3.14 env next.
|
|
||||||
|
|
||||||
## Per-code-artifact provenance
|
|
||||||
|
|
||||||
### `tractor/spawn/_subint_fork.py` (new submod)
|
|
||||||
|
|
||||||
> `git show 797f57c -- tractor/spawn/_subint_fork.py`
|
|
||||||
|
|
||||||
NotImplementedError stub for the subint-fork backend. Module
|
|
||||||
docstring + fn docstring explain the attempt, the CPython
|
|
||||||
block, and why the stub is kept in-tree. No runtime behavior
|
|
||||||
beyond raising with a pointer at the conc-anal doc.
|
|
||||||
|
|
||||||
### `tractor/spawn/_spawn.py` (modified)
|
|
||||||
|
|
||||||
> `git log 26fb820..HEAD -- tractor/spawn/_spawn.py`
|
|
||||||
|
|
||||||
- Added `'subint_fork'` to `SpawnMethodKey` literal with a
|
|
||||||
block comment explaining the CPython-level block.
|
|
||||||
- Generalized the `case 'subint':` arm to `case 'subint' |
|
|
||||||
'subint_fork':` since both use the same py3.14+ gate.
|
|
||||||
- Registered `subint_fork_proc` in `_methods` with a
|
|
||||||
pointer-comment at the analysis doc.
|
|
||||||
|
|
||||||
### `tractor/spawn/_subint.py` (modified across session)
|
|
||||||
|
|
||||||
> `git log 26fb820..HEAD -- tractor/spawn/_subint.py`
|
|
||||||
|
|
||||||
- Tightened `_has_subints` gate: dual-requires public
|
|
||||||
`concurrent.interpreters` + private `_interpreters`
|
|
||||||
(tests for py3.14-or-newer on the public-API presence,
|
|
||||||
then uses the private one for legacy-config subints
|
|
||||||
because `msgspec` still blocks the public isolated mode
|
|
||||||
per jcrist/msgspec#563).
|
|
||||||
- Updated module docstring, `subint_proc()` docstring, and
|
|
||||||
gate-error messages to reflect the 3.14+ requirement and
|
|
||||||
the reason (py3.13 wedges under multi-trio usage even
|
|
||||||
though the private module exists there).
|
|
||||||
|
|
||||||
### `tractor/_testing/pytest.py` (modified)
|
|
||||||
|
|
||||||
> `git log 26fb820..HEAD -- tractor/_testing/pytest.py`
|
|
||||||
|
|
||||||
- New `skipon_spawn_backend(*start_methods, reason=...)`
|
|
||||||
pytest marker expanded into `pytest.mark.skip(reason=...)`
|
|
||||||
at collection time via
|
|
||||||
`pytest_collection_modifyitems()`.
|
|
||||||
- Implementation uses `item.iter_markers(name=...)` which
|
|
||||||
walks function + class + module scopes uniformly and
|
|
||||||
handles both `pytestmark = <single Mark>` and
|
|
||||||
`pytestmark = [mark, ...]` forms natively. ~30-LOC
|
|
||||||
single-loop refactor replacing a prior nested
|
|
||||||
conditional that had four bugs (see "Review" narrative
|
|
||||||
above).
|
|
||||||
- Added `pytest.Config` / `pytest.Function` /
|
|
||||||
`pytest.FixtureRequest` type annotations on fixture
|
|
||||||
signatures while touching the file.
|
|
||||||
|
|
||||||
### `pyproject.toml` (modified)
|
|
||||||
|
|
||||||
> `git log 26fb820..HEAD -- pyproject.toml`
|
|
||||||
|
|
||||||
Added `pytest-timeout>=2.3` to `testing` dep group with
|
|
||||||
comment pointing at the `ai/conc-anal/` docs.
|
|
||||||
|
|
||||||
### `tests/discovery/test_registrar.py`,
|
|
||||||
`tests/test_subint_cancellation.py`,
|
|
||||||
`tests/test_cancellation.py` (modified)
|
|
||||||
|
|
||||||
> `git log 26fb820..HEAD -- tests/`
|
|
||||||
|
|
||||||
Applied `@pytest.mark.timeout(30, method='thread')` on
|
|
||||||
known-hanging subint tests. Extended comments to cross-
|
|
||||||
reference the `ai/conc-anal/*.md` docs. `method='thread'`
|
|
||||||
is documented inline as load-bearing (`signal`-method
|
|
||||||
SIGALRM suffers the same GIL-starvation path that drops
|
|
||||||
SIGINT).
|
|
||||||
|
|
||||||
### `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md` (new)
|
|
||||||
|
|
||||||
> `git show 797f57c -- ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
|
|
||||||
|
|
||||||
Third sibling doc under `conc-anal/`. Structure: TL;DR,
|
|
||||||
context ("what we tried"), symptom (the user's exact
|
|
||||||
`Fatal Python error` output), CPython source walkthrough
|
|
||||||
with excerpted snippets from `posixmodule.c` +
|
|
||||||
`pystate.c`, chain summary, definitive answer to Open
|
|
||||||
Question 1, `## Upstream-report draft (for CPython issue
|
|
||||||
tracker)` section with a two-tier ask, references.
|
|
||||||
|
|
||||||
### `ai/conc-anal/subint_fork_from_main_thread_smoketest.py` (new, THIS turn)
|
|
||||||
|
|
||||||
Zero-tractor-import smoke test for the proposed workaround
|
|
||||||
architecture. Four argparse-driven scenarios covering the
|
|
||||||
control case + baseline + arch-critical case + end-to-end.
|
|
||||||
Pass/fail banners per scenario; clean `--help` output;
|
|
||||||
py3.13 early-exit.
|
|
||||||
|
|
||||||
## Non-code output (verbatim)
|
|
||||||
|
|
||||||
### The `strace` signature that kicked off the CPython
|
|
||||||
walkthrough
|
|
||||||
|
|
||||||
```
|
|
||||||
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
|
||||||
write(16, "\2", 1) = -1 EAGAIN (Resource temporarily unavailable)
|
|
||||||
rt_sigreturn({mask=[WINCH]}) = 139801964688928
|
|
||||||
```
|
|
||||||
|
|
||||||
### Key user quotes framing the direction
|
|
||||||
|
|
||||||
> ok actually we get this [fatal error] ... see if you can
|
|
||||||
> take a look at what's going on, in particular wrt to
|
|
||||||
> cpython's sources. pretty sure there's a local copy at
|
|
||||||
> ~/repos/cpython/
|
|
||||||
|
|
||||||
(Drove the CPython walkthrough that produced the
|
|
||||||
definitive refusal chain.)
|
|
||||||
|
|
||||||
> is there any reason we can't just sidestep this "must fork
|
|
||||||
> from main thread in main subint" issue by simply ensuring
|
|
||||||
> a "subint forking server" is always setup prior to
|
|
||||||
> invoking trio in a non-main-thread subint ...
|
|
||||||
|
|
||||||
(Drove the main-interp-thread-forkserver architectural
|
|
||||||
discussion + smoke-test script design.)
|
|
||||||
|
|
||||||
### CPython source tags for quick jump-back
|
|
||||||
|
|
||||||
```
|
|
||||||
Modules/posixmodule.c:728 PyOS_AfterFork_Child()
|
|
||||||
Modules/posixmodule.c:753 // Ideally we could guarantee tstate is running main.
|
|
||||||
Modules/posixmodule.c:778 status = _PyInterpreterState_DeleteExceptMain(runtime);
|
|
||||||
|
|
||||||
Python/pystate.c:1040 _PyInterpreterState_DeleteExceptMain()
|
|
||||||
Python/pystate.c:1044-1047 tstate->interp != main → PyStatus_ERR("not main interpreter")
|
|
||||||
```
|
|
||||||
|
|
@ -1,27 +0,0 @@
|
||||||
# AI Prompt I/O Log — claude
|
|
||||||
|
|
||||||
This directory tracks prompt inputs and model
|
|
||||||
outputs for AI-assisted development using
|
|
||||||
`claude` (Claude Code).
|
|
||||||
|
|
||||||
## Policy
|
|
||||||
|
|
||||||
Prompt logging follows the
|
|
||||||
[NLNet generative AI policy][nlnet-ai].
|
|
||||||
All substantive AI contributions are logged
|
|
||||||
with:
|
|
||||||
- Model name and version
|
|
||||||
- Timestamps
|
|
||||||
- The prompts that produced the output
|
|
||||||
- Unedited model output (`.raw.md` files)
|
|
||||||
|
|
||||||
[nlnet-ai]: https://nlnet.nl/foundation/policies/generativeAI/
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
Entries are created by the `/prompt-io` skill
|
|
||||||
or automatically via `/commit-msg` integration.
|
|
||||||
|
|
||||||
Human contributors remain accountable for all
|
|
||||||
code decisions. AI-generated content is never
|
|
||||||
presented as human-authored work.
|
|
||||||
|
|
@ -1,76 +0,0 @@
|
||||||
ok now i want you to take a look at the most recent commit adding
|
|
||||||
a `tpt_bind_addrs` to `open_root_actor()` and extend the existing
|
|
||||||
tests/discovery/test_multiaddr* and friends to use this new param in
|
|
||||||
at least one suite with parametrizations over,
|
|
||||||
|
|
||||||
- `registry_addrs == tpt_bind_addrs`, as in both inputs are the same.
|
|
||||||
- `set(registry_addrs) >= set(tpt_bind_addrs)`, as in the registry
|
|
||||||
addrs include the bind set.
|
|
||||||
- `registry_addrs != tpt_bind_addrs`, where the reg set is disjoint from
|
|
||||||
the bind set in all possible combos you can imagine.
|
|
||||||
|
|
||||||
All of the ^above cases should further be parametrized over,
|
|
||||||
- the root being the registrar,
|
|
||||||
- a non-registrar root using our bg `daemon` fixture.
|
|
||||||
|
|
||||||
once we have a fairly thorough test suite and have flushed out all
|
|
||||||
bugs and edge cases we want to design a wrapping API which allows
|
|
||||||
declaring full tree's of actors tpt endpoints using multiaddrs such
|
|
||||||
that a `dict[str, list[str]]` of actor-name -> multiaddr can be used
|
|
||||||
to configure a tree of actors-as-services given such an input
|
|
||||||
"endpoints-table" can be matched with the number of appropriately
|
|
||||||
named subactore spawns in a `tractor` user-app.
|
|
||||||
|
|
||||||
Here is a small example from piker,
|
|
||||||
|
|
||||||
- in piker's root conf.toml we define a `[network]` section which can
|
|
||||||
define various actor-service-daemon names set to a maddr
|
|
||||||
(multiaddress str).
|
|
||||||
|
|
||||||
- each actor whether part of the `pikerd` tree (as a sub) or spawned
|
|
||||||
in other non-registrar rooted trees (such as `piker chart`) should
|
|
||||||
configurable in terms of its `tractor` tpt bind addresses via
|
|
||||||
a simple service lookup table,
|
|
||||||
|
|
||||||
```toml
|
|
||||||
[network]
|
|
||||||
pikerd = [
|
|
||||||
'/ip4/127.0.0.1/tcp/6116', # std localhost daemon-actor tree
|
|
||||||
'/uds/run/user/1000/piker/pikerd@6116.sock', # same but serving UDS
|
|
||||||
]
|
|
||||||
chart = [
|
|
||||||
'/ip4/127.0.0.1/tcp/3333', # std localhost daemon-actor tree
|
|
||||||
'/uds/run/user/1000/piker/chart@3333.sock',
|
|
||||||
]
|
|
||||||
```
|
|
||||||
|
|
||||||
We should take whatever common API is needed to support this and
|
|
||||||
distill it into a
|
|
||||||
```python
|
|
||||||
tractor.discovery.parse_endpoints(
|
|
||||||
) -> dict[
|
|
||||||
str,
|
|
||||||
list[Address]
|
|
||||||
|dict[str, list[Address]]
|
|
||||||
# ^recursive case, see below
|
|
||||||
]:
|
|
||||||
```
|
|
||||||
|
|
||||||
style API which can,
|
|
||||||
|
|
||||||
- be re-used easily across dependent projects.
|
|
||||||
- correctly raise tpt-backend support errors when a maddr specifying
|
|
||||||
a unsupport proto is passed.
|
|
||||||
- be used to handle "tunnelled" maddrs per
|
|
||||||
https://github.com/multiformats/py-multiaddr/#tunneling such that
|
|
||||||
for any such tunneled maddr-`str`-entry we deliver a data-structure
|
|
||||||
which can easily be passed to nested `@acm`s which consecutively
|
|
||||||
setup nested net bindspaces for binding the endpoint addrs using
|
|
||||||
a combo of our `.ipc.*` machinery and, say for example something like
|
|
||||||
https://github.com/svinota/pyroute2, more precisely say for
|
|
||||||
managing tunnelled wireguard eps within network-namespaces,
|
|
||||||
* https://docs.pyroute2.org/
|
|
||||||
* https://docs.pyroute2.org/netns.html
|
|
||||||
|
|
||||||
remember to include use of all default `.claude/skills` throughout
|
|
||||||
this work!
|
|
||||||
|
|
@ -1,34 +0,0 @@
|
||||||
This is your first big boi, "from GH issue" design, plan and
|
|
||||||
implement task.
|
|
||||||
|
|
||||||
We need to try and add sub-interpreter (aka subint) support per the
|
|
||||||
issue,
|
|
||||||
|
|
||||||
https://github.com/goodboy/tractor/issues/379
|
|
||||||
|
|
||||||
Part of this work should include,
|
|
||||||
|
|
||||||
- modularizing and thus better organizing the `.spawn.*` subpkg by
|
|
||||||
breaking up various backends currently in `spawn._spawn` into
|
|
||||||
separate submods where it makes sense.
|
|
||||||
|
|
||||||
- add a new `._subint` backend which tries to keep as much of the
|
|
||||||
inter-process-isolation machinery in use as possible but with plans
|
|
||||||
to optimize for localhost only benefits as offered by python's
|
|
||||||
subints where possible.
|
|
||||||
|
|
||||||
* utilizing localhost-only tpts like UDS, shm-buffers for
|
|
||||||
performant IPC between subactors but also leveraging the benefits from
|
|
||||||
the traditional OS subprocs mem/storage-domain isolation, linux
|
|
||||||
namespaces where possible and as available/permitted by whatever
|
|
||||||
is happening under the hood with how cpython implements subints.
|
|
||||||
|
|
||||||
* default configuration should encourage state isolation as with
|
|
||||||
subprocs, but explicit public escape hatches to enable rigorously
|
|
||||||
managed shm channels for high performance apps.
|
|
||||||
|
|
||||||
- all tests should be (able to be) parameterized to use the new
|
|
||||||
`subints` backend and enabled by flag in the harness using the
|
|
||||||
existing `pytest --spawn-backend <spawn-backend>` support offered in
|
|
||||||
the `open_root_actor()` and `.testing._pytest` harness override
|
|
||||||
fixture.
|
|
||||||
|
|
@ -420,17 +420,20 @@ Check out our experimental system for `guest`_-mode controlled
|
||||||
|
|
||||||
|
|
||||||
async def aio_echo_server(
|
async def aio_echo_server(
|
||||||
chan: tractor.to_asyncio.LinkedTaskChannel,
|
to_trio: trio.MemorySendChannel,
|
||||||
|
from_trio: asyncio.Queue,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
# a first message must be sent **from** this ``asyncio``
|
# a first message must be sent **from** this ``asyncio``
|
||||||
# task or the ``trio`` side will never unblock from
|
# task or the ``trio`` side will never unblock from
|
||||||
# ``tractor.to_asyncio.open_channel_from():``
|
# ``tractor.to_asyncio.open_channel_from():``
|
||||||
chan.started_nowait('start')
|
to_trio.send_nowait('start')
|
||||||
|
|
||||||
|
# XXX: this uses an ``from_trio: asyncio.Queue`` currently but we
|
||||||
|
# should probably offer something better.
|
||||||
while True:
|
while True:
|
||||||
# echo the msg back
|
# echo the msg back
|
||||||
chan.send_nowait(await chan.get())
|
to_trio.send_nowait(await from_trio.get())
|
||||||
await asyncio.sleep(0)
|
await asyncio.sleep(0)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -442,7 +445,7 @@ Check out our experimental system for `guest`_-mode controlled
|
||||||
# message.
|
# message.
|
||||||
async with tractor.to_asyncio.open_channel_from(
|
async with tractor.to_asyncio.open_channel_from(
|
||||||
aio_echo_server,
|
aio_echo_server,
|
||||||
) as (chan, first):
|
) as (first, chan):
|
||||||
|
|
||||||
assert first == 'start'
|
assert first == 'start'
|
||||||
await ctx.started(first)
|
await ctx.started(first)
|
||||||
|
|
@ -501,10 +504,8 @@ Yes, we spawn a python process, run ``asyncio``, start ``trio`` on the
|
||||||
``asyncio`` loop, then send commands to the ``trio`` scheduled tasks to
|
``asyncio`` loop, then send commands to the ``trio`` scheduled tasks to
|
||||||
tell ``asyncio`` tasks what to do XD
|
tell ``asyncio`` tasks what to do XD
|
||||||
|
|
||||||
The ``asyncio``-side task receives a single
|
We need help refining the `asyncio`-side channel API to be more
|
||||||
``chan: LinkedTaskChannel`` handle providing a ``trio``-like
|
`trio`-like. Feel free to sling your opinion in `#273`_!
|
||||||
API: ``.started_nowait()``, ``.send_nowait()``, ``.get()``
|
|
||||||
and more. Feel free to sling your opinion in `#273`_!
|
|
||||||
|
|
||||||
|
|
||||||
.. _#273: https://github.com/goodboy/tractor/issues/273
|
.. _#273: https://github.com/goodboy/tractor/issues/273
|
||||||
|
|
@ -640,15 +641,13 @@ Help us push toward the future of distributed `Python`.
|
||||||
- Typed capability-based (dialog) protocols ( see `#196
|
- Typed capability-based (dialog) protocols ( see `#196
|
||||||
<https://github.com/goodboy/tractor/issues/196>`_ with draft work
|
<https://github.com/goodboy/tractor/issues/196>`_ with draft work
|
||||||
started in `#311 <https://github.com/goodboy/tractor/pull/311>`_)
|
started in `#311 <https://github.com/goodboy/tractor/pull/311>`_)
|
||||||
- **macOS is now officially supported** and tested in CI
|
- We **recently disabled CI-testing on windows** and need help getting
|
||||||
alongside Linux!
|
it running again! (see `#327
|
||||||
- We **recently disabled CI-testing on windows** and need
|
<https://github.com/goodboy/tractor/pull/327>`_). **We do have windows
|
||||||
help getting it running again! (see `#327
|
support** (and have for quite a while) but since no active hacker
|
||||||
<https://github.com/goodboy/tractor/pull/327>`_). **We do
|
exists in the user-base to help test on that OS, for now we're not
|
||||||
have windows support** (and have for quite a while) but
|
actively maintaining testing due to the added hassle and general
|
||||||
since no active hacker exists in the user-base to help
|
latency..
|
||||||
test on that OS, for now we're not actively maintaining
|
|
||||||
testing due to the added hassle and general latency..
|
|
||||||
|
|
||||||
|
|
||||||
Feel like saying hi?
|
Feel like saying hi?
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,6 @@ from tractor import (
|
||||||
MsgStream,
|
MsgStream,
|
||||||
_testing,
|
_testing,
|
||||||
trionics,
|
trionics,
|
||||||
TransportClosed,
|
|
||||||
)
|
)
|
||||||
import trio
|
import trio
|
||||||
import pytest
|
import pytest
|
||||||
|
|
@ -209,16 +208,12 @@ async def main(
|
||||||
# TODO: is this needed or no?
|
# TODO: is this needed or no?
|
||||||
raise
|
raise
|
||||||
|
|
||||||
except (
|
except trio.ClosedResourceError:
|
||||||
trio.ClosedResourceError,
|
|
||||||
TransportClosed,
|
|
||||||
) as _tpt_err:
|
|
||||||
# NOTE: don't send if we already broke the
|
# NOTE: don't send if we already broke the
|
||||||
# connection to avoid raising a closed-error
|
# connection to avoid raising a closed-error
|
||||||
# such that we drop through to the ctl-c
|
# such that we drop through to the ctl-c
|
||||||
# mashing by user.
|
# mashing by user.
|
||||||
with trio.CancelScope(shield=True):
|
await trio.sleep(0.01)
|
||||||
await trio.sleep(0.01)
|
|
||||||
|
|
||||||
# timeout: int = 1
|
# timeout: int = 1
|
||||||
# with trio.move_on_after(timeout) as cs:
|
# with trio.move_on_after(timeout) as cs:
|
||||||
|
|
@ -252,7 +247,6 @@ async def main(
|
||||||
await stream.send(i)
|
await stream.send(i)
|
||||||
pytest.fail('stream not closed?')
|
pytest.fail('stream not closed?')
|
||||||
except (
|
except (
|
||||||
TransportClosed,
|
|
||||||
trio.ClosedResourceError,
|
trio.ClosedResourceError,
|
||||||
trio.EndOfChannel,
|
trio.EndOfChannel,
|
||||||
) as send_err:
|
) as send_err:
|
||||||
|
|
|
||||||
|
|
@ -18,14 +18,15 @@ async def aio_sleep_forever():
|
||||||
|
|
||||||
|
|
||||||
async def bp_then_error(
|
async def bp_then_error(
|
||||||
chan: to_asyncio.LinkedTaskChannel,
|
to_trio: trio.MemorySendChannel,
|
||||||
|
from_trio: asyncio.Queue,
|
||||||
|
|
||||||
raise_after_bp: bool = True,
|
raise_after_bp: bool = True,
|
||||||
|
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
# sync with `trio`-side (caller) task
|
# sync with `trio`-side (caller) task
|
||||||
chan.started_nowait('start')
|
to_trio.send_nowait('start')
|
||||||
|
|
||||||
# NOTE: what happens here inside the hook needs some refinement..
|
# NOTE: what happens here inside the hook needs some refinement..
|
||||||
# => seems like it's still `.debug._set_trace()` but
|
# => seems like it's still `.debug._set_trace()` but
|
||||||
|
|
@ -59,7 +60,7 @@ async def trio_ctx(
|
||||||
to_asyncio.open_channel_from(
|
to_asyncio.open_channel_from(
|
||||||
bp_then_error,
|
bp_then_error,
|
||||||
# raise_after_bp=not bp_before_started,
|
# raise_after_bp=not bp_before_started,
|
||||||
) as (chan, first),
|
) as (first, chan),
|
||||||
|
|
||||||
trio.open_nursery() as tn,
|
trio.open_nursery() as tn,
|
||||||
):
|
):
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,7 @@ async def sleep(
|
||||||
|
|
||||||
|
|
||||||
async def open_ctx(
|
async def open_ctx(
|
||||||
n: tractor.runtime._supervise.ActorNursery
|
n: tractor._supervise.ActorNursery
|
||||||
):
|
):
|
||||||
|
|
||||||
# spawn both actors
|
# spawn both actors
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,6 @@ Verify we can dump a `stackscope` tree on a hang.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
import os
|
import os
|
||||||
import platform
|
|
||||||
import signal
|
import signal
|
||||||
|
|
||||||
import trio
|
import trio
|
||||||
|
|
@ -32,26 +31,13 @@ async def main(
|
||||||
from_test: bool = False,
|
from_test: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
if platform.system() != 'Darwin':
|
|
||||||
tpt = 'uds'
|
|
||||||
else:
|
|
||||||
# XXX, precisely we can't use pytest's tmp-path generation
|
|
||||||
# for tests.. apparently because:
|
|
||||||
#
|
|
||||||
# > The OSError: AF_UNIX path too long in macOS Python occurs
|
|
||||||
# > because the path to the Unix domain socket exceeds the
|
|
||||||
# > operating system's maximum path length limit (around 104
|
|
||||||
#
|
|
||||||
# WHICH IS just, wtf hillarious XD
|
|
||||||
tpt = 'tcp'
|
|
||||||
|
|
||||||
async with (
|
async with (
|
||||||
tractor.open_nursery(
|
tractor.open_nursery(
|
||||||
debug_mode=True,
|
debug_mode=True,
|
||||||
enable_stack_on_sig=True,
|
enable_stack_on_sig=True,
|
||||||
# maybe_enable_greenback=False,
|
# maybe_enable_greenback=False,
|
||||||
loglevel='devx',
|
loglevel='devx',
|
||||||
enable_transports=[tpt],
|
enable_transports=['uds'],
|
||||||
) as an,
|
) as an,
|
||||||
):
|
):
|
||||||
ptl: tractor.Portal = await an.start_actor(
|
ptl: tractor.Portal = await an.start_actor(
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,3 @@
|
||||||
import platform
|
|
||||||
|
|
||||||
import tractor
|
import tractor
|
||||||
import trio
|
import trio
|
||||||
|
|
||||||
|
|
@ -36,22 +34,9 @@ async def just_bp(
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
|
|
||||||
if platform.system() != 'Darwin':
|
|
||||||
tpt = 'uds'
|
|
||||||
else:
|
|
||||||
# XXX, precisely we can't use pytest's tmp-path generation
|
|
||||||
# for tests.. apparently because:
|
|
||||||
#
|
|
||||||
# > The OSError: AF_UNIX path too long in macOS Python occurs
|
|
||||||
# > because the path to the Unix domain socket exceeds the
|
|
||||||
# > operating system's maximum path length limit (around 104
|
|
||||||
#
|
|
||||||
# WHICH IS just, wtf hillarious XD
|
|
||||||
tpt = 'tcp'
|
|
||||||
|
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
debug_mode=True,
|
debug_mode=True,
|
||||||
enable_transports=[tpt],
|
enable_transports=['uds'],
|
||||||
loglevel='devx',
|
loglevel='devx',
|
||||||
) as n:
|
) as n:
|
||||||
p = await n.start_actor(
|
p = await n.start_actor(
|
||||||
|
|
|
||||||
|
|
@ -90,7 +90,7 @@ async def main() -> list[int]:
|
||||||
# yes, a nursery which spawns `trio`-"actors" B)
|
# yes, a nursery which spawns `trio`-"actors" B)
|
||||||
an: ActorNursery
|
an: ActorNursery
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
loglevel='error',
|
loglevel='cancel',
|
||||||
# debug_mode=True,
|
# debug_mode=True,
|
||||||
) as an:
|
) as an:
|
||||||
|
|
||||||
|
|
@ -118,10 +118,8 @@ async def main() -> list[int]:
|
||||||
cancelled: bool = await portal.cancel_actor()
|
cancelled: bool = await portal.cancel_actor()
|
||||||
assert cancelled
|
assert cancelled
|
||||||
|
|
||||||
print(
|
print(f"STREAM TIME = {time.time() - start}")
|
||||||
f"STREAM TIME = {time.time() - start}\n"
|
print(f"STREAM + SPAWN TIME = {time.time() - pre_start}")
|
||||||
f"STREAM + SPAWN TIME = {time.time() - pre_start}\n"
|
|
||||||
)
|
|
||||||
assert result_stream == list(range(seed))
|
assert result_stream == list(range(seed))
|
||||||
return result_stream
|
return result_stream
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -11,17 +11,21 @@ import tractor
|
||||||
|
|
||||||
|
|
||||||
async def aio_echo_server(
|
async def aio_echo_server(
|
||||||
chan: tractor.to_asyncio.LinkedTaskChannel,
|
to_trio: trio.MemorySendChannel,
|
||||||
|
from_trio: asyncio.Queue,
|
||||||
|
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
# a first message must be sent **from** this ``asyncio``
|
# a first message must be sent **from** this ``asyncio``
|
||||||
# task or the ``trio`` side will never unblock from
|
# task or the ``trio`` side will never unblock from
|
||||||
# ``tractor.to_asyncio.open_channel_from():``
|
# ``tractor.to_asyncio.open_channel_from():``
|
||||||
chan.started_nowait('start')
|
to_trio.send_nowait('start')
|
||||||
|
|
||||||
|
# XXX: this uses an ``from_trio: asyncio.Queue`` currently but we
|
||||||
|
# should probably offer something better.
|
||||||
while True:
|
while True:
|
||||||
# echo the msg back
|
# echo the msg back
|
||||||
chan.send_nowait(await chan.get())
|
to_trio.send_nowait(await from_trio.get())
|
||||||
await asyncio.sleep(0)
|
await asyncio.sleep(0)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -33,7 +37,7 @@ async def trio_to_aio_echo_server(
|
||||||
# message.
|
# message.
|
||||||
async with tractor.to_asyncio.open_channel_from(
|
async with tractor.to_asyncio.open_channel_from(
|
||||||
aio_echo_server,
|
aio_echo_server,
|
||||||
) as (chan, first):
|
) as (first, chan):
|
||||||
|
|
||||||
assert first == 'start'
|
assert first == 'start'
|
||||||
await ctx.started(first)
|
await ctx.started(first)
|
||||||
|
|
|
||||||
|
|
@ -1,5 +0,0 @@
|
||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
async def child_fn() -> str:
|
|
||||||
return f"child OK pid={os.getpid()}"
|
|
||||||
|
|
@ -1,50 +0,0 @@
|
||||||
"""
|
|
||||||
Integration test: spawning tractor actors from an MPI process.
|
|
||||||
|
|
||||||
When a parent is launched via ``mpirun``, Open MPI sets ``OMPI_*`` env
|
|
||||||
vars that bind ``MPI_Init`` to the ``orted`` daemon. Tractor children
|
|
||||||
inherit those env vars, so if ``inherit_parent_main=True`` (the default)
|
|
||||||
the child re-executes ``__main__``, re-imports ``mpi4py``, and
|
|
||||||
``MPI_Init_thread`` fails because the child was never spawned by
|
|
||||||
``orted``::
|
|
||||||
|
|
||||||
getting local rank failed
|
|
||||||
--> Returned value No permission (-17) instead of ORTE_SUCCESS
|
|
||||||
|
|
||||||
Passing ``inherit_parent_main=False`` and placing RPC functions in a
|
|
||||||
separate importable module (``_child``) avoids the re-import entirely.
|
|
||||||
|
|
||||||
Usage::
|
|
||||||
|
|
||||||
mpirun --allow-run-as-root -np 1 python -m \
|
|
||||||
examples.integration.mpi4py.inherit_parent_main
|
|
||||||
"""
|
|
||||||
|
|
||||||
from mpi4py import MPI
|
|
||||||
|
|
||||||
import os
|
|
||||||
import trio
|
|
||||||
import tractor
|
|
||||||
|
|
||||||
from ._child import child_fn
|
|
||||||
|
|
||||||
|
|
||||||
async def main() -> None:
|
|
||||||
rank = MPI.COMM_WORLD.Get_rank()
|
|
||||||
print(f"[parent] rank={rank} pid={os.getpid()}", flush=True)
|
|
||||||
|
|
||||||
async with tractor.open_nursery(start_method='trio') as an:
|
|
||||||
portal = await an.start_actor(
|
|
||||||
'mpi-child',
|
|
||||||
enable_modules=[child_fn.__module__],
|
|
||||||
# Without this the child replays __main__, which
|
|
||||||
# re-imports mpi4py and crashes on MPI_Init.
|
|
||||||
inherit_parent_main=False,
|
|
||||||
)
|
|
||||||
result = await portal.run(child_fn)
|
|
||||||
print(f"[parent] got: {result}", flush=True)
|
|
||||||
await portal.cancel_actor()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
trio.run(main)
|
|
||||||
|
|
@ -10,7 +10,7 @@ async def main(service_name):
|
||||||
await an.start_actor(service_name)
|
await an.start_actor(service_name)
|
||||||
|
|
||||||
async with tractor.get_registry() as portal:
|
async with tractor.get_registry() as portal:
|
||||||
print(f"Registrar is listening on {portal.channel}")
|
print(f"Arbiter is listening on {portal.channel}")
|
||||||
|
|
||||||
async with tractor.wait_for_actor(service_name) as sockaddr:
|
async with tractor.wait_for_actor(service_name) as sockaddr:
|
||||||
print(f"my_service is found at {sockaddr}")
|
print(f"my_service is found at {sockaddr}")
|
||||||
|
|
|
||||||
27
flake.lock
27
flake.lock
|
|
@ -1,27 +0,0 @@
|
||||||
{
|
|
||||||
"nodes": {
|
|
||||||
"nixpkgs": {
|
|
||||||
"locked": {
|
|
||||||
"lastModified": 1769018530,
|
|
||||||
"narHash": "sha256-MJ27Cy2NtBEV5tsK+YraYr2g851f3Fl1LpNHDzDX15c=",
|
|
||||||
"owner": "nixos",
|
|
||||||
"repo": "nixpkgs",
|
|
||||||
"rev": "88d3861acdd3d2f0e361767018218e51810df8a1",
|
|
||||||
"type": "github"
|
|
||||||
},
|
|
||||||
"original": {
|
|
||||||
"owner": "nixos",
|
|
||||||
"ref": "nixos-unstable",
|
|
||||||
"repo": "nixpkgs",
|
|
||||||
"type": "github"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"root": {
|
|
||||||
"inputs": {
|
|
||||||
"nixpkgs": "nixpkgs"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"root": "root",
|
|
||||||
"version": 7
|
|
||||||
}
|
|
||||||
70
flake.nix
70
flake.nix
|
|
@ -1,70 +0,0 @@
|
||||||
# An "impure" template thx to `pyproject.nix`,
|
|
||||||
# https://pyproject-nix.github.io/pyproject.nix/templates.html#impure
|
|
||||||
# https://github.com/pyproject-nix/pyproject.nix/blob/master/templates/impure/flake.nix
|
|
||||||
{
|
|
||||||
description = "An impure overlay (w dev-shell) using `uv`";
|
|
||||||
|
|
||||||
inputs = {
|
|
||||||
nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";
|
|
||||||
};
|
|
||||||
|
|
||||||
outputs =
|
|
||||||
{ nixpkgs, ... }:
|
|
||||||
let
|
|
||||||
inherit (nixpkgs) lib;
|
|
||||||
forAllSystems = lib.genAttrs lib.systems.flakeExposed;
|
|
||||||
in
|
|
||||||
{
|
|
||||||
devShells = forAllSystems (
|
|
||||||
system:
|
|
||||||
let
|
|
||||||
pkgs = nixpkgs.legacyPackages.${system};
|
|
||||||
|
|
||||||
# XXX NOTE XXX, for now we overlay specific pkgs via
|
|
||||||
# a major-version-pinned-`cpython`
|
|
||||||
cpython = "python313";
|
|
||||||
venv_dir = "py313";
|
|
||||||
pypkgs = pkgs."${cpython}Packages";
|
|
||||||
in
|
|
||||||
{
|
|
||||||
default = pkgs.mkShell {
|
|
||||||
|
|
||||||
packages = [
|
|
||||||
# XXX, ensure sh completions activate!
|
|
||||||
pkgs.bashInteractive
|
|
||||||
pkgs.bash-completion
|
|
||||||
|
|
||||||
# XXX, on nix(os), use pkgs version to avoid
|
|
||||||
# build/sys-sh-integration issues
|
|
||||||
pkgs.ruff
|
|
||||||
|
|
||||||
pkgs.uv
|
|
||||||
pkgs.${cpython}# ?TODO^ how to set from `cpython` above?
|
|
||||||
];
|
|
||||||
|
|
||||||
shellHook = ''
|
|
||||||
# unmask to debug **this** dev-shell-hook
|
|
||||||
# set -e
|
|
||||||
|
|
||||||
# link-in c++ stdlib for various AOT-ext-pkgs (numpy, etc.)
|
|
||||||
LD_LIBRARY_PATH="${pkgs.stdenv.cc.cc.lib}/lib:$LD_LIBRARY_PATH"
|
|
||||||
|
|
||||||
export LD_LIBRARY_PATH
|
|
||||||
|
|
||||||
# RUNTIME-SETTINGS
|
|
||||||
# ------ uv ------
|
|
||||||
# - always use the ./py313/ venv-subdir
|
|
||||||
# - sync env with all extras
|
|
||||||
export UV_PROJECT_ENVIRONMENT=${venv_dir}
|
|
||||||
uv sync --dev --all-extras
|
|
||||||
|
|
||||||
# ------ TIPS ------
|
|
||||||
# NOTE, to launch the py-venv installed `xonsh` (like @goodboy)
|
|
||||||
# run the `nix develop` cmd with,
|
|
||||||
# >> nix develop -c uv run xonsh
|
|
||||||
'';
|
|
||||||
};
|
|
||||||
}
|
|
||||||
);
|
|
||||||
};
|
|
||||||
}
|
|
||||||
115
pyproject.toml
115
pyproject.toml
|
|
@ -9,7 +9,7 @@ name = "tractor"
|
||||||
version = "0.1.0a6dev0"
|
version = "0.1.0a6dev0"
|
||||||
description = 'structured concurrent `trio`-"actors"'
|
description = 'structured concurrent `trio`-"actors"'
|
||||||
authors = [{ name = "Tyler Goodlet", email = "goodboy_foss@protonmail.com" }]
|
authors = [{ name = "Tyler Goodlet", email = "goodboy_foss@protonmail.com" }]
|
||||||
requires-python = ">=3.13, <3.15"
|
requires-python = ">= 3.11"
|
||||||
readme = "docs/README.rst"
|
readme = "docs/README.rst"
|
||||||
license = "AGPL-3.0-or-later"
|
license = "AGPL-3.0-or-later"
|
||||||
keywords = [
|
keywords = [
|
||||||
|
|
@ -24,14 +24,11 @@ keywords = [
|
||||||
classifiers = [
|
classifiers = [
|
||||||
"Development Status :: 3 - Alpha",
|
"Development Status :: 3 - Alpha",
|
||||||
"Operating System :: POSIX :: Linux",
|
"Operating System :: POSIX :: Linux",
|
||||||
"Operating System :: MacOS",
|
|
||||||
"Framework :: Trio",
|
"Framework :: Trio",
|
||||||
"License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
|
"License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
|
||||||
"Programming Language :: Python :: Implementation :: CPython",
|
"Programming Language :: Python :: Implementation :: CPython",
|
||||||
"Programming Language :: Python :: 3 :: Only",
|
"Programming Language :: Python :: 3 :: Only",
|
||||||
"Programming Language :: Python :: 3.12",
|
"Programming Language :: Python :: 3.11",
|
||||||
"Programming Language :: Python :: 3.13",
|
|
||||||
"Programming Language :: Python :: 3.14",
|
|
||||||
"Topic :: System :: Distributed Computing",
|
"Topic :: System :: Distributed Computing",
|
||||||
]
|
]
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
|
@ -44,99 +41,49 @@ dependencies = [
|
||||||
"tricycle>=0.4.1,<0.5",
|
"tricycle>=0.4.1,<0.5",
|
||||||
"wrapt>=1.16.0,<2",
|
"wrapt>=1.16.0,<2",
|
||||||
"colorlog>=6.8.2,<7",
|
"colorlog>=6.8.2,<7",
|
||||||
|
|
||||||
# built-in multi-actor `pdb` REPL
|
# built-in multi-actor `pdb` REPL
|
||||||
"pdbp>=1.8.2,<2", # windows only (from `pdbp`)
|
"pdbp>=1.6,<2", # windows only (from `pdbp`)
|
||||||
|
|
||||||
# typed IPC msging
|
# typed IPC msging
|
||||||
"msgspec>=0.20.0",
|
"msgspec>=0.19.0",
|
||||||
|
"cffi>=1.17.1",
|
||||||
"bidict>=0.23.1",
|
"bidict>=0.23.1",
|
||||||
"multiaddr>=0.2.0",
|
|
||||||
"platformdirs>=4.4.0",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# ------ project ------
|
# ------ project ------
|
||||||
|
|
||||||
[dependency-groups]
|
[dependency-groups]
|
||||||
dev = [
|
dev = [
|
||||||
{include-group = 'devx'},
|
|
||||||
{include-group = 'testing'},
|
|
||||||
{include-group = 'repl'},
|
|
||||||
]
|
|
||||||
devx = [
|
|
||||||
# `tractor.devx` tooling
|
|
||||||
"stackscope>=0.2.2,<0.3",
|
|
||||||
# ^ requires this?
|
|
||||||
"typing-extensions>=4.14.1",
|
|
||||||
# {include-group = 'sync_pause'}, # XXX, no 3.14 yet!
|
|
||||||
]
|
|
||||||
sync_pause = [
|
|
||||||
"greenback>=1.2.1,<2", # TODO? 3.14 greenlet on nix?
|
|
||||||
]
|
|
||||||
testing = [
|
|
||||||
# test suite
|
# test suite
|
||||||
# TODO: maybe some of these layout choices?
|
# TODO: maybe some of these layout choices?
|
||||||
# https://docs.pytest.org/en/8.0.x/explanation/goodpractices.html#choosing-a-test-layout-import-rules
|
# https://docs.pytest.org/en/8.0.x/explanation/goodpractices.html#choosing-a-test-layout-import-rules
|
||||||
"pytest>=8.3.5",
|
"pytest>=8.3.5",
|
||||||
"pexpect>=4.9.0,<5",
|
"pexpect>=4.9.0,<5",
|
||||||
# per-test wall-clock bound (used via
|
# `tractor.devx` tooling
|
||||||
# `@pytest.mark.timeout(..., method='thread')` on the
|
"greenback>=1.2.1,<2",
|
||||||
# known-hanging `subint`-backend audit tests; see
|
"stackscope>=0.2.2,<0.3",
|
||||||
# `ai/conc-anal/subint_*_issue.md`).
|
# ^ requires this?
|
||||||
"pytest-timeout>=2.3",
|
"typing-extensions>=4.14.1",
|
||||||
]
|
|
||||||
repl = [
|
|
||||||
"pyperclip>=1.9.0",
|
"pyperclip>=1.9.0",
|
||||||
"prompt-toolkit>=3.0.50",
|
"prompt-toolkit>=3.0.50",
|
||||||
"xonsh>=0.22.8",
|
"xonsh>=0.19.2",
|
||||||
"psutil>=7.0.0",
|
"psutil>=7.0.0",
|
||||||
]
|
]
|
||||||
lint = [
|
|
||||||
"ruff>=0.9.6"
|
|
||||||
]
|
|
||||||
# XXX, used for linux-only hi perf eventfd+shm channels
|
|
||||||
# now mostly moved over to `hotbaud`.
|
|
||||||
eventfd = [
|
|
||||||
"cffi>=1.17.1",
|
|
||||||
]
|
|
||||||
subints = [
|
|
||||||
"msgspec>=0.21.0",
|
|
||||||
]
|
|
||||||
# TODO, add these with sane versions; were originally in
|
# TODO, add these with sane versions; were originally in
|
||||||
# `requirements-docs.txt`..
|
# `requirements-docs.txt`..
|
||||||
# docs = [
|
# docs = [
|
||||||
# "sphinx>="
|
# "sphinx>="
|
||||||
# "sphinx_book_theme>="
|
# "sphinx_book_theme>="
|
||||||
# ]
|
# ]
|
||||||
|
|
||||||
# ------ dependency-groups ------
|
# ------ dependency-groups ------
|
||||||
|
|
||||||
[tool.uv.dependency-groups]
|
# ------ dependency-groups ------
|
||||||
# for subints, we require 3.14+ due to 2 issues,
|
|
||||||
# - hanging behaviour for various multi-task teardown cases (see
|
|
||||||
# "Availability" section in the `tractor.spawn._subints` doc string).
|
|
||||||
# - `msgspec` support which is oustanding per PEP 684 upstream tracker:
|
|
||||||
# https://github.com/jcrist/msgspec/issues/563
|
|
||||||
#
|
|
||||||
# https://docs.astral.sh/uv/concepts/projects/dependencies/#group-requires-python
|
|
||||||
subints = {requires-python = ">=3.14"}
|
|
||||||
eventfd = {requires-python = ">=3.13, <3.14"}
|
|
||||||
sync_pause = {requires-python = ">=3.13, <3.14"}
|
|
||||||
|
|
||||||
[tool.uv.sources]
|
[tool.uv.sources]
|
||||||
# XXX NOTE, only for @goodboy's hacking on `pprint(sort_dicts=False)`
|
# XXX NOTE, only for @goodboy's hacking on `pprint(sort_dicts=False)`
|
||||||
# for the `pp` alias..
|
# for the `pp` alias..
|
||||||
# ------ gh upstream ------
|
# pdbp = { path = "../pdbp", editable = true }
|
||||||
# xonsh = { git = 'https://github.com/anki-code/xonsh.git', branch = 'prompt_next_suggestion' }
|
|
||||||
# ^ https://github.com/xonsh/xonsh/pull/6048
|
|
||||||
# xonsh = { git = 'https://github.com/xonsh/xonsh.git', branch = 'main' }
|
|
||||||
xonsh = { path = "../xonsh", editable = true }
|
|
||||||
|
|
||||||
# [tool.uv.sources.pdbp]
|
|
||||||
# XXX, in case we need to tmp patch again.
|
|
||||||
# git = "https://github.com/goodboy/pdbp.git"
|
|
||||||
# branch ="repair_stack_trace_frame_indexing"
|
|
||||||
# path = "../pdbp"
|
|
||||||
# editable = true
|
|
||||||
|
|
||||||
# ------ tool.uv.sources ------
|
# ------ tool.uv.sources ------
|
||||||
# TODO, distributed (multi-host) extensions
|
# TODO, distributed (multi-host) extensions
|
||||||
|
|
@ -198,8 +145,6 @@ all_bullets = true
|
||||||
|
|
||||||
[tool.pytest.ini_options]
|
[tool.pytest.ini_options]
|
||||||
minversion = '6.0'
|
minversion = '6.0'
|
||||||
timeout = 200 # per-test hard limit
|
|
||||||
# https://docs.pytest.org/en/stable/reference/reference.html#configuration-options
|
|
||||||
testpaths = [
|
testpaths = [
|
||||||
'tests'
|
'tests'
|
||||||
]
|
]
|
||||||
|
|
@ -210,40 +155,10 @@ addopts = [
|
||||||
'--import-mode=importlib',
|
'--import-mode=importlib',
|
||||||
# don't show frickin captured logs AGAIN in the report..
|
# don't show frickin captured logs AGAIN in the report..
|
||||||
'--show-capture=no',
|
'--show-capture=no',
|
||||||
|
|
||||||
# sys-level capture. REQUIRED for fork-based spawn
|
|
||||||
# backends (e.g. `subint_forkserver`): default
|
|
||||||
# `--capture=fd` redirects fd 1,2 to temp files, and fork
|
|
||||||
# children inherit those fds — opaque deadlocks happen in
|
|
||||||
# the pytest-capture-machinery ↔ fork-child stdio
|
|
||||||
# interaction. `--capture=sys` only redirects Python-level
|
|
||||||
# `sys.stdout`/`sys.stderr`, leaving fd 1,2 alone.
|
|
||||||
#
|
|
||||||
# Trade-off (vs. `--capture=fd`):
|
|
||||||
# - LOST: per-test attribution of subactor *raw-fd* output
|
|
||||||
# (C-ext writes, `os.write(2, ...)`, subproc stdout). Not
|
|
||||||
# zero — those go to the terminal, captured by CI's
|
|
||||||
# terminal-level capture, just not per-test-scoped in the
|
|
||||||
# pytest failure report.
|
|
||||||
# - KEPT: Python-level `print()` + `logging` capture per-
|
|
||||||
# test (tractor's logger uses `sys.stderr`, so tractor
|
|
||||||
# log output IS still attributed per-test).
|
|
||||||
# - KEPT: user `pytest -s` for debugging (unaffected).
|
|
||||||
#
|
|
||||||
# Full post-mortem in
|
|
||||||
# `ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`.
|
|
||||||
'--capture=sys',
|
|
||||||
|
|
||||||
# disable `xonsh` plugin
|
|
||||||
# https://docs.pytest.org/en/stable/how-to/plugins.html#disabling-plugins-from-autoloading
|
|
||||||
# https://docs.pytest.org/en/stable/how-to/plugins.html#deactivating-unregistering-a-plugin-by-name
|
|
||||||
'-p no:xonsh'
|
|
||||||
]
|
]
|
||||||
log_cli = false
|
log_cli = false
|
||||||
# TODO: maybe some of these layout choices?
|
# TODO: maybe some of these layout choices?
|
||||||
# https://docs.pytest.org/en/8.0.x/explanation/goodpractices.html#choosing-a-test-layout-import-rules
|
# https://docs.pytest.org/en/8.0.x/explanation/goodpractices.html#choosing-a-test-layout-import-rules
|
||||||
# pythonpath = "src"
|
# pythonpath = "src"
|
||||||
|
|
||||||
# https://docs.pytest.org/en/stable/reference/reference.html#confval-console_output_style
|
|
||||||
console_output_style = 'progress'
|
|
||||||
# ------ tool.pytest ------
|
# ------ tool.pytest ------
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,8 @@
|
||||||
|
# vim: ft=ini
|
||||||
|
# pytest.ini for tractor
|
||||||
|
|
||||||
|
[pytest]
|
||||||
|
# don't show frickin captured logs AGAIN in the report..
|
||||||
|
addopts = --show-capture='no'
|
||||||
|
log_cli = false
|
||||||
|
; minversion = 6.0
|
||||||
|
|
@ -35,8 +35,8 @@ exclude = [
|
||||||
line-length = 88
|
line-length = 88
|
||||||
indent-width = 4
|
indent-width = 4
|
||||||
|
|
||||||
# assume latest minor cpython
|
# Assume Python 3.9
|
||||||
target-version = "py313"
|
target-version = "py311"
|
||||||
|
|
||||||
[lint]
|
[lint]
|
||||||
# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default.
|
# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default.
|
||||||
|
|
|
||||||
|
|
@ -9,11 +9,8 @@ import os
|
||||||
import signal
|
import signal
|
||||||
import platform
|
import platform
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
|
||||||
from typing import Literal
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import tractor
|
|
||||||
from tractor._testing import (
|
from tractor._testing import (
|
||||||
examples_dir as examples_dir,
|
examples_dir as examples_dir,
|
||||||
tractor_test as tractor_test,
|
tractor_test as tractor_test,
|
||||||
|
|
@ -25,8 +22,6 @@ pytest_plugins: list[str] = [
|
||||||
'tractor._testing.pytest',
|
'tractor._testing.pytest',
|
||||||
]
|
]
|
||||||
|
|
||||||
_ci_env: bool = os.environ.get('CI', False)
|
|
||||||
_non_linux: bool = platform.system() != 'Linux'
|
|
||||||
|
|
||||||
# Sending signal.SIGINT on subprocess fails on windows. Use CTRL_* alternatives
|
# Sending signal.SIGINT on subprocess fails on windows. Use CTRL_* alternatives
|
||||||
if platform.system() == 'Windows':
|
if platform.system() == 'Windows':
|
||||||
|
|
@ -39,8 +34,9 @@ else:
|
||||||
_INT_SIGNAL = signal.SIGINT
|
_INT_SIGNAL = signal.SIGINT
|
||||||
_INT_RETURN_CODE = 1 if sys.version_info < (3, 8) else -signal.SIGINT.value
|
_INT_RETURN_CODE = 1 if sys.version_info < (3, 8) else -signal.SIGINT.value
|
||||||
_PROC_SPAWN_WAIT = (
|
_PROC_SPAWN_WAIT = (
|
||||||
2 if _ci_env
|
0.6
|
||||||
else 1
|
if sys.version_info < (3, 7)
|
||||||
|
else 0.4
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -48,80 +44,6 @@ no_windows = pytest.mark.skipif(
|
||||||
platform.system() == "Windows",
|
platform.system() == "Windows",
|
||||||
reason="Test is unsupported on windows",
|
reason="Test is unsupported on windows",
|
||||||
)
|
)
|
||||||
no_macos = pytest.mark.skipif(
|
|
||||||
platform.system() == "Darwin",
|
|
||||||
reason="Test is unsupported on MacOS",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_cpu_state(
|
|
||||||
icpu: int = 0,
|
|
||||||
setting: Literal[
|
|
||||||
'scaling_governor',
|
|
||||||
'*_pstate_max_freq',
|
|
||||||
'scaling_max_freq',
|
|
||||||
# 'scaling_cur_freq',
|
|
||||||
] = '*_pstate_max_freq',
|
|
||||||
) -> tuple[
|
|
||||||
Path,
|
|
||||||
str|int,
|
|
||||||
]|None:
|
|
||||||
'''
|
|
||||||
Attempt to read the (first) CPU's setting according
|
|
||||||
to the set `setting` from under the file-sys,
|
|
||||||
|
|
||||||
/sys/devices/system/cpu/cpu0/cpufreq/{setting}
|
|
||||||
|
|
||||||
Useful to determine latency headroom for various perf affected
|
|
||||||
test suites.
|
|
||||||
|
|
||||||
'''
|
|
||||||
try:
|
|
||||||
# Read governor for core 0 (usually same for all)
|
|
||||||
setting_path: Path = list(
|
|
||||||
Path(f'/sys/devices/system/cpu/cpu{icpu}/cpufreq/')
|
|
||||||
.glob(f'{setting}')
|
|
||||||
)[0] # <- XXX must be single match!
|
|
||||||
with open(
|
|
||||||
setting_path,
|
|
||||||
'r',
|
|
||||||
) as f:
|
|
||||||
return (
|
|
||||||
setting_path,
|
|
||||||
f.read().strip(),
|
|
||||||
)
|
|
||||||
except (FileNotFoundError, IndexError):
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def cpu_scaling_factor() -> float:
|
|
||||||
'''
|
|
||||||
Return a latency-headroom multiplier (>= 1.0) reflecting how
|
|
||||||
much to inflate time-limits when CPU-freq scaling is active on
|
|
||||||
linux.
|
|
||||||
|
|
||||||
When no scaling info is available (non-linux, missing sysfs),
|
|
||||||
returns 1.0 (i.e. no headroom adjustment needed).
|
|
||||||
|
|
||||||
'''
|
|
||||||
if _non_linux:
|
|
||||||
return 1.
|
|
||||||
|
|
||||||
mx = get_cpu_state()
|
|
||||||
cur = get_cpu_state(setting='scaling_max_freq')
|
|
||||||
if mx is None or cur is None:
|
|
||||||
return 1.
|
|
||||||
|
|
||||||
_mx_pth, max_freq = mx
|
|
||||||
_cur_pth, cur_freq = cur
|
|
||||||
cpu_scaled: float = int(cur_freq) / int(max_freq)
|
|
||||||
|
|
||||||
if cpu_scaled != 1.:
|
|
||||||
return 1. / (
|
|
||||||
cpu_scaled * 2 # <- bc likely "dual threaded"
|
|
||||||
)
|
|
||||||
|
|
||||||
return 1.
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_addoption(
|
def pytest_addoption(
|
||||||
|
|
@ -139,54 +61,16 @@ def pytest_addoption(
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session', autouse=True)
|
@pytest.fixture(scope='session', autouse=True)
|
||||||
def loglevel(
|
def loglevel(request):
|
||||||
request: pytest.FixtureRequest,
|
|
||||||
) -> str:
|
|
||||||
import tractor
|
import tractor
|
||||||
orig = tractor.log._default_loglevel
|
orig = tractor.log._default_loglevel
|
||||||
level = tractor.log._default_loglevel = request.config.option.loglevel
|
level = tractor.log._default_loglevel = request.config.option.loglevel
|
||||||
log = tractor.log.get_console_log(
|
tractor.log.get_console_log(level)
|
||||||
level=level,
|
|
||||||
name='tractor', # <- enable root logger
|
|
||||||
)
|
|
||||||
log.info(
|
|
||||||
f'Test-harness set runtime loglevel: {level!r}\n'
|
|
||||||
)
|
|
||||||
yield level
|
yield level
|
||||||
tractor.log._default_loglevel = orig
|
tractor.log._default_loglevel = orig
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='function')
|
_ci_env: bool = os.environ.get('CI', False)
|
||||||
def test_log(
|
|
||||||
request: pytest.FixtureRequest,
|
|
||||||
loglevel: str,
|
|
||||||
) -> tractor.log.StackLevelAdapter:
|
|
||||||
'''
|
|
||||||
Deliver a per test-module-fn logger instance for reporting from
|
|
||||||
within actual test bodies/fixtures.
|
|
||||||
|
|
||||||
For example this can be handy to report certain error cases from
|
|
||||||
exception handlers using `test_log.exception()`.
|
|
||||||
|
|
||||||
'''
|
|
||||||
modname: str = request.function.__module__
|
|
||||||
log = tractor.log.get_logger(
|
|
||||||
name=modname, # <- enable root logger
|
|
||||||
# pkg_name='tests',
|
|
||||||
)
|
|
||||||
_log = tractor.log.get_console_log(
|
|
||||||
level=loglevel,
|
|
||||||
logger=log,
|
|
||||||
name=modname,
|
|
||||||
# pkg_name='tests',
|
|
||||||
)
|
|
||||||
_log.debug(
|
|
||||||
f'In-test-logging requested\n'
|
|
||||||
f'test_log.name: {log.name!r}\n'
|
|
||||||
f'level: {loglevel!r}\n'
|
|
||||||
|
|
||||||
)
|
|
||||||
yield _log
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
@pytest.fixture(scope='session')
|
||||||
|
|
@ -201,39 +85,15 @@ def ci_env() -> bool:
|
||||||
def sig_prog(
|
def sig_prog(
|
||||||
proc: subprocess.Popen,
|
proc: subprocess.Popen,
|
||||||
sig: int,
|
sig: int,
|
||||||
canc_timeout: float = 0.2,
|
canc_timeout: float = 0.1,
|
||||||
tries: int = 3,
|
|
||||||
) -> int:
|
) -> int:
|
||||||
'''
|
"Kill the actor-process with ``sig``."
|
||||||
Kill the actor-process with `sig`.
|
proc.send_signal(sig)
|
||||||
|
time.sleep(canc_timeout)
|
||||||
Prefer to kill with the provided signal and
|
if not proc.poll():
|
||||||
failing a `canc_timeout`, send a `SIKILL`-like
|
|
||||||
to ensure termination.
|
|
||||||
|
|
||||||
'''
|
|
||||||
for i in range(tries):
|
|
||||||
proc.send_signal(sig)
|
|
||||||
if proc.poll() is None:
|
|
||||||
print(
|
|
||||||
f'WARNING, proc still alive after,\n'
|
|
||||||
f'canc_timeout={canc_timeout!r}\n'
|
|
||||||
f'sig={sig!r}\n'
|
|
||||||
f'\n'
|
|
||||||
f'{proc.args!r}\n'
|
|
||||||
)
|
|
||||||
time.sleep(canc_timeout)
|
|
||||||
else:
|
|
||||||
# TODO: why sometimes does SIGINT not work on teardown?
|
# TODO: why sometimes does SIGINT not work on teardown?
|
||||||
# seems to happen only when trace logging enabled?
|
# seems to happen only when trace logging enabled?
|
||||||
if proc.poll() is None:
|
proc.send_signal(_KILL_SIGNAL)
|
||||||
print(
|
|
||||||
f'XXX WARNING KILLING PROG WITH SIGINT XXX\n'
|
|
||||||
f'canc_timeout={canc_timeout!r}\n'
|
|
||||||
f'{proc.args!r}\n'
|
|
||||||
)
|
|
||||||
proc.send_signal(_KILL_SIGNAL)
|
|
||||||
|
|
||||||
ret: int = proc.wait()
|
ret: int = proc.wait()
|
||||||
assert ret
|
assert ret
|
||||||
|
|
||||||
|
|
@ -246,8 +106,6 @@ def daemon(
|
||||||
testdir: pytest.Pytester,
|
testdir: pytest.Pytester,
|
||||||
reg_addr: tuple[str, int],
|
reg_addr: tuple[str, int],
|
||||||
tpt_proto: str,
|
tpt_proto: str,
|
||||||
ci_env: bool,
|
|
||||||
test_log: tractor.log.StackLevelAdapter,
|
|
||||||
|
|
||||||
) -> subprocess.Popen:
|
) -> subprocess.Popen:
|
||||||
'''
|
'''
|
||||||
|
|
@ -263,12 +121,10 @@ def daemon(
|
||||||
"import tractor; "
|
"import tractor; "
|
||||||
"tractor.run_daemon([], "
|
"tractor.run_daemon([], "
|
||||||
"registry_addrs={reg_addrs}, "
|
"registry_addrs={reg_addrs}, "
|
||||||
"enable_transports={enable_tpts}, "
|
|
||||||
"debug_mode={debug_mode}, "
|
"debug_mode={debug_mode}, "
|
||||||
"loglevel={ll})"
|
"loglevel={ll})"
|
||||||
).format(
|
).format(
|
||||||
reg_addrs=str([reg_addr]),
|
reg_addrs=str([reg_addr]),
|
||||||
enable_tpts=str([tpt_proto]),
|
|
||||||
ll="'{}'".format(loglevel) if loglevel else None,
|
ll="'{}'".format(loglevel) if loglevel else None,
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
)
|
)
|
||||||
|
|
@ -287,25 +143,13 @@ def daemon(
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO! we should poll for the registry socket-bind to take place
|
|
||||||
# and only once that's done yield to the requester!
|
|
||||||
# -[ ] TCP: use the `._root.open_root_actor()`::`ping_tpt_socket()`
|
|
||||||
# closure!
|
|
||||||
# -[ ] UDS: can we do something similar for 'pinging" the
|
|
||||||
# file-socket?
|
|
||||||
#
|
|
||||||
global _PROC_SPAWN_WAIT
|
|
||||||
# UDS sockets are **really** fast to bind()/listen()/connect()
|
# UDS sockets are **really** fast to bind()/listen()/connect()
|
||||||
# so it's often required that we delay a bit more starting
|
# so it's often required that we delay a bit more starting
|
||||||
# the first actor-tree..
|
# the first actor-tree..
|
||||||
if tpt_proto == 'uds':
|
if tpt_proto == 'uds':
|
||||||
_PROC_SPAWN_WAIT += 1.6
|
global _PROC_SPAWN_WAIT
|
||||||
|
_PROC_SPAWN_WAIT = 0.6
|
||||||
|
|
||||||
if _non_linux and ci_env:
|
|
||||||
_PROC_SPAWN_WAIT += 1
|
|
||||||
|
|
||||||
# XXX, allow time for the sub-py-proc to boot up.
|
|
||||||
# !TODO, see ping-polling ideas above!
|
|
||||||
time.sleep(_PROC_SPAWN_WAIT)
|
time.sleep(_PROC_SPAWN_WAIT)
|
||||||
|
|
||||||
assert not proc.returncode
|
assert not proc.returncode
|
||||||
|
|
@ -315,30 +159,18 @@ def daemon(
|
||||||
# XXX! yeah.. just be reaaal careful with this bc sometimes it
|
# XXX! yeah.. just be reaaal careful with this bc sometimes it
|
||||||
# can lock up on the `_io.BufferedReader` and hang..
|
# can lock up on the `_io.BufferedReader` and hang..
|
||||||
stderr: str = proc.stderr.read().decode()
|
stderr: str = proc.stderr.read().decode()
|
||||||
stdout: str = proc.stdout.read().decode()
|
if stderr:
|
||||||
if (
|
|
||||||
stderr
|
|
||||||
or
|
|
||||||
stdout
|
|
||||||
):
|
|
||||||
print(
|
print(
|
||||||
f'Daemon actor tree produced output:\n'
|
f'Daemon actor tree produced STDERR:\n'
|
||||||
f'{proc.args}\n'
|
f'{proc.args}\n'
|
||||||
f'\n'
|
f'\n'
|
||||||
f'stderr: {stderr!r}\n'
|
f'{stderr}\n'
|
||||||
f'stdout: {stdout!r}\n'
|
|
||||||
)
|
)
|
||||||
|
if proc.returncode != -2:
|
||||||
if (rc := proc.returncode) != -2:
|
raise RuntimeError(
|
||||||
msg: str = (
|
'Daemon actor tree failed !?\n'
|
||||||
f'Daemon actor tree was not cancelled !?\n'
|
f'{proc.args}\n'
|
||||||
f'proc.args: {proc.args!r}\n'
|
|
||||||
f'proc.returncode: {rc!r}\n'
|
|
||||||
)
|
)
|
||||||
if rc < 0:
|
|
||||||
raise RuntimeError(msg)
|
|
||||||
|
|
||||||
test_log.error(msg)
|
|
||||||
|
|
||||||
|
|
||||||
# @pytest.fixture(autouse=True)
|
# @pytest.fixture(autouse=True)
|
||||||
|
|
|
||||||
|
|
@ -3,8 +3,6 @@
|
||||||
|
|
||||||
'''
|
'''
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import platform
|
|
||||||
import signal
|
|
||||||
import time
|
import time
|
||||||
from typing import (
|
from typing import (
|
||||||
Callable,
|
Callable,
|
||||||
|
|
@ -34,23 +32,9 @@ if TYPE_CHECKING:
|
||||||
from pexpect import pty_spawn
|
from pexpect import pty_spawn
|
||||||
|
|
||||||
|
|
||||||
_non_linux: bool = platform.system() != 'Linux'
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_configure(config):
|
|
||||||
# register custom marks to avoid warnings see,
|
|
||||||
# https://docs.pytest.org/en/stable/how-to/writing_plugins.html#registering-custom-markers
|
|
||||||
config.addinivalue_line(
|
|
||||||
'markers',
|
|
||||||
'ctlcs_bish: test will (likely) not behave under SIGINT..'
|
|
||||||
)
|
|
||||||
|
|
||||||
# a fn that sub-instantiates a `pexpect.spawn()`
|
# a fn that sub-instantiates a `pexpect.spawn()`
|
||||||
# and returns it.
|
# and returns it.
|
||||||
type PexpectSpawner = Callable[
|
type PexpectSpawner = Callable[[str], pty_spawn.spawn]
|
||||||
[str],
|
|
||||||
pty_spawn.spawn,
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|
@ -80,65 +64,26 @@ def spawn(
|
||||||
|
|
||||||
'''
|
'''
|
||||||
import os
|
import os
|
||||||
# disable colored tbs
|
|
||||||
os.environ['PYTHON_COLORS'] = '0'
|
os.environ['PYTHON_COLORS'] = '0'
|
||||||
# disable all ANSI color output
|
|
||||||
# os.environ['NO_COLOR'] = '1'
|
|
||||||
|
|
||||||
spawned: PexpectSpawner|None = None
|
|
||||||
|
|
||||||
def _spawn(
|
def _spawn(
|
||||||
cmd: str,
|
cmd: str,
|
||||||
expect_timeout: float = 4,
|
|
||||||
**mkcmd_kwargs,
|
**mkcmd_kwargs,
|
||||||
) -> pty_spawn.spawn:
|
) -> pty_spawn.spawn:
|
||||||
nonlocal spawned
|
|
||||||
unset_colors()
|
unset_colors()
|
||||||
spawned = testdir.spawn(
|
return testdir.spawn(
|
||||||
cmd=mk_cmd(
|
cmd=mk_cmd(
|
||||||
cmd,
|
cmd,
|
||||||
**mkcmd_kwargs,
|
**mkcmd_kwargs,
|
||||||
),
|
),
|
||||||
expect_timeout=(timeout:=(
|
expect_timeout=3,
|
||||||
expect_timeout + 6
|
|
||||||
if _non_linux and _ci_env
|
|
||||||
else expect_timeout
|
|
||||||
)),
|
|
||||||
# preexec_fn=unset_colors,
|
# preexec_fn=unset_colors,
|
||||||
# ^TODO? get `pytest` core to expose underlying
|
# ^TODO? get `pytest` core to expose underlying
|
||||||
# `pexpect.spawn()` stuff?
|
# `pexpect.spawn()` stuff?
|
||||||
)
|
)
|
||||||
# sanity
|
|
||||||
assert spawned.timeout == timeout
|
|
||||||
return spawned
|
|
||||||
|
|
||||||
# such that test-dep can pass input script name.
|
# such that test-dep can pass input script name.
|
||||||
yield _spawn # the `PexpectSpawner`, type alias.
|
return _spawn # the `PexpectSpawner`, type alias.
|
||||||
|
|
||||||
if (
|
|
||||||
spawned
|
|
||||||
and
|
|
||||||
(ptyproc := spawned.ptyproc)
|
|
||||||
):
|
|
||||||
start: float = time.time()
|
|
||||||
timeout: float = 5
|
|
||||||
while (
|
|
||||||
ptyproc.isalive()
|
|
||||||
and
|
|
||||||
(
|
|
||||||
(_time_took := (time.time() - start))
|
|
||||||
<
|
|
||||||
timeout
|
|
||||||
)
|
|
||||||
):
|
|
||||||
ptyproc.kill(signal.SIGINT)
|
|
||||||
time.sleep(0.01)
|
|
||||||
|
|
||||||
if ptyproc.isalive():
|
|
||||||
ptyproc.kill(signal.SIGKILL)
|
|
||||||
|
|
||||||
# TODO? ensure we've cleaned up any UDS-paths?
|
|
||||||
# breakpoint()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(
|
@pytest.fixture(
|
||||||
|
|
@ -146,12 +91,13 @@ def spawn(
|
||||||
ids='ctl-c={}'.format,
|
ids='ctl-c={}'.format,
|
||||||
)
|
)
|
||||||
def ctlc(
|
def ctlc(
|
||||||
request: pytest.FixtureRequest,
|
request,
|
||||||
ci_env: bool,
|
ci_env: bool,
|
||||||
|
|
||||||
) -> bool:
|
) -> bool:
|
||||||
|
|
||||||
use_ctlc: bool = request.param
|
use_ctlc = request.param
|
||||||
|
|
||||||
node = request.node
|
node = request.node
|
||||||
markers = node.own_markers
|
markers = node.own_markers
|
||||||
for mark in markers:
|
for mark in markers:
|
||||||
|
|
@ -163,13 +109,7 @@ def ctlc(
|
||||||
'https://github.com/goodboy/tractor/issues/320'
|
'https://github.com/goodboy/tractor/issues/320'
|
||||||
)
|
)
|
||||||
|
|
||||||
if (
|
if mark.name == 'ctlcs_bish':
|
||||||
mark.name == 'ctlcs_bish'
|
|
||||||
and
|
|
||||||
use_ctlc
|
|
||||||
and
|
|
||||||
all(mark.args)
|
|
||||||
):
|
|
||||||
pytest.skip(
|
pytest.skip(
|
||||||
f'Test {node} prolly uses something from the stdlib (namely `asyncio`..)\n'
|
f'Test {node} prolly uses something from the stdlib (namely `asyncio`..)\n'
|
||||||
f'The test and/or underlying example script can *sometimes* run fine '
|
f'The test and/or underlying example script can *sometimes* run fine '
|
||||||
|
|
@ -274,13 +214,12 @@ def assert_before(
|
||||||
err_on_false=True,
|
err_on_false=True,
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
return str(child.before.decode())
|
|
||||||
|
|
||||||
|
|
||||||
def do_ctlc(
|
def do_ctlc(
|
||||||
child,
|
child,
|
||||||
count: int = 3,
|
count: int = 3,
|
||||||
delay: float|None = None,
|
delay: float = 0.1,
|
||||||
patt: str|None = None,
|
patt: str|None = None,
|
||||||
|
|
||||||
# expect repl UX to reprint the prompt after every
|
# expect repl UX to reprint the prompt after every
|
||||||
|
|
@ -292,7 +231,6 @@ def do_ctlc(
|
||||||
) -> str|None:
|
) -> str|None:
|
||||||
|
|
||||||
before: str|None = None
|
before: str|None = None
|
||||||
delay = delay or 0.1
|
|
||||||
|
|
||||||
# make sure ctl-c sends don't do anything but repeat output
|
# make sure ctl-c sends don't do anything but repeat output
|
||||||
for _ in range(count):
|
for _ in range(count):
|
||||||
|
|
@ -303,10 +241,7 @@ def do_ctlc(
|
||||||
# if you run this test manually it works just fine..
|
# if you run this test manually it works just fine..
|
||||||
if expect_prompt:
|
if expect_prompt:
|
||||||
time.sleep(delay)
|
time.sleep(delay)
|
||||||
child.expect(
|
child.expect(PROMPT)
|
||||||
PROMPT,
|
|
||||||
timeout=(child.timeout * 2) if _ci_env else child.timeout,
|
|
||||||
)
|
|
||||||
before = str(child.before.decode())
|
before = str(child.before.decode())
|
||||||
time.sleep(delay)
|
time.sleep(delay)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -37,9 +37,6 @@ from .conftest import (
|
||||||
in_prompt_msg,
|
in_prompt_msg,
|
||||||
assert_before,
|
assert_before,
|
||||||
)
|
)
|
||||||
from ..conftest import (
|
|
||||||
_ci_env,
|
|
||||||
)
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from ..conftest import PexpectSpawner
|
from ..conftest import PexpectSpawner
|
||||||
|
|
@ -54,14 +51,13 @@ if TYPE_CHECKING:
|
||||||
# - recurrent root errors
|
# - recurrent root errors
|
||||||
|
|
||||||
|
|
||||||
_non_linux: bool = platform.system() != 'Linux'
|
|
||||||
|
|
||||||
if platform.system() == 'Windows':
|
if platform.system() == 'Windows':
|
||||||
pytest.skip(
|
pytest.skip(
|
||||||
'Debugger tests have no windows support (yet)',
|
'Debugger tests have no windows support (yet)',
|
||||||
allow_module_level=True,
|
allow_module_level=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# TODO: was trying to this xfail style but some weird bug i see in CI
|
# TODO: was trying to this xfail style but some weird bug i see in CI
|
||||||
# that's happening at collect time.. pretty soon gonna dump actions i'm
|
# that's happening at collect time.. pretty soon gonna dump actions i'm
|
||||||
# thinkin...
|
# thinkin...
|
||||||
|
|
@ -197,11 +193,6 @@ def test_root_actor_bp_forever(
|
||||||
child.expect(EOF)
|
child.expect(EOF)
|
||||||
|
|
||||||
|
|
||||||
# skip on non-Linux CI
|
|
||||||
@pytest.mark.ctlcs_bish(
|
|
||||||
_non_linux,
|
|
||||||
_ci_env,
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
'do_next',
|
'do_next',
|
||||||
(True, False),
|
(True, False),
|
||||||
|
|
@ -267,11 +258,6 @@ def test_subactor_error(
|
||||||
child.expect(EOF)
|
child.expect(EOF)
|
||||||
|
|
||||||
|
|
||||||
# skip on non-Linux CI
|
|
||||||
@pytest.mark.ctlcs_bish(
|
|
||||||
_non_linux,
|
|
||||||
_ci_env,
|
|
||||||
)
|
|
||||||
def test_subactor_breakpoint(
|
def test_subactor_breakpoint(
|
||||||
spawn,
|
spawn,
|
||||||
ctlc: bool,
|
ctlc: bool,
|
||||||
|
|
@ -494,24 +480,8 @@ def test_multi_daemon_subactors(
|
||||||
stream.
|
stream.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
non_linux = _non_linux
|
|
||||||
if non_linux and ctlc:
|
|
||||||
pytest.skip(
|
|
||||||
'Ctl-c + MacOS is too unreliable/racy for this test..\n'
|
|
||||||
)
|
|
||||||
# !TODO, if someone with more patience then i wants to muck
|
|
||||||
# with the timings on this please feel free to see all the
|
|
||||||
# `non_linux` branching logic i added on my first attempt
|
|
||||||
# below!
|
|
||||||
#
|
|
||||||
# my conclusion was that if i were to run the script
|
|
||||||
# manually, and thus as slowly as a human would, the test
|
|
||||||
# would and should pass as described in this test fn, however
|
|
||||||
# after fighting with it for >= 1hr. i decided more then
|
|
||||||
# likely the more extensive `linux` testing should cover most
|
|
||||||
# regressions.
|
|
||||||
|
|
||||||
child = spawn('multi_daemon_subactors')
|
child = spawn('multi_daemon_subactors')
|
||||||
|
|
||||||
child.expect(PROMPT)
|
child.expect(PROMPT)
|
||||||
|
|
||||||
# there can be a race for which subactor will acquire
|
# there can be a race for which subactor will acquire
|
||||||
|
|
@ -541,19 +511,8 @@ def test_multi_daemon_subactors(
|
||||||
else:
|
else:
|
||||||
raise ValueError('Neither log msg was found !?')
|
raise ValueError('Neither log msg was found !?')
|
||||||
|
|
||||||
non_linux_delay: float = 0.3
|
|
||||||
if ctlc:
|
if ctlc:
|
||||||
do_ctlc(
|
do_ctlc(child)
|
||||||
child,
|
|
||||||
delay=(
|
|
||||||
non_linux_delay
|
|
||||||
if non_linux
|
|
||||||
else None
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
if non_linux:
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
# NOTE: previously since we did not have clobber prevention
|
# NOTE: previously since we did not have clobber prevention
|
||||||
# in the root actor this final resume could result in the debugger
|
# in the root actor this final resume could result in the debugger
|
||||||
|
|
@ -584,66 +543,33 @@ def test_multi_daemon_subactors(
|
||||||
# assert "in use by child ('bp_forever'," in before
|
# assert "in use by child ('bp_forever'," in before
|
||||||
|
|
||||||
if ctlc:
|
if ctlc:
|
||||||
do_ctlc(
|
do_ctlc(child)
|
||||||
child,
|
|
||||||
delay=(
|
|
||||||
non_linux_delay
|
|
||||||
if non_linux
|
|
||||||
else None
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
if non_linux:
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
# expect another breakpoint actor entry
|
# expect another breakpoint actor entry
|
||||||
child.sendline('c')
|
child.sendline('c')
|
||||||
child.expect(PROMPT)
|
child.expect(PROMPT)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
before: str = assert_before(
|
assert_before(
|
||||||
child,
|
child,
|
||||||
bp_forev_parts,
|
bp_forev_parts,
|
||||||
)
|
)
|
||||||
except AssertionError:
|
except AssertionError:
|
||||||
before: str = assert_before(
|
assert_before(
|
||||||
child,
|
child,
|
||||||
name_error_parts,
|
name_error_parts,
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if ctlc:
|
if ctlc:
|
||||||
before: str = do_ctlc(
|
do_ctlc(child)
|
||||||
child,
|
|
||||||
delay=(
|
|
||||||
non_linux_delay
|
|
||||||
if non_linux
|
|
||||||
else None
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
if non_linux:
|
|
||||||
time.sleep(1)
|
|
||||||
|
|
||||||
# should crash with the 2nd name error (simulates
|
# should crash with the 2nd name error (simulates
|
||||||
# a retry) and then the root eventually (boxed) errors
|
# a retry) and then the root eventually (boxed) errors
|
||||||
# after 1 or more further bp actor entries.
|
# after 1 or more further bp actor entries.
|
||||||
|
|
||||||
child.sendline('c')
|
child.sendline('c')
|
||||||
try:
|
child.expect(PROMPT)
|
||||||
child.expect(
|
|
||||||
PROMPT,
|
|
||||||
timeout=3,
|
|
||||||
)
|
|
||||||
except EOF:
|
|
||||||
before: str = child.before.decode()
|
|
||||||
print(
|
|
||||||
f'\n'
|
|
||||||
f'??? NEVER RXED `pdb` PROMPT ???\n'
|
|
||||||
f'\n'
|
|
||||||
f'{before}\n'
|
|
||||||
)
|
|
||||||
raise
|
|
||||||
|
|
||||||
assert_before(
|
assert_before(
|
||||||
child,
|
child,
|
||||||
name_error_parts,
|
name_error_parts,
|
||||||
|
|
@ -763,8 +689,7 @@ def test_multi_subactors_root_errors(
|
||||||
|
|
||||||
@has_nested_actors
|
@has_nested_actors
|
||||||
def test_multi_nested_subactors_error_through_nurseries(
|
def test_multi_nested_subactors_error_through_nurseries(
|
||||||
ci_env: bool,
|
spawn,
|
||||||
spawn: PexpectSpawner,
|
|
||||||
|
|
||||||
# TODO: address debugger issue for nested tree:
|
# TODO: address debugger issue for nested tree:
|
||||||
# https://github.com/goodboy/tractor/issues/320
|
# https://github.com/goodboy/tractor/issues/320
|
||||||
|
|
@ -785,29 +710,9 @@ def test_multi_nested_subactors_error_through_nurseries(
|
||||||
|
|
||||||
# timed_out_early: bool = False
|
# timed_out_early: bool = False
|
||||||
|
|
||||||
for (
|
for send_char in itertools.cycle(['c', 'q']):
|
||||||
i,
|
|
||||||
send_char,
|
|
||||||
) in enumerate(itertools.cycle(['c', 'q'])):
|
|
||||||
|
|
||||||
timeout: float = -1
|
|
||||||
if (
|
|
||||||
_non_linux
|
|
||||||
and
|
|
||||||
ci_env
|
|
||||||
):
|
|
||||||
timeout: float = 6
|
|
||||||
|
|
||||||
# XXX linux but the first crash sequence
|
|
||||||
# can take longer to arrive at a prompt.
|
|
||||||
elif i == 0:
|
|
||||||
timeout = 5
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
child.expect(
|
child.expect(PROMPT)
|
||||||
PROMPT,
|
|
||||||
timeout=timeout,
|
|
||||||
)
|
|
||||||
child.sendline(send_char)
|
child.sendline(send_char)
|
||||||
time.sleep(0.01)
|
time.sleep(0.01)
|
||||||
|
|
||||||
|
|
@ -984,11 +889,6 @@ def test_different_debug_mode_per_actor(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# skip on non-Linux CI
|
|
||||||
@pytest.mark.ctlcs_bish(
|
|
||||||
_non_linux,
|
|
||||||
_ci_env,
|
|
||||||
)
|
|
||||||
def test_post_mortem_api(
|
def test_post_mortem_api(
|
||||||
spawn,
|
spawn,
|
||||||
ctlc: bool,
|
ctlc: bool,
|
||||||
|
|
@ -1233,21 +1133,12 @@ def test_ctxep_pauses_n_maybe_ipc_breaks(
|
||||||
# closed so verify we see error reporting as well as
|
# closed so verify we see error reporting as well as
|
||||||
# a failed crash-REPL request msg and can CTL-c our way
|
# a failed crash-REPL request msg and can CTL-c our way
|
||||||
# out.
|
# out.
|
||||||
|
|
||||||
# ?TODO, match depending on `tpt_proto(s)`?
|
|
||||||
# - [ ] how can we pass it into the script tho?
|
|
||||||
tpt: str = 'UDS'
|
|
||||||
if _non_linux:
|
|
||||||
tpt: str = 'TCP'
|
|
||||||
|
|
||||||
assert_before(
|
assert_before(
|
||||||
child,
|
child,
|
||||||
['peer IPC channel closed abruptly?',
|
['peer IPC channel closed abruptly?',
|
||||||
'another task closed this fd',
|
'another task closed this fd',
|
||||||
'Debug lock request was CANCELLED?',
|
'Debug lock request was CANCELLED?',
|
||||||
f"'Msgpack{tpt}Stream' was already closed locally?",
|
"TransportClosed: 'MsgpackUDSStream' was already closed locally ?",]
|
||||||
f"TransportClosed: 'Msgpack{tpt}Stream' was already closed 'by peer'?",
|
|
||||||
]
|
|
||||||
|
|
||||||
# XXX races on whether these show/hit?
|
# XXX races on whether these show/hit?
|
||||||
# 'Failed to REPl via `_pause()` You called `tractor.pause()` from an already cancelled scope!',
|
# 'Failed to REPl via `_pause()` You called `tractor.pause()` from an already cancelled scope!',
|
||||||
|
|
|
||||||
|
|
@ -63,9 +63,6 @@ def test_pause_from_sync(
|
||||||
`examples/debugging/sync_bp.py`
|
`examples/debugging/sync_bp.py`
|
||||||
|
|
||||||
'''
|
'''
|
||||||
# XXX required for `breakpoint()` overload and
|
|
||||||
# thus`tractor.devx.pause_from_sync()`.
|
|
||||||
pytest.importorskip('greenback')
|
|
||||||
child = spawn('sync_bp')
|
child = spawn('sync_bp')
|
||||||
|
|
||||||
# first `sync_pause()` after nurseries open
|
# first `sync_pause()` after nurseries open
|
||||||
|
|
@ -263,9 +260,6 @@ def test_sync_pause_from_aio_task(
|
||||||
`examples/debugging/asycio_bp.py`
|
`examples/debugging/asycio_bp.py`
|
||||||
|
|
||||||
'''
|
'''
|
||||||
# XXX required for `breakpoint()` overload and
|
|
||||||
# thus`tractor.devx.pause_from_sync()`.
|
|
||||||
pytest.importorskip('greenback')
|
|
||||||
child = spawn('asyncio_bp')
|
child = spawn('asyncio_bp')
|
||||||
|
|
||||||
# RACE on whether trio/asyncio task bps first
|
# RACE on whether trio/asyncio task bps first
|
||||||
|
|
|
||||||
|
|
@ -31,9 +31,6 @@ from .conftest import (
|
||||||
PROMPT,
|
PROMPT,
|
||||||
_pause_msg,
|
_pause_msg,
|
||||||
)
|
)
|
||||||
from ..conftest import (
|
|
||||||
no_macos,
|
|
||||||
)
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from pexpect.exceptions import (
|
from pexpect.exceptions import (
|
||||||
|
|
@ -45,7 +42,6 @@ if TYPE_CHECKING:
|
||||||
from ..conftest import PexpectSpawner
|
from ..conftest import PexpectSpawner
|
||||||
|
|
||||||
|
|
||||||
@no_macos
|
|
||||||
def test_shield_pause(
|
def test_shield_pause(
|
||||||
spawn: PexpectSpawner,
|
spawn: PexpectSpawner,
|
||||||
):
|
):
|
||||||
|
|
@ -61,7 +57,6 @@ def test_shield_pause(
|
||||||
expect(
|
expect(
|
||||||
child,
|
child,
|
||||||
'Yo my child hanging..?',
|
'Yo my child hanging..?',
|
||||||
timeout=3,
|
|
||||||
)
|
)
|
||||||
assert_before(
|
assert_before(
|
||||||
child,
|
child,
|
||||||
|
|
@ -126,7 +121,7 @@ def test_shield_pause(
|
||||||
child.pid,
|
child.pid,
|
||||||
signal.SIGINT,
|
signal.SIGINT,
|
||||||
)
|
)
|
||||||
from tractor.runtime._supervise import _shutdown_msg
|
from tractor._supervise import _shutdown_msg
|
||||||
expect(
|
expect(
|
||||||
child,
|
child,
|
||||||
# 'Shutting down actor runtime',
|
# 'Shutting down actor runtime',
|
||||||
|
|
@ -156,10 +151,8 @@ def test_breakpoint_hook_restored(
|
||||||
calls used.
|
calls used.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
# XXX required for `breakpoint()` overload and
|
|
||||||
# thus`tractor.devx.pause_from_sync()`.
|
|
||||||
pytest.importorskip('greenback')
|
|
||||||
child = spawn('restore_builtin_breakpoint')
|
child = spawn('restore_builtin_breakpoint')
|
||||||
|
|
||||||
child.expect(PROMPT)
|
child.expect(PROMPT)
|
||||||
try:
|
try:
|
||||||
assert_before(
|
assert_before(
|
||||||
|
|
|
||||||
|
|
@ -1,376 +0,0 @@
|
||||||
'''
|
|
||||||
Multiaddr construction, parsing, and round-trip tests for
|
|
||||||
`tractor.discovery._multiaddr.mk_maddr()` and
|
|
||||||
`tractor.discovery._multiaddr.parse_maddr()`.
|
|
||||||
|
|
||||||
'''
|
|
||||||
from pathlib import Path
|
|
||||||
from types import SimpleNamespace
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from multiaddr import Multiaddr
|
|
||||||
|
|
||||||
from tractor.ipc._tcp import TCPAddress
|
|
||||||
from tractor.ipc._uds import UDSAddress
|
|
||||||
from tractor.discovery._multiaddr import (
|
|
||||||
mk_maddr,
|
|
||||||
parse_maddr,
|
|
||||||
parse_endpoints,
|
|
||||||
_tpt_proto_to_maddr,
|
|
||||||
_maddr_to_tpt_proto,
|
|
||||||
)
|
|
||||||
from tractor.discovery._addr import wrap_address
|
|
||||||
|
|
||||||
|
|
||||||
def test_tpt_proto_to_maddr_mapping():
|
|
||||||
'''
|
|
||||||
`_tpt_proto_to_maddr` maps all supported `proto_key`
|
|
||||||
values to their correct multiaddr protocol names.
|
|
||||||
|
|
||||||
'''
|
|
||||||
assert _tpt_proto_to_maddr['tcp'] == 'tcp'
|
|
||||||
assert _tpt_proto_to_maddr['uds'] == 'unix'
|
|
||||||
assert len(_tpt_proto_to_maddr) == 2
|
|
||||||
|
|
||||||
|
|
||||||
def test_mk_maddr_tcp_ipv4():
|
|
||||||
'''
|
|
||||||
`mk_maddr()` on a `TCPAddress` with an IPv4 host
|
|
||||||
produces the correct `/ip4/<host>/tcp/<port>` multiaddr.
|
|
||||||
|
|
||||||
'''
|
|
||||||
addr = TCPAddress('127.0.0.1', 1234)
|
|
||||||
result: Multiaddr = mk_maddr(addr)
|
|
||||||
|
|
||||||
assert isinstance(result, Multiaddr)
|
|
||||||
assert str(result) == '/ip4/127.0.0.1/tcp/1234'
|
|
||||||
|
|
||||||
protos = result.protocols()
|
|
||||||
assert protos[0].name == 'ip4'
|
|
||||||
assert protos[1].name == 'tcp'
|
|
||||||
|
|
||||||
assert result.value_for_protocol('ip4') == '127.0.0.1'
|
|
||||||
assert result.value_for_protocol('tcp') == '1234'
|
|
||||||
|
|
||||||
|
|
||||||
def test_mk_maddr_tcp_ipv6():
|
|
||||||
'''
|
|
||||||
`mk_maddr()` on a `TCPAddress` with an IPv6 host
|
|
||||||
produces the correct `/ip6/<host>/tcp/<port>` multiaddr.
|
|
||||||
|
|
||||||
'''
|
|
||||||
addr = TCPAddress('::1', 5678)
|
|
||||||
result: Multiaddr = mk_maddr(addr)
|
|
||||||
|
|
||||||
assert str(result) == '/ip6/::1/tcp/5678'
|
|
||||||
|
|
||||||
protos = result.protocols()
|
|
||||||
assert protos[0].name == 'ip6'
|
|
||||||
assert protos[1].name == 'tcp'
|
|
||||||
|
|
||||||
|
|
||||||
def test_mk_maddr_uds():
|
|
||||||
'''
|
|
||||||
`mk_maddr()` on a `UDSAddress` produces a `/unix/<path>`
|
|
||||||
multiaddr containing the full socket path.
|
|
||||||
|
|
||||||
'''
|
|
||||||
# NOTE, use an absolute `filedir` to match real runtime
|
|
||||||
# UDS paths; `mk_maddr()` strips the leading `/` to avoid
|
|
||||||
# the double-slash `/unix//run/..` that py-multiaddr
|
|
||||||
# rejects as "empty protocol path".
|
|
||||||
filedir = '/tmp/tractor_test'
|
|
||||||
filename = 'test_sock.sock'
|
|
||||||
addr = UDSAddress(
|
|
||||||
filedir=filedir,
|
|
||||||
filename=filename,
|
|
||||||
)
|
|
||||||
result: Multiaddr = mk_maddr(addr)
|
|
||||||
|
|
||||||
assert isinstance(result, Multiaddr)
|
|
||||||
|
|
||||||
result_str: str = str(result)
|
|
||||||
assert result_str.startswith('/unix/')
|
|
||||||
# verify the leading `/` was stripped to avoid double-slash
|
|
||||||
assert '/unix/tmp/tractor_test/' in result_str
|
|
||||||
|
|
||||||
sockpath_rel: str = str(
|
|
||||||
Path(filedir) / filename
|
|
||||||
).lstrip('/')
|
|
||||||
unix_val: str = result.value_for_protocol('unix')
|
|
||||||
assert unix_val.endswith(sockpath_rel)
|
|
||||||
|
|
||||||
|
|
||||||
def test_mk_maddr_unsupported_proto_key():
|
|
||||||
'''
|
|
||||||
`mk_maddr()` raises `ValueError` for an unsupported
|
|
||||||
`proto_key`.
|
|
||||||
|
|
||||||
'''
|
|
||||||
fake_addr = SimpleNamespace(proto_key='quic')
|
|
||||||
with pytest.raises(
|
|
||||||
ValueError,
|
|
||||||
match='Unsupported proto_key',
|
|
||||||
):
|
|
||||||
mk_maddr(fake_addr)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
'addr',
|
|
||||||
[
|
|
||||||
pytest.param(
|
|
||||||
TCPAddress('127.0.0.1', 9999),
|
|
||||||
id='tcp-ipv4',
|
|
||||||
),
|
|
||||||
pytest.param(
|
|
||||||
UDSAddress(
|
|
||||||
filedir='/tmp/tractor_rt',
|
|
||||||
filename='roundtrip.sock',
|
|
||||||
),
|
|
||||||
id='uds',
|
|
||||||
),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_mk_maddr_roundtrip(addr):
|
|
||||||
'''
|
|
||||||
`mk_maddr()` output is valid multiaddr syntax that the
|
|
||||||
library can re-parse back into an equivalent `Multiaddr`.
|
|
||||||
|
|
||||||
'''
|
|
||||||
maddr: Multiaddr = mk_maddr(addr)
|
|
||||||
reparsed = Multiaddr(str(maddr))
|
|
||||||
|
|
||||||
assert reparsed == maddr
|
|
||||||
assert str(reparsed) == str(maddr)
|
|
||||||
|
|
||||||
|
|
||||||
# ------ parse_maddr() tests ------
|
|
||||||
|
|
||||||
def test_maddr_to_tpt_proto_mapping():
|
|
||||||
'''
|
|
||||||
`_maddr_to_tpt_proto` is the exact inverse of
|
|
||||||
`_tpt_proto_to_maddr`.
|
|
||||||
|
|
||||||
'''
|
|
||||||
assert _maddr_to_tpt_proto == {
|
|
||||||
'tcp': 'tcp',
|
|
||||||
'unix': 'uds',
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def test_parse_maddr_tcp_ipv4():
|
|
||||||
'''
|
|
||||||
`parse_maddr()` on an IPv4 TCP multiaddr string
|
|
||||||
produce a `TCPAddress` with the correct host and port.
|
|
||||||
|
|
||||||
'''
|
|
||||||
result = parse_maddr('/ip4/127.0.0.1/tcp/1234')
|
|
||||||
|
|
||||||
assert isinstance(result, TCPAddress)
|
|
||||||
assert result.unwrap() == ('127.0.0.1', 1234)
|
|
||||||
|
|
||||||
|
|
||||||
def test_parse_maddr_tcp_ipv6():
|
|
||||||
'''
|
|
||||||
`parse_maddr()` on an IPv6 TCP multiaddr string
|
|
||||||
produce a `TCPAddress` with the correct host and port.
|
|
||||||
|
|
||||||
'''
|
|
||||||
result = parse_maddr('/ip6/::1/tcp/5678')
|
|
||||||
|
|
||||||
assert isinstance(result, TCPAddress)
|
|
||||||
assert result.unwrap() == ('::1', 5678)
|
|
||||||
|
|
||||||
|
|
||||||
def test_parse_maddr_uds():
|
|
||||||
'''
|
|
||||||
`parse_maddr()` on a `/unix/...` multiaddr string
|
|
||||||
produce a `UDSAddress` with the correct dir and filename,
|
|
||||||
preserving absolute path semantics.
|
|
||||||
|
|
||||||
'''
|
|
||||||
result = parse_maddr('/unix/tmp/tractor_test/test.sock')
|
|
||||||
|
|
||||||
assert isinstance(result, UDSAddress)
|
|
||||||
filedir, filename = result.unwrap()
|
|
||||||
assert filename == 'test.sock'
|
|
||||||
assert str(filedir) == '/tmp/tractor_test'
|
|
||||||
|
|
||||||
|
|
||||||
def test_parse_maddr_unsupported():
|
|
||||||
'''
|
|
||||||
`parse_maddr()` raise `ValueError` for an unsupported
|
|
||||||
protocol combination like UDP.
|
|
||||||
|
|
||||||
'''
|
|
||||||
with pytest.raises(
|
|
||||||
ValueError,
|
|
||||||
match='Unsupported multiaddr protocol combo',
|
|
||||||
):
|
|
||||||
parse_maddr('/ip4/127.0.0.1/udp/1234')
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
'addr',
|
|
||||||
[
|
|
||||||
pytest.param(
|
|
||||||
TCPAddress('127.0.0.1', 9999),
|
|
||||||
id='tcp-ipv4',
|
|
||||||
),
|
|
||||||
pytest.param(
|
|
||||||
UDSAddress(
|
|
||||||
filedir='/tmp/tractor_rt',
|
|
||||||
filename='roundtrip.sock',
|
|
||||||
),
|
|
||||||
id='uds',
|
|
||||||
),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_parse_maddr_roundtrip(addr):
|
|
||||||
'''
|
|
||||||
Full round-trip: `addr -> mk_maddr -> str -> parse_maddr`
|
|
||||||
produce an `Address` whose `.unwrap()` matches the original.
|
|
||||||
|
|
||||||
'''
|
|
||||||
maddr: Multiaddr = mk_maddr(addr)
|
|
||||||
maddr_str: str = str(maddr)
|
|
||||||
parsed = parse_maddr(maddr_str)
|
|
||||||
|
|
||||||
assert type(parsed) is type(addr)
|
|
||||||
assert parsed.unwrap() == addr.unwrap()
|
|
||||||
|
|
||||||
|
|
||||||
def test_wrap_address_maddr_str():
|
|
||||||
'''
|
|
||||||
`wrap_address()` accept a multiaddr-format string and
|
|
||||||
return the correct `Address` type.
|
|
||||||
|
|
||||||
'''
|
|
||||||
result = wrap_address('/ip4/127.0.0.1/tcp/9999')
|
|
||||||
|
|
||||||
assert isinstance(result, TCPAddress)
|
|
||||||
assert result.unwrap() == ('127.0.0.1', 9999)
|
|
||||||
|
|
||||||
|
|
||||||
# ------ parse_endpoints() tests ------
|
|
||||||
|
|
||||||
def test_parse_endpoints_tcp_only():
|
|
||||||
'''
|
|
||||||
`parse_endpoints()` with a single TCP maddr per actor
|
|
||||||
produce the correct `TCPAddress` instances.
|
|
||||||
|
|
||||||
'''
|
|
||||||
table = {
|
|
||||||
'registry': ['/ip4/127.0.0.1/tcp/1616'],
|
|
||||||
'data_feed': ['/ip4/0.0.0.0/tcp/5555'],
|
|
||||||
}
|
|
||||||
result = parse_endpoints(table)
|
|
||||||
|
|
||||||
assert set(result.keys()) == {'registry', 'data_feed'}
|
|
||||||
|
|
||||||
reg_addr = result['registry'][0]
|
|
||||||
assert isinstance(reg_addr, TCPAddress)
|
|
||||||
assert reg_addr.unwrap() == ('127.0.0.1', 1616)
|
|
||||||
|
|
||||||
feed_addr = result['data_feed'][0]
|
|
||||||
assert isinstance(feed_addr, TCPAddress)
|
|
||||||
assert feed_addr.unwrap() == ('0.0.0.0', 5555)
|
|
||||||
|
|
||||||
|
|
||||||
def test_parse_endpoints_mixed_tpts():
|
|
||||||
'''
|
|
||||||
`parse_endpoints()` with both TCP and UDS maddrs for
|
|
||||||
the same actor produce the correct mixed `Address` list.
|
|
||||||
|
|
||||||
'''
|
|
||||||
table = {
|
|
||||||
'broker': [
|
|
||||||
'/ip4/127.0.0.1/tcp/4040',
|
|
||||||
'/unix/tmp/tractor/broker.sock',
|
|
||||||
],
|
|
||||||
}
|
|
||||||
result = parse_endpoints(table)
|
|
||||||
addrs = result['broker']
|
|
||||||
|
|
||||||
assert len(addrs) == 2
|
|
||||||
assert isinstance(addrs[0], TCPAddress)
|
|
||||||
assert addrs[0].unwrap() == ('127.0.0.1', 4040)
|
|
||||||
|
|
||||||
assert isinstance(addrs[1], UDSAddress)
|
|
||||||
filedir, filename = addrs[1].unwrap()
|
|
||||||
assert filename == 'broker.sock'
|
|
||||||
assert str(filedir) == '/tmp/tractor'
|
|
||||||
|
|
||||||
|
|
||||||
def test_parse_endpoints_unwrapped_tuples():
|
|
||||||
'''
|
|
||||||
`parse_endpoints()` accept raw `(host, port)` tuples
|
|
||||||
and wrap them as `TCPAddress`.
|
|
||||||
|
|
||||||
'''
|
|
||||||
table = {
|
|
||||||
'ems': [('127.0.0.1', 6666)],
|
|
||||||
}
|
|
||||||
result = parse_endpoints(table)
|
|
||||||
|
|
||||||
addr = result['ems'][0]
|
|
||||||
assert isinstance(addr, TCPAddress)
|
|
||||||
assert addr.unwrap() == ('127.0.0.1', 6666)
|
|
||||||
|
|
||||||
|
|
||||||
def test_parse_endpoints_mixed_str_and_tuple():
|
|
||||||
'''
|
|
||||||
`parse_endpoints()` accept a mix of maddr strings and
|
|
||||||
raw tuples in the same actor entry list.
|
|
||||||
|
|
||||||
'''
|
|
||||||
table = {
|
|
||||||
'quoter': [
|
|
||||||
'/ip4/127.0.0.1/tcp/7777',
|
|
||||||
('127.0.0.1', 8888),
|
|
||||||
],
|
|
||||||
}
|
|
||||||
result = parse_endpoints(table)
|
|
||||||
addrs = result['quoter']
|
|
||||||
|
|
||||||
assert len(addrs) == 2
|
|
||||||
assert isinstance(addrs[0], TCPAddress)
|
|
||||||
assert addrs[0].unwrap() == ('127.0.0.1', 7777)
|
|
||||||
|
|
||||||
assert isinstance(addrs[1], TCPAddress)
|
|
||||||
assert addrs[1].unwrap() == ('127.0.0.1', 8888)
|
|
||||||
|
|
||||||
|
|
||||||
def test_parse_endpoints_unsupported_proto():
|
|
||||||
'''
|
|
||||||
`parse_endpoints()` raise `ValueError` when a maddr
|
|
||||||
string uses an unsupported protocol like `/udp/`.
|
|
||||||
|
|
||||||
'''
|
|
||||||
table = {
|
|
||||||
'bad_actor': ['/ip4/127.0.0.1/udp/9999'],
|
|
||||||
}
|
|
||||||
with pytest.raises(
|
|
||||||
ValueError,
|
|
||||||
match='Unsupported multiaddr protocol combo',
|
|
||||||
):
|
|
||||||
parse_endpoints(table)
|
|
||||||
|
|
||||||
|
|
||||||
def test_parse_endpoints_empty_table():
|
|
||||||
'''
|
|
||||||
`parse_endpoints()` on an empty table return an empty
|
|
||||||
dict.
|
|
||||||
|
|
||||||
'''
|
|
||||||
assert parse_endpoints({}) == {}
|
|
||||||
|
|
||||||
|
|
||||||
def test_parse_endpoints_empty_actor_list():
|
|
||||||
'''
|
|
||||||
`parse_endpoints()` with an actor mapped to an empty
|
|
||||||
list preserve the key with an empty list value.
|
|
||||||
|
|
||||||
'''
|
|
||||||
result = parse_endpoints({'x': []})
|
|
||||||
assert result == {'x': []}
|
|
||||||
|
|
@ -1,345 +0,0 @@
|
||||||
'''
|
|
||||||
`open_root_actor(tpt_bind_addrs=...)` test suite.
|
|
||||||
|
|
||||||
Verify all three runtime code paths for explicit IPC-server
|
|
||||||
bind-address selection in `_root.py`:
|
|
||||||
|
|
||||||
1. Non-registrar, no explicit bind -> random addrs from registry proto
|
|
||||||
2. Registrar, no explicit bind -> binds to registry_addrs
|
|
||||||
3. Explicit bind given -> wraps via `wrap_address()` and uses them
|
|
||||||
|
|
||||||
'''
|
|
||||||
import pytest
|
|
||||||
import trio
|
|
||||||
import tractor
|
|
||||||
from tractor.discovery._addr import (
|
|
||||||
wrap_address,
|
|
||||||
)
|
|
||||||
from tractor.discovery._multiaddr import mk_maddr
|
|
||||||
from tractor._testing.addr import get_rando_addr
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# helpers
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
def _bound_bindspaces(
|
|
||||||
actor: tractor.Actor,
|
|
||||||
) -> set[str]:
|
|
||||||
'''
|
|
||||||
Collect the set of bindspace strings from the actor's
|
|
||||||
currently bound IPC-server accept addresses.
|
|
||||||
|
|
||||||
'''
|
|
||||||
return {
|
|
||||||
wrap_address(a).bindspace
|
|
||||||
for a in actor.accept_addrs
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _bound_wrapped(
|
|
||||||
actor: tractor.Actor,
|
|
||||||
) -> list:
|
|
||||||
'''
|
|
||||||
Return the actor's accept addrs as wrapped `Address` objects.
|
|
||||||
|
|
||||||
'''
|
|
||||||
return [
|
|
||||||
wrap_address(a)
|
|
||||||
for a in actor.accept_addrs
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# 1) Registrar + explicit tpt_bind_addrs
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
'addr_combo',
|
|
||||||
[
|
|
||||||
'bind-eq-reg',
|
|
||||||
'bind-subset-reg',
|
|
||||||
'bind-disjoint-reg',
|
|
||||||
],
|
|
||||||
ids=lambda v: v,
|
|
||||||
)
|
|
||||||
def test_registrar_root_tpt_bind_addrs(
|
|
||||||
reg_addr: tuple,
|
|
||||||
tpt_proto: str,
|
|
||||||
debug_mode: bool,
|
|
||||||
addr_combo: str,
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
Registrar root-actor with explicit `tpt_bind_addrs`:
|
|
||||||
bound set must include all registry + all bind addr bindspaces
|
|
||||||
(merge behavior).
|
|
||||||
|
|
||||||
'''
|
|
||||||
reg_wrapped = wrap_address(reg_addr)
|
|
||||||
|
|
||||||
if addr_combo == 'bind-eq-reg':
|
|
||||||
bind_addrs = [reg_addr]
|
|
||||||
# extra secondary reg addr for subset test
|
|
||||||
extra_reg = []
|
|
||||||
|
|
||||||
elif addr_combo == 'bind-subset-reg':
|
|
||||||
second_reg = get_rando_addr(tpt_proto)
|
|
||||||
bind_addrs = [reg_addr]
|
|
||||||
extra_reg = [second_reg]
|
|
||||||
|
|
||||||
elif addr_combo == 'bind-disjoint-reg':
|
|
||||||
# port=0 on same host -> completely different addr
|
|
||||||
rando = wrap_address(reg_addr).get_random(
|
|
||||||
bindspace=reg_wrapped.bindspace,
|
|
||||||
)
|
|
||||||
bind_addrs = [rando.unwrap()]
|
|
||||||
extra_reg = []
|
|
||||||
|
|
||||||
all_reg = [reg_addr] + extra_reg
|
|
||||||
|
|
||||||
async def _main():
|
|
||||||
async with tractor.open_root_actor(
|
|
||||||
registry_addrs=all_reg,
|
|
||||||
tpt_bind_addrs=bind_addrs,
|
|
||||||
debug_mode=debug_mode,
|
|
||||||
):
|
|
||||||
actor = tractor.current_actor()
|
|
||||||
assert actor.is_registrar
|
|
||||||
|
|
||||||
bound = actor.accept_addrs
|
|
||||||
bound_bs = _bound_bindspaces(actor)
|
|
||||||
|
|
||||||
# all registry bindspaces must appear in bound set
|
|
||||||
for ra in all_reg:
|
|
||||||
assert wrap_address(ra).bindspace in bound_bs
|
|
||||||
|
|
||||||
# all bind-addr bindspaces must appear
|
|
||||||
for ba in bind_addrs:
|
|
||||||
assert wrap_address(ba).bindspace in bound_bs
|
|
||||||
|
|
||||||
# registry addr must appear verbatim in bound
|
|
||||||
# (after wrapping both sides for comparison)
|
|
||||||
bound_w = _bound_wrapped(actor)
|
|
||||||
assert reg_wrapped in bound_w
|
|
||||||
|
|
||||||
if addr_combo == 'bind-disjoint-reg':
|
|
||||||
assert len(bound) >= 2
|
|
||||||
|
|
||||||
trio.run(_main)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
'addr_combo',
|
|
||||||
[
|
|
||||||
'bind-same-bindspace',
|
|
||||||
'bind-disjoint',
|
|
||||||
],
|
|
||||||
ids=lambda v: v,
|
|
||||||
)
|
|
||||||
def test_non_registrar_root_tpt_bind_addrs(
|
|
||||||
daemon,
|
|
||||||
reg_addr: tuple,
|
|
||||||
tpt_proto: str,
|
|
||||||
debug_mode: bool,
|
|
||||||
addr_combo: str,
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
Non-registrar root with explicit `tpt_bind_addrs`:
|
|
||||||
bound set must exactly match the requested bind addrs
|
|
||||||
(no merge with registry).
|
|
||||||
|
|
||||||
'''
|
|
||||||
reg_wrapped = wrap_address(reg_addr)
|
|
||||||
|
|
||||||
if addr_combo == 'bind-same-bindspace':
|
|
||||||
# same bindspace as reg but port=0 so we get a random port
|
|
||||||
rando = reg_wrapped.get_random(
|
|
||||||
bindspace=reg_wrapped.bindspace,
|
|
||||||
)
|
|
||||||
bind_addrs = [rando.unwrap()]
|
|
||||||
|
|
||||||
elif addr_combo == 'bind-disjoint':
|
|
||||||
rando = reg_wrapped.get_random(
|
|
||||||
bindspace=reg_wrapped.bindspace,
|
|
||||||
)
|
|
||||||
bind_addrs = [rando.unwrap()]
|
|
||||||
|
|
||||||
async def _main():
|
|
||||||
async with tractor.open_root_actor(
|
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
tpt_bind_addrs=bind_addrs,
|
|
||||||
debug_mode=debug_mode,
|
|
||||||
):
|
|
||||||
actor = tractor.current_actor()
|
|
||||||
assert not actor.is_registrar
|
|
||||||
|
|
||||||
bound = actor.accept_addrs
|
|
||||||
assert len(bound) == len(bind_addrs)
|
|
||||||
|
|
||||||
# bindspaces must match
|
|
||||||
bound_bs = _bound_bindspaces(actor)
|
|
||||||
for ba in bind_addrs:
|
|
||||||
assert wrap_address(ba).bindspace in bound_bs
|
|
||||||
|
|
||||||
# TCP port=0 should resolve to a real port
|
|
||||||
for uw_addr in bound:
|
|
||||||
w = wrap_address(uw_addr)
|
|
||||||
if w.proto_key == 'tcp':
|
|
||||||
_host, port = uw_addr
|
|
||||||
assert port > 0
|
|
||||||
|
|
||||||
trio.run(_main)
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# 3) Non-registrar, default random bind (baseline)
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
def test_non_registrar_default_random_bind(
|
|
||||||
daemon,
|
|
||||||
reg_addr: tuple,
|
|
||||||
debug_mode: bool,
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
Baseline: no `tpt_bind_addrs`, daemon running.
|
|
||||||
Bound bindspace matches registry bindspace,
|
|
||||||
but bound addr differs from reg_addr (random).
|
|
||||||
|
|
||||||
'''
|
|
||||||
reg_wrapped = wrap_address(reg_addr)
|
|
||||||
|
|
||||||
async def _main():
|
|
||||||
async with tractor.open_root_actor(
|
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
debug_mode=debug_mode,
|
|
||||||
):
|
|
||||||
actor = tractor.current_actor()
|
|
||||||
assert not actor.is_registrar
|
|
||||||
|
|
||||||
bound_bs = _bound_bindspaces(actor)
|
|
||||||
assert reg_wrapped.bindspace in bound_bs
|
|
||||||
|
|
||||||
# bound addr should differ from the registry addr
|
|
||||||
# (the runtime picks a random port/path)
|
|
||||||
bound_w = _bound_wrapped(actor)
|
|
||||||
assert reg_wrapped not in bound_w
|
|
||||||
|
|
||||||
trio.run(_main)
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# 4) Multiaddr string input
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
def test_tpt_bind_addrs_as_maddr_str(
|
|
||||||
reg_addr: tuple,
|
|
||||||
debug_mode: bool,
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
Pass multiaddr strings as `tpt_bind_addrs`.
|
|
||||||
Runtime should parse and bind successfully.
|
|
||||||
|
|
||||||
'''
|
|
||||||
reg_wrapped = wrap_address(reg_addr)
|
|
||||||
# build a port-0 / random maddr string for binding
|
|
||||||
rando = reg_wrapped.get_random(
|
|
||||||
bindspace=reg_wrapped.bindspace,
|
|
||||||
)
|
|
||||||
maddr_str: str = str(mk_maddr(rando))
|
|
||||||
|
|
||||||
async def _main():
|
|
||||||
async with tractor.open_root_actor(
|
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
tpt_bind_addrs=[maddr_str],
|
|
||||||
debug_mode=debug_mode,
|
|
||||||
):
|
|
||||||
actor = tractor.current_actor()
|
|
||||||
assert actor.is_registrar
|
|
||||||
|
|
||||||
for uw_addr in actor.accept_addrs:
|
|
||||||
w = wrap_address(uw_addr)
|
|
||||||
if w.proto_key == 'tcp':
|
|
||||||
_host, port = uw_addr
|
|
||||||
assert port > 0
|
|
||||||
|
|
||||||
trio.run(_main)
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# 5) Registrar merge produces union of binds
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
def test_registrar_merge_binds_union(
|
|
||||||
tpt_proto: str,
|
|
||||||
debug_mode: bool,
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
Registrar + disjoint bind addr: bound set must include
|
|
||||||
both registry and explicit bind addresses.
|
|
||||||
|
|
||||||
'''
|
|
||||||
reg_addr = get_rando_addr(tpt_proto)
|
|
||||||
reg_wrapped = wrap_address(reg_addr)
|
|
||||||
|
|
||||||
rando = reg_wrapped.get_random(
|
|
||||||
bindspace=reg_wrapped.bindspace,
|
|
||||||
)
|
|
||||||
bind_addrs = [rando.unwrap()]
|
|
||||||
|
|
||||||
# NOTE: for UDS, `get_random()` produces the same
|
|
||||||
# filename for the same pid+actor-state, so the
|
|
||||||
# "disjoint" premise only holds when the addrs
|
|
||||||
# actually differ (always true for TCP, may
|
|
||||||
# collide for UDS).
|
|
||||||
expect_disjoint: bool = (
|
|
||||||
tuple(reg_addr) != rando.unwrap()
|
|
||||||
)
|
|
||||||
|
|
||||||
async def _main():
|
|
||||||
async with tractor.open_root_actor(
|
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
tpt_bind_addrs=bind_addrs,
|
|
||||||
debug_mode=debug_mode,
|
|
||||||
):
|
|
||||||
actor = tractor.current_actor()
|
|
||||||
assert actor.is_registrar
|
|
||||||
|
|
||||||
bound = actor.accept_addrs
|
|
||||||
bound_w = _bound_wrapped(actor)
|
|
||||||
|
|
||||||
if expect_disjoint:
|
|
||||||
# must have at least 2 (registry + bind)
|
|
||||||
assert len(bound) >= 2
|
|
||||||
|
|
||||||
# registry addr must appear in bound set
|
|
||||||
assert reg_wrapped in bound_w
|
|
||||||
|
|
||||||
trio.run(_main)
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
# 6) open_nursery forwards tpt_bind_addrs
|
|
||||||
# ------------------------------------------------------------------
|
|
||||||
def test_open_nursery_forwards_tpt_bind_addrs(
|
|
||||||
reg_addr: tuple,
|
|
||||||
debug_mode: bool,
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
`open_nursery(tpt_bind_addrs=...)` forwards through
|
|
||||||
`**kwargs` to `open_root_actor()`.
|
|
||||||
|
|
||||||
'''
|
|
||||||
reg_wrapped = wrap_address(reg_addr)
|
|
||||||
rando = reg_wrapped.get_random(
|
|
||||||
bindspace=reg_wrapped.bindspace,
|
|
||||||
)
|
|
||||||
bind_addrs = [rando.unwrap()]
|
|
||||||
|
|
||||||
async def _main():
|
|
||||||
async with tractor.open_nursery(
|
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
tpt_bind_addrs=bind_addrs,
|
|
||||||
debug_mode=debug_mode,
|
|
||||||
):
|
|
||||||
actor = tractor.current_actor()
|
|
||||||
bound_bs = _bound_bindspaces(actor)
|
|
||||||
|
|
||||||
for ba in bind_addrs:
|
|
||||||
assert wrap_address(ba).bindspace in bound_bs
|
|
||||||
|
|
||||||
trio.run(_main)
|
|
||||||
|
|
@ -8,16 +8,17 @@ from pathlib import Path
|
||||||
import pytest
|
import pytest
|
||||||
import trio
|
import trio
|
||||||
import tractor
|
import tractor
|
||||||
from tractor import Actor
|
from tractor import (
|
||||||
from tractor.runtime import _state
|
Actor,
|
||||||
from tractor.discovery import _addr
|
_state,
|
||||||
|
_addr,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def bindspace_dir_str() -> str:
|
def bindspace_dir_str() -> str:
|
||||||
|
|
||||||
from tractor.runtime._state import get_rt_dir
|
rt_dir: Path = tractor._state.get_rt_dir()
|
||||||
rt_dir: Path = get_rt_dir()
|
|
||||||
bs_dir: Path = rt_dir / 'doggy'
|
bs_dir: Path = rt_dir / 'doggy'
|
||||||
bs_dir_str: str = str(bs_dir)
|
bs_dir_str: str = str(bs_dir)
|
||||||
assert not bs_dir.is_dir()
|
assert not bs_dir.is_dir()
|
||||||
|
|
|
||||||
|
|
@ -13,9 +13,9 @@ from tractor import (
|
||||||
Portal,
|
Portal,
|
||||||
ipc,
|
ipc,
|
||||||
msg,
|
msg,
|
||||||
|
_state,
|
||||||
|
_addr,
|
||||||
)
|
)
|
||||||
from tractor.runtime import _state
|
|
||||||
from tractor.discovery import _addr
|
|
||||||
|
|
||||||
@tractor.context
|
@tractor.context
|
||||||
async def chk_tpts(
|
async def chk_tpts(
|
||||||
|
|
@ -62,13 +62,6 @@ def test_root_passes_tpt_to_sub(
|
||||||
reg_addr: tuple,
|
reg_addr: tuple,
|
||||||
debug_mode: bool,
|
debug_mode: bool,
|
||||||
):
|
):
|
||||||
# XXX NOTE, the `reg_addr` addr won't be the same type as the
|
|
||||||
# `tpt_proto_key` would deliver here unless you pass `--tpt-proto
|
|
||||||
# <tpt_proto_key>` on the CLI.
|
|
||||||
#
|
|
||||||
# if tpt_proto_key == 'uds':
|
|
||||||
# breakpoint()
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
enable_transports=[tpt_proto_key],
|
enable_transports=[tpt_proto_key],
|
||||||
|
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
||||||
'''
|
|
||||||
`tractor.msg.*` sub-sys test suite.
|
|
||||||
|
|
||||||
'''
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
||||||
'''
|
|
||||||
`tractor.msg.*` test sub-pkg conf.
|
|
||||||
|
|
||||||
'''
|
|
||||||
|
|
@ -1,240 +0,0 @@
|
||||||
'''
|
|
||||||
Unit tests for `tractor.msg.pretty_struct`
|
|
||||||
private-field filtering in `pformat()`.
|
|
||||||
|
|
||||||
'''
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from tractor.msg.pretty_struct import (
|
|
||||||
Struct,
|
|
||||||
pformat,
|
|
||||||
iter_struct_ppfmt_lines,
|
|
||||||
)
|
|
||||||
from tractor.msg._codec import (
|
|
||||||
MsgDec,
|
|
||||||
mk_dec,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ------ test struct definitions ------ #
|
|
||||||
|
|
||||||
class PublicOnly(Struct):
|
|
||||||
'''
|
|
||||||
All-public fields for baseline testing.
|
|
||||||
|
|
||||||
'''
|
|
||||||
name: str = 'alice'
|
|
||||||
age: int = 30
|
|
||||||
|
|
||||||
|
|
||||||
class PrivateOnly(Struct):
|
|
||||||
'''
|
|
||||||
Only underscore-prefixed (private) fields.
|
|
||||||
|
|
||||||
'''
|
|
||||||
_secret: str = 'hidden'
|
|
||||||
_internal: int = 99
|
|
||||||
|
|
||||||
|
|
||||||
class MixedFields(Struct):
|
|
||||||
'''
|
|
||||||
Mix of public and private fields.
|
|
||||||
|
|
||||||
'''
|
|
||||||
name: str = 'bob'
|
|
||||||
_hidden: int = 42
|
|
||||||
value: float = 3.14
|
|
||||||
_meta: str = 'internal'
|
|
||||||
|
|
||||||
|
|
||||||
class Inner(
|
|
||||||
Struct,
|
|
||||||
frozen=True,
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
Frozen inner struct with a private field,
|
|
||||||
for nesting tests.
|
|
||||||
|
|
||||||
'''
|
|
||||||
x: int = 1
|
|
||||||
_secret: str = 'nope'
|
|
||||||
|
|
||||||
|
|
||||||
class Outer(Struct):
|
|
||||||
'''
|
|
||||||
Outer struct nesting an `Inner`.
|
|
||||||
|
|
||||||
'''
|
|
||||||
label: str = 'outer'
|
|
||||||
inner: Inner = Inner()
|
|
||||||
|
|
||||||
|
|
||||||
class EmptyStruct(Struct):
|
|
||||||
'''
|
|
||||||
Struct with zero fields.
|
|
||||||
|
|
||||||
'''
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
# ------ tests ------ #
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
'struct_and_expected',
|
|
||||||
[
|
|
||||||
(
|
|
||||||
PublicOnly(),
|
|
||||||
{
|
|
||||||
'shown': ['name', 'age'],
|
|
||||||
'hidden': [],
|
|
||||||
},
|
|
||||||
),
|
|
||||||
(
|
|
||||||
MixedFields(),
|
|
||||||
{
|
|
||||||
'shown': ['name', 'value'],
|
|
||||||
'hidden': ['_hidden', '_meta'],
|
|
||||||
},
|
|
||||||
),
|
|
||||||
(
|
|
||||||
PrivateOnly(),
|
|
||||||
{
|
|
||||||
'shown': [],
|
|
||||||
'hidden': ['_secret', '_internal'],
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
ids=[
|
|
||||||
'all-public',
|
|
||||||
'mixed-pub-priv',
|
|
||||||
'all-private',
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_field_visibility_in_pformat(
|
|
||||||
struct_and_expected: tuple[
|
|
||||||
Struct,
|
|
||||||
dict[str, list[str]],
|
|
||||||
],
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
Verify `pformat()` shows public fields
|
|
||||||
and hides `_`-prefixed private fields.
|
|
||||||
|
|
||||||
'''
|
|
||||||
(
|
|
||||||
struct,
|
|
||||||
expected,
|
|
||||||
) = struct_and_expected
|
|
||||||
output: str = pformat(struct)
|
|
||||||
|
|
||||||
for field_name in expected['shown']:
|
|
||||||
assert field_name in output, (
|
|
||||||
f'{field_name!r} should appear in:\n'
|
|
||||||
f'{output}'
|
|
||||||
)
|
|
||||||
|
|
||||||
for field_name in expected['hidden']:
|
|
||||||
assert field_name not in output, (
|
|
||||||
f'{field_name!r} should NOT appear in:\n'
|
|
||||||
f'{output}'
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_iter_ppfmt_lines_skips_private():
|
|
||||||
'''
|
|
||||||
Directly verify `iter_struct_ppfmt_lines()`
|
|
||||||
never yields tuples with `_`-prefixed field
|
|
||||||
names.
|
|
||||||
|
|
||||||
'''
|
|
||||||
struct = MixedFields()
|
|
||||||
lines: list[tuple[str, str]] = list(
|
|
||||||
iter_struct_ppfmt_lines(
|
|
||||||
struct,
|
|
||||||
field_indent=2,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
# should have lines for public fields only
|
|
||||||
assert len(lines) == 2
|
|
||||||
|
|
||||||
for _prefix, line_content in lines:
|
|
||||||
field_name: str = (
|
|
||||||
line_content.split(':')[0].strip()
|
|
||||||
)
|
|
||||||
assert not field_name.startswith('_'), (
|
|
||||||
f'private field leaked: {field_name!r}'
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_nested_struct_filters_inner_private():
|
|
||||||
'''
|
|
||||||
Verify that nested struct's private fields
|
|
||||||
are also filtered out during recursion.
|
|
||||||
|
|
||||||
'''
|
|
||||||
outer = Outer()
|
|
||||||
output: str = pformat(outer)
|
|
||||||
|
|
||||||
# outer's public field
|
|
||||||
assert 'label' in output
|
|
||||||
|
|
||||||
# inner's public field (recursed into)
|
|
||||||
assert 'x' in output
|
|
||||||
|
|
||||||
# inner's private field must be hidden
|
|
||||||
assert '_secret' not in output
|
|
||||||
|
|
||||||
|
|
||||||
def test_empty_struct_pformat():
|
|
||||||
'''
|
|
||||||
An empty struct should produce a valid
|
|
||||||
`pformat()` result with no field lines.
|
|
||||||
|
|
||||||
'''
|
|
||||||
output: str = pformat(EmptyStruct())
|
|
||||||
assert 'EmptyStruct(' in output
|
|
||||||
assert output.rstrip().endswith(')')
|
|
||||||
|
|
||||||
# no field lines => only struct header+footer
|
|
||||||
lines: list[tuple[str, str]] = list(
|
|
||||||
iter_struct_ppfmt_lines(
|
|
||||||
EmptyStruct(),
|
|
||||||
field_indent=2,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
assert lines == []
|
|
||||||
|
|
||||||
|
|
||||||
def test_real_msgdec_pformat_hides_private():
|
|
||||||
'''
|
|
||||||
Verify `pformat()` on a real `MsgDec`
|
|
||||||
hides the `_dec` internal field.
|
|
||||||
|
|
||||||
NOTE: `MsgDec.__repr__` is custom and does
|
|
||||||
NOT call `pformat()`, so we call it directly.
|
|
||||||
|
|
||||||
'''
|
|
||||||
dec: MsgDec = mk_dec(spec=int)
|
|
||||||
output: str = pformat(dec)
|
|
||||||
|
|
||||||
# the private `_dec` field should be filtered
|
|
||||||
assert '_dec' not in output
|
|
||||||
|
|
||||||
# but the struct type name should be present
|
|
||||||
assert 'MsgDec(' in output
|
|
||||||
|
|
||||||
|
|
||||||
def test_pformat_repr_integration():
|
|
||||||
'''
|
|
||||||
Verify that `Struct.__repr__()` (which calls
|
|
||||||
`pformat()`) also hides private fields for
|
|
||||||
custom structs that do NOT override `__repr__`.
|
|
||||||
|
|
||||||
'''
|
|
||||||
mixed = MixedFields()
|
|
||||||
output: str = repr(mixed)
|
|
||||||
|
|
||||||
assert 'name' in output
|
|
||||||
assert 'value' in output
|
|
||||||
assert '_hidden' not in output
|
|
||||||
assert '_meta' not in output
|
|
||||||
|
|
@ -1,245 +0,0 @@
|
||||||
'''
|
|
||||||
Cancellation + hard-kill semantics audit for the `subint` spawn
|
|
||||||
backend.
|
|
||||||
|
|
||||||
Exercises the escape-hatch machinery added to
|
|
||||||
`tractor.spawn._subint` (module-level `_HARD_KILL_TIMEOUT`,
|
|
||||||
bounded shields around the soft-kill / thread-join sites, daemon
|
|
||||||
driver-thread abandonment) so that future stdlib regressions or
|
|
||||||
our own refactors don't silently re-introduce the hangs first
|
|
||||||
diagnosed during the Phase B.2/B.3 bringup (issue #379).
|
|
||||||
|
|
||||||
Every test in this module:
|
|
||||||
- is wrapped in `trio.fail_after()` for a deterministic per-test
|
|
||||||
wall-clock ceiling (the whole point of these tests is to fail
|
|
||||||
fast when our escape hatches regress; an unbounded test would
|
|
||||||
defeat itself),
|
|
||||||
- arms `tractor.devx.dump_on_hang()` to capture a stack dump on
|
|
||||||
failure — without it, a hang here is opaque because pytest's
|
|
||||||
stderr capture swallows `faulthandler` output by default
|
|
||||||
(hard-won lesson from the original diagnosis),
|
|
||||||
- skips on py<3.13 (no `_interpreters`) and on any
|
|
||||||
`--spawn-backend` other than `'subint'` (these tests are
|
|
||||||
subint-specific by design — they'd be nonsense under `trio` or
|
|
||||||
`mp_*`).
|
|
||||||
|
|
||||||
'''
|
|
||||||
from __future__ import annotations
|
|
||||||
from functools import partial
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import trio
|
|
||||||
import tractor
|
|
||||||
from tractor.devx import dump_on_hang
|
|
||||||
|
|
||||||
|
|
||||||
# Gate: the `subint` backend requires py3.14+. Check the
|
|
||||||
# public stdlib wrapper's presence (added in 3.14) rather than
|
|
||||||
# the private `_interpreters` module (which exists on 3.13 but
|
|
||||||
# wedges under tractor's usage — see `tractor.spawn._subint`).
|
|
||||||
pytest.importorskip('concurrent.interpreters')
|
|
||||||
|
|
||||||
# Subint-only: read the spawn method that `pytest_configure`
|
|
||||||
# committed via `try_set_start_method()`. By the time this module
|
|
||||||
# imports, the CLI backend choice has been applied.
|
|
||||||
from tractor.spawn._spawn import _spawn_method # noqa: E402
|
|
||||||
|
|
||||||
if _spawn_method != 'subint':
|
|
||||||
pytestmark = pytest.mark.skip(
|
|
||||||
reason=(
|
|
||||||
"subint-specific cancellation audit — "
|
|
||||||
"pass `--spawn-backend=subint` to run."
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
# child-side task bodies (run inside the spawned subint)
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
async def _trivial_rpc() -> str:
|
|
||||||
'''
|
|
||||||
Minimal RPC body for the baseline happy-teardown test.
|
|
||||||
'''
|
|
||||||
return 'hello from subint'
|
|
||||||
|
|
||||||
|
|
||||||
async def _spin_without_trio_checkpoints() -> None:
|
|
||||||
'''
|
|
||||||
Block the main task with NO trio-visible checkpoints so any
|
|
||||||
`Portal.cancel_actor()` arriving over IPC has nothing to hand
|
|
||||||
off to.
|
|
||||||
|
|
||||||
`threading.Event.wait(timeout)` releases the GIL (so other
|
|
||||||
threads — including trio's IO/RPC tasks — can progress) but
|
|
||||||
does NOT insert a trio checkpoint, so the subactor's main
|
|
||||||
task never notices cancellation.
|
|
||||||
|
|
||||||
This is the exact "stuck subint" scenario the hard-kill
|
|
||||||
shields exist to survive.
|
|
||||||
'''
|
|
||||||
import threading
|
|
||||||
never_set = threading.Event()
|
|
||||||
while not never_set.is_set():
|
|
||||||
# 1s re-check granularity; low enough not to waste CPU,
|
|
||||||
# high enough that even a pathologically slow
|
|
||||||
# `_HARD_KILL_TIMEOUT` won't accidentally align with a
|
|
||||||
# wake.
|
|
||||||
never_set.wait(timeout=1.0)
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
# parent-side harnesses (driven inside `trio.run(...)`)
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
async def _happy_path(
|
|
||||||
reg_addr: tuple[str, int|str],
|
|
||||||
deadline: float,
|
|
||||||
) -> None:
|
|
||||||
with trio.fail_after(deadline):
|
|
||||||
async with (
|
|
||||||
tractor.open_root_actor(
|
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
),
|
|
||||||
tractor.open_nursery() as an,
|
|
||||||
):
|
|
||||||
portal: tractor.Portal = await an.run_in_actor(
|
|
||||||
_trivial_rpc,
|
|
||||||
name='subint-happy',
|
|
||||||
)
|
|
||||||
result: str = await portal.wait_for_result()
|
|
||||||
assert result == 'hello from subint'
|
|
||||||
|
|
||||||
|
|
||||||
async def _spawn_stuck_then_cancel(
|
|
||||||
reg_addr: tuple[str, int|str],
|
|
||||||
deadline: float,
|
|
||||||
) -> None:
|
|
||||||
with trio.fail_after(deadline):
|
|
||||||
async with (
|
|
||||||
tractor.open_root_actor(
|
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
),
|
|
||||||
tractor.open_nursery() as an,
|
|
||||||
):
|
|
||||||
await an.run_in_actor(
|
|
||||||
_spin_without_trio_checkpoints,
|
|
||||||
name='subint-stuck',
|
|
||||||
)
|
|
||||||
# Give the child time to reach its non-checkpointing
|
|
||||||
# loop before we cancel; the precise value doesn't
|
|
||||||
# matter as long as it's a handful of trio schedule
|
|
||||||
# ticks.
|
|
||||||
await trio.sleep(0.5)
|
|
||||||
an.cancel_scope.cancel()
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
# tests
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def test_subint_happy_teardown(
|
|
||||||
reg_addr: tuple[str, int|str],
|
|
||||||
) -> None:
|
|
||||||
'''
|
|
||||||
Baseline: spawn a subactor, do one portal RPC, close nursery
|
|
||||||
cleanly. No cancel, no faults.
|
|
||||||
|
|
||||||
If this regresses we know something's wrong at the
|
|
||||||
spawn/teardown layer unrelated to the hard-kill escape
|
|
||||||
hatches.
|
|
||||||
|
|
||||||
'''
|
|
||||||
deadline: float = 10.0
|
|
||||||
with dump_on_hang(
|
|
||||||
seconds=deadline,
|
|
||||||
path='/tmp/subint_cancellation_happy.dump',
|
|
||||||
):
|
|
||||||
trio.run(partial(_happy_path, reg_addr, deadline))
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipon_spawn_backend(
|
|
||||||
'subint',
|
|
||||||
reason=(
|
|
||||||
'XXX SUBINT HANGING TEST XXX\n'
|
|
||||||
'See oustanding issue(s)\n'
|
|
||||||
# TODO, put issue link!
|
|
||||||
)
|
|
||||||
)
|
|
||||||
# Wall-clock bound via `pytest-timeout` (`method='thread'`)
|
|
||||||
# as defense-in-depth over the inner `trio.fail_after(15)`.
|
|
||||||
# Under the orphaned-channel hang class described in
|
|
||||||
# `ai/conc-anal/subint_cancel_delivery_hang_issue.md`, SIGINT
|
|
||||||
# is still deliverable and this test *should* be unwedgeable
|
|
||||||
# by the inner trio timeout — but sibling subint-backend
|
|
||||||
# tests in this repo have also exhibited the
|
|
||||||
# `subint_sigint_starvation_issue.md` GIL-starvation flavor,
|
|
||||||
# so `method='thread'` keeps us safe in case ordering or
|
|
||||||
# load shifts the failure mode.
|
|
||||||
@pytest.mark.timeout(
|
|
||||||
3, # NOTE never passes pre-3.14+ subints support.
|
|
||||||
method='thread',
|
|
||||||
)
|
|
||||||
def test_subint_non_checkpointing_child(
|
|
||||||
reg_addr: tuple[str, int|str],
|
|
||||||
) -> None:
|
|
||||||
'''
|
|
||||||
Cancel a subactor whose main task is stuck in a non-
|
|
||||||
checkpointing Python loop.
|
|
||||||
|
|
||||||
`Portal.cancel_actor()` may be delivered over IPC but the
|
|
||||||
main task never checkpoints to observe the Cancelled —
|
|
||||||
so the subint's `trio.run()` can't exit gracefully.
|
|
||||||
|
|
||||||
The parent `subint_proc` bounded-shield + daemon-driver-
|
|
||||||
thread combo should abandon the thread after
|
|
||||||
`_HARD_KILL_TIMEOUT` and let the parent return cleanly.
|
|
||||||
|
|
||||||
Wall-clock budget:
|
|
||||||
- ~0.5s: settle time for child to enter the stuck loop
|
|
||||||
- ~3s: `_HARD_KILL_TIMEOUT` (soft-kill wait)
|
|
||||||
- ~3s: `_HARD_KILL_TIMEOUT` (thread-join wait)
|
|
||||||
- margin
|
|
||||||
|
|
||||||
KNOWN ISSUE (Ctrl-C-able hang):
|
|
||||||
-------------------------------
|
|
||||||
This test currently hangs past the hard-kill timeout for
|
|
||||||
reasons unrelated to the subint teardown itself — after
|
|
||||||
the subint is destroyed, a parent-side trio task appears
|
|
||||||
to park on an orphaned IPC channel (no clean EOF
|
|
||||||
delivered to a waiting receive). Unlike the
|
|
||||||
SIGINT-starvation sibling case in
|
|
||||||
`test_stale_entry_is_deleted`, this hang IS Ctrl-C-able
|
|
||||||
(`strace` shows SIGINT wakeup-fd `write() = 1`, not
|
|
||||||
`EAGAIN`) — i.e. the main trio loop is still iterating
|
|
||||||
normally. That makes this *our* bug to fix, not a
|
|
||||||
CPython-level limitation.
|
|
||||||
|
|
||||||
See `ai/conc-anal/subint_cancel_delivery_hang_issue.md`
|
|
||||||
for the full analysis + candidate fix directions
|
|
||||||
(explicit parent-side channel abort in `subint_proc`
|
|
||||||
teardown being the most likely surgical fix).
|
|
||||||
|
|
||||||
The sibling `ai/conc-anal/subint_sigint_starvation_issue.md`
|
|
||||||
documents the *other* hang class (abandoned-legacy-subint
|
|
||||||
thread + shared-GIL starvation → signal-wakeup-fd pipe
|
|
||||||
fills → SIGINT silently dropped) — that one is
|
|
||||||
structurally blocked on msgspec PEP 684 adoption and is
|
|
||||||
NOT what this test is hitting.
|
|
||||||
|
|
||||||
'''
|
|
||||||
deadline: float = 15.0
|
|
||||||
with dump_on_hang(
|
|
||||||
seconds=deadline,
|
|
||||||
path='/tmp/subint_cancellation_stuck.dump',
|
|
||||||
):
|
|
||||||
trio.run(
|
|
||||||
partial(
|
|
||||||
_spawn_stuck_then_cancel,
|
|
||||||
reg_addr,
|
|
||||||
deadline,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
@ -1,603 +0,0 @@
|
||||||
'''
|
|
||||||
Integration exercises for the `tractor.spawn._subint_forkserver`
|
|
||||||
submodule at three tiers:
|
|
||||||
|
|
||||||
1. the low-level primitives
|
|
||||||
(`fork_from_worker_thread()` +
|
|
||||||
`run_subint_in_worker_thread()`) driven from inside a real
|
|
||||||
`trio.run()` in the parent process,
|
|
||||||
|
|
||||||
2. the full `subint_forkserver_proc` spawn backend wired
|
|
||||||
through tractor's normal actor-nursery + portal-RPC
|
|
||||||
machinery — i.e. `open_root_actor` + `open_nursery` +
|
|
||||||
`run_in_actor` against a subactor spawned via fork from a
|
|
||||||
main-interp worker thread.
|
|
||||||
|
|
||||||
Background
|
|
||||||
----------
|
|
||||||
`ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
|
|
||||||
establishes that `os.fork()` from a non-main sub-interpreter
|
|
||||||
aborts the child at the CPython level. The sibling
|
|
||||||
`subint_fork_from_main_thread_smoketest.py` proves the escape
|
|
||||||
hatch: fork from a main-interp *worker thread* (one that has
|
|
||||||
never entered a subint) works, and the forked child can then
|
|
||||||
host its own `trio.run()` inside a fresh subint.
|
|
||||||
|
|
||||||
Those smoke-test scenarios are standalone — no trio runtime
|
|
||||||
in the *parent*. Tiers (1)+(2) here cover the primitives
|
|
||||||
driven from inside `trio.run()` in the parent, and tier (3)
|
|
||||||
(the `*_spawn_basic` test) drives the registered
|
|
||||||
`subint_forkserver` spawn backend end-to-end against the
|
|
||||||
tractor runtime.
|
|
||||||
|
|
||||||
Gating
|
|
||||||
------
|
|
||||||
- py3.14+ (via `concurrent.interpreters` presence)
|
|
||||||
- no `--spawn-backend` restriction — the backend-level test
|
|
||||||
flips `tractor.spawn._spawn._spawn_method` programmatically
|
|
||||||
(via `try_set_start_method('subint_forkserver')`) and
|
|
||||||
restores it on teardown, so these tests are independent of
|
|
||||||
the session-level CLI backend choice.
|
|
||||||
|
|
||||||
'''
|
|
||||||
from __future__ import annotations
|
|
||||||
from functools import partial
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
import platform
|
|
||||||
import select
|
|
||||||
import signal
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import trio
|
|
||||||
|
|
||||||
import tractor
|
|
||||||
from tractor.devx import dump_on_hang
|
|
||||||
|
|
||||||
|
|
||||||
# Gate: subint forkserver primitives require py3.14+. Check
|
|
||||||
# the public stdlib wrapper's presence (added in 3.14) rather
|
|
||||||
# than `_interpreters` directly — see
|
|
||||||
# `tractor.spawn._subint` for why.
|
|
||||||
pytest.importorskip('concurrent.interpreters')
|
|
||||||
|
|
||||||
from tractor.spawn._subint_forkserver import ( # noqa: E402
|
|
||||||
fork_from_worker_thread,
|
|
||||||
run_subint_in_worker_thread,
|
|
||||||
wait_child,
|
|
||||||
)
|
|
||||||
from tractor.spawn import _spawn as _spawn_mod # noqa: E402
|
|
||||||
from tractor.spawn._spawn import try_set_start_method # noqa: E402
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
# child-side callables (passed via `child_target=` across fork)
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
_CHILD_TRIO_BOOTSTRAP: str = (
|
|
||||||
'import trio\n'
|
|
||||||
'async def _main():\n'
|
|
||||||
' await trio.sleep(0.05)\n'
|
|
||||||
' return 42\n'
|
|
||||||
'result = trio.run(_main)\n'
|
|
||||||
'assert result == 42, f"trio.run returned {result}"\n'
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _child_trio_in_subint() -> int:
|
|
||||||
'''
|
|
||||||
`child_target` for the trio-in-child scenario: drive a
|
|
||||||
trivial `trio.run()` inside a fresh legacy-config subint
|
|
||||||
on a worker thread.
|
|
||||||
|
|
||||||
Returns an exit code suitable for `os._exit()`:
|
|
||||||
- 0: subint-hosted `trio.run()` succeeded
|
|
||||||
- 3: driver thread hang (timeout inside `run_subint_in_worker_thread`)
|
|
||||||
- 4: subint bootstrap raised some other exception
|
|
||||||
|
|
||||||
'''
|
|
||||||
try:
|
|
||||||
run_subint_in_worker_thread(
|
|
||||||
_CHILD_TRIO_BOOTSTRAP,
|
|
||||||
thread_name='child-subint-trio-thread',
|
|
||||||
)
|
|
||||||
except RuntimeError:
|
|
||||||
# timeout / thread-never-returned
|
|
||||||
return 3
|
|
||||||
except BaseException:
|
|
||||||
return 4
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
# parent-side harnesses (run inside `trio.run()`)
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
async def run_fork_in_non_trio_thread(
|
|
||||||
deadline: float,
|
|
||||||
*,
|
|
||||||
child_target=None,
|
|
||||||
) -> int:
|
|
||||||
'''
|
|
||||||
From inside a parent `trio.run()`, off-load the
|
|
||||||
forkserver primitive to a main-interp worker thread via
|
|
||||||
`trio.to_thread.run_sync()` and return the forked child's
|
|
||||||
pid.
|
|
||||||
|
|
||||||
Then `wait_child()` on that pid (also off-loaded so we
|
|
||||||
don't block trio's event loop on `waitpid()`) and assert
|
|
||||||
the child exited cleanly.
|
|
||||||
|
|
||||||
'''
|
|
||||||
with trio.fail_after(deadline):
|
|
||||||
# NOTE: `fork_from_worker_thread` internally spawns its
|
|
||||||
# own dedicated `threading.Thread` (not from trio's
|
|
||||||
# cache) and joins it before returning — so we can
|
|
||||||
# safely off-load via `to_thread.run_sync` without
|
|
||||||
# worrying about the trio-thread-cache recycling the
|
|
||||||
# runner. Pass `abandon_on_cancel=False` for the
|
|
||||||
# same "bounded + clean" rationale we use in
|
|
||||||
# `_subint.subint_proc`.
|
|
||||||
pid: int = await trio.to_thread.run_sync(
|
|
||||||
partial(
|
|
||||||
fork_from_worker_thread,
|
|
||||||
child_target,
|
|
||||||
thread_name='test-subint-forkserver',
|
|
||||||
),
|
|
||||||
abandon_on_cancel=False,
|
|
||||||
)
|
|
||||||
assert pid > 0
|
|
||||||
|
|
||||||
ok, status_str = await trio.to_thread.run_sync(
|
|
||||||
partial(
|
|
||||||
wait_child,
|
|
||||||
pid,
|
|
||||||
expect_exit_ok=True,
|
|
||||||
),
|
|
||||||
abandon_on_cancel=False,
|
|
||||||
)
|
|
||||||
assert ok, (
|
|
||||||
f'forked child did not exit cleanly: '
|
|
||||||
f'{status_str}'
|
|
||||||
)
|
|
||||||
return pid
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
# tests
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
# Bounded wall-clock via `pytest-timeout` (`method='thread'`)
|
|
||||||
# for the usual GIL-hostage safety reason documented in the
|
|
||||||
# sibling `test_subint_cancellation.py` / the class-A
|
|
||||||
# `subint_sigint_starvation_issue.md`. Each test also has an
|
|
||||||
# inner `trio.fail_after()` so assertion failures fire fast
|
|
||||||
# under normal conditions.
|
|
||||||
@pytest.mark.timeout(30, method='thread')
|
|
||||||
def test_fork_from_worker_thread_via_trio(
|
|
||||||
) -> None:
|
|
||||||
'''
|
|
||||||
Baseline: inside `trio.run()`, call
|
|
||||||
`fork_from_worker_thread()` via `trio.to_thread.run_sync()`,
|
|
||||||
get a child pid back, reap the child cleanly.
|
|
||||||
|
|
||||||
No trio-in-child. If this regresses we know the parent-
|
|
||||||
side trio↔worker-thread plumbing is broken independent
|
|
||||||
of any child-side subint machinery.
|
|
||||||
|
|
||||||
'''
|
|
||||||
deadline: float = 10.0
|
|
||||||
with dump_on_hang(
|
|
||||||
seconds=deadline,
|
|
||||||
path='/tmp/subint_forkserver_baseline.dump',
|
|
||||||
):
|
|
||||||
pid: int = trio.run(
|
|
||||||
partial(run_fork_in_non_trio_thread, deadline),
|
|
||||||
)
|
|
||||||
# parent-side sanity — we got a real pid back.
|
|
||||||
assert isinstance(pid, int) and pid > 0
|
|
||||||
# by now the child has been waited on; it shouldn't be
|
|
||||||
# reap-able again.
|
|
||||||
with pytest.raises((ChildProcessError, OSError)):
|
|
||||||
os.waitpid(pid, os.WNOHANG)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.timeout(30, method='thread')
|
|
||||||
def test_fork_and_run_trio_in_child() -> None:
|
|
||||||
'''
|
|
||||||
End-to-end: inside the parent's `trio.run()`, off-load
|
|
||||||
`fork_from_worker_thread()` to a worker thread, have the
|
|
||||||
forked child then create a fresh subint and run
|
|
||||||
`trio.run()` inside it on yet another worker thread.
|
|
||||||
|
|
||||||
This is the full "forkserver + trio-in-subint-in-child"
|
|
||||||
pattern the proposed `subint_forkserver` spawn backend
|
|
||||||
would rest on.
|
|
||||||
|
|
||||||
'''
|
|
||||||
deadline: float = 15.0
|
|
||||||
with dump_on_hang(
|
|
||||||
seconds=deadline,
|
|
||||||
path='/tmp/subint_forkserver_trio_in_child.dump',
|
|
||||||
):
|
|
||||||
pid: int = trio.run(
|
|
||||||
partial(
|
|
||||||
run_fork_in_non_trio_thread,
|
|
||||||
deadline,
|
|
||||||
child_target=_child_trio_in_subint,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
assert isinstance(pid, int) and pid > 0
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
# tier-3 backend test: drive the registered `subint_forkserver`
|
|
||||||
# spawn backend end-to-end through tractor's actor-nursery +
|
|
||||||
# portal-RPC machinery.
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
async def _trivial_rpc() -> str:
|
|
||||||
'''
|
|
||||||
Minimal subactor-side RPC body: just return a sentinel
|
|
||||||
string the parent can assert on.
|
|
||||||
|
|
||||||
'''
|
|
||||||
return 'hello from subint-forkserver child'
|
|
||||||
|
|
||||||
|
|
||||||
async def _happy_path_forkserver(
|
|
||||||
reg_addr: tuple[str, int | str],
|
|
||||||
deadline: float,
|
|
||||||
) -> None:
|
|
||||||
'''
|
|
||||||
Parent-side harness: stand up a root actor, open an actor
|
|
||||||
nursery, spawn one subactor via the currently-selected
|
|
||||||
spawn backend (which this test will have flipped to
|
|
||||||
`subint_forkserver`), run a trivial RPC through its
|
|
||||||
portal, assert the round-trip result.
|
|
||||||
|
|
||||||
'''
|
|
||||||
with trio.fail_after(deadline):
|
|
||||||
async with (
|
|
||||||
tractor.open_root_actor(
|
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
),
|
|
||||||
tractor.open_nursery() as an,
|
|
||||||
):
|
|
||||||
portal: tractor.Portal = await an.run_in_actor(
|
|
||||||
_trivial_rpc,
|
|
||||||
name='subint-forkserver-child',
|
|
||||||
)
|
|
||||||
result: str = await portal.wait_for_result()
|
|
||||||
assert result == 'hello from subint-forkserver child'
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def forkserver_spawn_method():
|
|
||||||
'''
|
|
||||||
Flip `tractor.spawn._spawn._spawn_method` to
|
|
||||||
`'subint_forkserver'` for the duration of a test, then
|
|
||||||
restore whatever was in place before (usually the
|
|
||||||
session-level CLI choice, typically `'trio'`).
|
|
||||||
|
|
||||||
Without this, other tests in the same session would
|
|
||||||
observe the global flip and start spawning via fork —
|
|
||||||
which is almost certainly NOT what their assertions were
|
|
||||||
written against.
|
|
||||||
|
|
||||||
'''
|
|
||||||
prev_method: str = _spawn_mod._spawn_method
|
|
||||||
prev_ctx = _spawn_mod._ctx
|
|
||||||
try_set_start_method('subint_forkserver')
|
|
||||||
try:
|
|
||||||
yield
|
|
||||||
finally:
|
|
||||||
_spawn_mod._spawn_method = prev_method
|
|
||||||
_spawn_mod._ctx = prev_ctx
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.timeout(60, method='thread')
|
|
||||||
def test_subint_forkserver_spawn_basic(
|
|
||||||
reg_addr: tuple[str, int | str],
|
|
||||||
forkserver_spawn_method,
|
|
||||||
) -> None:
|
|
||||||
'''
|
|
||||||
Happy-path: spawn ONE subactor via the
|
|
||||||
`subint_forkserver` backend (parent-side fork from a
|
|
||||||
main-interp worker thread), do a trivial portal-RPC
|
|
||||||
round-trip, tear the nursery down cleanly.
|
|
||||||
|
|
||||||
If this passes, the "forkserver + tractor runtime" arch
|
|
||||||
is proven end-to-end: the registered
|
|
||||||
`subint_forkserver_proc` spawn target successfully
|
|
||||||
forks a child, the child runs `_actor_child_main()` +
|
|
||||||
completes IPC handshake + serves an RPC, and the parent
|
|
||||||
reaps via `_ForkedProc.wait()` without regressing any of
|
|
||||||
the normal nursery teardown invariants.
|
|
||||||
|
|
||||||
'''
|
|
||||||
deadline: float = 20.0
|
|
||||||
with dump_on_hang(
|
|
||||||
seconds=deadline,
|
|
||||||
path='/tmp/subint_forkserver_spawn_basic.dump',
|
|
||||||
):
|
|
||||||
trio.run(
|
|
||||||
partial(
|
|
||||||
_happy_path_forkserver,
|
|
||||||
reg_addr,
|
|
||||||
deadline,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
# tier-4 DRAFT: orphaned-subactor SIGINT survivability
|
|
||||||
#
|
|
||||||
# Motivating question: with `subint_forkserver`, the child's
|
|
||||||
# `trio.run()` lives on the fork-inherited worker thread which
|
|
||||||
# is NOT `threading.main_thread()` — so trio cannot install its
|
|
||||||
# `signal.set_wakeup_fd`-based SIGINT handler. If the parent
|
|
||||||
# goes away via `SIGKILL` (no IPC `Portal.cancel_actor()`
|
|
||||||
# possible), does SIGINT on the orphan child cleanly tear it
|
|
||||||
# down via CPython's default `KeyboardInterrupt` delivery, or
|
|
||||||
# does it hang?
|
|
||||||
#
|
|
||||||
# Working hypothesis (unverified pre-this-test): post-fork the
|
|
||||||
# child is effectively single-threaded (only the fork-worker
|
|
||||||
# tstate survived), so SIGINT → default handler → raises
|
|
||||||
# `KeyboardInterrupt` on the only thread — which happens to be
|
|
||||||
# the one driving trio's event loop — so trio observes it at
|
|
||||||
# the next checkpoint. If so, we're "fine" on this backend
|
|
||||||
# despite the missing trio SIGINT handler.
|
|
||||||
#
|
|
||||||
# Cross-backend generalization (decide after this passes):
|
|
||||||
# - applicable to any backend whose subactors are separate OS
|
|
||||||
# processes: `trio`, `mp_spawn`, `mp_forkserver`,
|
|
||||||
# `subint_forkserver`.
|
|
||||||
# - NOT applicable to plain `subint` (subactors are in-process
|
|
||||||
# subinterpreters, no orphan child process to SIGINT).
|
|
||||||
# - move path: lift the harness script into
|
|
||||||
# `tests/_orphan_harness.py`, parametrize on the session's
|
|
||||||
# `_spawn_method`, add `skipif _spawn_method == 'subint'`.
|
|
||||||
# ----------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
_ORPHAN_HARNESS_SCRIPT: str = '''
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import trio
|
|
||||||
import tractor
|
|
||||||
from tractor.spawn._spawn import try_set_start_method
|
|
||||||
|
|
||||||
async def _sleep_forever() -> None:
|
|
||||||
print(f"CHILD_PID={os.getpid()}", flush=True)
|
|
||||||
await trio.sleep_forever()
|
|
||||||
|
|
||||||
async def _main(reg_addr):
|
|
||||||
async with (
|
|
||||||
tractor.open_root_actor(registry_addrs=[reg_addr]),
|
|
||||||
tractor.open_nursery() as an,
|
|
||||||
):
|
|
||||||
portal = await an.run_in_actor(
|
|
||||||
_sleep_forever,
|
|
||||||
name="orphan-test-child",
|
|
||||||
)
|
|
||||||
print(f"PARENT_READY={os.getpid()}", flush=True)
|
|
||||||
await trio.sleep_forever()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
backend = sys.argv[1]
|
|
||||||
host = sys.argv[2]
|
|
||||||
port = int(sys.argv[3])
|
|
||||||
try_set_start_method(backend)
|
|
||||||
trio.run(_main, (host, port))
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
|
||||||
def _read_marker(
|
|
||||||
proc: subprocess.Popen,
|
|
||||||
marker: str,
|
|
||||||
timeout: float,
|
|
||||||
_buf: dict,
|
|
||||||
) -> str:
|
|
||||||
'''
|
|
||||||
Block until `<marker>=<value>\\n` appears on `proc.stdout`
|
|
||||||
and return `<value>`. Uses a per-proc byte buffer (`_buf`)
|
|
||||||
to carry partial lines across calls.
|
|
||||||
|
|
||||||
'''
|
|
||||||
deadline: float = time.monotonic() + timeout
|
|
||||||
remainder: bytes = _buf.get('remainder', b'')
|
|
||||||
prefix: bytes = f'{marker}='.encode()
|
|
||||||
while time.monotonic() < deadline:
|
|
||||||
# drain any complete lines already buffered
|
|
||||||
while b'\n' in remainder:
|
|
||||||
line, remainder = remainder.split(b'\n', 1)
|
|
||||||
if line.startswith(prefix):
|
|
||||||
_buf['remainder'] = remainder
|
|
||||||
return line[len(prefix):].decode().strip()
|
|
||||||
ready, _, _ = select.select([proc.stdout], [], [], 0.2)
|
|
||||||
if not ready:
|
|
||||||
continue
|
|
||||||
chunk: bytes = os.read(proc.stdout.fileno(), 4096)
|
|
||||||
if not chunk:
|
|
||||||
break
|
|
||||||
remainder += chunk
|
|
||||||
_buf['remainder'] = remainder
|
|
||||||
raise TimeoutError(
|
|
||||||
f'Never observed marker {marker!r} on harness stdout '
|
|
||||||
f'within {timeout}s'
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _process_alive(pid: int) -> bool:
|
|
||||||
'''Liveness probe for a pid we do NOT parent (post-orphan).'''
|
|
||||||
try:
|
|
||||||
os.kill(pid, 0)
|
|
||||||
return True
|
|
||||||
except ProcessLookupError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
# Flakey under session-level env pollution (leftover
|
|
||||||
# subactor PIDs from earlier tests competing for ports /
|
|
||||||
# inheriting the harness subprocess's FDs). Passes
|
|
||||||
# cleanly in isolation, fails in suite; `strict=False`
|
|
||||||
# so either outcome is tolerated until the env isolation
|
|
||||||
# is improved. Tracker:
|
|
||||||
# `ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md`.
|
|
||||||
@pytest.mark.xfail(
|
|
||||||
strict=False,
|
|
||||||
reason=(
|
|
||||||
'Env-pollution sensitive. Passes in isolation, '
|
|
||||||
'flakey in full-suite runs; orphan subactor may '
|
|
||||||
'take longer than 10s to exit when competing for '
|
|
||||||
'resources with leftover state from earlier tests.'
|
|
||||||
),
|
|
||||||
)
|
|
||||||
@pytest.mark.timeout(
|
|
||||||
30,
|
|
||||||
method='thread',
|
|
||||||
)
|
|
||||||
def test_orphaned_subactor_sigint_cleanup_DRAFT(
|
|
||||||
reg_addr: tuple[str, int | str],
|
|
||||||
tmp_path: Path,
|
|
||||||
) -> None:
|
|
||||||
'''
|
|
||||||
DRAFT — orphaned-subactor SIGINT survivability under the
|
|
||||||
`subint_forkserver` backend.
|
|
||||||
|
|
||||||
Sequence:
|
|
||||||
1. Spawn a harness subprocess that brings up a root
|
|
||||||
actor + one `sleep_forever` subactor via
|
|
||||||
`subint_forkserver`.
|
|
||||||
2. Read the harness's stdout for `PARENT_READY=<pid>`
|
|
||||||
and `CHILD_PID=<pid>` markers (confirms the
|
|
||||||
parent→child IPC handshake completed).
|
|
||||||
3. `SIGKILL` the parent (no IPC cancel possible — the
|
|
||||||
whole point of this test).
|
|
||||||
4. `SIGINT` the orphan child.
|
|
||||||
5. Poll `os.kill(child_pid, 0)` for up to 10s — assert
|
|
||||||
the child exits.
|
|
||||||
|
|
||||||
Empirical result (2026-04, py3.14): currently **FAILS** —
|
|
||||||
SIGINT on the orphan child doesn't unwind the trio loop,
|
|
||||||
despite trio's `KIManager` handler being correctly
|
|
||||||
installed in the subactor (the post-fork thread IS
|
|
||||||
`threading.main_thread()` on py3.14). `faulthandler` dump
|
|
||||||
shows the subactor wedged in `trio/_core/_io_epoll.py::
|
|
||||||
get_events` — the signal's supposed wakeup of the event
|
|
||||||
loop isn't firing. Full analysis + diagnostic evidence
|
|
||||||
in `ai/conc-anal/
|
|
||||||
subint_forkserver_orphan_sigint_hang_issue.md`.
|
|
||||||
|
|
||||||
The runtime's *intentional* "KBI-as-OS-cancel" path at
|
|
||||||
`tractor/spawn/_entry.py::_trio_main:164` is therefore
|
|
||||||
unreachable under this backend+config. Closing the gap is
|
|
||||||
aligned with existing design intent (make the already-
|
|
||||||
designed behavior actually fire), not a new feature.
|
|
||||||
Marked `xfail(strict=True)` so the
|
|
||||||
mark flips to XPASS→fail once the gap is closed and we'll
|
|
||||||
know to drop the mark.
|
|
||||||
|
|
||||||
'''
|
|
||||||
if platform.system() != 'Linux':
|
|
||||||
pytest.skip(
|
|
||||||
'orphan-reparenting semantics only exercised on Linux'
|
|
||||||
)
|
|
||||||
|
|
||||||
script_path = tmp_path / '_orphan_harness.py'
|
|
||||||
script_path.write_text(_ORPHAN_HARNESS_SCRIPT)
|
|
||||||
|
|
||||||
# Offset the port so we don't race the session reg_addr with
|
|
||||||
# any concurrently-running backend test's listener.
|
|
||||||
host: str = reg_addr[0]
|
|
||||||
port: int = int(reg_addr[1]) + 17
|
|
||||||
|
|
||||||
proc: subprocess.Popen = subprocess.Popen(
|
|
||||||
[
|
|
||||||
sys.executable,
|
|
||||||
str(script_path),
|
|
||||||
'subint_forkserver',
|
|
||||||
host,
|
|
||||||
str(port),
|
|
||||||
],
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.STDOUT,
|
|
||||||
)
|
|
||||||
parent_pid: int | None = None
|
|
||||||
child_pid: int | None = None
|
|
||||||
buf: dict = {}
|
|
||||||
try:
|
|
||||||
child_pid = int(_read_marker(proc, 'CHILD_PID', 15.0, buf))
|
|
||||||
parent_pid = int(_read_marker(proc, 'PARENT_READY', 15.0, buf))
|
|
||||||
|
|
||||||
# sanity: both alive before we start killing stuff
|
|
||||||
assert _process_alive(parent_pid), (
|
|
||||||
f'harness parent pid={parent_pid} gone before '
|
|
||||||
f'SIGKILL — test premise broken'
|
|
||||||
)
|
|
||||||
assert _process_alive(child_pid), (
|
|
||||||
f'orphan-candidate child pid={child_pid} gone '
|
|
||||||
f'before test started'
|
|
||||||
)
|
|
||||||
|
|
||||||
# step 3: kill parent — no IPC cancel arrives at child.
|
|
||||||
# `proc.wait()` reaps the zombie so it truly disappears
|
|
||||||
# from the process table (otherwise `os.kill(pid, 0)`
|
|
||||||
# keeps reporting it as alive).
|
|
||||||
os.kill(parent_pid, signal.SIGKILL)
|
|
||||||
try:
|
|
||||||
proc.wait(timeout=3.0)
|
|
||||||
except subprocess.TimeoutExpired:
|
|
||||||
pytest.fail(
|
|
||||||
f'harness parent pid={parent_pid} did not die '
|
|
||||||
f'after SIGKILL — test premise broken'
|
|
||||||
)
|
|
||||||
assert _process_alive(child_pid), (
|
|
||||||
f'child pid={child_pid} died along with parent — '
|
|
||||||
f'did the parent reap it before SIGKILL took? '
|
|
||||||
f'test premise requires an orphan.'
|
|
||||||
)
|
|
||||||
|
|
||||||
# step 4+5: SIGINT the orphan, poll for exit.
|
|
||||||
os.kill(child_pid, signal.SIGINT)
|
|
||||||
timeout: float = 6.0
|
|
||||||
cleanup_deadline: float = time.monotonic() + timeout
|
|
||||||
while time.monotonic() < cleanup_deadline:
|
|
||||||
if not _process_alive(child_pid):
|
|
||||||
return # <- success path
|
|
||||||
time.sleep(0.1)
|
|
||||||
|
|
||||||
pytest.fail(
|
|
||||||
f'Orphan subactor (pid={child_pid}) did NOT exit '
|
|
||||||
f'within 10s of SIGINT under `subint_forkserver` '
|
|
||||||
f'→ trio on non-main thread did not observe the '
|
|
||||||
f'default CPython KeyboardInterrupt; backend needs '
|
|
||||||
f'explicit SIGINT plumbing.'
|
|
||||||
)
|
|
||||||
finally:
|
|
||||||
# best-effort cleanup to avoid leaking orphans across
|
|
||||||
# the test session regardless of outcome.
|
|
||||||
for pid in (parent_pid, child_pid):
|
|
||||||
if pid is None:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
os.kill(pid, signal.SIGKILL)
|
|
||||||
except ProcessLookupError:
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
proc.kill()
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
proc.wait(timeout=2.0)
|
|
||||||
except subprocess.TimeoutExpired:
|
|
||||||
pass
|
|
||||||
|
|
@ -1,12 +1,7 @@
|
||||||
'''
|
"""
|
||||||
Audit the simplest inter-actor bidirectional (streaming)
|
Bidirectional streaming.
|
||||||
msg patterns.
|
|
||||||
|
|
||||||
'''
|
"""
|
||||||
from __future__ import annotations
|
|
||||||
from typing import (
|
|
||||||
Callable,
|
|
||||||
)
|
|
||||||
import pytest
|
import pytest
|
||||||
import trio
|
import trio
|
||||||
import tractor
|
import tractor
|
||||||
|
|
@ -14,8 +9,10 @@ import tractor
|
||||||
|
|
||||||
@tractor.context
|
@tractor.context
|
||||||
async def simple_rpc(
|
async def simple_rpc(
|
||||||
|
|
||||||
ctx: tractor.Context,
|
ctx: tractor.Context,
|
||||||
data: int,
|
data: int,
|
||||||
|
|
||||||
) -> None:
|
) -> None:
|
||||||
'''
|
'''
|
||||||
Test a small ping-pong server.
|
Test a small ping-pong server.
|
||||||
|
|
@ -42,13 +39,15 @@ async def simple_rpc(
|
||||||
|
|
||||||
@tractor.context
|
@tractor.context
|
||||||
async def simple_rpc_with_forloop(
|
async def simple_rpc_with_forloop(
|
||||||
|
|
||||||
ctx: tractor.Context,
|
ctx: tractor.Context,
|
||||||
data: int,
|
data: int,
|
||||||
) -> None:
|
|
||||||
'''
|
|
||||||
Same as previous test but using `async for` syntax/api.
|
|
||||||
|
|
||||||
'''
|
) -> None:
|
||||||
|
"""Same as previous test but using ``async for`` syntax/api.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
# signal to parent that we're up
|
# signal to parent that we're up
|
||||||
await ctx.started(data + 1)
|
await ctx.started(data + 1)
|
||||||
|
|
||||||
|
|
@ -69,78 +68,62 @@ async def simple_rpc_with_forloop(
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
'use_async_for',
|
'use_async_for',
|
||||||
[
|
[True, False],
|
||||||
True,
|
|
||||||
False,
|
|
||||||
],
|
|
||||||
ids='use_async_for={}'.format,
|
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
'server_func',
|
'server_func',
|
||||||
[
|
[simple_rpc, simple_rpc_with_forloop],
|
||||||
simple_rpc,
|
|
||||||
simple_rpc_with_forloop,
|
|
||||||
],
|
|
||||||
ids='server_func={}'.format,
|
|
||||||
)
|
)
|
||||||
def test_simple_rpc(
|
def test_simple_rpc(server_func, use_async_for):
|
||||||
server_func: Callable,
|
|
||||||
use_async_for: bool,
|
|
||||||
loglevel: str,
|
|
||||||
debug_mode: bool,
|
|
||||||
):
|
|
||||||
'''
|
'''
|
||||||
The simplest request response pattern.
|
The simplest request response pattern.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
async def main():
|
async def main():
|
||||||
with trio.fail_after(6):
|
async with tractor.open_nursery() as n:
|
||||||
async with tractor.open_nursery(
|
|
||||||
loglevel=loglevel,
|
|
||||||
debug_mode=debug_mode,
|
|
||||||
) as an:
|
|
||||||
portal: tractor.Portal = await an.start_actor(
|
|
||||||
'rpc_server',
|
|
||||||
enable_modules=[__name__],
|
|
||||||
)
|
|
||||||
|
|
||||||
async with portal.open_context(
|
portal = await n.start_actor(
|
||||||
server_func, # taken from pytest parameterization
|
'rpc_server',
|
||||||
data=10,
|
enable_modules=[__name__],
|
||||||
) as (ctx, sent):
|
)
|
||||||
|
|
||||||
assert sent == 11
|
async with portal.open_context(
|
||||||
|
server_func, # taken from pytest parameterization
|
||||||
|
data=10,
|
||||||
|
) as (ctx, sent):
|
||||||
|
|
||||||
async with ctx.open_stream() as stream:
|
assert sent == 11
|
||||||
|
|
||||||
if use_async_for:
|
async with ctx.open_stream() as stream:
|
||||||
|
|
||||||
count = 0
|
if use_async_for:
|
||||||
# receive msgs using async for style
|
|
||||||
|
count = 0
|
||||||
|
# receive msgs using async for style
|
||||||
|
print('ping')
|
||||||
|
await stream.send('ping')
|
||||||
|
|
||||||
|
async for msg in stream:
|
||||||
|
assert msg == 'pong'
|
||||||
print('ping')
|
print('ping')
|
||||||
await stream.send('ping')
|
await stream.send('ping')
|
||||||
|
count += 1
|
||||||
|
|
||||||
async for msg in stream:
|
if count >= 9:
|
||||||
assert msg == 'pong'
|
break
|
||||||
print('ping')
|
|
||||||
await stream.send('ping')
|
|
||||||
count += 1
|
|
||||||
|
|
||||||
if count >= 9:
|
else:
|
||||||
break
|
# classic send/receive style
|
||||||
|
for _ in range(10):
|
||||||
|
|
||||||
else:
|
print('ping')
|
||||||
# classic send/receive style
|
await stream.send('ping')
|
||||||
for _ in range(10):
|
assert await stream.receive() == 'pong'
|
||||||
|
|
||||||
print('ping')
|
# stream should terminate here
|
||||||
await stream.send('ping')
|
|
||||||
assert await stream.receive() == 'pong'
|
|
||||||
|
|
||||||
# stream should terminate here
|
# final context result(s) should be consumed here in __aexit__()
|
||||||
|
|
||||||
# final context result(s) should be consumed here in __aexit__()
|
await portal.cancel_actor()
|
||||||
|
|
||||||
await portal.cancel_actor()
|
|
||||||
|
|
||||||
trio.run(main)
|
trio.run(main)
|
||||||
|
|
|
||||||
|
|
@ -98,8 +98,7 @@ def test_ipc_channel_break_during_stream(
|
||||||
expect_final_exc = TransportClosed
|
expect_final_exc = TransportClosed
|
||||||
|
|
||||||
mod: ModuleType = import_path(
|
mod: ModuleType = import_path(
|
||||||
examples_dir()
|
examples_dir() / 'advanced_faults'
|
||||||
/ 'advanced_faults'
|
|
||||||
/ 'ipc_failure_during_stream.py',
|
/ 'ipc_failure_during_stream.py',
|
||||||
root=examples_dir(),
|
root=examples_dir(),
|
||||||
consider_namespace_packages=False,
|
consider_namespace_packages=False,
|
||||||
|
|
@ -114,9 +113,8 @@ def test_ipc_channel_break_during_stream(
|
||||||
if (
|
if (
|
||||||
# only expect EoC if trans is broken on the child side,
|
# only expect EoC if trans is broken on the child side,
|
||||||
ipc_break['break_child_ipc_after'] is not False
|
ipc_break['break_child_ipc_after'] is not False
|
||||||
and
|
|
||||||
# AND we tell the child to call `MsgStream.aclose()`.
|
# AND we tell the child to call `MsgStream.aclose()`.
|
||||||
pre_aclose_msgstream
|
and pre_aclose_msgstream
|
||||||
):
|
):
|
||||||
# expect_final_exc = trio.EndOfChannel
|
# expect_final_exc = trio.EndOfChannel
|
||||||
# ^XXX NOPE! XXX^ since now `.open_stream()` absorbs this
|
# ^XXX NOPE! XXX^ since now `.open_stream()` absorbs this
|
||||||
|
|
@ -146,6 +144,9 @@ def test_ipc_channel_break_during_stream(
|
||||||
# a user sending ctl-c by raising a KBI.
|
# a user sending ctl-c by raising a KBI.
|
||||||
if pre_aclose_msgstream:
|
if pre_aclose_msgstream:
|
||||||
expect_final_exc = KeyboardInterrupt
|
expect_final_exc = KeyboardInterrupt
|
||||||
|
if tpt_proto == 'uds':
|
||||||
|
expect_final_exc = TransportClosed
|
||||||
|
expect_final_cause = trio.BrokenResourceError
|
||||||
|
|
||||||
# XXX OLD XXX
|
# XXX OLD XXX
|
||||||
# if child calls `MsgStream.aclose()` then expect EoC.
|
# if child calls `MsgStream.aclose()` then expect EoC.
|
||||||
|
|
@ -159,13 +160,16 @@ def test_ipc_channel_break_during_stream(
|
||||||
ipc_break['break_child_ipc_after'] is not False
|
ipc_break['break_child_ipc_after'] is not False
|
||||||
and (
|
and (
|
||||||
ipc_break['break_parent_ipc_after']
|
ipc_break['break_parent_ipc_after']
|
||||||
>
|
> ipc_break['break_child_ipc_after']
|
||||||
ipc_break['break_child_ipc_after']
|
|
||||||
)
|
)
|
||||||
):
|
):
|
||||||
if pre_aclose_msgstream:
|
if pre_aclose_msgstream:
|
||||||
expect_final_exc = KeyboardInterrupt
|
expect_final_exc = KeyboardInterrupt
|
||||||
|
|
||||||
|
if tpt_proto == 'uds':
|
||||||
|
expect_final_exc = TransportClosed
|
||||||
|
expect_final_cause = trio.BrokenResourceError
|
||||||
|
|
||||||
# NOTE when the parent IPC side dies (even if the child does as well
|
# NOTE when the parent IPC side dies (even if the child does as well
|
||||||
# but the child fails BEFORE the parent) we always expect the
|
# but the child fails BEFORE the parent) we always expect the
|
||||||
# IPC layer to raise a closed-resource, NEVER do we expect
|
# IPC layer to raise a closed-resource, NEVER do we expect
|
||||||
|
|
@ -244,15 +248,8 @@ def test_ipc_channel_break_during_stream(
|
||||||
# get raw instance from pytest wrapper
|
# get raw instance from pytest wrapper
|
||||||
value = excinfo.value
|
value = excinfo.value
|
||||||
if isinstance(value, ExceptionGroup):
|
if isinstance(value, ExceptionGroup):
|
||||||
excs: tuple[Exception] = value.exceptions
|
excs = value.exceptions
|
||||||
assert (
|
assert len(excs) == 1
|
||||||
len(excs) <= 2
|
|
||||||
and
|
|
||||||
all(
|
|
||||||
isinstance(exc, TransportClosed)
|
|
||||||
for exc in excs
|
|
||||||
)
|
|
||||||
)
|
|
||||||
final_exc = excs[0]
|
final_exc = excs[0]
|
||||||
assert isinstance(final_exc, expect_final_exc)
|
assert isinstance(final_exc, expect_final_exc)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -17,18 +17,8 @@ from tractor._testing import (
|
||||||
from .conftest import no_windows
|
from .conftest import no_windows
|
||||||
|
|
||||||
|
|
||||||
_non_linux: bool = platform.system() != 'Linux'
|
def is_win():
|
||||||
_friggin_windows: bool = platform.system() == 'Windows'
|
return platform.system() == 'Windows'
|
||||||
|
|
||||||
|
|
||||||
pytestmark = pytest.mark.skipon_spawn_backend(
|
|
||||||
'subint',
|
|
||||||
reason=(
|
|
||||||
'XXX SUBINT HANGING TEST XXX\n'
|
|
||||||
'See oustanding issue(s)\n'
|
|
||||||
# TODO, put issue link!
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
async def assert_err(delay=0):
|
async def assert_err(delay=0):
|
||||||
|
|
@ -120,17 +110,8 @@ def test_remote_error(reg_addr, args_err):
|
||||||
assert exc.boxed_type == errtype
|
assert exc.boxed_type == errtype
|
||||||
|
|
||||||
|
|
||||||
# @pytest.mark.skipon_spawn_backend(
|
|
||||||
# 'subint',
|
|
||||||
# reason=(
|
|
||||||
# 'XXX SUBINT HANGING TEST XXX\n'
|
|
||||||
# 'See oustanding issue(s)\n'
|
|
||||||
# # TODO, put issue link!
|
|
||||||
# )
|
|
||||||
# )
|
|
||||||
def test_multierror(
|
def test_multierror(
|
||||||
reg_addr: tuple[str, int],
|
reg_addr: tuple[str, int],
|
||||||
start_method: str,
|
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
Verify we raise a ``BaseExceptionGroup`` out of a nursery where
|
Verify we raise a ``BaseExceptionGroup`` out of a nursery where
|
||||||
|
|
@ -160,28 +141,15 @@ def test_multierror(
|
||||||
trio.run(main)
|
trio.run(main)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('delay', (0, 0.5))
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
'delay',
|
'num_subactors', range(25, 26),
|
||||||
(0, 0.5),
|
|
||||||
ids='delays={}'.format,
|
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize(
|
def test_multierror_fast_nursery(reg_addr, start_method, num_subactors, delay):
|
||||||
'num_subactors',
|
"""Verify we raise a ``BaseExceptionGroup`` out of a nursery where
|
||||||
range(25, 26),
|
|
||||||
ids= 'num_subs={}'.format,
|
|
||||||
)
|
|
||||||
def test_multierror_fast_nursery(
|
|
||||||
reg_addr: tuple,
|
|
||||||
start_method: str,
|
|
||||||
num_subactors: int,
|
|
||||||
delay: float,
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
Verify we raise a ``BaseExceptionGroup`` out of a nursery where
|
|
||||||
more then one actor errors and also with a delay before failure
|
more then one actor errors and also with a delay before failure
|
||||||
to test failure during an ongoing spawning.
|
to test failure during an ongoing spawning.
|
||||||
|
"""
|
||||||
'''
|
|
||||||
async def main():
|
async def main():
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
registry_addrs=[reg_addr],
|
registry_addrs=[reg_addr],
|
||||||
|
|
@ -221,15 +189,8 @@ async def do_nothing():
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize('mechanism', ['nursery_cancel', KeyboardInterrupt])
|
||||||
'mechanism', [
|
def test_cancel_single_subactor(reg_addr, mechanism):
|
||||||
'nursery_cancel',
|
|
||||||
KeyboardInterrupt,
|
|
||||||
])
|
|
||||||
def test_cancel_single_subactor(
|
|
||||||
reg_addr: tuple,
|
|
||||||
mechanism: str|KeyboardInterrupt,
|
|
||||||
):
|
|
||||||
'''
|
'''
|
||||||
Ensure a ``ActorNursery.start_actor()`` spawned subactor
|
Ensure a ``ActorNursery.start_actor()`` spawned subactor
|
||||||
cancels when the nursery is cancelled.
|
cancels when the nursery is cancelled.
|
||||||
|
|
@ -271,13 +232,9 @@ async def stream_forever():
|
||||||
await trio.sleep(0.01)
|
await trio.sleep(0.01)
|
||||||
|
|
||||||
|
|
||||||
@tractor_test(
|
@tractor_test
|
||||||
timeout=6,
|
async def test_cancel_infinite_streamer(start_method):
|
||||||
)
|
|
||||||
async def test_cancel_infinite_streamer(
|
|
||||||
reg_addr: tuple,
|
|
||||||
start_method: str,
|
|
||||||
):
|
|
||||||
# stream for at most 1 seconds
|
# stream for at most 1 seconds
|
||||||
with (
|
with (
|
||||||
trio.fail_after(4),
|
trio.fail_after(4),
|
||||||
|
|
@ -300,14 +257,6 @@ async def test_cancel_infinite_streamer(
|
||||||
assert n.cancelled
|
assert n.cancelled
|
||||||
|
|
||||||
|
|
||||||
# @pytest.mark.skipon_spawn_backend(
|
|
||||||
# 'subint',
|
|
||||||
# reason=(
|
|
||||||
# 'XXX SUBINT HANGING TEST XXX\n'
|
|
||||||
# 'See oustanding issue(s)\n'
|
|
||||||
# # TODO, put issue link!
|
|
||||||
# )
|
|
||||||
# )
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
'num_actors_and_errs',
|
'num_actors_and_errs',
|
||||||
[
|
[
|
||||||
|
|
@ -337,12 +286,9 @@ async def test_cancel_infinite_streamer(
|
||||||
'no_daemon_actors_fail_all_run_in_actors_sleep_then_fail',
|
'no_daemon_actors_fail_all_run_in_actors_sleep_then_fail',
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@tractor_test(
|
@tractor_test
|
||||||
timeout=10,
|
|
||||||
)
|
|
||||||
async def test_some_cancels_all(
|
async def test_some_cancels_all(
|
||||||
num_actors_and_errs: tuple,
|
num_actors_and_errs: tuple,
|
||||||
reg_addr: tuple,
|
|
||||||
start_method: str,
|
start_method: str,
|
||||||
loglevel: str,
|
loglevel: str,
|
||||||
):
|
):
|
||||||
|
|
@ -424,10 +370,7 @@ async def test_some_cancels_all(
|
||||||
pytest.fail("Should have gotten a remote assertion error?")
|
pytest.fail("Should have gotten a remote assertion error?")
|
||||||
|
|
||||||
|
|
||||||
async def spawn_and_error(
|
async def spawn_and_error(breadth, depth) -> None:
|
||||||
breadth: int,
|
|
||||||
depth: int,
|
|
||||||
) -> None:
|
|
||||||
name = tractor.current_actor().name
|
name = tractor.current_actor().name
|
||||||
async with tractor.open_nursery() as nursery:
|
async with tractor.open_nursery() as nursery:
|
||||||
for i in range(breadth):
|
for i in range(breadth):
|
||||||
|
|
@ -452,18 +395,8 @@ async def spawn_and_error(
|
||||||
await nursery.run_in_actor(*args, **kwargs)
|
await nursery.run_in_actor(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
# NOTE: subint_forkserver skip handled by file-level `pytestmark`
|
|
||||||
# above (same pytest-capture-fd hang class as siblings).
|
|
||||||
@pytest.mark.timeout(
|
|
||||||
10,
|
|
||||||
method='thread',
|
|
||||||
)
|
|
||||||
@tractor_test
|
@tractor_test
|
||||||
async def test_nested_multierrors(
|
async def test_nested_multierrors(loglevel, start_method):
|
||||||
reg_addr: tuple,
|
|
||||||
loglevel: str,
|
|
||||||
start_method: str,
|
|
||||||
):
|
|
||||||
'''
|
'''
|
||||||
Test that failed actor sets are wrapped in `BaseExceptionGroup`s. This
|
Test that failed actor sets are wrapped in `BaseExceptionGroup`s. This
|
||||||
test goes only 2 nurseries deep but we should eventually have tests
|
test goes only 2 nurseries deep but we should eventually have tests
|
||||||
|
|
@ -498,7 +431,7 @@ async def test_nested_multierrors(
|
||||||
for subexc in err.exceptions:
|
for subexc in err.exceptions:
|
||||||
|
|
||||||
# verify first level actor errors are wrapped as remote
|
# verify first level actor errors are wrapped as remote
|
||||||
if _friggin_windows:
|
if is_win():
|
||||||
|
|
||||||
# windows is often too slow and cancellation seems
|
# windows is often too slow and cancellation seems
|
||||||
# to happen before an actor is spawned
|
# to happen before an actor is spawned
|
||||||
|
|
@ -531,7 +464,7 @@ async def test_nested_multierrors(
|
||||||
# XXX not sure what's up with this..
|
# XXX not sure what's up with this..
|
||||||
# on windows sometimes spawning is just too slow and
|
# on windows sometimes spawning is just too slow and
|
||||||
# we get back the (sent) cancel signal instead
|
# we get back the (sent) cancel signal instead
|
||||||
if _friggin_windows:
|
if is_win():
|
||||||
if isinstance(subexc, tractor.RemoteActorError):
|
if isinstance(subexc, tractor.RemoteActorError):
|
||||||
assert subexc.boxed_type in (
|
assert subexc.boxed_type in (
|
||||||
BaseExceptionGroup,
|
BaseExceptionGroup,
|
||||||
|
|
@ -550,24 +483,20 @@ async def test_nested_multierrors(
|
||||||
|
|
||||||
@no_windows
|
@no_windows
|
||||||
def test_cancel_via_SIGINT(
|
def test_cancel_via_SIGINT(
|
||||||
reg_addr: tuple,
|
loglevel,
|
||||||
loglevel: str,
|
start_method,
|
||||||
start_method: str,
|
spawn_backend,
|
||||||
):
|
):
|
||||||
'''
|
"""Ensure that a control-C (SIGINT) signal cancels both the parent and
|
||||||
Ensure that a control-C (SIGINT) signal cancels both the parent and
|
|
||||||
child processes in trionic fashion
|
child processes in trionic fashion
|
||||||
|
"""
|
||||||
'''
|
pid = os.getpid()
|
||||||
pid: int = os.getpid()
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
with trio.fail_after(2):
|
with trio.fail_after(2):
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery() as tn:
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
) as tn:
|
|
||||||
await tn.start_actor('sucka')
|
await tn.start_actor('sucka')
|
||||||
if 'mp' in start_method:
|
if 'mp' in spawn_backend:
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
os.kill(pid, signal.SIGINT)
|
os.kill(pid, signal.SIGINT)
|
||||||
await trio.sleep_forever()
|
await trio.sleep_forever()
|
||||||
|
|
@ -578,38 +507,23 @@ def test_cancel_via_SIGINT(
|
||||||
|
|
||||||
@no_windows
|
@no_windows
|
||||||
def test_cancel_via_SIGINT_other_task(
|
def test_cancel_via_SIGINT_other_task(
|
||||||
reg_addr: tuple,
|
loglevel,
|
||||||
loglevel: str,
|
start_method,
|
||||||
start_method: str,
|
spawn_backend,
|
||||||
spawn_backend: str,
|
|
||||||
):
|
):
|
||||||
'''
|
"""Ensure that a control-C (SIGINT) signal cancels both the parent
|
||||||
Ensure that a control-C (SIGINT) signal cancels both the parent
|
and child processes in trionic fashion even a subprocess is started
|
||||||
and child processes in trionic fashion even a subprocess is
|
from a seperate ``trio`` child task.
|
||||||
started from a seperate ``trio`` child task.
|
"""
|
||||||
|
pid = os.getpid()
|
||||||
'''
|
timeout: float = 2
|
||||||
from .conftest import cpu_scaling_factor
|
if is_win(): # smh
|
||||||
|
|
||||||
pid: int = os.getpid()
|
|
||||||
timeout: float = (
|
|
||||||
4 if _non_linux
|
|
||||||
else 2
|
|
||||||
)
|
|
||||||
if _friggin_windows: # smh
|
|
||||||
timeout += 1
|
timeout += 1
|
||||||
|
|
||||||
# add latency headroom for CPU freq scaling (auto-cpufreq et al.)
|
|
||||||
headroom: float = cpu_scaling_factor()
|
|
||||||
if headroom != 1.:
|
|
||||||
timeout *= headroom
|
|
||||||
|
|
||||||
async def spawn_and_sleep_forever(
|
async def spawn_and_sleep_forever(
|
||||||
task_status=trio.TASK_STATUS_IGNORED
|
task_status=trio.TASK_STATUS_IGNORED
|
||||||
):
|
):
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery() as tn:
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
) as tn:
|
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
await tn.run_in_actor(
|
await tn.run_in_actor(
|
||||||
sleep_forever,
|
sleep_forever,
|
||||||
|
|
@ -654,14 +568,6 @@ async def spawn_sub_with_sync_blocking_task():
|
||||||
print('exiting first subactor layer..\n')
|
print('exiting first subactor layer..\n')
|
||||||
|
|
||||||
|
|
||||||
# @pytest.mark.skipon_spawn_backend(
|
|
||||||
# 'subint',
|
|
||||||
# reason=(
|
|
||||||
# 'XXX SUBINT HANGING TEST XXX\n'
|
|
||||||
# 'See oustanding issue(s)\n'
|
|
||||||
# # TODO, put issue link!
|
|
||||||
# )
|
|
||||||
# )
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
'man_cancel_outer',
|
'man_cancel_outer',
|
||||||
[
|
[
|
||||||
|
|
@ -738,11 +644,7 @@ def test_cancel_while_childs_child_in_sync_sleep(
|
||||||
#
|
#
|
||||||
# delay = 1 # no AssertionError in eg, TooSlowError raised.
|
# delay = 1 # no AssertionError in eg, TooSlowError raised.
|
||||||
# delay = 2 # is AssertionError in eg AND no TooSlowError !?
|
# delay = 2 # is AssertionError in eg AND no TooSlowError !?
|
||||||
# is AssertionError in eg AND no _cs cancellation.
|
delay = 4 # is AssertionError in eg AND no _cs cancellation.
|
||||||
delay = (
|
|
||||||
6 if _non_linux
|
|
||||||
else 4
|
|
||||||
)
|
|
||||||
|
|
||||||
with trio.fail_after(delay) as _cs:
|
with trio.fail_after(delay) as _cs:
|
||||||
# with trio.CancelScope() as cs:
|
# with trio.CancelScope() as cs:
|
||||||
|
|
@ -776,7 +678,7 @@ def test_cancel_while_childs_child_in_sync_sleep(
|
||||||
|
|
||||||
|
|
||||||
def test_fast_graceful_cancel_when_spawn_task_in_soft_proc_wait_for_daemon(
|
def test_fast_graceful_cancel_when_spawn_task_in_soft_proc_wait_for_daemon(
|
||||||
start_method: str,
|
start_method,
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
This is a very subtle test which demonstrates how cancellation
|
This is a very subtle test which demonstrates how cancellation
|
||||||
|
|
@ -794,7 +696,7 @@ def test_fast_graceful_cancel_when_spawn_task_in_soft_proc_wait_for_daemon(
|
||||||
kbi_delay = 0.5
|
kbi_delay = 0.5
|
||||||
timeout: float = 2.9
|
timeout: float = 2.9
|
||||||
|
|
||||||
if _friggin_windows: # smh
|
if is_win(): # smh
|
||||||
timeout += 1
|
timeout += 1
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
|
|
|
||||||
|
|
@ -18,15 +18,16 @@ from tractor import RemoteActorError
|
||||||
|
|
||||||
|
|
||||||
async def aio_streamer(
|
async def aio_streamer(
|
||||||
chan: tractor.to_asyncio.LinkedTaskChannel,
|
from_trio: asyncio.Queue,
|
||||||
|
to_trio: trio.abc.SendChannel,
|
||||||
) -> trio.abc.ReceiveChannel:
|
) -> trio.abc.ReceiveChannel:
|
||||||
|
|
||||||
# required first msg to sync caller
|
# required first msg to sync caller
|
||||||
chan.started_nowait(None)
|
to_trio.send_nowait(None)
|
||||||
|
|
||||||
from itertools import cycle
|
from itertools import cycle
|
||||||
for i in cycle(range(10)):
|
for i in cycle(range(10)):
|
||||||
chan.send_nowait(i)
|
to_trio.send_nowait(i)
|
||||||
await asyncio.sleep(0.01)
|
await asyncio.sleep(0.01)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -68,7 +69,7 @@ async def wrapper_mngr(
|
||||||
else:
|
else:
|
||||||
async with tractor.to_asyncio.open_channel_from(
|
async with tractor.to_asyncio.open_channel_from(
|
||||||
aio_streamer,
|
aio_streamer,
|
||||||
) as (from_aio, first):
|
) as (first, from_aio):
|
||||||
assert not first
|
assert not first
|
||||||
|
|
||||||
# cache it so next task uses broadcast receiver
|
# cache it so next task uses broadcast receiver
|
||||||
|
|
|
||||||
|
|
@ -10,19 +10,7 @@ from tractor._testing import tractor_test
|
||||||
MESSAGE = 'tractoring at full speed'
|
MESSAGE = 'tractoring at full speed'
|
||||||
|
|
||||||
|
|
||||||
def test_empty_mngrs_input_raises(
|
def test_empty_mngrs_input_raises() -> None:
|
||||||
tpt_proto: str,
|
|
||||||
) -> None:
|
|
||||||
# TODO, the `open_actor_cluster()` teardown hangs
|
|
||||||
# intermittently on UDS when `gather_contexts(mngrs=())`
|
|
||||||
# raises `ValueError` mid-setup; likely a race in the
|
|
||||||
# actor-nursery cleanup vs UDS socket shutdown. Needs
|
|
||||||
# a deeper look at `._clustering`/`._supervise` teardown
|
|
||||||
# paths with the UDS transport.
|
|
||||||
if tpt_proto == 'uds':
|
|
||||||
pytest.skip(
|
|
||||||
'actor-cluster teardown hangs intermittently on UDS'
|
|
||||||
)
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
with trio.fail_after(3):
|
with trio.fail_after(3):
|
||||||
|
|
@ -68,39 +56,25 @@ async def worker(
|
||||||
print(msg)
|
print(msg)
|
||||||
assert msg == MESSAGE
|
assert msg == MESSAGE
|
||||||
|
|
||||||
# ?TODO, does this ever cause a hang?
|
# TODO: does this ever cause a hang
|
||||||
# assert 0
|
# assert 0
|
||||||
|
|
||||||
|
|
||||||
# ?TODO, but needs a fn-scoped tpt_proto fixture..
|
|
||||||
# @pytest.mark.no_tpt('uds')
|
|
||||||
@tractor_test
|
@tractor_test
|
||||||
async def test_streaming_to_actor_cluster(
|
async def test_streaming_to_actor_cluster() -> None:
|
||||||
tpt_proto: str,
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
Open an actor "cluster" using the (experimental) `._clustering`
|
|
||||||
API and conduct standard inter-task-ctx streaming.
|
|
||||||
|
|
||||||
'''
|
async with (
|
||||||
if tpt_proto == 'uds':
|
open_actor_cluster(modules=[__name__]) as portals,
|
||||||
pytest.skip(
|
|
||||||
f'Test currently fails with tpt-proto={tpt_proto!r}\n'
|
|
||||||
)
|
|
||||||
|
|
||||||
with trio.fail_after(6):
|
gather_contexts(
|
||||||
async with (
|
mngrs=[p.open_context(worker) for p in portals.values()],
|
||||||
open_actor_cluster(modules=[__name__]) as portals,
|
) as contexts,
|
||||||
|
|
||||||
gather_contexts(
|
gather_contexts(
|
||||||
mngrs=[p.open_context(worker) for p in portals.values()],
|
mngrs=[ctx[0].open_stream() for ctx in contexts],
|
||||||
) as contexts,
|
) as streams,
|
||||||
|
|
||||||
gather_contexts(
|
):
|
||||||
mngrs=[ctx[0].open_stream() for ctx in contexts],
|
with trio.move_on_after(1):
|
||||||
) as streams,
|
for stream in itertools.cycle(streams):
|
||||||
|
await stream.send(MESSAGE)
|
||||||
):
|
|
||||||
with trio.move_on_after(1):
|
|
||||||
for stream in itertools.cycle(streams):
|
|
||||||
await stream.send(MESSAGE)
|
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,6 @@ from itertools import count
|
||||||
import math
|
import math
|
||||||
import platform
|
import platform
|
||||||
from pprint import pformat
|
from pprint import pformat
|
||||||
import sys
|
|
||||||
from typing import (
|
from typing import (
|
||||||
Callable,
|
Callable,
|
||||||
)
|
)
|
||||||
|
|
@ -26,7 +25,7 @@ from tractor._exceptions import (
|
||||||
StreamOverrun,
|
StreamOverrun,
|
||||||
ContextCancelled,
|
ContextCancelled,
|
||||||
)
|
)
|
||||||
from tractor.runtime._state import current_ipc_ctx
|
from tractor._state import current_ipc_ctx
|
||||||
|
|
||||||
from tractor._testing import (
|
from tractor._testing import (
|
||||||
tractor_test,
|
tractor_test,
|
||||||
|
|
@ -939,14 +938,9 @@ def test_one_end_stream_not_opened(
|
||||||
|
|
||||||
'''
|
'''
|
||||||
overrunner, buf_size_increase, entrypoint = overrun_by
|
overrunner, buf_size_increase, entrypoint = overrun_by
|
||||||
from tractor.runtime._runtime import Actor
|
from tractor._runtime import Actor
|
||||||
buf_size = buf_size_increase + Actor.msg_buffer_size
|
buf_size = buf_size_increase + Actor.msg_buffer_size
|
||||||
|
|
||||||
timeout: float = (
|
|
||||||
1 if sys.platform == 'linux'
|
|
||||||
else 3
|
|
||||||
)
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
|
|
@ -956,7 +950,7 @@ def test_one_end_stream_not_opened(
|
||||||
enable_modules=[__name__],
|
enable_modules=[__name__],
|
||||||
)
|
)
|
||||||
|
|
||||||
with trio.fail_after(timeout):
|
with trio.fail_after(1):
|
||||||
async with portal.open_context(
|
async with portal.open_context(
|
||||||
entrypoint,
|
entrypoint,
|
||||||
) as (ctx, sent):
|
) as (ctx, sent):
|
||||||
|
|
|
||||||
|
|
@ -1,33 +1,25 @@
|
||||||
'''
|
"""
|
||||||
Discovery subsystem via a "registrar" actor scenarios.
|
Actor "discovery" testing
|
||||||
|
"""
|
||||||
'''
|
|
||||||
import os
|
import os
|
||||||
import signal
|
import signal
|
||||||
import platform
|
import platform
|
||||||
from functools import partial
|
from functools import partial
|
||||||
import itertools
|
import itertools
|
||||||
import time
|
|
||||||
from typing import Callable
|
|
||||||
|
|
||||||
import psutil
|
import psutil
|
||||||
import pytest
|
import pytest
|
||||||
import subprocess
|
import subprocess
|
||||||
import tractor
|
import tractor
|
||||||
from tractor.devx import dump_on_hang
|
|
||||||
from tractor.trionics import collapse_eg
|
from tractor.trionics import collapse_eg
|
||||||
from tractor._testing import tractor_test
|
from tractor._testing import tractor_test
|
||||||
from tractor.discovery._addr import wrap_address
|
|
||||||
from tractor.discovery._multiaddr import mk_maddr
|
|
||||||
import trio
|
import trio
|
||||||
|
|
||||||
|
|
||||||
@tractor_test
|
@tractor_test
|
||||||
async def test_reg_then_unreg(
|
async def test_reg_then_unreg(reg_addr):
|
||||||
reg_addr: tuple,
|
|
||||||
):
|
|
||||||
actor = tractor.current_actor()
|
actor = tractor.current_actor()
|
||||||
assert actor.is_registrar
|
assert actor.is_arbiter
|
||||||
assert len(actor._registry) == 1 # only self is registered
|
assert len(actor._registry) == 1 # only self is registered
|
||||||
|
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
|
|
@ -35,10 +27,10 @@ async def test_reg_then_unreg(
|
||||||
) as n:
|
) as n:
|
||||||
|
|
||||||
portal = await n.start_actor('actor', enable_modules=[__name__])
|
portal = await n.start_actor('actor', enable_modules=[__name__])
|
||||||
uid = portal.channel.aid.uid
|
uid = portal.channel.uid
|
||||||
|
|
||||||
async with tractor.get_registry(reg_addr) as aportal:
|
async with tractor.get_registry(reg_addr) as aportal:
|
||||||
# this local actor should be the registrar
|
# this local actor should be the arbiter
|
||||||
assert actor is aportal.actor
|
assert actor is aportal.actor
|
||||||
|
|
||||||
async with tractor.wait_for_actor('actor'):
|
async with tractor.wait_for_actor('actor'):
|
||||||
|
|
@ -56,49 +48,6 @@ async def test_reg_then_unreg(
|
||||||
assert not sockaddrs
|
assert not sockaddrs
|
||||||
|
|
||||||
|
|
||||||
@tractor_test
|
|
||||||
async def test_reg_then_unreg_maddr(
|
|
||||||
reg_addr: tuple,
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
Same as `test_reg_then_unreg` but pass the registry
|
|
||||||
address as a multiaddr string to verify `wrap_address()`
|
|
||||||
multiaddr parsing end-to-end through the runtime.
|
|
||||||
|
|
||||||
'''
|
|
||||||
# tuple -> Address -> multiaddr string
|
|
||||||
addr_obj = wrap_address(reg_addr)
|
|
||||||
maddr_str: str = str(mk_maddr(addr_obj))
|
|
||||||
|
|
||||||
actor = tractor.current_actor()
|
|
||||||
assert actor.is_registrar
|
|
||||||
|
|
||||||
async with tractor.open_nursery(
|
|
||||||
registry_addrs=[maddr_str],
|
|
||||||
) as n:
|
|
||||||
|
|
||||||
portal = await n.start_actor(
|
|
||||||
'actor_maddr',
|
|
||||||
enable_modules=[__name__],
|
|
||||||
)
|
|
||||||
uid = portal.channel.aid.uid
|
|
||||||
|
|
||||||
async with tractor.get_registry(maddr_str) as aportal:
|
|
||||||
assert actor is aportal.actor
|
|
||||||
|
|
||||||
async with tractor.wait_for_actor('actor_maddr'):
|
|
||||||
assert uid in aportal.actor._registry
|
|
||||||
sockaddrs = actor._registry[uid]
|
|
||||||
assert sockaddrs
|
|
||||||
|
|
||||||
await n.cancel()
|
|
||||||
|
|
||||||
await trio.sleep(0.1)
|
|
||||||
assert uid not in aportal.actor._registry
|
|
||||||
sockaddrs = actor._registry.get(uid)
|
|
||||||
assert not sockaddrs
|
|
||||||
|
|
||||||
|
|
||||||
the_line = 'Hi my name is {}'
|
the_line = 'Hi my name is {}'
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -132,20 +81,12 @@ async def say_hello_use_wait(
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.timeout(
|
|
||||||
7,
|
|
||||||
method='thread',
|
|
||||||
)
|
|
||||||
@tractor_test
|
@tractor_test
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize('func', [say_hello, say_hello_use_wait])
|
||||||
'func',
|
|
||||||
[say_hello,
|
|
||||||
say_hello_use_wait]
|
|
||||||
)
|
|
||||||
async def test_trynamic_trio(
|
async def test_trynamic_trio(
|
||||||
func: Callable,
|
func,
|
||||||
start_method: str,
|
start_method,
|
||||||
reg_addr: tuple,
|
reg_addr,
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
Root actor acting as the "director" and running one-shot-task-actors
|
Root actor acting as the "director" and running one-shot-task-actors
|
||||||
|
|
@ -178,10 +119,7 @@ async def stream_forever():
|
||||||
await trio.sleep(0.01)
|
await trio.sleep(0.01)
|
||||||
|
|
||||||
|
|
||||||
async def cancel(
|
async def cancel(use_signal, delay=0):
|
||||||
use_signal: bool,
|
|
||||||
delay: float = 0,
|
|
||||||
):
|
|
||||||
# hold on there sally
|
# hold on there sally
|
||||||
await trio.sleep(delay)
|
await trio.sleep(delay)
|
||||||
|
|
||||||
|
|
@ -194,17 +132,15 @@ async def cancel(
|
||||||
raise KeyboardInterrupt
|
raise KeyboardInterrupt
|
||||||
|
|
||||||
|
|
||||||
async def stream_from(portal: tractor.Portal):
|
async def stream_from(portal):
|
||||||
async with portal.open_stream_from(stream_forever) as stream:
|
async with portal.open_stream_from(stream_forever) as stream:
|
||||||
async for value in stream:
|
async for value in stream:
|
||||||
print(value)
|
print(value)
|
||||||
|
|
||||||
|
|
||||||
async def unpack_reg(
|
async def unpack_reg(actor_or_portal):
|
||||||
actor_or_portal: tractor.Portal|tractor.Actor,
|
|
||||||
):
|
|
||||||
'''
|
'''
|
||||||
Get and unpack a "registry" RPC request from the registrar
|
Get and unpack a "registry" RPC request from the "arbiter" registry
|
||||||
system.
|
system.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
|
@ -213,10 +149,7 @@ async def unpack_reg(
|
||||||
else:
|
else:
|
||||||
msg = await actor_or_portal.run_from_ns('self', 'get_registry')
|
msg = await actor_or_portal.run_from_ns('self', 'get_registry')
|
||||||
|
|
||||||
return {
|
return {tuple(key.split('.')): val for key, val in msg.items()}
|
||||||
tuple(key.split('.')): val
|
|
||||||
for key, val in msg.items()
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
async def spawn_and_check_registry(
|
async def spawn_and_check_registry(
|
||||||
|
|
@ -240,26 +173,24 @@ async def spawn_and_check_registry(
|
||||||
registry_addrs=[reg_addr],
|
registry_addrs=[reg_addr],
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
):
|
):
|
||||||
async with tractor.get_registry(
|
async with tractor.get_registry(reg_addr) as portal:
|
||||||
addr=reg_addr,
|
|
||||||
) as portal:
|
|
||||||
# runtime needs to be up to call this
|
# runtime needs to be up to call this
|
||||||
actor = tractor.current_actor()
|
actor = tractor.current_actor()
|
||||||
|
|
||||||
if remote_arbiter:
|
if remote_arbiter:
|
||||||
assert not actor.is_registrar
|
assert not actor.is_arbiter
|
||||||
|
|
||||||
if actor.is_registrar:
|
if actor.is_arbiter:
|
||||||
extra = 1 # registrar is local root actor
|
extra = 1 # arbiter is local root actor
|
||||||
get_reg = partial(unpack_reg, actor)
|
get_reg = partial(unpack_reg, actor)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
get_reg = partial(unpack_reg, portal)
|
get_reg = partial(unpack_reg, portal)
|
||||||
extra = 2 # local root actor + remote registrar
|
extra = 2 # local root actor + remote arbiter
|
||||||
|
|
||||||
# ensure current actor is registered
|
# ensure current actor is registered
|
||||||
registry: dict = await get_reg()
|
registry: dict = await get_reg()
|
||||||
assert actor.aid.uid in registry
|
assert actor.uid in registry
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async with tractor.open_nursery() as an:
|
async with tractor.open_nursery() as an:
|
||||||
|
|
@ -307,35 +238,22 @@ async def spawn_and_check_registry(
|
||||||
|
|
||||||
# all subactors should have de-registered
|
# all subactors should have de-registered
|
||||||
registry = await get_reg()
|
registry = await get_reg()
|
||||||
start: float = time.time()
|
assert len(registry) == extra
|
||||||
while (
|
assert actor.uid in registry
|
||||||
not (len(registry) == extra)
|
|
||||||
and
|
|
||||||
(time.time() - start) < 5
|
|
||||||
):
|
|
||||||
print(
|
|
||||||
f'Waiting for remaining subs to dereg..\n'
|
|
||||||
f'{registry!r}\n'
|
|
||||||
)
|
|
||||||
await trio.sleep(0.3)
|
|
||||||
else:
|
|
||||||
assert len(registry) == extra
|
|
||||||
|
|
||||||
assert actor.aid.uid in registry
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('use_signal', [False, True])
|
@pytest.mark.parametrize('use_signal', [False, True])
|
||||||
@pytest.mark.parametrize('with_streaming', [False, True])
|
@pytest.mark.parametrize('with_streaming', [False, True])
|
||||||
def test_subactors_unregister_on_cancel(
|
def test_subactors_unregister_on_cancel(
|
||||||
debug_mode: bool,
|
debug_mode: bool,
|
||||||
start_method: str,
|
start_method,
|
||||||
use_signal: bool,
|
use_signal,
|
||||||
reg_addr: tuple,
|
reg_addr,
|
||||||
with_streaming: bool,
|
with_streaming,
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
Verify that cancelling a nursery results in all subactors
|
Verify that cancelling a nursery results in all subactors
|
||||||
deregistering themselves with the registrar.
|
deregistering themselves with the arbiter.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
with pytest.raises(KeyboardInterrupt):
|
with pytest.raises(KeyboardInterrupt):
|
||||||
|
|
@ -356,17 +274,15 @@ def test_subactors_unregister_on_cancel(
|
||||||
def test_subactors_unregister_on_cancel_remote_daemon(
|
def test_subactors_unregister_on_cancel_remote_daemon(
|
||||||
daemon: subprocess.Popen,
|
daemon: subprocess.Popen,
|
||||||
debug_mode: bool,
|
debug_mode: bool,
|
||||||
start_method: str,
|
start_method,
|
||||||
use_signal: bool,
|
use_signal,
|
||||||
reg_addr: tuple,
|
reg_addr,
|
||||||
with_streaming: bool,
|
with_streaming,
|
||||||
):
|
):
|
||||||
'''
|
"""Verify that cancelling a nursery results in all subactors
|
||||||
Verify that cancelling a nursery results in all subactors
|
deregistering themselves with a **remote** (not in the local process
|
||||||
deregistering themselves with a **remote** (not in the local
|
tree) arbiter.
|
||||||
process tree) registrar.
|
"""
|
||||||
|
|
||||||
'''
|
|
||||||
with pytest.raises(KeyboardInterrupt):
|
with pytest.raises(KeyboardInterrupt):
|
||||||
trio.run(
|
trio.run(
|
||||||
partial(
|
partial(
|
||||||
|
|
@ -409,24 +325,20 @@ async def close_chans_before_nursery(
|
||||||
try:
|
try:
|
||||||
get_reg = partial(unpack_reg, aportal)
|
get_reg = partial(unpack_reg, aportal)
|
||||||
|
|
||||||
async with tractor.open_nursery() as an:
|
async with tractor.open_nursery() as tn:
|
||||||
portal1 = await an.start_actor(
|
portal1 = await tn.start_actor(
|
||||||
name='consumer1',
|
name='consumer1', enable_modules=[__name__])
|
||||||
enable_modules=[__name__],
|
portal2 = await tn.start_actor(
|
||||||
)
|
'consumer2', enable_modules=[__name__])
|
||||||
portal2 = await an.start_actor(
|
|
||||||
'consumer2',
|
|
||||||
enable_modules=[__name__],
|
|
||||||
)
|
|
||||||
|
|
||||||
async with (
|
# TODO: compact this back as was in last commit once
|
||||||
portal1.open_stream_from(
|
# 3.9+, see https://github.com/goodboy/tractor/issues/207
|
||||||
|
async with portal1.open_stream_from(
|
||||||
|
stream_forever
|
||||||
|
) as agen1:
|
||||||
|
async with portal2.open_stream_from(
|
||||||
stream_forever
|
stream_forever
|
||||||
) as agen1,
|
) as agen2:
|
||||||
portal2.open_stream_from(
|
|
||||||
stream_forever
|
|
||||||
) as agen2,
|
|
||||||
):
|
|
||||||
async with (
|
async with (
|
||||||
collapse_eg(),
|
collapse_eg(),
|
||||||
trio.open_nursery() as tn,
|
trio.open_nursery() as tn,
|
||||||
|
|
@ -437,7 +349,7 @@ async def close_chans_before_nursery(
|
||||||
await streamer(agen2)
|
await streamer(agen2)
|
||||||
finally:
|
finally:
|
||||||
# Kill the root nursery thus resulting in
|
# Kill the root nursery thus resulting in
|
||||||
# normal registrar channel ops to fail during
|
# normal arbiter channel ops to fail during
|
||||||
# teardown. It doesn't seem like this is
|
# teardown. It doesn't seem like this is
|
||||||
# reliably triggered by an external SIGINT.
|
# reliably triggered by an external SIGINT.
|
||||||
# tractor.current_actor()._root_nursery.cancel_scope.cancel()
|
# tractor.current_actor()._root_nursery.cancel_scope.cancel()
|
||||||
|
|
@ -449,30 +361,27 @@ async def close_chans_before_nursery(
|
||||||
# also kill off channels cuz why not
|
# also kill off channels cuz why not
|
||||||
await agen1.aclose()
|
await agen1.aclose()
|
||||||
await agen2.aclose()
|
await agen2.aclose()
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
with trio.CancelScope(shield=True):
|
with trio.CancelScope(shield=True):
|
||||||
await trio.sleep(1)
|
await trio.sleep(1)
|
||||||
|
|
||||||
# all subactors should have de-registered
|
# all subactors should have de-registered
|
||||||
registry = await get_reg()
|
registry = await get_reg()
|
||||||
assert portal1.channel.aid.uid not in registry
|
assert portal1.channel.uid not in registry
|
||||||
assert portal2.channel.aid.uid not in registry
|
assert portal2.channel.uid not in registry
|
||||||
assert len(registry) == entries_at_end
|
assert len(registry) == entries_at_end
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('use_signal', [False, True])
|
@pytest.mark.parametrize('use_signal', [False, True])
|
||||||
def test_close_channel_explicit(
|
def test_close_channel_explicit(
|
||||||
start_method: str,
|
start_method,
|
||||||
use_signal: bool,
|
use_signal,
|
||||||
reg_addr: tuple,
|
reg_addr,
|
||||||
):
|
):
|
||||||
'''
|
"""Verify that closing a stream explicitly and killing the actor's
|
||||||
Verify that closing a stream explicitly and killing the actor's
|
|
||||||
"root nursery" **before** the containing nursery tears down also
|
"root nursery" **before** the containing nursery tears down also
|
||||||
results in subactor(s) deregistering from the registrar.
|
results in subactor(s) deregistering from the arbiter.
|
||||||
|
"""
|
||||||
'''
|
|
||||||
with pytest.raises(KeyboardInterrupt):
|
with pytest.raises(KeyboardInterrupt):
|
||||||
trio.run(
|
trio.run(
|
||||||
partial(
|
partial(
|
||||||
|
|
@ -485,18 +394,16 @@ def test_close_channel_explicit(
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('use_signal', [False, True])
|
@pytest.mark.parametrize('use_signal', [False, True])
|
||||||
def test_close_channel_explicit_remote_registrar(
|
def test_close_channel_explicit_remote_arbiter(
|
||||||
daemon: subprocess.Popen,
|
daemon: subprocess.Popen,
|
||||||
start_method: str,
|
start_method,
|
||||||
use_signal: bool,
|
use_signal,
|
||||||
reg_addr: tuple,
|
reg_addr,
|
||||||
):
|
):
|
||||||
'''
|
"""Verify that closing a stream explicitly and killing the actor's
|
||||||
Verify that closing a stream explicitly and killing the actor's
|
|
||||||
"root nursery" **before** the containing nursery tears down also
|
"root nursery" **before** the containing nursery tears down also
|
||||||
results in subactor(s) deregistering from the registrar.
|
results in subactor(s) deregistering from the arbiter.
|
||||||
|
"""
|
||||||
'''
|
|
||||||
with pytest.raises(KeyboardInterrupt):
|
with pytest.raises(KeyboardInterrupt):
|
||||||
trio.run(
|
trio.run(
|
||||||
partial(
|
partial(
|
||||||
|
|
@ -506,138 +413,3 @@ def test_close_channel_explicit_remote_registrar(
|
||||||
remote_arbiter=True,
|
remote_arbiter=True,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@tractor.context
|
|
||||||
async def kill_transport(
|
|
||||||
ctx: tractor.Context,
|
|
||||||
) -> None:
|
|
||||||
|
|
||||||
await ctx.started()
|
|
||||||
actor: tractor.Actor = tractor.current_actor()
|
|
||||||
actor.ipc_server.cancel()
|
|
||||||
await trio.sleep_forever()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Wall-clock bound via `pytest-timeout` (`method='thread'`).
|
|
||||||
# Under `--spawn-backend=subint` this test can wedge in an
|
|
||||||
# un-Ctrl-C-able state (abandoned-subint + shared-GIL
|
|
||||||
# starvation → signal-wakeup-fd pipe fills → SIGINT silently
|
|
||||||
# dropped; see `ai/conc-anal/subint_sigint_starvation_issue.md`).
|
|
||||||
# `method='thread'` is specifically required because `signal`-
|
|
||||||
# method SIGALRM suffers the same GIL-starvation path and
|
|
||||||
# wouldn't fire the Python-level handler.
|
|
||||||
# At timeout the plugin hard-kills the pytest process — that's
|
|
||||||
# the intended behavior here; the alternative is an unattended
|
|
||||||
# suite run that never returns.
|
|
||||||
@pytest.mark.timeout(
|
|
||||||
3, # NOTE should be a 2.1s happy path.
|
|
||||||
method='thread',
|
|
||||||
)
|
|
||||||
@pytest.mark.skipon_spawn_backend(
|
|
||||||
'subint',
|
|
||||||
reason=(
|
|
||||||
'XXX SUBINT HANGING TEST XXX\n'
|
|
||||||
'See oustanding issue(s)\n'
|
|
||||||
# TODO, put issue link!
|
|
||||||
)
|
|
||||||
)
|
|
||||||
# @pytest.mark.parametrize('use_signal', [False, True])
|
|
||||||
#
|
|
||||||
def test_stale_entry_is_deleted(
|
|
||||||
debug_mode: bool,
|
|
||||||
daemon: subprocess.Popen,
|
|
||||||
start_method: str,
|
|
||||||
reg_addr: tuple,
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
Ensure that when a stale entry is detected in the registrar's
|
|
||||||
table that the `find_actor()` API takes care of deleting the
|
|
||||||
stale entry and not delivering a bad portal.
|
|
||||||
|
|
||||||
'''
|
|
||||||
async def main():
|
|
||||||
|
|
||||||
name: str = 'transport_fails_actor'
|
|
||||||
_reg_ptl: tractor.Portal
|
|
||||||
an: tractor.ActorNursery
|
|
||||||
async with (
|
|
||||||
tractor.open_nursery(
|
|
||||||
debug_mode=debug_mode,
|
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
) as an,
|
|
||||||
tractor.get_registry(reg_addr) as _reg_ptl,
|
|
||||||
):
|
|
||||||
ptl: tractor.Portal = await an.start_actor(
|
|
||||||
name,
|
|
||||||
enable_modules=[__name__],
|
|
||||||
)
|
|
||||||
async with ptl.open_context(
|
|
||||||
kill_transport,
|
|
||||||
) as (first, ctx):
|
|
||||||
async with tractor.find_actor(
|
|
||||||
name,
|
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
) as maybe_portal:
|
|
||||||
# because the transitive
|
|
||||||
# `._api.maybe_open_portal()` call should
|
|
||||||
# fail and implicitly call `.delete_addr()`
|
|
||||||
assert maybe_portal is None
|
|
||||||
registry: dict = await unpack_reg(_reg_ptl)
|
|
||||||
assert ptl.chan.aid.uid not in registry
|
|
||||||
|
|
||||||
# should fail since we knocked out the IPC tpt XD
|
|
||||||
await ptl.cancel_actor()
|
|
||||||
await an.cancel()
|
|
||||||
|
|
||||||
# TODO, remove once the `[subint]` variant no longer hangs.
|
|
||||||
#
|
|
||||||
# Status (as of Phase B hard-kill landing):
|
|
||||||
#
|
|
||||||
# - `[trio]`/`[mp_*]` variants: completes normally; `dump_on_hang`
|
|
||||||
# is a no-op safety net here.
|
|
||||||
#
|
|
||||||
# - `[subint]` variant: hangs indefinitely AND is un-Ctrl-C-able.
|
|
||||||
# `strace -p <pytest_pid>` while in the hang reveals a silently-
|
|
||||||
# dropped SIGINT — the C signal handler tries to write the
|
|
||||||
# signum byte to Python's signal-wakeup fd and gets `EAGAIN`,
|
|
||||||
# meaning the pipe is full (nobody's draining it).
|
|
||||||
#
|
|
||||||
# Root-cause chain: our hard-kill in `spawn._subint` abandoned
|
|
||||||
# the driver OS-thread (which is `daemon=True`) after the soft-
|
|
||||||
# kill timeout, but the *sub-interpreter* inside that thread is
|
|
||||||
# still running `trio.run()` — `_interpreters.destroy()` can't
|
|
||||||
# force-stop a running subint (raises `InterpreterError`), and
|
|
||||||
# legacy-config subints share the main GIL. The abandoned subint
|
|
||||||
# starves the parent's trio event loop from iterating often
|
|
||||||
# enough to drain its wakeup pipe → SIGINT silently drops.
|
|
||||||
#
|
|
||||||
# This is structurally a CPython-level limitation: there's no
|
|
||||||
# public force-destroy primitive for a running subint. We
|
|
||||||
# escape on the harness side via a SIGINT-loop in the `daemon`
|
|
||||||
# fixture teardown (killing the bg registrar subproc closes its
|
|
||||||
# end of the IPC, which eventually unblocks a recv in main trio,
|
|
||||||
# which lets the loop drain the wakeup pipe). Long-term fix path:
|
|
||||||
# msgspec PEP 684 support (jcrist/msgspec#563) → isolated-mode
|
|
||||||
# subints with per-interp GIL.
|
|
||||||
#
|
|
||||||
# Full analysis:
|
|
||||||
# `ai/conc-anal/subint_sigint_starvation_issue.md`
|
|
||||||
#
|
|
||||||
# See also the *sibling* hang class documented in
|
|
||||||
# `ai/conc-anal/subint_cancel_delivery_hang_issue.md` — same
|
|
||||||
# subint backend, different root cause (Ctrl-C-able hang, main
|
|
||||||
# trio loop iterating fine; ours to fix, not CPython's).
|
|
||||||
# Reproduced by `tests/test_subint_cancellation.py
|
|
||||||
# ::test_subint_non_checkpointing_child`.
|
|
||||||
#
|
|
||||||
# Kept here (and not behind a `pytestmark.skip`) so we can still
|
|
||||||
# inspect the dump file if the hang ever returns after a refactor.
|
|
||||||
# `pytest`'s stderr capture eats `faulthandler` output otherwise,
|
|
||||||
# so we route `dump_on_hang` to a file.
|
|
||||||
with dump_on_hang(
|
|
||||||
seconds=20,
|
|
||||||
path=f'/tmp/test_stale_entry_is_deleted_{start_method}.dump',
|
|
||||||
):
|
|
||||||
trio.run(main)
|
|
||||||
|
|
@ -9,17 +9,12 @@ import sys
|
||||||
import subprocess
|
import subprocess
|
||||||
import platform
|
import platform
|
||||||
import shutil
|
import shutil
|
||||||
from typing import Callable
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import tractor
|
|
||||||
from tractor._testing import (
|
from tractor._testing import (
|
||||||
examples_dir,
|
examples_dir,
|
||||||
)
|
)
|
||||||
|
|
||||||
_non_linux: bool = platform.system() != 'Linux'
|
|
||||||
_friggin_macos: bool = platform.system() == 'Darwin'
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def run_example_in_subproc(
|
def run_example_in_subproc(
|
||||||
|
|
@ -94,10 +89,8 @@ def run_example_in_subproc(
|
||||||
for f in p[2]
|
for f in p[2]
|
||||||
|
|
||||||
if (
|
if (
|
||||||
'__' not in f # ignore any pkg-mods
|
'__' not in f
|
||||||
# ignore any `__pycache__` subdir
|
and f[0] != '_'
|
||||||
and '__pycache__' not in str(p[0])
|
|
||||||
and f[0] != '_' # ignore any WIP "examplel mods"
|
|
||||||
and 'debugging' not in p[0]
|
and 'debugging' not in p[0]
|
||||||
and 'integration' not in p[0]
|
and 'integration' not in p[0]
|
||||||
and 'advanced_faults' not in p[0]
|
and 'advanced_faults' not in p[0]
|
||||||
|
|
@ -108,10 +101,8 @@ def run_example_in_subproc(
|
||||||
ids=lambda t: t[1],
|
ids=lambda t: t[1],
|
||||||
)
|
)
|
||||||
def test_example(
|
def test_example(
|
||||||
run_example_in_subproc: Callable,
|
run_example_in_subproc,
|
||||||
example_script: str,
|
example_script,
|
||||||
test_log: tractor.log.StackLevelAdapter,
|
|
||||||
ci_env: bool,
|
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
Load and run scripts from this repo's ``examples/`` dir as a user
|
Load and run scripts from this repo's ``examples/`` dir as a user
|
||||||
|
|
@ -125,39 +116,9 @@ def test_example(
|
||||||
'''
|
'''
|
||||||
ex_file: str = os.path.join(*example_script)
|
ex_file: str = os.path.join(*example_script)
|
||||||
|
|
||||||
if (
|
if 'rpc_bidir_streaming' in ex_file and sys.version_info < (3, 9):
|
||||||
'rpc_bidir_streaming' in ex_file
|
|
||||||
and
|
|
||||||
sys.version_info < (3, 9)
|
|
||||||
):
|
|
||||||
pytest.skip("2-way streaming example requires py3.9 async with syntax")
|
pytest.skip("2-way streaming example requires py3.9 async with syntax")
|
||||||
|
|
||||||
if (
|
|
||||||
'full_fledged_streaming_service' in ex_file
|
|
||||||
and
|
|
||||||
_friggin_macos
|
|
||||||
and
|
|
||||||
ci_env
|
|
||||||
):
|
|
||||||
pytest.skip(
|
|
||||||
'Streaming example is too flaky in CI\n'
|
|
||||||
'AND their competitor runs this CI service..\n'
|
|
||||||
'This test does run just fine "in person" however..'
|
|
||||||
)
|
|
||||||
|
|
||||||
from .conftest import cpu_scaling_factor
|
|
||||||
|
|
||||||
timeout: float = (
|
|
||||||
60
|
|
||||||
if ci_env and _non_linux
|
|
||||||
else 16
|
|
||||||
)
|
|
||||||
|
|
||||||
# add latency headroom for CPU freq scaling (auto-cpufreq et al.)
|
|
||||||
headroom: float = cpu_scaling_factor()
|
|
||||||
if headroom != 1.:
|
|
||||||
timeout *= headroom
|
|
||||||
|
|
||||||
with open(ex_file, 'r') as ex:
|
with open(ex_file, 'r') as ex:
|
||||||
code = ex.read()
|
code = ex.read()
|
||||||
|
|
||||||
|
|
@ -165,12 +126,9 @@ def test_example(
|
||||||
err = None
|
err = None
|
||||||
try:
|
try:
|
||||||
if not proc.poll():
|
if not proc.poll():
|
||||||
_, err = proc.communicate(timeout=timeout)
|
_, err = proc.communicate(timeout=15)
|
||||||
|
|
||||||
except subprocess.TimeoutExpired as e:
|
except subprocess.TimeoutExpired as e:
|
||||||
test_log.exception(
|
|
||||||
f'Example failed to finish within {timeout}s ??\n'
|
|
||||||
)
|
|
||||||
proc.kill()
|
proc.kill()
|
||||||
err = e.stderr
|
err = e.stderr
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -26,8 +26,8 @@ from tractor import (
|
||||||
to_asyncio,
|
to_asyncio,
|
||||||
RemoteActorError,
|
RemoteActorError,
|
||||||
ContextCancelled,
|
ContextCancelled,
|
||||||
|
_state,
|
||||||
)
|
)
|
||||||
from tractor.runtime import _state
|
|
||||||
from tractor.trionics import BroadcastReceiver
|
from tractor.trionics import BroadcastReceiver
|
||||||
from tractor._testing import expect_ctxc
|
from tractor._testing import expect_ctxc
|
||||||
|
|
||||||
|
|
@ -47,11 +47,12 @@ async def sleep_and_err(
|
||||||
|
|
||||||
# just signature placeholders for compat with
|
# just signature placeholders for compat with
|
||||||
# ``to_asyncio.open_channel_from()``
|
# ``to_asyncio.open_channel_from()``
|
||||||
chan: to_asyncio.LinkedTaskChannel|None = None,
|
to_trio: trio.MemorySendChannel|None = None,
|
||||||
|
from_trio: asyncio.Queue|None = None,
|
||||||
|
|
||||||
):
|
):
|
||||||
if chan:
|
if to_trio:
|
||||||
chan.started_nowait('start')
|
to_trio.send_nowait('start')
|
||||||
|
|
||||||
await asyncio.sleep(sleep_for)
|
await asyncio.sleep(sleep_for)
|
||||||
assert 0
|
assert 0
|
||||||
|
|
@ -183,7 +184,6 @@ def test_tractor_cancels_aio(
|
||||||
async def main():
|
async def main():
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
) as an:
|
) as an:
|
||||||
portal = await an.run_in_actor(
|
portal = await an.run_in_actor(
|
||||||
asyncio_actor,
|
asyncio_actor,
|
||||||
|
|
@ -206,11 +206,11 @@ def test_trio_cancels_aio(
|
||||||
|
|
||||||
'''
|
'''
|
||||||
async def main():
|
async def main():
|
||||||
# cancel the nursery shortly after boot
|
|
||||||
with trio.move_on_after(1):
|
with trio.move_on_after(1):
|
||||||
async with tractor.open_nursery(
|
# cancel the nursery shortly after boot
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
) as tn:
|
async with tractor.open_nursery() as tn:
|
||||||
await tn.run_in_actor(
|
await tn.run_in_actor(
|
||||||
asyncio_actor,
|
asyncio_actor,
|
||||||
target='aio_sleep_forever',
|
target='aio_sleep_forever',
|
||||||
|
|
@ -238,7 +238,7 @@ async def trio_ctx(
|
||||||
trio.open_nursery() as tn,
|
trio.open_nursery() as tn,
|
||||||
tractor.to_asyncio.open_channel_from(
|
tractor.to_asyncio.open_channel_from(
|
||||||
sleep_and_err,
|
sleep_and_err,
|
||||||
) as (chan, first),
|
) as (first, chan),
|
||||||
):
|
):
|
||||||
|
|
||||||
assert first == 'start'
|
assert first == 'start'
|
||||||
|
|
@ -278,9 +278,7 @@ def test_context_spawns_aio_task_that_errors(
|
||||||
'''
|
'''
|
||||||
async def main():
|
async def main():
|
||||||
with trio.fail_after(1 + delay):
|
with trio.fail_after(1 + delay):
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery() as an:
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
) as an:
|
|
||||||
p = await an.start_actor(
|
p = await an.start_actor(
|
||||||
'aio_daemon',
|
'aio_daemon',
|
||||||
enable_modules=[__name__],
|
enable_modules=[__name__],
|
||||||
|
|
@ -363,9 +361,7 @@ def test_aio_cancelled_from_aio_causes_trio_cancelled(
|
||||||
async def main():
|
async def main():
|
||||||
|
|
||||||
an: tractor.ActorNursery
|
an: tractor.ActorNursery
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery() as an:
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
) as an:
|
|
||||||
p: tractor.Portal = await an.run_in_actor(
|
p: tractor.Portal = await an.run_in_actor(
|
||||||
asyncio_actor,
|
asyncio_actor,
|
||||||
target='aio_cancel',
|
target='aio_cancel',
|
||||||
|
|
@ -403,7 +399,7 @@ async def no_to_trio_in_args():
|
||||||
|
|
||||||
async def push_from_aio_task(
|
async def push_from_aio_task(
|
||||||
sequence: Iterable,
|
sequence: Iterable,
|
||||||
chan: to_asyncio.LinkedTaskChannel,
|
to_trio: trio.abc.SendChannel,
|
||||||
expect_cancel: False,
|
expect_cancel: False,
|
||||||
fail_early: bool,
|
fail_early: bool,
|
||||||
exit_early: bool,
|
exit_early: bool,
|
||||||
|
|
@ -411,12 +407,15 @@ async def push_from_aio_task(
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# print('trying breakpoint')
|
||||||
|
# breakpoint()
|
||||||
|
|
||||||
# sync caller ctx manager
|
# sync caller ctx manager
|
||||||
chan.started_nowait(True)
|
to_trio.send_nowait(True)
|
||||||
|
|
||||||
for i in sequence:
|
for i in sequence:
|
||||||
print(f'asyncio sending {i}')
|
print(f'asyncio sending {i}')
|
||||||
chan.send_nowait(i)
|
to_trio.send_nowait(i)
|
||||||
await asyncio.sleep(0.001)
|
await asyncio.sleep(0.001)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
|
|
@ -479,7 +478,7 @@ async def stream_from_aio(
|
||||||
trio_exit_early
|
trio_exit_early
|
||||||
))
|
))
|
||||||
|
|
||||||
) as (chan, first):
|
) as (first, chan):
|
||||||
|
|
||||||
assert first is True
|
assert first is True
|
||||||
|
|
||||||
|
|
@ -574,9 +573,7 @@ def test_basic_interloop_channel_stream(
|
||||||
async def main():
|
async def main():
|
||||||
# TODO, figure out min timeout here!
|
# TODO, figure out min timeout here!
|
||||||
with trio.fail_after(6):
|
with trio.fail_after(6):
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery() as an:
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
) as an:
|
|
||||||
portal = await an.run_in_actor(
|
portal = await an.run_in_actor(
|
||||||
stream_from_aio,
|
stream_from_aio,
|
||||||
infect_asyncio=True,
|
infect_asyncio=True,
|
||||||
|
|
@ -589,13 +586,9 @@ def test_basic_interloop_channel_stream(
|
||||||
|
|
||||||
|
|
||||||
# TODO: parametrize the above test and avoid the duplication here?
|
# TODO: parametrize the above test and avoid the duplication here?
|
||||||
def test_trio_error_cancels_intertask_chan(
|
def test_trio_error_cancels_intertask_chan(reg_addr):
|
||||||
reg_addr: tuple[str, int],
|
|
||||||
):
|
|
||||||
async def main():
|
async def main():
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery() as an:
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
) as an:
|
|
||||||
portal = await an.run_in_actor(
|
portal = await an.run_in_actor(
|
||||||
stream_from_aio,
|
stream_from_aio,
|
||||||
trio_raise_err=True,
|
trio_raise_err=True,
|
||||||
|
|
@ -630,7 +623,6 @@ def test_trio_closes_early_causes_aio_checkpoint_raise(
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
# enable_stack_on_sig=True,
|
# enable_stack_on_sig=True,
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
) as an:
|
) as an:
|
||||||
portal = await an.run_in_actor(
|
portal = await an.run_in_actor(
|
||||||
stream_from_aio,
|
stream_from_aio,
|
||||||
|
|
@ -679,7 +671,6 @@ def test_aio_exits_early_relays_AsyncioTaskExited(
|
||||||
async def main():
|
async def main():
|
||||||
with trio.fail_after(1 + delay):
|
with trio.fail_after(1 + delay):
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
# enable_stack_on_sig=True,
|
# enable_stack_on_sig=True,
|
||||||
) as an:
|
) as an:
|
||||||
|
|
@ -720,7 +711,6 @@ def test_aio_errors_and_channel_propagates_and_closes(
|
||||||
):
|
):
|
||||||
async def main():
|
async def main():
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
) as an:
|
) as an:
|
||||||
portal = await an.run_in_actor(
|
portal = await an.run_in_actor(
|
||||||
|
|
@ -742,21 +732,15 @@ def test_aio_errors_and_channel_propagates_and_closes(
|
||||||
|
|
||||||
|
|
||||||
async def aio_echo_server(
|
async def aio_echo_server(
|
||||||
chan: to_asyncio.LinkedTaskChannel,
|
to_trio: trio.MemorySendChannel,
|
||||||
|
from_trio: asyncio.Queue,
|
||||||
) -> None:
|
) -> None:
|
||||||
'''
|
|
||||||
An IPC-msg "echo server" with msgs received and relayed by
|
|
||||||
a parent `trio.Task` into a child `asyncio.Task`
|
|
||||||
and then repeated back to that local parent (`trio.Task`)
|
|
||||||
and sent again back to the original calling remote actor.
|
|
||||||
|
|
||||||
'''
|
to_trio.send_nowait('start')
|
||||||
# same semantics as `trio.TaskStatus.started()`
|
|
||||||
chan.started_nowait('start')
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
msg = await chan.get()
|
msg = await from_trio.get()
|
||||||
except to_asyncio.TrioTaskExited:
|
except to_asyncio.TrioTaskExited:
|
||||||
print(
|
print(
|
||||||
'breaking aio echo loop due to `trio` exit!'
|
'breaking aio echo loop due to `trio` exit!'
|
||||||
|
|
@ -764,7 +748,7 @@ async def aio_echo_server(
|
||||||
break
|
break
|
||||||
|
|
||||||
# echo the msg back
|
# echo the msg back
|
||||||
chan.send_nowait(msg)
|
to_trio.send_nowait(msg)
|
||||||
|
|
||||||
# if we get the terminate sentinel
|
# if we get the terminate sentinel
|
||||||
# break the echo loop
|
# break the echo loop
|
||||||
|
|
@ -781,10 +765,7 @@ async def trio_to_aio_echo_server(
|
||||||
):
|
):
|
||||||
async with to_asyncio.open_channel_from(
|
async with to_asyncio.open_channel_from(
|
||||||
aio_echo_server,
|
aio_echo_server,
|
||||||
) as (
|
) as (first, chan):
|
||||||
chan,
|
|
||||||
first, # value from `chan.started_nowait()` above
|
|
||||||
):
|
|
||||||
assert first == 'start'
|
assert first == 'start'
|
||||||
|
|
||||||
await ctx.started(first)
|
await ctx.started(first)
|
||||||
|
|
@ -795,8 +776,7 @@ async def trio_to_aio_echo_server(
|
||||||
await chan.send(msg)
|
await chan.send(msg)
|
||||||
|
|
||||||
out = await chan.receive()
|
out = await chan.receive()
|
||||||
|
# echo back to parent actor-task
|
||||||
# echo back to parent-actor's remote parent-ctx-task!
|
|
||||||
await stream.send(out)
|
await stream.send(out)
|
||||||
|
|
||||||
if out is None:
|
if out is None:
|
||||||
|
|
@ -820,7 +800,6 @@ def test_echoserver_detailed_mechanics(
|
||||||
):
|
):
|
||||||
async def main():
|
async def main():
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
) as an:
|
) as an:
|
||||||
p = await an.start_actor(
|
p = await an.start_actor(
|
||||||
|
|
@ -999,7 +978,7 @@ async def manage_file(
|
||||||
],
|
],
|
||||||
ids=[
|
ids=[
|
||||||
'bg_aio_task',
|
'bg_aio_task',
|
||||||
'just_trio_sleep',
|
'just_trio_slee',
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
|
@ -1015,14 +994,11 @@ async def manage_file(
|
||||||
)
|
)
|
||||||
def test_sigint_closes_lifetime_stack(
|
def test_sigint_closes_lifetime_stack(
|
||||||
tmp_path: Path,
|
tmp_path: Path,
|
||||||
reg_addr: tuple,
|
|
||||||
debug_mode: bool,
|
|
||||||
|
|
||||||
wait_for_ctx: bool,
|
wait_for_ctx: bool,
|
||||||
bg_aio_task: bool,
|
bg_aio_task: bool,
|
||||||
trio_side_is_shielded: bool,
|
trio_side_is_shielded: bool,
|
||||||
|
debug_mode: bool,
|
||||||
send_sigint_to: str,
|
send_sigint_to: str,
|
||||||
start_method: str,
|
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
Ensure that an infected child can use the `Actor.lifetime_stack`
|
Ensure that an infected child can use the `Actor.lifetime_stack`
|
||||||
|
|
@ -1032,22 +1008,12 @@ def test_sigint_closes_lifetime_stack(
|
||||||
'''
|
'''
|
||||||
async def main():
|
async def main():
|
||||||
|
|
||||||
delay: float = (
|
delay = 999 if tractor.debug_mode() else 1
|
||||||
999
|
|
||||||
if debug_mode
|
|
||||||
else 1
|
|
||||||
)
|
|
||||||
try:
|
try:
|
||||||
an: tractor.ActorNursery
|
an: tractor.ActorNursery
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
) as an:
|
) as an:
|
||||||
|
|
||||||
# sanity
|
|
||||||
if debug_mode:
|
|
||||||
assert tractor.debug_mode()
|
|
||||||
|
|
||||||
p: tractor.Portal = await an.start_actor(
|
p: tractor.Portal = await an.start_actor(
|
||||||
'file_mngr',
|
'file_mngr',
|
||||||
enable_modules=[__name__],
|
enable_modules=[__name__],
|
||||||
|
|
@ -1082,10 +1048,6 @@ def test_sigint_closes_lifetime_stack(
|
||||||
cpid if send_sigint_to == 'child'
|
cpid if send_sigint_to == 'child'
|
||||||
else os.getpid()
|
else os.getpid()
|
||||||
)
|
)
|
||||||
print(
|
|
||||||
f'Sending SIGINT to {send_sigint_to!r}\n'
|
|
||||||
f'pid: {pid!r}\n'
|
|
||||||
)
|
|
||||||
os.kill(
|
os.kill(
|
||||||
pid,
|
pid,
|
||||||
signal.SIGINT,
|
signal.SIGINT,
|
||||||
|
|
@ -1096,37 +1058,13 @@ def test_sigint_closes_lifetime_stack(
|
||||||
# timeout should trigger!
|
# timeout should trigger!
|
||||||
if wait_for_ctx:
|
if wait_for_ctx:
|
||||||
print('waiting for ctx outcome in parent..')
|
print('waiting for ctx outcome in parent..')
|
||||||
|
|
||||||
if debug_mode:
|
|
||||||
assert delay == 999
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with trio.fail_after(
|
with trio.fail_after(1 + delay):
|
||||||
1 + delay
|
|
||||||
):
|
|
||||||
await ctx.wait_for_result()
|
await ctx.wait_for_result()
|
||||||
except tractor.ContextCancelled as ctxc:
|
except tractor.ContextCancelled as ctxc:
|
||||||
assert ctxc.canceller == ctx.chan.uid
|
assert ctxc.canceller == ctx.chan.uid
|
||||||
raise
|
raise
|
||||||
|
|
||||||
except trio.TooSlowError:
|
|
||||||
if (
|
|
||||||
send_sigint_to == 'child'
|
|
||||||
and
|
|
||||||
start_method == 'subint_forkserver'
|
|
||||||
):
|
|
||||||
pytest.xfail(
|
|
||||||
reason=(
|
|
||||||
'SIGINT delivery to fork-child subactor is known '
|
|
||||||
'to NOT SUCCEED, precisely bc we have not wired up a'
|
|
||||||
'"trio SIGINT mode" in the child pre-fork.\n'
|
|
||||||
'Also see `test_orphaned_subactor_sigint_cleanup_DRAFT` for'
|
|
||||||
'a dedicated suite demonstrating this expected limitation as '
|
|
||||||
'well as the detailed doc:\n'
|
|
||||||
'`ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md`.\n'
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
# XXX CASE 2: this seems to be the source of the
|
# XXX CASE 2: this seems to be the source of the
|
||||||
# original issue which exhibited BEFORE we put
|
# original issue which exhibited BEFORE we put
|
||||||
# a `Actor.cancel_soon()` inside
|
# a `Actor.cancel_soon()` inside
|
||||||
|
|
@ -1152,21 +1090,24 @@ def test_sigint_closes_lifetime_stack(
|
||||||
|
|
||||||
|
|
||||||
# ?TODO asyncio.Task fn-deco?
|
# ?TODO asyncio.Task fn-deco?
|
||||||
|
# -[ ] do sig checkingat import time like @context?
|
||||||
|
# -[ ] maybe name it @aio_task ??
|
||||||
# -[ ] chan: to_asyncio.InterloopChannel ??
|
# -[ ] chan: to_asyncio.InterloopChannel ??
|
||||||
# -[ ] do fn-sig checking at import time like @context?
|
|
||||||
# |_[ ] maybe name it @a(sync)io_task ??
|
|
||||||
# @asyncio_task <- not bad ??
|
|
||||||
async def raise_before_started(
|
async def raise_before_started(
|
||||||
|
# from_trio: asyncio.Queue,
|
||||||
|
# to_trio: trio.abc.SendChannel,
|
||||||
chan: to_asyncio.LinkedTaskChannel,
|
chan: to_asyncio.LinkedTaskChannel,
|
||||||
|
|
||||||
) -> None:
|
) -> None:
|
||||||
'''
|
'''
|
||||||
`asyncio.Task` entry point which RTEs before calling
|
`asyncio.Task` entry point which RTEs before calling
|
||||||
`chan.started_nowait()`.
|
`to_trio.send_nowait()`.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
await asyncio.sleep(0.2)
|
await asyncio.sleep(0.2)
|
||||||
raise RuntimeError('Some shite went wrong before `.send_nowait()`!!')
|
raise RuntimeError('Some shite went wrong before `.send_nowait()`!!')
|
||||||
|
|
||||||
|
# to_trio.send_nowait('Uhh we shouldve RTE-d ^^ ??')
|
||||||
chan.started_nowait('Uhh we shouldve RTE-d ^^ ??')
|
chan.started_nowait('Uhh we shouldve RTE-d ^^ ??')
|
||||||
await asyncio.sleep(float('inf'))
|
await asyncio.sleep(float('inf'))
|
||||||
|
|
||||||
|
|
@ -1226,7 +1167,6 @@ def test_aio_side_raises_before_started(
|
||||||
with trio.fail_after(3):
|
with trio.fail_after(3):
|
||||||
an: tractor.ActorNursery
|
an: tractor.ActorNursery
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
loglevel=loglevel,
|
loglevel=loglevel,
|
||||||
) as an:
|
) as an:
|
||||||
|
|
|
||||||
|
|
@ -11,30 +11,18 @@ import trio
|
||||||
import tractor
|
import tractor
|
||||||
from tractor import ( # typing
|
from tractor import ( # typing
|
||||||
Actor,
|
Actor,
|
||||||
Context,
|
|
||||||
ContextCancelled,
|
|
||||||
MsgStream,
|
|
||||||
Portal,
|
|
||||||
RemoteActorError,
|
|
||||||
current_actor,
|
current_actor,
|
||||||
open_nursery,
|
open_nursery,
|
||||||
|
Portal,
|
||||||
|
Context,
|
||||||
|
ContextCancelled,
|
||||||
|
RemoteActorError,
|
||||||
)
|
)
|
||||||
from tractor._testing import (
|
from tractor._testing import (
|
||||||
# tractor_test,
|
# tractor_test,
|
||||||
expect_ctxc,
|
expect_ctxc,
|
||||||
)
|
)
|
||||||
|
|
||||||
from .conftest import cpu_scaling_factor
|
|
||||||
|
|
||||||
pytestmark = pytest.mark.skipon_spawn_backend(
|
|
||||||
'subint',
|
|
||||||
reason=(
|
|
||||||
'XXX SUBINT GIL-CONTENTION HANGING TEST XXX\n'
|
|
||||||
'See oustanding issue(s)\n'
|
|
||||||
# TODO, put issue link!
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# XXX TODO cases:
|
# XXX TODO cases:
|
||||||
# - [x] WE cancelled the peer and thus should not see any raised
|
# - [x] WE cancelled the peer and thus should not see any raised
|
||||||
# `ContextCancelled` as it should be reaped silently?
|
# `ContextCancelled` as it should be reaped silently?
|
||||||
|
|
@ -212,7 +200,7 @@ async def stream_from_peer(
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
# sanity
|
# sanity
|
||||||
assert tractor.debug_mode() == debug_mode
|
assert tractor._state.debug_mode() == debug_mode
|
||||||
|
|
||||||
peer: Portal
|
peer: Portal
|
||||||
try:
|
try:
|
||||||
|
|
@ -592,7 +580,7 @@ def test_peer_canceller(
|
||||||
assert (
|
assert (
|
||||||
re.canceller
|
re.canceller
|
||||||
==
|
==
|
||||||
root.aid.uid
|
root.uid
|
||||||
)
|
)
|
||||||
|
|
||||||
else: # the other 2 ctxs
|
else: # the other 2 ctxs
|
||||||
|
|
@ -601,7 +589,7 @@ def test_peer_canceller(
|
||||||
and (
|
and (
|
||||||
re.canceller
|
re.canceller
|
||||||
==
|
==
|
||||||
canceller.channel.aid.uid
|
canceller.channel.uid
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -756,7 +744,7 @@ def test_peer_canceller(
|
||||||
# -> each context should have received
|
# -> each context should have received
|
||||||
# a silently absorbed context cancellation
|
# a silently absorbed context cancellation
|
||||||
# in its remote nursery scope.
|
# in its remote nursery scope.
|
||||||
# assert ctx.chan.aid.uid == ctx.canceller
|
# assert ctx.chan.uid == ctx.canceller
|
||||||
|
|
||||||
# NOTE: when an inter-peer cancellation
|
# NOTE: when an inter-peer cancellation
|
||||||
# occurred, we DO NOT expect this
|
# occurred, we DO NOT expect this
|
||||||
|
|
@ -808,12 +796,12 @@ async def basic_echo_server(
|
||||||
|
|
||||||
) -> None:
|
) -> None:
|
||||||
'''
|
'''
|
||||||
Just the simplest `MsgStream` echo server which resays what you
|
Just the simplest `MsgStream` echo server which resays what
|
||||||
told it but with its uid in front ;)
|
you told it but with its uid in front ;)
|
||||||
|
|
||||||
'''
|
'''
|
||||||
actor: Actor = tractor.current_actor()
|
actor: Actor = tractor.current_actor()
|
||||||
uid: tuple = actor.aid.uid
|
uid: tuple = actor.uid
|
||||||
await ctx.started(uid)
|
await ctx.started(uid)
|
||||||
async with ctx.open_stream() as ipc:
|
async with ctx.open_stream() as ipc:
|
||||||
async for msg in ipc:
|
async for msg in ipc:
|
||||||
|
|
@ -852,7 +840,7 @@ async def serve_subactors(
|
||||||
async with open_nursery() as an:
|
async with open_nursery() as an:
|
||||||
|
|
||||||
# sanity
|
# sanity
|
||||||
assert tractor.debug_mode() == debug_mode
|
assert tractor._state.debug_mode() == debug_mode
|
||||||
|
|
||||||
await ctx.started(peer_name)
|
await ctx.started(peer_name)
|
||||||
async with ctx.open_stream() as ipc:
|
async with ctx.open_stream() as ipc:
|
||||||
|
|
@ -868,7 +856,7 @@ async def serve_subactors(
|
||||||
f'|_{peer}\n'
|
f'|_{peer}\n'
|
||||||
)
|
)
|
||||||
await ipc.send((
|
await ipc.send((
|
||||||
peer.chan.aid.uid,
|
peer.chan.uid,
|
||||||
peer.chan.raddr.unwrap(),
|
peer.chan.raddr.unwrap(),
|
||||||
))
|
))
|
||||||
|
|
||||||
|
|
@ -891,7 +879,7 @@ async def client_req_subactor(
|
||||||
) -> None:
|
) -> None:
|
||||||
# sanity
|
# sanity
|
||||||
if debug_mode:
|
if debug_mode:
|
||||||
assert tractor.debug_mode()
|
assert tractor._state.debug_mode()
|
||||||
|
|
||||||
# TODO: other cases to do with sub lifetimes:
|
# TODO: other cases to do with sub lifetimes:
|
||||||
# -[ ] test that we can have the server spawn a sub
|
# -[ ] test that we can have the server spawn a sub
|
||||||
|
|
@ -978,14 +966,9 @@ async def tell_little_bro(
|
||||||
|
|
||||||
caller: str = '',
|
caller: str = '',
|
||||||
err_after: float|None = None,
|
err_after: float|None = None,
|
||||||
rng_seed: int = 100,
|
rng_seed: int = 50,
|
||||||
# NOTE, ensure ^ is large enough (on fast hw anyway)
|
|
||||||
# to ensure the peer cancel req arrives before the
|
|
||||||
# echoing dialog does itself Bp
|
|
||||||
):
|
):
|
||||||
# contact target actor, do a stream dialog.
|
# contact target actor, do a stream dialog.
|
||||||
lb: Portal
|
|
||||||
echo_ipc: MsgStream
|
|
||||||
async with (
|
async with (
|
||||||
tractor.wait_for_actor(
|
tractor.wait_for_actor(
|
||||||
name=actor_name
|
name=actor_name
|
||||||
|
|
@ -1000,17 +983,17 @@ async def tell_little_bro(
|
||||||
else None
|
else None
|
||||||
),
|
),
|
||||||
) as (sub_ctx, first),
|
) as (sub_ctx, first),
|
||||||
|
|
||||||
sub_ctx.open_stream() as echo_ipc,
|
sub_ctx.open_stream() as echo_ipc,
|
||||||
):
|
):
|
||||||
actor: Actor = current_actor()
|
actor: Actor = current_actor()
|
||||||
uid: tuple = actor.aid.uid
|
uid: tuple = actor.uid
|
||||||
for i in range(rng_seed):
|
for i in range(rng_seed):
|
||||||
msg: tuple = (
|
msg: tuple = (
|
||||||
uid,
|
uid,
|
||||||
i,
|
i,
|
||||||
)
|
)
|
||||||
await echo_ipc.send(msg)
|
await echo_ipc.send(msg)
|
||||||
await trio.sleep(0.001)
|
|
||||||
resp = await echo_ipc.receive()
|
resp = await echo_ipc.receive()
|
||||||
print(
|
print(
|
||||||
f'{caller} => {actor_name}: {msg}\n'
|
f'{caller} => {actor_name}: {msg}\n'
|
||||||
|
|
@ -1023,9 +1006,6 @@ async def tell_little_bro(
|
||||||
assert sub_uid != uid
|
assert sub_uid != uid
|
||||||
assert _i == i
|
assert _i == i
|
||||||
|
|
||||||
# XXX, usually should never get here!
|
|
||||||
# await tractor.pause()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
'raise_client_error',
|
'raise_client_error',
|
||||||
|
|
@ -1040,10 +1020,6 @@ def test_peer_spawns_and_cancels_service_subactor(
|
||||||
raise_client_error: str,
|
raise_client_error: str,
|
||||||
reg_addr: tuple[str, int],
|
reg_addr: tuple[str, int],
|
||||||
raise_sub_spawn_error_after: float|None,
|
raise_sub_spawn_error_after: float|None,
|
||||||
loglevel: str,
|
|
||||||
test_log: tractor.log.StackLevelAdapter,
|
|
||||||
# ^XXX, set to 'warning' to see masked-exc warnings
|
|
||||||
# that may transpire during actor-nursery teardown.
|
|
||||||
):
|
):
|
||||||
# NOTE: this tests for the modden `mod wks open piker` bug
|
# NOTE: this tests for the modden `mod wks open piker` bug
|
||||||
# discovered as part of implementing workspace ctx
|
# discovered as part of implementing workspace ctx
|
||||||
|
|
@ -1073,7 +1049,6 @@ def test_peer_spawns_and_cancels_service_subactor(
|
||||||
# NOTE: to halt the peer tasks on ctxc, uncomment this.
|
# NOTE: to halt the peer tasks on ctxc, uncomment this.
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
registry_addrs=[reg_addr],
|
registry_addrs=[reg_addr],
|
||||||
loglevel=loglevel,
|
|
||||||
) as an:
|
) as an:
|
||||||
server: Portal = await an.start_actor(
|
server: Portal = await an.start_actor(
|
||||||
(server_name := 'spawn_server'),
|
(server_name := 'spawn_server'),
|
||||||
|
|
@ -1109,7 +1084,7 @@ def test_peer_spawns_and_cancels_service_subactor(
|
||||||
) as (client_ctx, client_says),
|
) as (client_ctx, client_says),
|
||||||
):
|
):
|
||||||
root: Actor = current_actor()
|
root: Actor = current_actor()
|
||||||
spawner_uid: tuple = spawn_ctx.chan.aid.uid
|
spawner_uid: tuple = spawn_ctx.chan.uid
|
||||||
print(
|
print(
|
||||||
f'Server says: {first}\n'
|
f'Server says: {first}\n'
|
||||||
f'Client says: {client_says}\n'
|
f'Client says: {client_says}\n'
|
||||||
|
|
@ -1128,7 +1103,7 @@ def test_peer_spawns_and_cancels_service_subactor(
|
||||||
print(
|
print(
|
||||||
'Sub-spawn came online\n'
|
'Sub-spawn came online\n'
|
||||||
f'portal: {sub}\n'
|
f'portal: {sub}\n'
|
||||||
f'.uid: {sub.actor.aid.uid}\n'
|
f'.uid: {sub.actor.uid}\n'
|
||||||
f'chan.raddr: {sub.chan.raddr}\n'
|
f'chan.raddr: {sub.chan.raddr}\n'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -1162,7 +1137,7 @@ def test_peer_spawns_and_cancels_service_subactor(
|
||||||
|
|
||||||
assert isinstance(res, ContextCancelled)
|
assert isinstance(res, ContextCancelled)
|
||||||
assert client_ctx.cancel_acked
|
assert client_ctx.cancel_acked
|
||||||
assert res.canceller == root.aid.uid
|
assert res.canceller == root.uid
|
||||||
assert not raise_sub_spawn_error_after
|
assert not raise_sub_spawn_error_after
|
||||||
|
|
||||||
# cancelling the spawner sub should
|
# cancelling the spawner sub should
|
||||||
|
|
@ -1196,8 +1171,8 @@ def test_peer_spawns_and_cancels_service_subactor(
|
||||||
# little_bro: a `RuntimeError`.
|
# little_bro: a `RuntimeError`.
|
||||||
#
|
#
|
||||||
check_inner_rte(rae)
|
check_inner_rte(rae)
|
||||||
assert rae.relay_uid == client.chan.aid.uid
|
assert rae.relay_uid == client.chan.uid
|
||||||
assert rae.src_uid == sub.chan.aid.uid
|
assert rae.src_uid == sub.chan.uid
|
||||||
|
|
||||||
assert not client_ctx.cancel_acked
|
assert not client_ctx.cancel_acked
|
||||||
assert (
|
assert (
|
||||||
|
|
@ -1226,12 +1201,12 @@ def test_peer_spawns_and_cancels_service_subactor(
|
||||||
except ContextCancelled as ctxc:
|
except ContextCancelled as ctxc:
|
||||||
_ctxc = ctxc
|
_ctxc = ctxc
|
||||||
print(
|
print(
|
||||||
f'{root.aid.uid} caught ctxc from ctx with {client_ctx.chan.aid.uid}\n'
|
f'{root.uid} caught ctxc from ctx with {client_ctx.chan.uid}\n'
|
||||||
f'{repr(ctxc)}\n'
|
f'{repr(ctxc)}\n'
|
||||||
)
|
)
|
||||||
|
|
||||||
if not raise_sub_spawn_error_after:
|
if not raise_sub_spawn_error_after:
|
||||||
assert ctxc.canceller == root.aid.uid
|
assert ctxc.canceller == root.uid
|
||||||
else:
|
else:
|
||||||
assert ctxc.canceller == spawner_uid
|
assert ctxc.canceller == spawner_uid
|
||||||
|
|
||||||
|
|
@ -1262,20 +1237,9 @@ def test_peer_spawns_and_cancels_service_subactor(
|
||||||
|
|
||||||
# assert spawn_ctx.cancelled_caught
|
# assert spawn_ctx.cancelled_caught
|
||||||
|
|
||||||
|
|
||||||
async def _main():
|
async def _main():
|
||||||
headroom: float = cpu_scaling_factor()
|
|
||||||
this_fast_on_linux: float = 3
|
|
||||||
this_fast = this_fast_on_linux * headroom
|
|
||||||
if headroom != 1.:
|
|
||||||
test_log.warning(
|
|
||||||
f'Adding latency headroom on linux bc CPU scaling,\n'
|
|
||||||
f'headroom: {headroom}\n'
|
|
||||||
f'this_fast_on_linux: {this_fast_on_linux} -> {this_fast}\n'
|
|
||||||
)
|
|
||||||
with trio.fail_after(
|
with trio.fail_after(
|
||||||
this_fast
|
3 if not debug_mode
|
||||||
if not debug_mode
|
|
||||||
else 999
|
else 999
|
||||||
):
|
):
|
||||||
await main()
|
await main()
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,9 @@
|
||||||
"""
|
"""
|
||||||
Streaming via the, now legacy, "async-gen API".
|
Streaming via async gen api
|
||||||
|
|
||||||
"""
|
"""
|
||||||
import time
|
import time
|
||||||
from functools import partial
|
from functools import partial
|
||||||
import platform
|
import platform
|
||||||
from typing import Callable
|
|
||||||
|
|
||||||
import trio
|
import trio
|
||||||
import tractor
|
import tractor
|
||||||
|
|
@ -21,11 +19,7 @@ def test_must_define_ctx():
|
||||||
async def no_ctx():
|
async def no_ctx():
|
||||||
pass
|
pass
|
||||||
|
|
||||||
assert (
|
assert "no_ctx must be `ctx: tractor.Context" in str(err.value)
|
||||||
"no_ctx must be `ctx: tractor.Context"
|
|
||||||
in
|
|
||||||
str(err.value)
|
|
||||||
)
|
|
||||||
|
|
||||||
@tractor.stream
|
@tractor.stream
|
||||||
async def has_ctx(ctx):
|
async def has_ctx(ctx):
|
||||||
|
|
@ -75,14 +69,14 @@ async def stream_from_single_subactor(
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
registry_addrs=[reg_addr],
|
registry_addrs=[reg_addr],
|
||||||
start_method=start_method,
|
start_method=start_method,
|
||||||
) as an:
|
) as nursery:
|
||||||
|
|
||||||
async with tractor.find_actor('streamerd') as portals:
|
async with tractor.find_actor('streamerd') as portals:
|
||||||
|
|
||||||
if not portals:
|
if not portals:
|
||||||
|
|
||||||
# no brokerd actor found
|
# no brokerd actor found
|
||||||
portal = await an.start_actor(
|
portal = await nursery.start_actor(
|
||||||
'streamerd',
|
'streamerd',
|
||||||
enable_modules=[__name__],
|
enable_modules=[__name__],
|
||||||
)
|
)
|
||||||
|
|
@ -122,22 +116,11 @@ async def stream_from_single_subactor(
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
'stream_func',
|
'stream_func', [async_gen_stream, context_stream]
|
||||||
[
|
|
||||||
async_gen_stream,
|
|
||||||
context_stream,
|
|
||||||
],
|
|
||||||
ids='stream_func={}'.format
|
|
||||||
)
|
)
|
||||||
def test_stream_from_single_subactor(
|
def test_stream_from_single_subactor(reg_addr, start_method, stream_func):
|
||||||
reg_addr: tuple,
|
"""Verify streaming from a spawned async generator.
|
||||||
start_method: str,
|
"""
|
||||||
stream_func: Callable,
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
Verify streaming from a spawned async generator.
|
|
||||||
|
|
||||||
'''
|
|
||||||
trio.run(
|
trio.run(
|
||||||
partial(
|
partial(
|
||||||
stream_from_single_subactor,
|
stream_from_single_subactor,
|
||||||
|
|
@ -149,9 +132,10 @@ def test_stream_from_single_subactor(
|
||||||
|
|
||||||
|
|
||||||
# this is the first 2 actors, streamer_1 and streamer_2
|
# this is the first 2 actors, streamer_1 and streamer_2
|
||||||
async def stream_data(seed: int):
|
async def stream_data(seed):
|
||||||
|
|
||||||
for i in range(seed):
|
for i in range(seed):
|
||||||
|
|
||||||
yield i
|
yield i
|
||||||
|
|
||||||
# trigger scheduler to simulate practical usage
|
# trigger scheduler to simulate practical usage
|
||||||
|
|
@ -159,17 +143,15 @@ async def stream_data(seed: int):
|
||||||
|
|
||||||
|
|
||||||
# this is the third actor; the aggregator
|
# this is the third actor; the aggregator
|
||||||
async def aggregate(seed: int):
|
async def aggregate(seed):
|
||||||
'''
|
"""Ensure that the two streams we receive match but only stream
|
||||||
Ensure that the two streams we receive match but only stream
|
|
||||||
a single set of values to the parent.
|
a single set of values to the parent.
|
||||||
|
"""
|
||||||
'''
|
async with tractor.open_nursery() as nursery:
|
||||||
async with tractor.open_nursery() as an:
|
|
||||||
portals = []
|
portals = []
|
||||||
for i in range(1, 3):
|
for i in range(1, 3):
|
||||||
# fork point
|
# fork point
|
||||||
portal = await an.start_actor(
|
portal = await nursery.start_actor(
|
||||||
name=f'streamer_{i}',
|
name=f'streamer_{i}',
|
||||||
enable_modules=[__name__],
|
enable_modules=[__name__],
|
||||||
)
|
)
|
||||||
|
|
@ -182,28 +164,20 @@ async def aggregate(seed: int):
|
||||||
async with send_chan:
|
async with send_chan:
|
||||||
|
|
||||||
async with portal.open_stream_from(
|
async with portal.open_stream_from(
|
||||||
stream_data,
|
stream_data, seed=seed,
|
||||||
seed=seed,
|
|
||||||
) as stream:
|
) as stream:
|
||||||
|
|
||||||
async for value in stream:
|
async for value in stream:
|
||||||
# leverage trio's built-in backpressure
|
# leverage trio's built-in backpressure
|
||||||
await send_chan.send(value)
|
await send_chan.send(value)
|
||||||
|
|
||||||
print(
|
print(f"FINISHED ITERATING {portal.channel.uid}")
|
||||||
f'FINISHED ITERATING!\n'
|
|
||||||
f'peer: {portal.channel.aid.uid}'
|
|
||||||
)
|
|
||||||
|
|
||||||
# spawn 2 trio tasks to collect streams and push to a local queue
|
# spawn 2 trio tasks to collect streams and push to a local queue
|
||||||
async with trio.open_nursery() as tn:
|
async with trio.open_nursery() as n:
|
||||||
|
|
||||||
for portal in portals:
|
for portal in portals:
|
||||||
tn.start_soon(
|
n.start_soon(push_to_chan, portal, send_chan.clone())
|
||||||
push_to_chan,
|
|
||||||
portal,
|
|
||||||
send_chan.clone(),
|
|
||||||
)
|
|
||||||
|
|
||||||
# close this local task's reference to send side
|
# close this local task's reference to send side
|
||||||
await send_chan.aclose()
|
await send_chan.aclose()
|
||||||
|
|
@ -220,21 +194,20 @@ async def aggregate(seed: int):
|
||||||
|
|
||||||
print("FINISHED ITERATING in aggregator")
|
print("FINISHED ITERATING in aggregator")
|
||||||
|
|
||||||
await an.cancel()
|
await nursery.cancel()
|
||||||
print("WAITING on `ActorNursery` to finish")
|
print("WAITING on `ActorNursery` to finish")
|
||||||
print("AGGREGATOR COMPLETE!")
|
print("AGGREGATOR COMPLETE!")
|
||||||
|
|
||||||
|
|
||||||
async def a_quadruple_example() -> list[int]:
|
# this is the main actor and *arbiter*
|
||||||
'''
|
async def a_quadruple_example():
|
||||||
Open the root-actor which is also a "registrar".
|
# a nursery which spawns "actors"
|
||||||
|
async with tractor.open_nursery() as nursery:
|
||||||
|
|
||||||
'''
|
|
||||||
async with tractor.open_nursery() as an:
|
|
||||||
seed = int(1e3)
|
seed = int(1e3)
|
||||||
pre_start = time.time()
|
pre_start = time.time()
|
||||||
|
|
||||||
portal = await an.start_actor(
|
portal = await nursery.start_actor(
|
||||||
name='aggregator',
|
name='aggregator',
|
||||||
enable_modules=[__name__],
|
enable_modules=[__name__],
|
||||||
)
|
)
|
||||||
|
|
@ -255,14 +228,8 @@ async def a_quadruple_example() -> list[int]:
|
||||||
return result_stream
|
return result_stream
|
||||||
|
|
||||||
|
|
||||||
async def cancel_after(
|
async def cancel_after(wait, reg_addr):
|
||||||
wait: float,
|
async with tractor.open_root_actor(registry_addrs=[reg_addr]):
|
||||||
reg_addr: tuple,
|
|
||||||
) -> list[int]:
|
|
||||||
|
|
||||||
async with tractor.open_root_actor(
|
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
):
|
|
||||||
with trio.move_on_after(wait):
|
with trio.move_on_after(wait):
|
||||||
return await a_quadruple_example()
|
return await a_quadruple_example()
|
||||||
|
|
||||||
|
|
@ -273,10 +240,6 @@ def time_quad_ex(
|
||||||
ci_env: bool,
|
ci_env: bool,
|
||||||
spawn_backend: str,
|
spawn_backend: str,
|
||||||
):
|
):
|
||||||
non_linux: bool = (_sys := platform.system()) != 'Linux'
|
|
||||||
if ci_env and non_linux:
|
|
||||||
pytest.skip(f'Test is too flaky on {_sys!r} in CI')
|
|
||||||
|
|
||||||
if spawn_backend == 'mp':
|
if spawn_backend == 'mp':
|
||||||
'''
|
'''
|
||||||
no idea but the mp *nix runs are flaking out here often...
|
no idea but the mp *nix runs are flaking out here often...
|
||||||
|
|
@ -284,59 +247,32 @@ def time_quad_ex(
|
||||||
'''
|
'''
|
||||||
pytest.skip("Test is too flaky on mp in CI")
|
pytest.skip("Test is too flaky on mp in CI")
|
||||||
|
|
||||||
timeout = 7 if non_linux else 4
|
timeout = 7 if platform.system() in ('Windows', 'Darwin') else 4
|
||||||
start = time.time()
|
start = time.time()
|
||||||
results: list[int] = trio.run(
|
results = trio.run(cancel_after, timeout, reg_addr)
|
||||||
cancel_after,
|
diff = time.time() - start
|
||||||
timeout,
|
|
||||||
reg_addr,
|
|
||||||
)
|
|
||||||
diff: float = time.time() - start
|
|
||||||
assert results
|
assert results
|
||||||
return results, diff
|
return results, diff
|
||||||
|
|
||||||
|
|
||||||
def test_a_quadruple_example(
|
def test_a_quadruple_example(
|
||||||
time_quad_ex: tuple[list[int], float],
|
time_quad_ex: tuple,
|
||||||
ci_env: bool,
|
ci_env: bool,
|
||||||
spawn_backend: str,
|
spawn_backend: str,
|
||||||
test_log: tractor.log.StackLevelAdapter,
|
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
This also serves as a "we'd like to be this fast" smoke test
|
This also serves as a kind of "we'd like to be this fast test".
|
||||||
given past empirical eval of this suite.
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
non_linux: bool = (_sys := platform.system()) != 'Linux'
|
|
||||||
|
|
||||||
this_fast_on_linux: float = 3
|
|
||||||
this_fast = (
|
|
||||||
6 if non_linux
|
|
||||||
else this_fast_on_linux
|
|
||||||
)
|
|
||||||
# ^ XXX NOTE,
|
|
||||||
# i've noticed that tweaking the CPU governor setting
|
|
||||||
# to not "always" enable "turbo" mode can result in latency
|
|
||||||
# which causes this limit to be too little. Not sure if it'd
|
|
||||||
# be worth it to adjust the linux value based on reading the
|
|
||||||
# CPU conf from the sys?
|
|
||||||
#
|
|
||||||
# For ex, see the `auto-cpufreq` docs on such settings,
|
|
||||||
# https://github.com/AdnanHodzic/auto-cpufreq?tab=readme-ov-file#example-config-file-contents
|
|
||||||
#
|
|
||||||
# HENCE this below latency-headroom compensation logic..
|
|
||||||
from .conftest import cpu_scaling_factor
|
|
||||||
headroom: float = cpu_scaling_factor()
|
|
||||||
if headroom != 1.:
|
|
||||||
this_fast = this_fast_on_linux * headroom
|
|
||||||
test_log.warning(
|
|
||||||
f'Adding latency headroom on linux bc CPU scaling,\n'
|
|
||||||
f'headroom: {headroom}\n'
|
|
||||||
f'this_fast_on_linux: {this_fast_on_linux} -> {this_fast}\n'
|
|
||||||
)
|
|
||||||
|
|
||||||
results, diff = time_quad_ex
|
results, diff = time_quad_ex
|
||||||
assert results
|
assert results
|
||||||
|
this_fast = (
|
||||||
|
6 if platform.system() in (
|
||||||
|
'Windows',
|
||||||
|
'Darwin',
|
||||||
|
)
|
||||||
|
else 3
|
||||||
|
)
|
||||||
assert diff < this_fast
|
assert diff < this_fast
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -345,58 +281,43 @@ def test_a_quadruple_example(
|
||||||
list(map(lambda i: i/10, range(3, 9)))
|
list(map(lambda i: i/10, range(3, 9)))
|
||||||
)
|
)
|
||||||
def test_not_fast_enough_quad(
|
def test_not_fast_enough_quad(
|
||||||
reg_addr: tuple,
|
reg_addr, time_quad_ex, cancel_delay, ci_env, spawn_backend
|
||||||
time_quad_ex: tuple[list[int], float],
|
|
||||||
cancel_delay: float,
|
|
||||||
ci_env: bool,
|
|
||||||
spawn_backend: str,
|
|
||||||
):
|
):
|
||||||
'''
|
"""Verify we can cancel midway through the quad example and all actors
|
||||||
Verify we can cancel midway through the quad example and all
|
cancel gracefully.
|
||||||
actors cancel gracefully.
|
"""
|
||||||
|
|
||||||
'''
|
|
||||||
results, diff = time_quad_ex
|
results, diff = time_quad_ex
|
||||||
delay = max(diff - cancel_delay, 0)
|
delay = max(diff - cancel_delay, 0)
|
||||||
results = trio.run(
|
results = trio.run(cancel_after, delay, reg_addr)
|
||||||
cancel_after,
|
system = platform.system()
|
||||||
delay,
|
if system in ('Windows', 'Darwin') and results is not None:
|
||||||
reg_addr,
|
|
||||||
)
|
|
||||||
system: str = platform.system()
|
|
||||||
if (
|
|
||||||
system in ('Windows', 'Darwin')
|
|
||||||
and
|
|
||||||
results is not None
|
|
||||||
):
|
|
||||||
# In CI envoirments it seems later runs are quicker then the first
|
# In CI envoirments it seems later runs are quicker then the first
|
||||||
# so just ignore these
|
# so just ignore these
|
||||||
print(f'Woa there {system} caught your breath eh?')
|
print(f"Woa there {system} caught your breath eh?")
|
||||||
else:
|
else:
|
||||||
# should be cancelled mid-streaming
|
# should be cancelled mid-streaming
|
||||||
assert results is None
|
assert results is None
|
||||||
|
|
||||||
|
|
||||||
@tractor_test(timeout=20)
|
@tractor_test
|
||||||
async def test_respawn_consumer_task(
|
async def test_respawn_consumer_task(
|
||||||
reg_addr: tuple,
|
reg_addr,
|
||||||
spawn_backend: str,
|
spawn_backend,
|
||||||
loglevel: str,
|
loglevel,
|
||||||
):
|
):
|
||||||
'''
|
"""Verify that ``._portal.ReceiveStream.shield()``
|
||||||
Verify that ``._portal.ReceiveStream.shield()``
|
|
||||||
sucessfully protects the underlying IPC channel from being closed
|
sucessfully protects the underlying IPC channel from being closed
|
||||||
when cancelling and respawning a consumer task.
|
when cancelling and respawning a consumer task.
|
||||||
|
|
||||||
This also serves to verify that all values from the stream can be
|
This also serves to verify that all values from the stream can be
|
||||||
received despite the respawns.
|
received despite the respawns.
|
||||||
|
|
||||||
'''
|
"""
|
||||||
stream = None
|
stream = None
|
||||||
|
|
||||||
async with tractor.open_nursery() as an:
|
async with tractor.open_nursery() as n:
|
||||||
|
|
||||||
portal = await an.start_actor(
|
portal = await n.start_actor(
|
||||||
name='streamer',
|
name='streamer',
|
||||||
enable_modules=[__name__]
|
enable_modules=[__name__]
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
"""
|
"""
|
||||||
Registrar and "local" actor api
|
Arbiter and "local" actor api
|
||||||
"""
|
"""
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
|
@ -12,11 +12,11 @@ from tractor._testing import tractor_test
|
||||||
|
|
||||||
@pytest.mark.trio
|
@pytest.mark.trio
|
||||||
async def test_no_runtime():
|
async def test_no_runtime():
|
||||||
"""A registrar must be established before any nurseries
|
"""An arbitter must be established before any nurseries
|
||||||
can be created.
|
can be created.
|
||||||
|
|
||||||
(In other words ``tractor.open_root_actor()`` must be
|
(In other words ``tractor.open_root_actor()`` must be engaged at
|
||||||
engaged at some point?)
|
some point?)
|
||||||
"""
|
"""
|
||||||
with pytest.raises(RuntimeError) :
|
with pytest.raises(RuntimeError) :
|
||||||
async with tractor.find_actor('doggy'):
|
async with tractor.find_actor('doggy'):
|
||||||
|
|
@ -25,9 +25,9 @@ async def test_no_runtime():
|
||||||
|
|
||||||
@tractor_test
|
@tractor_test
|
||||||
async def test_self_is_registered(reg_addr):
|
async def test_self_is_registered(reg_addr):
|
||||||
"Verify waiting on the registrar to register itself using the standard api."
|
"Verify waiting on the arbiter to register itself using the standard api."
|
||||||
actor = tractor.current_actor()
|
actor = tractor.current_actor()
|
||||||
assert actor.is_registrar
|
assert actor.is_arbiter
|
||||||
with trio.fail_after(0.2):
|
with trio.fail_after(0.2):
|
||||||
async with tractor.wait_for_actor('root') as portal:
|
async with tractor.wait_for_actor('root') as portal:
|
||||||
assert portal.channel.uid[0] == 'root'
|
assert portal.channel.uid[0] == 'root'
|
||||||
|
|
@ -35,11 +35,11 @@ async def test_self_is_registered(reg_addr):
|
||||||
|
|
||||||
@tractor_test
|
@tractor_test
|
||||||
async def test_self_is_registered_localportal(reg_addr):
|
async def test_self_is_registered_localportal(reg_addr):
|
||||||
"Verify waiting on the registrar to register itself using a local portal."
|
"Verify waiting on the arbiter to register itself using a local portal."
|
||||||
actor = tractor.current_actor()
|
actor = tractor.current_actor()
|
||||||
assert actor.is_registrar
|
assert actor.is_arbiter
|
||||||
async with tractor.get_registry(reg_addr) as portal:
|
async with tractor.get_registry(reg_addr) as portal:
|
||||||
assert isinstance(portal, tractor.runtime._portal.LocalPortal)
|
assert isinstance(portal, tractor._portal.LocalPortal)
|
||||||
|
|
||||||
with trio.fail_after(0.2):
|
with trio.fail_after(0.2):
|
||||||
sockaddr = await portal.run_from_ns(
|
sockaddr = await portal.run_from_ns(
|
||||||
|
|
@ -57,8 +57,8 @@ def test_local_actor_async_func(reg_addr):
|
||||||
async with tractor.open_root_actor(
|
async with tractor.open_root_actor(
|
||||||
registry_addrs=[reg_addr],
|
registry_addrs=[reg_addr],
|
||||||
):
|
):
|
||||||
# registrar is started in-proc if dne
|
# arbiter is started in-proc if dne
|
||||||
assert tractor.current_actor().is_registrar
|
assert tractor.current_actor().is_arbiter
|
||||||
|
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
nums.append(i)
|
nums.append(i)
|
||||||
|
|
|
||||||
|
|
@ -1,185 +0,0 @@
|
||||||
'''
|
|
||||||
`tractor.log`-wrapping unit tests.
|
|
||||||
|
|
||||||
'''
|
|
||||||
from pathlib import Path
|
|
||||||
import shutil
|
|
||||||
from types import ModuleType
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import tractor
|
|
||||||
from tractor import (
|
|
||||||
_code_load,
|
|
||||||
log,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_root_pkg_not_duplicated_in_logger_name():
|
|
||||||
'''
|
|
||||||
When both `pkg_name` and `name` are passed and they have
|
|
||||||
a common `<root_name>.< >` prefix, ensure that it is not
|
|
||||||
duplicated in the child's `StackLevelAdapter.name: str`.
|
|
||||||
|
|
||||||
'''
|
|
||||||
project_name: str = 'pylib'
|
|
||||||
pkg_path: str = 'pylib.subpkg.mod'
|
|
||||||
|
|
||||||
assert not tractor.current_actor(
|
|
||||||
err_on_no_runtime=False,
|
|
||||||
)
|
|
||||||
proj_log = log.get_logger(
|
|
||||||
pkg_name=project_name,
|
|
||||||
mk_sublog=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
sublog = log.get_logger(
|
|
||||||
pkg_name=project_name,
|
|
||||||
name=pkg_path,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert proj_log is not sublog
|
|
||||||
assert sublog.name.count(proj_log.name) == 1
|
|
||||||
assert 'mod' not in sublog.name
|
|
||||||
|
|
||||||
|
|
||||||
def test_implicit_mod_name_applied_for_child(
|
|
||||||
testdir: pytest.Pytester,
|
|
||||||
loglevel: str,
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
Verify that when `.log.get_logger(pkg_name='pylib')` is called
|
|
||||||
from a given sub-mod from within the `pylib` pkg-path, we
|
|
||||||
implicitly set the equiv of `name=__name__` from the caller's
|
|
||||||
module.
|
|
||||||
|
|
||||||
'''
|
|
||||||
# tractor.log.get_console_log(level=loglevel)
|
|
||||||
proj_name: str = 'snakelib'
|
|
||||||
mod_code: str = (
|
|
||||||
f'import tractor\n'
|
|
||||||
f'\n'
|
|
||||||
# if you need to trace `testdir` stuff @ import-time..
|
|
||||||
# f'breakpoint()\n'
|
|
||||||
f'log = tractor.log.get_logger(pkg_name="{proj_name}")\n'
|
|
||||||
)
|
|
||||||
|
|
||||||
# create a sub-module for each pkg layer
|
|
||||||
_lib = testdir.mkpydir(proj_name)
|
|
||||||
pkg: Path = Path(_lib)
|
|
||||||
pkg_init_mod: Path = pkg / "__init__.py"
|
|
||||||
pkg_init_mod.write_text(mod_code)
|
|
||||||
|
|
||||||
subpkg: Path = pkg / 'subpkg'
|
|
||||||
subpkg.mkdir()
|
|
||||||
subpkgmod: Path = subpkg / "__init__.py"
|
|
||||||
subpkgmod.touch()
|
|
||||||
subpkgmod.write_text(mod_code)
|
|
||||||
|
|
||||||
_submod: Path = testdir.makepyfile(
|
|
||||||
_mod=mod_code,
|
|
||||||
)
|
|
||||||
|
|
||||||
pkg_submod = pkg / 'mod.py'
|
|
||||||
pkg_subpkg_submod = subpkg / 'submod.py'
|
|
||||||
shutil.copyfile(
|
|
||||||
_submod,
|
|
||||||
pkg_submod,
|
|
||||||
)
|
|
||||||
shutil.copyfile(
|
|
||||||
_submod,
|
|
||||||
pkg_subpkg_submod,
|
|
||||||
)
|
|
||||||
testdir.chdir()
|
|
||||||
# NOTE, to introspect the py-file-module-layout use (in .xsh
|
|
||||||
# syntax): `ranger @str(testdir)`
|
|
||||||
|
|
||||||
# XXX NOTE, once the "top level" pkg mod has been
|
|
||||||
# imported, we can then use `import` syntax to
|
|
||||||
# import it's sub-pkgs and modules.
|
|
||||||
subpkgmod: ModuleType = _code_load.load_module_from_path(
|
|
||||||
Path(pkg / '__init__.py'),
|
|
||||||
module_name=proj_name,
|
|
||||||
)
|
|
||||||
|
|
||||||
pkg_root_log = log.get_logger(
|
|
||||||
pkg_name=proj_name,
|
|
||||||
mk_sublog=False,
|
|
||||||
)
|
|
||||||
# the top level pkg-mod, created just now,
|
|
||||||
# by above API call.
|
|
||||||
assert pkg_root_log.name == proj_name
|
|
||||||
assert not pkg_root_log.logger.getChildren()
|
|
||||||
#
|
|
||||||
# ^TODO! test this same output but created via a `get_logger()`
|
|
||||||
# call in the `snakelib.__init__py`!!
|
|
||||||
|
|
||||||
# NOTE, the pkg-level "init mod" should of course
|
|
||||||
# have the same name as the package ns-path.
|
|
||||||
import snakelib as init_mod
|
|
||||||
assert init_mod.log.name == proj_name
|
|
||||||
|
|
||||||
# NOTE, a first-pkg-level sub-module should only
|
|
||||||
# use the package-name since the leaf-node-module
|
|
||||||
# will be included in log headers by default.
|
|
||||||
from snakelib import mod
|
|
||||||
assert mod.log.name == proj_name
|
|
||||||
|
|
||||||
from snakelib import subpkg
|
|
||||||
assert (
|
|
||||||
subpkg.log.name
|
|
||||||
==
|
|
||||||
subpkg.__package__
|
|
||||||
==
|
|
||||||
f'{proj_name}.subpkg'
|
|
||||||
)
|
|
||||||
|
|
||||||
from snakelib.subpkg import submod
|
|
||||||
assert (
|
|
||||||
submod.log.name
|
|
||||||
==
|
|
||||||
submod.__package__
|
|
||||||
==
|
|
||||||
f'{proj_name}.subpkg'
|
|
||||||
)
|
|
||||||
|
|
||||||
sub_logs = pkg_root_log.logger.getChildren()
|
|
||||||
assert len(sub_logs) == 1 # only one nested sub-pkg module
|
|
||||||
assert submod.log.logger in sub_logs
|
|
||||||
|
|
||||||
|
|
||||||
# TODO, moar tests against existing feats:
|
|
||||||
# ------ - ------
|
|
||||||
# - [ ] color settings?
|
|
||||||
# - [ ] header contents like,
|
|
||||||
# - actor + thread + task names from various conc-primitives,
|
|
||||||
# - [ ] `StackLevelAdapter` extensions,
|
|
||||||
# - our custom levels/methods: `transport|runtime|cance|pdb|devx`
|
|
||||||
# - [ ] custom-headers support?
|
|
||||||
#
|
|
||||||
|
|
||||||
# TODO, test driven dev of new-ideas/long-wanted feats,
|
|
||||||
# ------ - ------
|
|
||||||
# - [ ] https://github.com/goodboy/tractor/issues/244
|
|
||||||
# - [ ] @catern mentioned using a sync / deterministic sys
|
|
||||||
# and in particular `svlogd`?
|
|
||||||
# |_ https://smarden.org/runit/svlogd.8
|
|
||||||
|
|
||||||
# - [ ] using adapter vs. filters?
|
|
||||||
# - https://stackoverflow.com/questions/60691759/add-information-to-every-log-message-in-python-logging/61830838#61830838
|
|
||||||
|
|
||||||
# - [ ] `.at_least_level()` optimization which short circuits wtv
|
|
||||||
# `logging` is doing behind the scenes when the level filters
|
|
||||||
# the emission..?
|
|
||||||
|
|
||||||
# - [ ] use of `.log.get_console_log()` in subactors and the
|
|
||||||
# subtleties of ensuring it actually emits from a subproc.
|
|
||||||
|
|
||||||
# - [ ] this idea of activating per-subsys emissions with some
|
|
||||||
# kind of `.name` filter passed to the runtime or maybe configured
|
|
||||||
# via the root `StackLevelAdapter`?
|
|
||||||
|
|
||||||
# - [ ] use of `logging.dict.dictConfig()` to simplify the impl
|
|
||||||
# of any of ^^ ??
|
|
||||||
# - https://stackoverflow.com/questions/7507825/where-is-a-complete-example-of-logging-config-dictconfig
|
|
||||||
# - https://docs.python.org/3/library/logging.config.html#configuration-dictionary-schema
|
|
||||||
# - https://docs.python.org/3/library/logging.config.html#logging.config.dictConfig
|
|
||||||
|
|
@ -1,13 +1,8 @@
|
||||||
"""
|
"""
|
||||||
Multiple python programs invoking the runtime.
|
Multiple python programs invoking the runtime.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
|
||||||
import platform
|
import platform
|
||||||
import subprocess
|
|
||||||
import time
|
import time
|
||||||
from typing import (
|
|
||||||
TYPE_CHECKING,
|
|
||||||
)
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import trio
|
import trio
|
||||||
|
|
@ -15,32 +10,14 @@ import tractor
|
||||||
from tractor._testing import (
|
from tractor._testing import (
|
||||||
tractor_test,
|
tractor_test,
|
||||||
)
|
)
|
||||||
from tractor import (
|
|
||||||
current_actor,
|
|
||||||
Actor,
|
|
||||||
Context,
|
|
||||||
Portal,
|
|
||||||
)
|
|
||||||
from tractor.runtime import _state
|
|
||||||
from .conftest import (
|
from .conftest import (
|
||||||
sig_prog,
|
sig_prog,
|
||||||
_INT_SIGNAL,
|
_INT_SIGNAL,
|
||||||
_INT_RETURN_CODE,
|
_INT_RETURN_CODE,
|
||||||
)
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from tractor.msg import Aid
|
|
||||||
from tractor.discovery._addr import (
|
|
||||||
UnwrappedAddress,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
def test_abort_on_sigint(daemon):
|
||||||
_non_linux: bool = platform.system() != 'Linux'
|
|
||||||
|
|
||||||
|
|
||||||
def test_abort_on_sigint(
|
|
||||||
daemon: subprocess.Popen,
|
|
||||||
):
|
|
||||||
assert daemon.returncode is None
|
assert daemon.returncode is None
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
sig_prog(daemon, _INT_SIGNAL)
|
sig_prog(daemon, _INT_SIGNAL)
|
||||||
|
|
@ -53,131 +30,39 @@ def test_abort_on_sigint(
|
||||||
|
|
||||||
|
|
||||||
@tractor_test
|
@tractor_test
|
||||||
async def test_cancel_remote_registrar(
|
async def test_cancel_remote_arbiter(daemon, reg_addr):
|
||||||
daemon: subprocess.Popen,
|
assert not tractor.current_actor().is_arbiter
|
||||||
reg_addr: UnwrappedAddress,
|
|
||||||
):
|
|
||||||
assert not current_actor().is_registrar
|
|
||||||
async with tractor.get_registry(reg_addr) as portal:
|
async with tractor.get_registry(reg_addr) as portal:
|
||||||
await portal.cancel_actor()
|
await portal.cancel_actor()
|
||||||
|
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
# the registrar channel server is cancelled but not its main task
|
# the arbiter channel server is cancelled but not its main task
|
||||||
assert daemon.returncode is None
|
assert daemon.returncode is None
|
||||||
|
|
||||||
# no registrar socket should exist
|
# no arbiter socket should exist
|
||||||
with pytest.raises(OSError):
|
with pytest.raises(OSError):
|
||||||
async with tractor.get_registry(reg_addr) as portal:
|
async with tractor.get_registry(reg_addr) as portal:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def test_register_duplicate_name(
|
def test_register_duplicate_name(daemon, reg_addr):
|
||||||
daemon: subprocess.Popen,
|
|
||||||
reg_addr: UnwrappedAddress,
|
|
||||||
):
|
|
||||||
async def main():
|
async def main():
|
||||||
|
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
registry_addrs=[reg_addr],
|
registry_addrs=[reg_addr],
|
||||||
) as an:
|
) as n:
|
||||||
|
|
||||||
assert not current_actor().is_registrar
|
assert not tractor.current_actor().is_arbiter
|
||||||
|
|
||||||
p1 = await an.start_actor('doggy')
|
p1 = await n.start_actor('doggy')
|
||||||
p2 = await an.start_actor('doggy')
|
p2 = await n.start_actor('doggy')
|
||||||
|
|
||||||
async with tractor.wait_for_actor('doggy') as portal:
|
async with tractor.wait_for_actor('doggy') as portal:
|
||||||
assert portal.channel.uid in (p2.channel.uid, p1.channel.uid)
|
assert portal.channel.uid in (p2.channel.uid, p1.channel.uid)
|
||||||
|
|
||||||
await an.cancel()
|
await n.cancel()
|
||||||
|
|
||||||
# XXX, run manually since we want to start this root **after**
|
# run it manually since we want to start **after**
|
||||||
# the other "daemon" program with it's own root.
|
# the other "daemon" program
|
||||||
trio.run(main)
|
|
||||||
|
|
||||||
|
|
||||||
@tractor.context
|
|
||||||
async def get_root_portal(
|
|
||||||
ctx: Context,
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
Connect back to the root actor manually (using `._discovery` API)
|
|
||||||
and ensure it's contact info is the same as our immediate parent.
|
|
||||||
|
|
||||||
'''
|
|
||||||
sub: Actor = current_actor()
|
|
||||||
rtvs: dict = _state._runtime_vars
|
|
||||||
raddrs: list[UnwrappedAddress] = rtvs['_root_addrs']
|
|
||||||
|
|
||||||
# await tractor.pause()
|
|
||||||
# XXX, in case the sub->root discovery breaks you might need
|
|
||||||
# this (i know i did Xp)!!
|
|
||||||
# from tractor.devx import mk_pdb
|
|
||||||
# mk_pdb().set_trace()
|
|
||||||
|
|
||||||
assert (
|
|
||||||
len(raddrs) == 1
|
|
||||||
and
|
|
||||||
list(sub._parent_chan.raddr.unwrap()) in raddrs
|
|
||||||
)
|
|
||||||
|
|
||||||
# connect back to our immediate parent which should also
|
|
||||||
# be the actor-tree's root.
|
|
||||||
from tractor.discovery._api import get_root
|
|
||||||
ptl: Portal
|
|
||||||
async with get_root() as ptl:
|
|
||||||
root_aid: Aid = ptl.chan.aid
|
|
||||||
parent_ptl: Portal = current_actor().get_parent()
|
|
||||||
assert (
|
|
||||||
root_aid.name == 'root'
|
|
||||||
and
|
|
||||||
parent_ptl.chan.aid == root_aid
|
|
||||||
)
|
|
||||||
await ctx.started()
|
|
||||||
|
|
||||||
|
|
||||||
def test_non_registrar_spawns_child(
|
|
||||||
daemon: subprocess.Popen,
|
|
||||||
reg_addr: UnwrappedAddress,
|
|
||||||
loglevel: str,
|
|
||||||
debug_mode: bool,
|
|
||||||
ci_env: bool,
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
Ensure a non-regristar (serving) root actor can spawn a sub and
|
|
||||||
that sub can connect back (manually) to it's rent that is the
|
|
||||||
root without issue.
|
|
||||||
|
|
||||||
More or less this audits the global contact info in
|
|
||||||
`._state._runtime_vars`.
|
|
||||||
|
|
||||||
'''
|
|
||||||
async def main():
|
|
||||||
|
|
||||||
# XXX, since apparently on macos in GH's CI it can be a race
|
|
||||||
# with the `daemon` registrar on grabbing the socket-addr..
|
|
||||||
if ci_env and _non_linux:
|
|
||||||
await trio.sleep(.5)
|
|
||||||
|
|
||||||
async with tractor.open_nursery(
|
|
||||||
registry_addrs=[reg_addr],
|
|
||||||
loglevel=loglevel,
|
|
||||||
debug_mode=debug_mode,
|
|
||||||
) as an:
|
|
||||||
|
|
||||||
actor: Actor = tractor.current_actor()
|
|
||||||
assert not actor.is_registrar
|
|
||||||
sub_ptl: Portal = await an.start_actor(
|
|
||||||
name='sub',
|
|
||||||
enable_modules=[__name__],
|
|
||||||
)
|
|
||||||
|
|
||||||
async with sub_ptl.open_context(
|
|
||||||
get_root_portal,
|
|
||||||
) as (ctx, _):
|
|
||||||
print('Waiting for `sub` to connect back to us..')
|
|
||||||
|
|
||||||
await an.cancel()
|
|
||||||
|
|
||||||
# XXX, run manually since we want to start this root **after**
|
|
||||||
# the other "daemon" program with it's own root.
|
|
||||||
trio.run(main)
|
trio.run(main)
|
||||||
|
|
|
||||||
|
|
@ -61,7 +61,7 @@ async def maybe_expect_raises(
|
||||||
Async wrapper for ensuring errors propagate from the inner scope.
|
Async wrapper for ensuring errors propagate from the inner scope.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
if tractor.debug_mode():
|
if tractor._state.debug_mode():
|
||||||
timeout += 999
|
timeout += 999
|
||||||
|
|
||||||
with trio.fail_after(timeout):
|
with trio.fail_after(timeout):
|
||||||
|
|
@ -7,14 +7,6 @@ import tractor
|
||||||
from tractor.experimental import msgpub
|
from tractor.experimental import msgpub
|
||||||
from tractor._testing import tractor_test
|
from tractor._testing import tractor_test
|
||||||
|
|
||||||
pytestmark = pytest.mark.skipon_spawn_backend(
|
|
||||||
'subint',
|
|
||||||
reason=(
|
|
||||||
'XXX SUBINT HANGING TEST XXX\n'
|
|
||||||
'See oustanding issue(s)\n'
|
|
||||||
# TODO, put issue link!
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_type_checks():
|
def test_type_checks():
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,333 +0,0 @@
|
||||||
'''
|
|
||||||
Verify that externally registered remote actor error
|
|
||||||
types are correctly relayed, boxed, and re-raised across
|
|
||||||
IPC actor hops via `reg_err_types()`.
|
|
||||||
|
|
||||||
Also ensure that when custom error types are NOT registered
|
|
||||||
the framework indicates the lookup failure to the user.
|
|
||||||
|
|
||||||
'''
|
|
||||||
import pytest
|
|
||||||
import trio
|
|
||||||
import tractor
|
|
||||||
from tractor import (
|
|
||||||
Context,
|
|
||||||
Portal,
|
|
||||||
RemoteActorError,
|
|
||||||
)
|
|
||||||
from tractor._exceptions import (
|
|
||||||
get_err_type,
|
|
||||||
reg_err_types,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# -- custom app-level errors for testing --
|
|
||||||
class CustomAppError(Exception):
|
|
||||||
'''
|
|
||||||
A hypothetical user-app error that should be
|
|
||||||
boxed+relayed by `tractor` IPC when registered.
|
|
||||||
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
|
||||||
class AnotherAppError(Exception):
|
|
||||||
'''
|
|
||||||
A second custom error for multi-type registration.
|
|
||||||
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
|
||||||
class UnregisteredAppError(Exception):
|
|
||||||
'''
|
|
||||||
A custom error that is intentionally NEVER
|
|
||||||
registered via `reg_err_types()` so we can
|
|
||||||
verify the framework's failure indication.
|
|
||||||
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
|
||||||
# -- remote-task endpoints --
|
|
||||||
@tractor.context
|
|
||||||
async def raise_custom_err(
|
|
||||||
ctx: Context,
|
|
||||||
) -> None:
|
|
||||||
'''
|
|
||||||
Remote ep that raises a `CustomAppError`
|
|
||||||
after sync-ing with the caller.
|
|
||||||
|
|
||||||
'''
|
|
||||||
await ctx.started()
|
|
||||||
raise CustomAppError(
|
|
||||||
'the app exploded remotely'
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@tractor.context
|
|
||||||
async def raise_another_err(
|
|
||||||
ctx: Context,
|
|
||||||
) -> None:
|
|
||||||
'''
|
|
||||||
Remote ep that raises `AnotherAppError`.
|
|
||||||
|
|
||||||
'''
|
|
||||||
await ctx.started()
|
|
||||||
raise AnotherAppError(
|
|
||||||
'another app-level kaboom'
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@tractor.context
|
|
||||||
async def raise_unreg_err(
|
|
||||||
ctx: Context,
|
|
||||||
) -> None:
|
|
||||||
'''
|
|
||||||
Remote ep that raises an `UnregisteredAppError`
|
|
||||||
which has NOT been `reg_err_types()`-registered.
|
|
||||||
|
|
||||||
'''
|
|
||||||
await ctx.started()
|
|
||||||
raise UnregisteredAppError(
|
|
||||||
'this error type is unknown to tractor'
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# -- unit tests for the type-registry plumbing --
|
|
||||||
|
|
||||||
class TestRegErrTypesPlumbing:
|
|
||||||
'''
|
|
||||||
Low-level checks on `reg_err_types()` and
|
|
||||||
`get_err_type()` without requiring IPC.
|
|
||||||
|
|
||||||
'''
|
|
||||||
|
|
||||||
def test_unregistered_type_returns_none(self):
|
|
||||||
'''
|
|
||||||
An unregistered custom error name should yield
|
|
||||||
`None` from `get_err_type()`.
|
|
||||||
|
|
||||||
'''
|
|
||||||
result = get_err_type('CustomAppError')
|
|
||||||
assert result is None
|
|
||||||
|
|
||||||
def test_register_and_lookup(self):
|
|
||||||
'''
|
|
||||||
After `reg_err_types()`, the custom type should
|
|
||||||
be discoverable via `get_err_type()`.
|
|
||||||
|
|
||||||
'''
|
|
||||||
reg_err_types([CustomAppError])
|
|
||||||
result = get_err_type('CustomAppError')
|
|
||||||
assert result is CustomAppError
|
|
||||||
|
|
||||||
def test_register_multiple_types(self):
|
|
||||||
'''
|
|
||||||
Registering a list of types should make each
|
|
||||||
one individually resolvable.
|
|
||||||
|
|
||||||
'''
|
|
||||||
reg_err_types([
|
|
||||||
CustomAppError,
|
|
||||||
AnotherAppError,
|
|
||||||
])
|
|
||||||
assert (
|
|
||||||
get_err_type('CustomAppError')
|
|
||||||
is CustomAppError
|
|
||||||
)
|
|
||||||
assert (
|
|
||||||
get_err_type('AnotherAppError')
|
|
||||||
is AnotherAppError
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_builtin_types_always_resolve(self):
|
|
||||||
'''
|
|
||||||
Builtin error types like `RuntimeError` and
|
|
||||||
`ValueError` should always be found without
|
|
||||||
any prior registration.
|
|
||||||
|
|
||||||
'''
|
|
||||||
assert (
|
|
||||||
get_err_type('RuntimeError')
|
|
||||||
is RuntimeError
|
|
||||||
)
|
|
||||||
assert (
|
|
||||||
get_err_type('ValueError')
|
|
||||||
is ValueError
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_tractor_native_types_resolve(self):
|
|
||||||
'''
|
|
||||||
`tractor`-internal exc types (e.g.
|
|
||||||
`ContextCancelled`) should always resolve.
|
|
||||||
|
|
||||||
'''
|
|
||||||
assert (
|
|
||||||
get_err_type('ContextCancelled')
|
|
||||||
is tractor.ContextCancelled
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_boxed_type_str_without_ipc_msg(self):
|
|
||||||
'''
|
|
||||||
When a `RemoteActorError` is constructed
|
|
||||||
without an IPC msg (and no resolvable type),
|
|
||||||
`.boxed_type_str` should return `'<unknown>'`.
|
|
||||||
|
|
||||||
'''
|
|
||||||
rae = RemoteActorError('test')
|
|
||||||
assert rae.boxed_type_str == '<unknown>'
|
|
||||||
|
|
||||||
|
|
||||||
# -- IPC-level integration tests --
|
|
||||||
|
|
||||||
def test_registered_custom_err_relayed(
|
|
||||||
debug_mode: bool,
|
|
||||||
tpt_proto: str,
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
When a custom error type is registered via
|
|
||||||
`reg_err_types()` on BOTH sides of an IPC dialog,
|
|
||||||
the parent should receive a `RemoteActorError`
|
|
||||||
whose `.boxed_type` matches the original custom
|
|
||||||
error type.
|
|
||||||
|
|
||||||
'''
|
|
||||||
reg_err_types([CustomAppError])
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
async with tractor.open_nursery(
|
|
||||||
debug_mode=debug_mode,
|
|
||||||
enable_transports=[tpt_proto],
|
|
||||||
) as an:
|
|
||||||
ptl: Portal = await an.start_actor(
|
|
||||||
'custom-err-raiser',
|
|
||||||
enable_modules=[__name__],
|
|
||||||
)
|
|
||||||
async with ptl.open_context(
|
|
||||||
raise_custom_err,
|
|
||||||
) as (ctx, sent):
|
|
||||||
assert not sent
|
|
||||||
try:
|
|
||||||
await ctx.wait_for_result()
|
|
||||||
except RemoteActorError as rae:
|
|
||||||
assert rae.boxed_type is CustomAppError
|
|
||||||
assert rae.src_type is CustomAppError
|
|
||||||
assert 'the app exploded remotely' in str(
|
|
||||||
rae.tb_str
|
|
||||||
)
|
|
||||||
raise
|
|
||||||
|
|
||||||
with pytest.raises(RemoteActorError) as excinfo:
|
|
||||||
trio.run(main)
|
|
||||||
|
|
||||||
rae = excinfo.value
|
|
||||||
assert rae.boxed_type is CustomAppError
|
|
||||||
|
|
||||||
|
|
||||||
def test_registered_another_err_relayed(
|
|
||||||
debug_mode: bool,
|
|
||||||
tpt_proto: str,
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
Same as above but for a different custom error
|
|
||||||
type to verify multi-type registration works
|
|
||||||
end-to-end over IPC.
|
|
||||||
|
|
||||||
'''
|
|
||||||
reg_err_types([AnotherAppError])
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
async with tractor.open_nursery(
|
|
||||||
debug_mode=debug_mode,
|
|
||||||
enable_transports=[tpt_proto],
|
|
||||||
) as an:
|
|
||||||
ptl: Portal = await an.start_actor(
|
|
||||||
'another-err-raiser',
|
|
||||||
enable_modules=[__name__],
|
|
||||||
)
|
|
||||||
async with ptl.open_context(
|
|
||||||
raise_another_err,
|
|
||||||
) as (ctx, sent):
|
|
||||||
assert not sent
|
|
||||||
try:
|
|
||||||
await ctx.wait_for_result()
|
|
||||||
except RemoteActorError as rae:
|
|
||||||
assert (
|
|
||||||
rae.boxed_type
|
|
||||||
is AnotherAppError
|
|
||||||
)
|
|
||||||
raise
|
|
||||||
|
|
||||||
await an.cancel()
|
|
||||||
|
|
||||||
with pytest.raises(RemoteActorError) as excinfo:
|
|
||||||
trio.run(main)
|
|
||||||
|
|
||||||
rae = excinfo.value
|
|
||||||
assert rae.boxed_type is AnotherAppError
|
|
||||||
|
|
||||||
|
|
||||||
def test_unregistered_err_still_relayed(
|
|
||||||
debug_mode: bool,
|
|
||||||
tpt_proto: str,
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
Verify that even when a custom error type is NOT registered via
|
|
||||||
`reg_err_types()`, the remote error is still relayed as
|
|
||||||
a `RemoteActorError` with all string-level info preserved
|
|
||||||
(traceback, type name, source actor uid).
|
|
||||||
|
|
||||||
The `.boxed_type` will be `None` (type obj can't be resolved) but
|
|
||||||
`.boxed_type_str` and `.src_type_str` still report the original
|
|
||||||
type name from the IPC msg.
|
|
||||||
|
|
||||||
This documents the expected limitation: without `reg_err_types()`
|
|
||||||
the `.boxed_type` property can NOT resolve to the original Python
|
|
||||||
type.
|
|
||||||
|
|
||||||
'''
|
|
||||||
# NOTE: intentionally do NOT call
|
|
||||||
# `reg_err_types([UnregisteredAppError])`
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
async with tractor.open_nursery(
|
|
||||||
debug_mode=debug_mode,
|
|
||||||
enable_transports=[tpt_proto],
|
|
||||||
) as an:
|
|
||||||
ptl: Portal = await an.start_actor(
|
|
||||||
'unreg-err-raiser',
|
|
||||||
enable_modules=[__name__],
|
|
||||||
)
|
|
||||||
async with ptl.open_context(
|
|
||||||
raise_unreg_err,
|
|
||||||
) as (ctx, sent):
|
|
||||||
assert not sent
|
|
||||||
await ctx.wait_for_result()
|
|
||||||
|
|
||||||
await an.cancel()
|
|
||||||
|
|
||||||
with pytest.raises(RemoteActorError) as excinfo:
|
|
||||||
trio.run(main)
|
|
||||||
|
|
||||||
rae = excinfo.value
|
|
||||||
|
|
||||||
# the error IS relayed even without
|
|
||||||
# registration; type obj is unresolvable but
|
|
||||||
# all string-level info is preserved.
|
|
||||||
assert rae.boxed_type is None # NOT `UnregisteredAppError`
|
|
||||||
assert rae.src_type is None
|
|
||||||
|
|
||||||
# string names survive the IPC round-trip
|
|
||||||
# via the `Error` msg fields.
|
|
||||||
assert (
|
|
||||||
rae.src_type_str
|
|
||||||
==
|
|
||||||
'UnregisteredAppError'
|
|
||||||
)
|
|
||||||
assert (
|
|
||||||
rae.boxed_type_str
|
|
||||||
==
|
|
||||||
'UnregisteredAppError'
|
|
||||||
)
|
|
||||||
|
|
||||||
# original traceback content is preserved
|
|
||||||
assert 'this error type is unknown' in rae.tb_str
|
|
||||||
assert 'UnregisteredAppError' in rae.tb_str
|
|
||||||
|
|
@ -12,14 +12,14 @@ import trio
|
||||||
import tractor
|
import tractor
|
||||||
from tractor.trionics import (
|
from tractor.trionics import (
|
||||||
maybe_open_context,
|
maybe_open_context,
|
||||||
collapse_eg,
|
|
||||||
)
|
)
|
||||||
from tractor.log import (
|
from tractor.log import (
|
||||||
get_console_log,
|
get_console_log,
|
||||||
get_logger,
|
get_logger,
|
||||||
)
|
)
|
||||||
|
log = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
log = get_logger()
|
|
||||||
|
|
||||||
_resource: int = 0
|
_resource: int = 0
|
||||||
|
|
||||||
|
|
@ -213,12 +213,9 @@ def test_open_local_sub_to_stream(
|
||||||
N local tasks using `trionics.maybe_open_context()`.
|
N local tasks using `trionics.maybe_open_context()`.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
from .conftest import cpu_scaling_factor
|
timeout: float = 3.6
|
||||||
timeout: float = (
|
if platform.system() == "Windows":
|
||||||
4
|
timeout: float = 10
|
||||||
if not platform.system() == "Windows"
|
|
||||||
else 10
|
|
||||||
) * cpu_scaling_factor()
|
|
||||||
|
|
||||||
if debug_mode:
|
if debug_mode:
|
||||||
timeout = 999
|
timeout = 999
|
||||||
|
|
@ -322,7 +319,7 @@ def test_open_local_sub_to_stream(
|
||||||
|
|
||||||
|
|
||||||
@acm
|
@acm
|
||||||
async def maybe_cancel_outer_cs(
|
async def cancel_outer_cs(
|
||||||
cs: trio.CancelScope|None = None,
|
cs: trio.CancelScope|None = None,
|
||||||
delay: float = 0,
|
delay: float = 0,
|
||||||
):
|
):
|
||||||
|
|
@ -336,31 +333,12 @@ async def maybe_cancel_outer_cs(
|
||||||
if cs:
|
if cs:
|
||||||
log.info('task calling cs.cancel()')
|
log.info('task calling cs.cancel()')
|
||||||
cs.cancel()
|
cs.cancel()
|
||||||
|
trio.lowlevel.checkpoint()
|
||||||
yield
|
yield
|
||||||
|
await trio.sleep_forever()
|
||||||
if cs:
|
|
||||||
await trio.sleep_forever()
|
|
||||||
|
|
||||||
# XXX, if not cancelled we'll leak this inf-blocking
|
|
||||||
# subtask to the actor's service tn..
|
|
||||||
else:
|
|
||||||
await trio.lowlevel.checkpoint()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
'delay',
|
|
||||||
[0.05, 0.5, 1],
|
|
||||||
ids="pre_sleep_delay={}".format,
|
|
||||||
)
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
'cancel_by_cs',
|
|
||||||
[True, False],
|
|
||||||
ids="cancel_by_cs={}".format,
|
|
||||||
)
|
|
||||||
def test_lock_not_corrupted_on_fast_cancel(
|
def test_lock_not_corrupted_on_fast_cancel(
|
||||||
delay: float,
|
|
||||||
cancel_by_cs: bool,
|
|
||||||
debug_mode: bool,
|
debug_mode: bool,
|
||||||
loglevel: str,
|
loglevel: str,
|
||||||
):
|
):
|
||||||
|
|
@ -377,14 +355,17 @@ def test_lock_not_corrupted_on_fast_cancel(
|
||||||
due to it having erronously exited without calling
|
due to it having erronously exited without calling
|
||||||
`lock.release()`.
|
`lock.release()`.
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
delay: float = 1.
|
||||||
|
|
||||||
async def use_moc(
|
async def use_moc(
|
||||||
|
cs: trio.CancelScope|None,
|
||||||
delay: float,
|
delay: float,
|
||||||
cs: trio.CancelScope|None = None,
|
|
||||||
):
|
):
|
||||||
log.info('task entering moc')
|
log.info('task entering moc')
|
||||||
async with maybe_open_context(
|
async with maybe_open_context(
|
||||||
maybe_cancel_outer_cs,
|
cancel_outer_cs,
|
||||||
kwargs={
|
kwargs={
|
||||||
'cs': cs,
|
'cs': cs,
|
||||||
'delay': delay,
|
'delay': delay,
|
||||||
|
|
@ -395,13 +376,7 @@ def test_lock_not_corrupted_on_fast_cancel(
|
||||||
else:
|
else:
|
||||||
log.info('1st task entered')
|
log.info('1st task entered')
|
||||||
|
|
||||||
if cs:
|
await trio.sleep_forever()
|
||||||
await trio.sleep_forever()
|
|
||||||
|
|
||||||
else:
|
|
||||||
await trio.sleep(delay)
|
|
||||||
|
|
||||||
# ^END, exit shared ctx.
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
with trio.fail_after(delay + 2):
|
with trio.fail_after(delay + 2):
|
||||||
|
|
@ -410,7 +385,6 @@ def test_lock_not_corrupted_on_fast_cancel(
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
loglevel=loglevel,
|
loglevel=loglevel,
|
||||||
),
|
),
|
||||||
# ?TODO, pass this as the parent tn?
|
|
||||||
trio.open_nursery() as tn,
|
trio.open_nursery() as tn,
|
||||||
):
|
):
|
||||||
get_console_log('info')
|
get_console_log('info')
|
||||||
|
|
@ -418,206 +392,15 @@ def test_lock_not_corrupted_on_fast_cancel(
|
||||||
cs = tn.cancel_scope
|
cs = tn.cancel_scope
|
||||||
tn.start_soon(
|
tn.start_soon(
|
||||||
use_moc,
|
use_moc,
|
||||||
|
cs,
|
||||||
delay,
|
delay,
|
||||||
cs if cancel_by_cs else None,
|
|
||||||
name='child',
|
name='child',
|
||||||
)
|
)
|
||||||
with trio.CancelScope() as rent_cs:
|
with trio.CancelScope() as rent_cs:
|
||||||
await use_moc(
|
await use_moc(
|
||||||
|
cs=rent_cs,
|
||||||
delay=delay,
|
delay=delay,
|
||||||
cs=rent_cs if cancel_by_cs else None,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
trio.run(main)
|
|
||||||
|
|
||||||
|
|
||||||
@acm
|
|
||||||
async def acm_with_resource(resource_id: str):
|
|
||||||
'''
|
|
||||||
Yield `resource_id` as the cached value.
|
|
||||||
|
|
||||||
Used to verify per-`ctx_key` isolation when the same
|
|
||||||
`acm_func` is called with different kwargs.
|
|
||||||
|
|
||||||
'''
|
|
||||||
yield resource_id
|
|
||||||
|
|
||||||
|
|
||||||
def test_per_ctx_key_resource_lifecycle(
|
|
||||||
debug_mode: bool,
|
|
||||||
loglevel: str,
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
Verify that `maybe_open_context()` correctly isolates resource
|
|
||||||
lifecycle **per `ctx_key`** when the same `acm_func` is called
|
|
||||||
with different kwargs.
|
|
||||||
|
|
||||||
Previously `_Cache.users` was a single global `int` and
|
|
||||||
`_Cache.locks` was keyed on `fid` (function ID), so calling
|
|
||||||
the same `acm_func` with different kwargs (producing different
|
|
||||||
`ctx_key`s) meant:
|
|
||||||
|
|
||||||
- teardown for one key was skipped bc the *other* key's users
|
|
||||||
kept the global count > 0,
|
|
||||||
- and re-entry could hit the old
|
|
||||||
`assert not resources.get(ctx_key)` crash during the
|
|
||||||
teardown window.
|
|
||||||
|
|
||||||
This was the root cause of a long-standing bug in piker's
|
|
||||||
`brokerd.kraken` backend.
|
|
||||||
|
|
||||||
'''
|
|
||||||
timeout: float = 6
|
|
||||||
if debug_mode:
|
|
||||||
timeout = 999
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
a_ready = trio.Event()
|
|
||||||
a_exit = trio.Event()
|
|
||||||
|
|
||||||
async def hold_resource_a():
|
|
||||||
'''
|
|
||||||
Open resource 'a' and keep it alive until signalled.
|
|
||||||
|
|
||||||
'''
|
|
||||||
async with maybe_open_context(
|
|
||||||
acm_with_resource,
|
|
||||||
kwargs={'resource_id': 'a'},
|
|
||||||
) as (cache_hit, value):
|
|
||||||
assert not cache_hit
|
|
||||||
assert value == 'a'
|
|
||||||
log.info("resource 'a' entered (holding)")
|
|
||||||
a_ready.set()
|
|
||||||
await a_exit.wait()
|
|
||||||
log.info("resource 'a' exiting")
|
|
||||||
|
|
||||||
with trio.fail_after(timeout):
|
|
||||||
async with (
|
|
||||||
tractor.open_root_actor(
|
|
||||||
debug_mode=debug_mode,
|
|
||||||
loglevel=loglevel,
|
|
||||||
),
|
|
||||||
trio.open_nursery() as tn,
|
|
||||||
):
|
|
||||||
# Phase 1: bg task holds resource 'a' open.
|
|
||||||
tn.start_soon(hold_resource_a)
|
|
||||||
await a_ready.wait()
|
|
||||||
|
|
||||||
# Phase 2: open resource 'b' (different kwargs,
|
|
||||||
# same acm_func) then exit it while 'a' is still
|
|
||||||
# alive.
|
|
||||||
async with maybe_open_context(
|
|
||||||
acm_with_resource,
|
|
||||||
kwargs={'resource_id': 'b'},
|
|
||||||
) as (cache_hit, value):
|
|
||||||
assert not cache_hit
|
|
||||||
assert value == 'b'
|
|
||||||
log.info("resource 'b' entered")
|
|
||||||
|
|
||||||
log.info("resource 'b' exited, waiting for teardown")
|
|
||||||
await trio.lowlevel.checkpoint()
|
|
||||||
|
|
||||||
# Phase 3: re-open 'b'; must be a fresh cache MISS
|
|
||||||
# proving 'b' was torn down independently of 'a'.
|
|
||||||
#
|
|
||||||
# With the old global `_Cache.users` counter this
|
|
||||||
# would be a stale cache HIT (leaked resource) or
|
|
||||||
# trigger `assert not resources.get(ctx_key)`.
|
|
||||||
async with maybe_open_context(
|
|
||||||
acm_with_resource,
|
|
||||||
kwargs={'resource_id': 'b'},
|
|
||||||
) as (cache_hit, value):
|
|
||||||
assert not cache_hit, (
|
|
||||||
"resource 'b' was NOT torn down despite "
|
|
||||||
"having zero users! (global user count bug)"
|
|
||||||
)
|
|
||||||
assert value == 'b'
|
|
||||||
log.info(
|
|
||||||
"resource 'b' re-entered "
|
|
||||||
"(cache miss, correct)"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Phase 4: let 'a' exit, clean shutdown.
|
|
||||||
a_exit.set()
|
|
||||||
|
|
||||||
trio.run(main)
|
|
||||||
|
|
||||||
|
|
||||||
def test_moc_reentry_during_teardown(
|
|
||||||
debug_mode: bool,
|
|
||||||
loglevel: str,
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
Reproduce the piker `open_cached_client('kraken')` race:
|
|
||||||
|
|
||||||
- same `acm_func`, NO kwargs (identical `ctx_key`)
|
|
||||||
- multiple tasks share the cached resource
|
|
||||||
- all users exit -> teardown starts
|
|
||||||
- a NEW task enters during `_Cache.run_ctx.__aexit__`
|
|
||||||
- `values[ctx_key]` is gone (popped in inner finally)
|
|
||||||
but `resources[ctx_key]` still exists (outer finally
|
|
||||||
hasn't run yet bc the acm cleanup has checkpoints)
|
|
||||||
- old code: `assert not resources.get(ctx_key)` FIRES
|
|
||||||
|
|
||||||
This models the real-world scenario where `brokerd.kraken`
|
|
||||||
tasks concurrently call `open_cached_client('kraken')`
|
|
||||||
(same `acm_func`, empty kwargs, shared `ctx_key`) and
|
|
||||||
the teardown/re-entry race triggers intermittently.
|
|
||||||
|
|
||||||
'''
|
|
||||||
async def main():
|
|
||||||
in_aexit = trio.Event()
|
|
||||||
|
|
||||||
@acm
|
|
||||||
async def cached_client():
|
|
||||||
'''
|
|
||||||
Simulates `kraken.api.get_client()`:
|
|
||||||
- no params (all callers share one `ctx_key`)
|
|
||||||
- slow-ish cleanup to widen the race window
|
|
||||||
between `values.pop()` and `resources.pop()`
|
|
||||||
inside `_Cache.run_ctx`.
|
|
||||||
|
|
||||||
'''
|
|
||||||
yield 'the-client'
|
|
||||||
# Signal that we're in __aexit__ — at this
|
|
||||||
# point `values` has already been popped by
|
|
||||||
# `run_ctx`'s inner finally, but `resources`
|
|
||||||
# is still alive (outer finally hasn't run).
|
|
||||||
in_aexit.set()
|
|
||||||
await trio.sleep(10)
|
|
||||||
|
|
||||||
first_done = trio.Event()
|
|
||||||
|
|
||||||
async def use_and_exit():
|
|
||||||
async with maybe_open_context(
|
|
||||||
cached_client,
|
|
||||||
) as (cache_hit, value):
|
|
||||||
assert value == 'the-client'
|
|
||||||
first_done.set()
|
|
||||||
|
|
||||||
async def reenter_during_teardown():
|
|
||||||
'''
|
|
||||||
Wait for the acm's `__aexit__` to start (meaning
|
|
||||||
`values` is popped but `resources` still exists),
|
|
||||||
then re-enter — triggering the assert.
|
|
||||||
|
|
||||||
'''
|
|
||||||
await in_aexit.wait()
|
|
||||||
async with maybe_open_context(
|
|
||||||
cached_client,
|
|
||||||
) as (cache_hit, value):
|
|
||||||
assert value == 'the-client'
|
|
||||||
|
|
||||||
with trio.fail_after(5):
|
|
||||||
async with (
|
|
||||||
tractor.open_root_actor(
|
|
||||||
debug_mode=debug_mode,
|
|
||||||
loglevel=loglevel,
|
|
||||||
),
|
|
||||||
collapse_eg(),
|
|
||||||
trio.open_nursery() as tn,
|
|
||||||
):
|
|
||||||
tn.start_soon(use_and_exit)
|
|
||||||
tn.start_soon(reenter_during_teardown)
|
|
||||||
|
|
||||||
trio.run(main)
|
trio.run(main)
|
||||||
|
|
|
||||||
|
|
@ -4,10 +4,6 @@ import trio
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
import tractor
|
import tractor
|
||||||
|
|
||||||
# XXX `cffi` dun build on py3.14 yet..
|
|
||||||
cffi = pytest.importorskip("cffi")
|
|
||||||
|
|
||||||
from tractor.ipc._ringbuf import (
|
from tractor.ipc._ringbuf import (
|
||||||
open_ringbuf,
|
open_ringbuf,
|
||||||
RBToken,
|
RBToken,
|
||||||
|
|
@ -18,7 +14,7 @@ from tractor._testing.samples import (
|
||||||
generate_sample_messages,
|
generate_sample_messages,
|
||||||
)
|
)
|
||||||
|
|
||||||
# XXX, in case you want to melt your cores, comment this skip line XD
|
# in case you don't want to melt your cores, uncomment dis!
|
||||||
pytestmark = pytest.mark.skip
|
pytestmark = pytest.mark.skip
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -49,7 +49,7 @@ def test_infected_root_actor(
|
||||||
),
|
),
|
||||||
to_asyncio.open_channel_from(
|
to_asyncio.open_channel_from(
|
||||||
aio_echo_server,
|
aio_echo_server,
|
||||||
) as (chan, first),
|
) as (first, chan),
|
||||||
):
|
):
|
||||||
assert first == 'start'
|
assert first == 'start'
|
||||||
|
|
||||||
|
|
@ -91,12 +91,13 @@ def test_infected_root_actor(
|
||||||
async def sync_and_err(
|
async def sync_and_err(
|
||||||
# just signature placeholders for compat with
|
# just signature placeholders for compat with
|
||||||
# ``to_asyncio.open_channel_from()``
|
# ``to_asyncio.open_channel_from()``
|
||||||
chan: tractor.to_asyncio.LinkedTaskChannel,
|
to_trio: trio.MemorySendChannel,
|
||||||
|
from_trio: asyncio.Queue,
|
||||||
ev: asyncio.Event,
|
ev: asyncio.Event,
|
||||||
|
|
||||||
):
|
):
|
||||||
if chan:
|
if to_trio:
|
||||||
chan.started_nowait('start')
|
to_trio.send_nowait('start')
|
||||||
|
|
||||||
await ev.wait()
|
await ev.wait()
|
||||||
raise RuntimeError('asyncio-side')
|
raise RuntimeError('asyncio-side')
|
||||||
|
|
@ -173,7 +174,7 @@ def test_trio_prestarted_task_bubbles(
|
||||||
sync_and_err,
|
sync_and_err,
|
||||||
ev=aio_ev,
|
ev=aio_ev,
|
||||||
)
|
)
|
||||||
) as (chan, first),
|
) as (first, chan),
|
||||||
):
|
):
|
||||||
|
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
|
|
|
||||||
|
|
@ -94,15 +94,15 @@ def test_runtime_vars_unset(
|
||||||
after the root actor-runtime exits!
|
after the root actor-runtime exits!
|
||||||
|
|
||||||
'''
|
'''
|
||||||
assert not tractor.runtime._state._runtime_vars['_debug_mode']
|
assert not tractor._state._runtime_vars['_debug_mode']
|
||||||
async def main():
|
async def main():
|
||||||
assert not tractor.runtime._state._runtime_vars['_debug_mode']
|
assert not tractor._state._runtime_vars['_debug_mode']
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
debug_mode=True,
|
debug_mode=True,
|
||||||
):
|
):
|
||||||
assert tractor.runtime._state._runtime_vars['_debug_mode']
|
assert tractor._state._runtime_vars['_debug_mode']
|
||||||
|
|
||||||
# after runtime closure, should be reverted!
|
# after runtime closure, should be reverted!
|
||||||
assert not tractor.runtime._state._runtime_vars['_debug_mode']
|
assert not tractor._state._runtime_vars['_debug_mode']
|
||||||
|
|
||||||
trio.run(main)
|
trio.run(main)
|
||||||
|
|
|
||||||
|
|
@ -110,7 +110,7 @@ def test_rpc_errors(
|
||||||
) as n:
|
) as n:
|
||||||
|
|
||||||
actor = tractor.current_actor()
|
actor = tractor.current_actor()
|
||||||
assert actor.is_registrar
|
assert actor.is_arbiter
|
||||||
await n.run_in_actor(
|
await n.run_in_actor(
|
||||||
sleep_back_actor,
|
sleep_back_actor,
|
||||||
actor_name=subactor_requests_to,
|
actor_name=subactor_requests_to,
|
||||||
|
|
|
||||||
|
|
@ -22,10 +22,6 @@ def unlink_file():
|
||||||
async def crash_and_clean_tmpdir(
|
async def crash_and_clean_tmpdir(
|
||||||
tmp_file_path: str,
|
tmp_file_path: str,
|
||||||
error: bool = True,
|
error: bool = True,
|
||||||
rent_cancel: bool = True,
|
|
||||||
|
|
||||||
# XXX unused, but do we really need to test these cases?
|
|
||||||
self_cancel: bool = False,
|
|
||||||
):
|
):
|
||||||
global _file_path
|
global _file_path
|
||||||
_file_path = tmp_file_path
|
_file_path = tmp_file_path
|
||||||
|
|
@ -36,75 +32,43 @@ async def crash_and_clean_tmpdir(
|
||||||
assert os.path.isfile(tmp_file_path)
|
assert os.path.isfile(tmp_file_path)
|
||||||
await trio.sleep(0.1)
|
await trio.sleep(0.1)
|
||||||
if error:
|
if error:
|
||||||
print('erroring in subactor!')
|
|
||||||
assert 0
|
assert 0
|
||||||
|
else:
|
||||||
elif self_cancel:
|
|
||||||
print('SELF-cancelling subactor!')
|
|
||||||
actor.cancel_soon()
|
actor.cancel_soon()
|
||||||
|
|
||||||
elif rent_cancel:
|
|
||||||
await trio.sleep_forever()
|
|
||||||
|
|
||||||
print('subactor exiting task!')
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
'error_in_child',
|
'error_in_child',
|
||||||
[True, False],
|
[True, False],
|
||||||
ids='error_in_child={}'.format,
|
|
||||||
)
|
)
|
||||||
@tractor_test
|
@tractor_test
|
||||||
async def test_lifetime_stack_wipes_tmpfile(
|
async def test_lifetime_stack_wipes_tmpfile(
|
||||||
tmp_path,
|
tmp_path,
|
||||||
error_in_child: bool,
|
error_in_child: bool,
|
||||||
loglevel: str,
|
|
||||||
# log: tractor.log.StackLevelAdapter,
|
|
||||||
# ^TODO, once landed via macos support!
|
|
||||||
):
|
):
|
||||||
child_tmp_file = tmp_path / "child.txt"
|
child_tmp_file = tmp_path / "child.txt"
|
||||||
child_tmp_file.touch()
|
child_tmp_file.touch()
|
||||||
assert child_tmp_file.exists()
|
assert child_tmp_file.exists()
|
||||||
path = str(child_tmp_file)
|
path = str(child_tmp_file)
|
||||||
|
|
||||||
# NOTE, this is expected to cancel the sub
|
|
||||||
# in the `error_in_child=False` case!
|
|
||||||
timeout: float = (
|
|
||||||
1.6 if error_in_child
|
|
||||||
else 1
|
|
||||||
)
|
|
||||||
try:
|
try:
|
||||||
with trio.move_on_after(timeout) as cs:
|
with trio.move_on_after(0.5):
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery() as n:
|
||||||
loglevel=loglevel,
|
await ( # inlined portal
|
||||||
) as an:
|
await n.run_in_actor(
|
||||||
await ( # inlined `tractor.Portal`
|
crash_and_clean_tmpdir,
|
||||||
await an.run_in_actor(
|
tmp_file_path=path,
|
||||||
crash_and_clean_tmpdir,
|
error=error_in_child,
|
||||||
tmp_file_path=path,
|
)
|
||||||
error=error_in_child,
|
).result()
|
||||||
)
|
|
||||||
).result()
|
|
||||||
except (
|
except (
|
||||||
tractor.RemoteActorError,
|
tractor.RemoteActorError,
|
||||||
|
# tractor.BaseExceptionGroup,
|
||||||
BaseExceptionGroup,
|
BaseExceptionGroup,
|
||||||
) as _exc:
|
):
|
||||||
exc = _exc
|
pass
|
||||||
from tractor.log import get_console_log
|
|
||||||
log = get_console_log(
|
|
||||||
level=loglevel,
|
|
||||||
name=__name__,
|
|
||||||
)
|
|
||||||
log.exception(
|
|
||||||
f'Subactor failed as expected with {type(exc)!r}\n'
|
|
||||||
)
|
|
||||||
|
|
||||||
# tmp file should have been wiped by
|
# tmp file should have been wiped by
|
||||||
# teardown stack.
|
# teardown stack.
|
||||||
assert not child_tmp_file.exists()
|
assert not child_tmp_file.exists()
|
||||||
|
|
||||||
if error_in_child:
|
|
||||||
assert not cs.cancel_called
|
|
||||||
else:
|
|
||||||
# expect timeout in some cases?
|
|
||||||
assert cs.cancel_called
|
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,6 @@
|
||||||
Shared mem primitives and APIs.
|
Shared mem primitives and APIs.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
import platform
|
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
# import numpy
|
# import numpy
|
||||||
|
|
@ -14,18 +13,6 @@ from tractor.ipc._shm import (
|
||||||
attach_shm_list,
|
attach_shm_list,
|
||||||
)
|
)
|
||||||
|
|
||||||
pytestmark = pytest.mark.skipon_spawn_backend(
|
|
||||||
'subint',
|
|
||||||
'subint_forkserver',
|
|
||||||
reason=(
|
|
||||||
'subint: GIL-contention hanging class.\n'
|
|
||||||
'subint_forkserver: `multiprocessing.SharedMemory` '
|
|
||||||
'has known issues with fork-without-exec (mp\'s '
|
|
||||||
'resource_tracker and SharedMemory internals assume '
|
|
||||||
'fresh-process state). RemoteActorError surfaces from '
|
|
||||||
'the shm-attach path. TODO, put issue link!\n'
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
@tractor.context
|
@tractor.context
|
||||||
async def child_attach_shml_alot(
|
async def child_attach_shml_alot(
|
||||||
|
|
@ -66,18 +53,7 @@ def test_child_attaches_alot():
|
||||||
shm_key=shml.key,
|
shm_key=shml.key,
|
||||||
) as (ctx, start_val),
|
) as (ctx, start_val),
|
||||||
):
|
):
|
||||||
assert (_key := shml.key) == start_val
|
assert start_val == key
|
||||||
|
|
||||||
if platform.system() != 'Darwin':
|
|
||||||
# XXX, macOS has a char limit..
|
|
||||||
# see `ipc._shm._shorten_key_for_macos`
|
|
||||||
assert (
|
|
||||||
start_val
|
|
||||||
==
|
|
||||||
key
|
|
||||||
==
|
|
||||||
_key
|
|
||||||
)
|
|
||||||
await ctx.result()
|
await ctx.result()
|
||||||
|
|
||||||
await portal.cancel_actor()
|
await portal.cancel_actor()
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,5 @@
|
||||||
"""
|
"""
|
||||||
Spawning basics including audit of,
|
Spawning basics
|
||||||
|
|
||||||
- subproc bootstrap, such as subactor runtime-data/config inheritance,
|
|
||||||
- basic (and mostly legacy) `ActorNursery` subactor starting and
|
|
||||||
cancel APIs.
|
|
||||||
|
|
||||||
Simple (and generally legacy) examples from the original
|
|
||||||
API design.
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
@ -40,13 +33,13 @@ async def spawn(
|
||||||
assert actor is None # no runtime yet
|
assert actor is None # no runtime yet
|
||||||
async with (
|
async with (
|
||||||
tractor.open_root_actor(
|
tractor.open_root_actor(
|
||||||
registry_addrs=[reg_addr],
|
arbiter_addr=reg_addr,
|
||||||
),
|
),
|
||||||
tractor.open_nursery() as an,
|
tractor.open_nursery() as an,
|
||||||
):
|
):
|
||||||
# now runtime exists
|
# now runtime exists
|
||||||
actor: tractor.Actor = tractor.current_actor()
|
actor: tractor.Actor = tractor.current_actor()
|
||||||
assert actor.is_registrar == should_be_root
|
assert actor.is_arbiter == should_be_root
|
||||||
|
|
||||||
# spawns subproc here
|
# spawns subproc here
|
||||||
portal: tractor.Portal = await an.run_in_actor(
|
portal: tractor.Portal = await an.run_in_actor(
|
||||||
|
|
@ -75,7 +68,7 @@ async def spawn(
|
||||||
assert result == 10
|
assert result == 10
|
||||||
return result
|
return result
|
||||||
else:
|
else:
|
||||||
assert actor.is_registrar == should_be_root
|
assert actor.is_arbiter == should_be_root
|
||||||
return 10
|
return 10
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -105,9 +98,7 @@ async def movie_theatre_question():
|
||||||
|
|
||||||
|
|
||||||
@tractor_test
|
@tractor_test
|
||||||
async def test_movie_theatre_convo(
|
async def test_movie_theatre_convo(start_method):
|
||||||
start_method: str,
|
|
||||||
):
|
|
||||||
'''
|
'''
|
||||||
The main ``tractor`` routine.
|
The main ``tractor`` routine.
|
||||||
|
|
||||||
|
|
@ -160,16 +151,13 @@ async def test_most_beautiful_word(
|
||||||
name='some_linguist',
|
name='some_linguist',
|
||||||
)
|
)
|
||||||
|
|
||||||
res: Any = await portal.wait_for_result()
|
print(await portal.result())
|
||||||
assert res == return_value
|
|
||||||
# The ``async with`` will unblock here since the 'some_linguist'
|
# The ``async with`` will unblock here since the 'some_linguist'
|
||||||
# actor has completed its main task ``cellar_door``.
|
# actor has completed its main task ``cellar_door``.
|
||||||
|
|
||||||
# this should pull the cached final result already captured during
|
# this should pull the cached final result already captured during
|
||||||
# the nursery block exit.
|
# the nursery block exit.
|
||||||
res: Any = await portal.wait_for_result()
|
print(await portal.result())
|
||||||
assert res == return_value
|
|
||||||
print(res)
|
|
||||||
|
|
||||||
|
|
||||||
async def check_loglevel(level):
|
async def check_loglevel(level):
|
||||||
|
|
@ -180,35 +168,22 @@ async def check_loglevel(level):
|
||||||
log.critical('yoyoyo')
|
log.critical('yoyoyo')
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
'level', [
|
|
||||||
'debug',
|
|
||||||
'cancel',
|
|
||||||
'critical'
|
|
||||||
],
|
|
||||||
ids='loglevel={}'.format,
|
|
||||||
)
|
|
||||||
def test_loglevel_propagated_to_subactor(
|
def test_loglevel_propagated_to_subactor(
|
||||||
capfd: pytest.CaptureFixture,
|
start_method,
|
||||||
start_method: str,
|
capfd,
|
||||||
reg_addr: tuple,
|
reg_addr,
|
||||||
level: str,
|
|
||||||
):
|
):
|
||||||
if start_method in ('mp_forkserver', 'subint_forkserver'):
|
if start_method == 'mp_forkserver':
|
||||||
pytest.skip(
|
pytest.skip(
|
||||||
"a bug with `capfd` seems to make forkserver capture not work? "
|
"a bug with `capfd` seems to make forkserver capture not work?")
|
||||||
"(same class as the `mp_forkserver` pre-existing skip — fork-"
|
|
||||||
"based backends inherit pytest's capfd temp-file fds into the "
|
level = 'critical'
|
||||||
"subactor and the IPC handshake reads garbage (`unclean EOF "
|
|
||||||
"read only X/HUGE_NUMBER bytes`). Work around by using "
|
|
||||||
"`capsys` instead or skip entirely."
|
|
||||||
)
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
name='registrar',
|
name='arbiter',
|
||||||
start_method=start_method,
|
start_method=start_method,
|
||||||
registry_addrs=[reg_addr],
|
arbiter_addr=reg_addr,
|
||||||
|
|
||||||
) as tn:
|
) as tn:
|
||||||
await tn.run_in_actor(
|
await tn.run_in_actor(
|
||||||
|
|
@ -222,121 +197,3 @@ def test_loglevel_propagated_to_subactor(
|
||||||
# ensure subactor spits log message on stderr
|
# ensure subactor spits log message on stderr
|
||||||
captured = capfd.readouterr()
|
captured = capfd.readouterr()
|
||||||
assert 'yoyoyo' in captured.err
|
assert 'yoyoyo' in captured.err
|
||||||
|
|
||||||
|
|
||||||
async def check_parent_main_inheritance(
|
|
||||||
expect_inherited: bool,
|
|
||||||
) -> bool:
|
|
||||||
'''
|
|
||||||
Assert that the child actor's ``_parent_main_data`` matches the
|
|
||||||
``inherit_parent_main`` flag it was spawned with.
|
|
||||||
|
|
||||||
With the trio spawn backend the parent's ``__main__`` bootstrap
|
|
||||||
data is captured and forwarded to each child so it can replay
|
|
||||||
the parent's ``__main__`` as ``__mp_main__``, mirroring the
|
|
||||||
stdlib ``multiprocessing`` bootstrap:
|
|
||||||
https://docs.python.org/3/library/multiprocessing.html#the-spawn-and-forkserver-start-methods
|
|
||||||
|
|
||||||
When ``inherit_parent_main=False`` the data dict is empty
|
|
||||||
(``{}``) so no fixup ever runs and the child keeps its own
|
|
||||||
``__main__`` untouched.
|
|
||||||
|
|
||||||
NOTE: under `pytest` the parent ``__main__`` is
|
|
||||||
``pytest.__main__`` whose ``_fixup_main_from_name()`` is a no-op
|
|
||||||
(the name ends with ``.__main__``), so we cannot observe
|
|
||||||
a difference in ``sys.modules['__main__'].__name__`` between the
|
|
||||||
two modes. Checking ``_parent_main_data`` directly is the most
|
|
||||||
reliable verification that the flag is threaded through
|
|
||||||
correctly; a ``RemoteActorError[AssertionError]`` propagates on
|
|
||||||
mismatch.
|
|
||||||
|
|
||||||
'''
|
|
||||||
import tractor
|
|
||||||
actor: tractor.Actor = tractor.current_actor()
|
|
||||||
has_data: bool = bool(actor._parent_main_data)
|
|
||||||
assert has_data == expect_inherited, (
|
|
||||||
f'Expected _parent_main_data to be '
|
|
||||||
f'{"non-empty" if expect_inherited else "empty"}, '
|
|
||||||
f'got: {actor._parent_main_data!r}'
|
|
||||||
)
|
|
||||||
return has_data
|
|
||||||
|
|
||||||
|
|
||||||
def test_run_in_actor_can_skip_parent_main_inheritance(
|
|
||||||
start_method: str, # <- only support on `trio` backend rn.
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
Verify ``inherit_parent_main=False`` on ``run_in_actor()``
|
|
||||||
prevents parent ``__main__`` data from reaching the child.
|
|
||||||
|
|
||||||
'''
|
|
||||||
if start_method != 'trio':
|
|
||||||
pytest.skip(
|
|
||||||
'parent main-inheritance opt-out only affects the trio backend'
|
|
||||||
)
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
async with tractor.open_nursery(start_method='trio') as an:
|
|
||||||
|
|
||||||
# Default: child receives parent __main__ bootstrap data
|
|
||||||
replaying = await an.run_in_actor(
|
|
||||||
check_parent_main_inheritance,
|
|
||||||
name='replaying-parent-main',
|
|
||||||
expect_inherited=True,
|
|
||||||
)
|
|
||||||
await replaying.result()
|
|
||||||
|
|
||||||
# Opt-out: child gets no parent __main__ data
|
|
||||||
isolated = await an.run_in_actor(
|
|
||||||
check_parent_main_inheritance,
|
|
||||||
name='isolated-parent-main',
|
|
||||||
inherit_parent_main=False,
|
|
||||||
expect_inherited=False,
|
|
||||||
)
|
|
||||||
await isolated.result()
|
|
||||||
|
|
||||||
trio.run(main)
|
|
||||||
|
|
||||||
|
|
||||||
def test_start_actor_can_skip_parent_main_inheritance(
|
|
||||||
start_method: str, # <- only support on `trio` backend rn.
|
|
||||||
):
|
|
||||||
'''
|
|
||||||
Verify ``inherit_parent_main=False`` on ``start_actor()``
|
|
||||||
prevents parent ``__main__`` data from reaching the child.
|
|
||||||
|
|
||||||
'''
|
|
||||||
if start_method != 'trio':
|
|
||||||
pytest.skip(
|
|
||||||
'parent main-inheritance opt-out only affects the trio backend'
|
|
||||||
)
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
async with tractor.open_nursery(start_method='trio') as an:
|
|
||||||
|
|
||||||
# Default: child receives parent __main__ bootstrap data
|
|
||||||
replaying = await an.start_actor(
|
|
||||||
'replaying-parent-main',
|
|
||||||
enable_modules=[__name__],
|
|
||||||
)
|
|
||||||
result = await replaying.run(
|
|
||||||
check_parent_main_inheritance,
|
|
||||||
expect_inherited=True,
|
|
||||||
)
|
|
||||||
assert result is True
|
|
||||||
await replaying.cancel_actor()
|
|
||||||
|
|
||||||
# Opt-out: child gets no parent __main__ data
|
|
||||||
isolated = await an.start_actor(
|
|
||||||
'isolated-parent-main',
|
|
||||||
enable_modules=[__name__],
|
|
||||||
inherit_parent_main=False,
|
|
||||||
)
|
|
||||||
result = await isolated.run(
|
|
||||||
check_parent_main_inheritance,
|
|
||||||
expect_inherited=False,
|
|
||||||
)
|
|
||||||
assert result is False
|
|
||||||
await isolated.cancel_actor()
|
|
||||||
|
|
||||||
trio.run(main)
|
|
||||||
|
|
|
||||||
|
|
@ -75,7 +75,7 @@ async def open_sequence_streamer(
|
||||||
) -> tractor.MsgStream:
|
) -> tractor.MsgStream:
|
||||||
|
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
registry_addrs=[reg_addr],
|
arbiter_addr=reg_addr,
|
||||||
start_method=start_method,
|
start_method=start_method,
|
||||||
) as an:
|
) as an:
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -30,23 +30,21 @@ from ._streaming import (
|
||||||
MsgStream as MsgStream,
|
MsgStream as MsgStream,
|
||||||
stream as stream,
|
stream as stream,
|
||||||
)
|
)
|
||||||
from .discovery._api import (
|
from ._discovery import (
|
||||||
get_registry as get_registry,
|
get_registry as get_registry,
|
||||||
find_actor as find_actor,
|
find_actor as find_actor,
|
||||||
wait_for_actor as wait_for_actor,
|
wait_for_actor as wait_for_actor,
|
||||||
query_actor as query_actor,
|
query_actor as query_actor,
|
||||||
)
|
)
|
||||||
from .runtime._supervise import (
|
from ._supervise import (
|
||||||
open_nursery as open_nursery,
|
open_nursery as open_nursery,
|
||||||
ActorNursery as ActorNursery,
|
ActorNursery as ActorNursery,
|
||||||
)
|
)
|
||||||
from .runtime._state import (
|
from ._state import (
|
||||||
RuntimeVars as RuntimeVars,
|
|
||||||
current_actor as current_actor,
|
current_actor as current_actor,
|
||||||
current_ipc_ctx as current_ipc_ctx,
|
|
||||||
debug_mode as debug_mode,
|
|
||||||
get_runtime_vars as get_runtime_vars,
|
|
||||||
is_root_process as is_root_process,
|
is_root_process as is_root_process,
|
||||||
|
current_ipc_ctx as current_ipc_ctx,
|
||||||
|
debug_mode as debug_mode
|
||||||
)
|
)
|
||||||
from ._exceptions import (
|
from ._exceptions import (
|
||||||
ContextCancelled as ContextCancelled,
|
ContextCancelled as ContextCancelled,
|
||||||
|
|
@ -67,10 +65,6 @@ from ._root import (
|
||||||
open_root_actor as open_root_actor,
|
open_root_actor as open_root_actor,
|
||||||
)
|
)
|
||||||
from .ipc import Channel as Channel
|
from .ipc import Channel as Channel
|
||||||
from .runtime._portal import Portal as Portal
|
from ._portal import Portal as Portal
|
||||||
from .runtime._runtime import Actor as Actor
|
from ._runtime import Actor as Actor
|
||||||
from .discovery._registry import (
|
|
||||||
Registrar as Registrar,
|
|
||||||
Arbiter as Arbiter,
|
|
||||||
)
|
|
||||||
# from . import hilevel as hilevel
|
# from . import hilevel as hilevel
|
||||||
|
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue