Compare commits
312 Commits
main
...
subint_for
| Author | SHA1 | Date |
|---|---|---|
|
|
2ca0f41e61 | |
|
|
b350aa09ee | |
|
|
d6e70e9de4 | |
|
|
4c133ab541 | |
|
|
4106ba73ea | |
|
|
eceed29d4a | |
|
|
e312a68d8a | |
|
|
4d0555435b | |
|
|
ab86f7613d | |
|
|
458a35cf09 | |
|
|
7cd47ef7fb | |
|
|
76d12060aa | |
|
|
506617c695 | |
|
|
8ac3dfeb85 | |
|
|
c20b05e181 | |
|
|
9993db0193 | |
|
|
35da808905 | |
|
|
70d58c4bd2 | |
|
|
1af2121057 | |
|
|
e3f4f5a387 | |
|
|
d093c31979 | |
|
|
1e357dcf08 | |
|
|
e31eb8d7c9 | |
|
|
8bcbe730bf | |
|
|
5e85f184e0 | |
|
|
f5f37b69e6 | |
|
|
a72deef709 | |
|
|
dcd5c1ff40 | |
|
|
76605d5609 | |
|
|
7804a9feac | |
|
|
63ab7c986b | |
|
|
26914fde75 | |
|
|
cf2e71d87f | |
|
|
25e400d526 | |
|
|
82332fbceb | |
|
|
de4f470b6c | |
|
|
0f48ed2eb9 | |
|
|
eee79a0357 | |
|
|
4b2a0886c3 | |
|
|
3b26b59dad | |
|
|
f3cea714bc | |
|
|
985ea76de5 | |
|
|
5998774535 | |
|
|
a6cbac954d | |
|
|
189f4e3ffc | |
|
|
a65fded4c6 | |
|
|
4a3254583b | |
|
|
2ed5e6a6e8 | |
|
|
34d9d482e4 | |
|
|
09466a1e9d | |
|
|
99541feec7 | |
|
|
c041518bdb | |
|
|
31cbd11a5b | |
|
|
8a8d01e076 | |
|
|
03bf2b931e | |
|
|
b8f243e98d | |
|
|
d2ea8aa2de | |
|
|
d318f1f8f4 | |
|
|
64ddc42ad8 | |
|
|
b524ee4633 | |
|
|
b1a0753a3f | |
|
|
ba86d482e3 | |
|
|
d3d6f646f9 | |
|
|
9cf3d588e7 | |
|
|
e75e29b1dc | |
|
|
a7b1ee34ef | |
|
|
ae5b63c0bc | |
|
|
f75865fb2e | |
|
|
e0b8f23cbc | |
|
|
8d662999a4 | |
|
|
d7ca68cf61 | |
|
|
b5b0504918 | |
|
|
de78a6445b | |
|
|
5c98ab1fb6 | |
|
|
3867403fab | |
|
|
7c8e5a6732 | |
|
|
3152f423d8 | |
|
|
ed65301d32 | |
|
|
8817032c90 | |
|
|
70dc60a199 | |
|
|
cd287c7e93 | |
|
|
7b04b2cdfc | |
|
|
75b07c4b7c | |
|
|
86d4e0d3ed | |
|
|
ccb013a615 | |
|
|
c3d6cc9007 | |
|
|
cb7b76c44f | |
|
|
23677f8a3c | |
|
|
06ff2dd5f2 | |
|
|
a891e003b2 | |
|
|
e90241baaa | |
|
|
7079a597c5 | |
|
|
bc60aa1ec5 | |
|
|
f881683c97 | |
|
|
490fac432c | |
|
|
5f6e45e1d4 | |
|
|
5c4438bacc | |
|
|
90ba0e3658 | |
|
|
1f1e09a786 | |
|
|
7cf3b5d00d | |
|
|
c72d495d68 | |
|
|
926e861f52 | |
|
|
d9cb38372f | |
|
|
b4c975b48b | |
|
|
8344537aa6 | |
|
|
3a9f4ea383 | |
|
|
ca1b01f926 | |
|
|
570c975f14 | |
|
|
a0a7668670 | |
|
|
27bf566d75 | |
|
|
e8f1eca8d2 | |
|
|
656c6c30d1 | |
|
|
acf6568275 | |
|
|
c6c591e61a | |
|
|
b883b27646 | |
|
|
00637764d9 | |
|
|
ea971d25aa | |
|
|
83b6c4270a | |
|
|
6309c2e6fc | |
|
|
f5301d3fb0 | |
|
|
9f8e9eb739 | |
|
|
6b04650187 | |
|
|
a0607f363e | |
|
|
2ee86ddb1a | |
|
|
0286d36ed7 | |
|
|
2db6f97130 | |
|
|
9af6adc181 | |
|
|
452a32fb23 | |
|
|
f47010d7e9 | |
|
|
3f198bc86c | |
|
|
391c8d3566 | |
|
|
4fc477cfd6 | |
|
|
cd1cd03725 | |
|
|
febe587c6c | |
|
|
4d3c5b9163 | |
|
|
8b106b9144 | |
|
|
d6ece8eab3 | |
|
|
8494eb9b8a | |
|
|
cab366cd65 | |
|
|
85f9c5df6f | |
|
|
ebe9d5e4b5 | |
|
|
bbf01d5161 | |
|
|
ec8e8a2786 | |
|
|
c3d1ec22eb | |
|
|
8f44efa327 | |
|
|
5968a3c773 | |
|
|
80597b80bf | |
|
|
a41c6d5c70 | |
|
|
9c37b3f956 | |
|
|
8f6bc56174 | |
|
|
b14dbde77b | |
|
|
cd6509b724 | |
|
|
93d99ed2eb | |
|
|
6215e3b2dd | |
|
|
be5d8da8c0 | |
|
|
21ed181835 | |
|
|
9ec2749ab7 | |
|
|
f3441a6790 | |
|
|
cc42d38284 | |
|
|
6827ceba12 | |
|
|
94458807ce | |
|
|
be5e7e446b | |
|
|
571b2b320e | |
|
|
c7b5d00f19 | |
|
|
1049f7bf38 | |
|
|
cc3bfac741 | |
|
|
e71eec07de | |
|
|
b557ec20a7 | |
|
|
85457cb839 | |
|
|
850219f60c | |
|
|
d929fb75b5 | |
|
|
403c2174a1 | |
|
|
528012f35f | |
|
|
0dfa6f4a8a | |
|
|
a0d3741fac | |
|
|
149b800c9f | |
|
|
03f458a45c | |
|
|
e77198bb64 | |
|
|
5b8f6cf4c7 | |
|
|
8868ff19f3 | |
|
|
066011b83d | |
|
|
b1d003d850 | |
|
|
8991ec2bf5 | |
|
|
dfc153c228 | |
|
|
52e8fb43ee | |
|
|
99577b719a | |
|
|
4092db60b2 | |
|
|
4f333dee05 | |
|
|
8a2f74da2c | |
|
|
2bf155131d | |
|
|
0f29f5717a | |
|
|
5ea721683b | |
|
|
f84ef44992 | |
|
|
1e0c57c6c5 | |
|
|
65660c77c7 | |
|
|
c9b415475f | |
|
|
359bcf691f | |
|
|
b3ce5ab4f6 | |
|
|
e89fe03da7 | |
|
|
417b796169 | |
|
|
36cbc07602 | |
|
|
1f2fad22ee | |
|
|
ca5f6f50a8 | |
|
|
a7ff1387c7 | |
|
|
abbb4a79c8 | |
|
|
1529095c32 | |
|
|
8215a7ba34 | |
|
|
c1c4d85958 | |
|
|
88b084802f | |
|
|
bf1dcea9d1 | |
|
|
5c270b89d5 | |
|
|
6ee0149e8d | |
|
|
9c4cd869fb | |
|
|
afd66ce3b7 | |
|
|
f9bdb1b35d | |
|
|
d135ce94af | |
|
|
fb94aa0095 | |
|
|
b71e8575e5 | |
|
|
bbc028e84c | |
|
|
016306adf5 | |
|
|
712c009790 | |
|
|
79396b4a26 | |
|
|
5b2905b702 | |
|
|
776af3fce6 | |
|
|
4639685770 | |
|
|
98a7d69341 | |
|
|
ab6c955949 | |
|
|
a72bb9321e | |
|
|
0e2949ea59 | |
|
|
fb73935dbc | |
|
|
94dfeb1441 | |
|
|
9c1bcb23af | |
|
|
a1ea373f34 | |
|
|
e8f3d64e71 | |
|
|
b30faaca82 | |
|
|
51701fc8dc | |
|
|
7b89204afd | |
|
|
82d02ef404 | |
|
|
b7546fd221 | |
|
|
86c95539ca | |
|
|
706a4b761b | |
|
|
c5af2fa778 | |
|
|
86489cc453 | |
|
|
2631fb4ff3 | |
|
|
aee86f2544 | |
|
|
83c8a8ad78 | |
|
|
daae196048 | |
|
|
70efcb09a0 | |
|
|
a7e74acdff | |
|
|
9c3d3bcec1 | |
|
|
521fb97fe9 | |
|
|
d8a3969048 | |
|
|
01c0db651a | |
|
|
7bcd7aca2b | |
|
|
920d0043b4 | |
|
|
93b9a6cd97 | |
|
|
e7cefba67f | |
|
|
683476cc96 | |
|
|
ad24df0ed7 | |
|
|
a1622c0b94 | |
|
|
a385d20810 | |
|
|
7f9044c1ef | |
|
|
d0618e3cb4 | |
|
|
a5bebf76d5 | |
|
|
814b2e7e62 | |
|
|
1704f73504 | |
|
|
c735fc8544 | |
|
|
c5ea6040bf | |
|
|
d4f2fa547a | |
|
|
20896bfbab | |
|
|
70bb77280e | |
|
|
916f88a070 | |
|
|
91f2f3ec10 | |
|
|
3e5124e184 | |
|
|
fa86269e30 | |
|
|
d0b92bbeba | |
|
|
9470815f5a | |
|
|
592d918394 | |
|
|
0cddc67bdb | |
|
|
052fe2435f | |
|
|
28819bf5d3 | |
|
|
07c2ba5c0d | |
|
|
50f40f427b | |
|
|
bf6de55865 | |
|
|
5ded99a886 | |
|
|
7145fa364f | |
|
|
f8e25688c7 | |
|
|
c3f455a8ec | |
|
|
f78e842fba | |
|
|
3638b80c9d | |
|
|
2ed9e65530 | |
|
|
6cab363c51 | |
|
|
8aee24e83f | |
|
|
cdcc1b42fc | |
|
|
51ac0c623e | |
|
|
3f0bde1bf8 | |
|
|
fa1a15dce8 | |
|
|
5850844297 | |
|
|
ff02939213 | |
|
|
d61e8caab2 | |
|
|
0b0c83e9da | |
|
|
5e7c0f264d | |
|
|
edf1189fe0 | |
|
|
de24bfe052 | |
|
|
e235b96894 | |
|
|
dea4b9fd93 | |
|
|
557e2cec6a | |
|
|
0e3229f16d | |
|
|
448d25aef4 | |
|
|
343c9e0034 | |
|
|
1dc27c5161 | |
|
|
14aefa4b11 |
|
|
@ -0,0 +1,38 @@
|
||||||
|
# Docs TODOs
|
||||||
|
|
||||||
|
## Auto-sync README code examples with source
|
||||||
|
|
||||||
|
The `docs/README.rst` has inline code blocks that
|
||||||
|
duplicate actual example files (e.g.
|
||||||
|
`examples/infected_asyncio_echo_server.py`). Every time
|
||||||
|
the public API changes we have to manually sync both.
|
||||||
|
|
||||||
|
Sphinx's `literalinclude` directive can pull code directly
|
||||||
|
from source files:
|
||||||
|
|
||||||
|
```rst
|
||||||
|
.. literalinclude:: ../examples/infected_asyncio_echo_server.py
|
||||||
|
:language: python
|
||||||
|
:caption: examples/infected_asyncio_echo_server.py
|
||||||
|
```
|
||||||
|
|
||||||
|
Or to include only a specific function/section:
|
||||||
|
|
||||||
|
```rst
|
||||||
|
.. literalinclude:: ../examples/infected_asyncio_echo_server.py
|
||||||
|
:language: python
|
||||||
|
:pyobject: aio_echo_server
|
||||||
|
```
|
||||||
|
|
||||||
|
This way the docs always reflect the actual code without
|
||||||
|
manual syncing.
|
||||||
|
|
||||||
|
### Considerations
|
||||||
|
- `README.rst` is also rendered on GitHub/PyPI which do
|
||||||
|
NOT support `literalinclude` - so we'd need a build
|
||||||
|
step or a separate `_sphinx_readme.rst` (which already
|
||||||
|
exists at `docs/github_readme/_sphinx_readme.rst`).
|
||||||
|
- Could use a pre-commit hook or CI step to extract code
|
||||||
|
from examples into the README for GitHub rendering.
|
||||||
|
- Another option: `sphinx-autodoc` style approach where
|
||||||
|
docstrings from the actual module are pulled in.
|
||||||
|
|
@ -0,0 +1,42 @@
|
||||||
|
{
|
||||||
|
"permissions": {
|
||||||
|
"allow": [
|
||||||
|
"Bash(cp .claude/*)",
|
||||||
|
"Read(.claude/**)",
|
||||||
|
"Read(.claude/skills/run-tests/**)",
|
||||||
|
"Write(.claude/**/*commit_msg*)",
|
||||||
|
"Write(.claude/git_commit_msg_LATEST.md)",
|
||||||
|
"Skill(run-tests)",
|
||||||
|
"Skill(close-wkt)",
|
||||||
|
"Skill(open-wkt)",
|
||||||
|
"Skill(prompt-io)",
|
||||||
|
"Bash(date *)",
|
||||||
|
"Bash(git diff *)",
|
||||||
|
"Bash(git log *)",
|
||||||
|
"Bash(git status)",
|
||||||
|
"Bash(git remote:*)",
|
||||||
|
"Bash(git stash:*)",
|
||||||
|
"Bash(git mv:*)",
|
||||||
|
"Bash(git rev-parse:*)",
|
||||||
|
"Bash(test:*)",
|
||||||
|
"Bash(ls:*)",
|
||||||
|
"Bash(grep:*)",
|
||||||
|
"Bash(find:*)",
|
||||||
|
"Bash(ln:*)",
|
||||||
|
"Bash(cat:*)",
|
||||||
|
"Bash(mkdir:*)",
|
||||||
|
"Bash(gh pr:*)",
|
||||||
|
"Bash(gh api:*)",
|
||||||
|
"Bash(gh issue:*)",
|
||||||
|
"Bash(UV_PROJECT_ENVIRONMENT=py* uv sync:*)",
|
||||||
|
"Bash(UV_PROJECT_ENVIRONMENT=py* uv run:*)",
|
||||||
|
"Bash(echo EXIT:$?:*)",
|
||||||
|
"Bash(echo \"EXIT=$?\")",
|
||||||
|
"Read(//tmp/**)"
|
||||||
|
],
|
||||||
|
"deny": [],
|
||||||
|
"ask": []
|
||||||
|
},
|
||||||
|
"prefersReducedMotion": false,
|
||||||
|
"outputStyle": "default"
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,225 @@
|
||||||
|
# Commit Message Style Guide for `tractor`
|
||||||
|
|
||||||
|
Analysis based on 500 recent commits from the `tractor` repository.
|
||||||
|
|
||||||
|
## Core Principles
|
||||||
|
|
||||||
|
Write commit messages that are technically precise yet casual in
|
||||||
|
tone. Use abbreviations and informal language while maintaining
|
||||||
|
clarity about what changed and why.
|
||||||
|
|
||||||
|
## Subject Line Format
|
||||||
|
|
||||||
|
### Length and Structure
|
||||||
|
- Target: ~50 chars with a hard-max of 67.
|
||||||
|
- Use backticks around code elements (72.2% of commits)
|
||||||
|
- Rarely use colons (5.2%), except for file prefixes
|
||||||
|
- End with '?' for uncertain changes (rare: 0.8%)
|
||||||
|
- End with '!' for important changes (rare: 2.0%)
|
||||||
|
|
||||||
|
### Opening Verbs (Present Tense)
|
||||||
|
|
||||||
|
Most common verbs from analysis:
|
||||||
|
- `Add` (14.4%) - wholly new features/functionality
|
||||||
|
- `Use` (4.4%) - adopt new approach/tool
|
||||||
|
- `Drop` (3.6%) - remove code/feature
|
||||||
|
- `Fix` (2.4%) - bug fixes
|
||||||
|
- `Move`/`Mv` (3.6%) - relocate code
|
||||||
|
- `Adjust` (2.0%) - minor tweaks
|
||||||
|
- `Update` (1.6%) - enhance existing feature
|
||||||
|
- `Bump` (1.2%) - dependency updates
|
||||||
|
- `Rename` (1.2%) - identifier changes
|
||||||
|
- `Set` (1.2%) - configuration changes
|
||||||
|
- `Handle` (1.0%) - add handling logic
|
||||||
|
- `Raise` (1.0%) - add error raising
|
||||||
|
- `Pass` (0.8%) - pass parameters/values
|
||||||
|
- `Support` (0.8%) - add support for something
|
||||||
|
- `Hide` (1.4%) - make private/internal
|
||||||
|
- `Always` (1.4%) - enforce consistent behavior
|
||||||
|
- `Mk` (1.4%) - make/create (abbreviated)
|
||||||
|
- `Start` (1.0%) - begin implementation
|
||||||
|
|
||||||
|
Other frequent verbs: `More`, `Change`, `Extend`, `Disable`, `Log`,
|
||||||
|
`Enable`, `Ensure`, `Expose`, `Allow`
|
||||||
|
|
||||||
|
### Backtick Usage
|
||||||
|
|
||||||
|
Always use backticks for:
|
||||||
|
- Module names: `trio`, `asyncio`, `msgspec`, `greenback`, `stackscope`
|
||||||
|
- Class names: `Context`, `Actor`, `Address`, `PldRx`, `SpawnSpec`
|
||||||
|
- Method names: `.pause_from_sync()`, `._pause()`, `.cancel()`
|
||||||
|
- Function names: `breakpoint()`, `collapse_eg()`, `open_root_actor()`
|
||||||
|
- Decorators: `@acm`, `@context`
|
||||||
|
- Exceptions: `Cancelled`, `TransportClosed`, `MsgTypeError`
|
||||||
|
- Keywords: `finally`, `None`, `False`
|
||||||
|
- Variable names: `tn`, `debug_mode`
|
||||||
|
- Complex expressions: `trio.Cancelled`, `asyncio.Task`
|
||||||
|
|
||||||
|
Most backticked terms in tractor:
|
||||||
|
`trio`, `asyncio`, `Context`, `.pause_from_sync()`, `tn`,
|
||||||
|
`._pause()`, `breakpoint()`, `collapse_eg()`, `Actor`, `@acm`,
|
||||||
|
`.cancel()`, `Cancelled`, `open_root_actor()`, `greenback`
|
||||||
|
|
||||||
|
### Examples
|
||||||
|
|
||||||
|
Good subject lines:
|
||||||
|
```
|
||||||
|
Add `uds` to `._multiaddr`, tweak typing
|
||||||
|
Drop `DebugStatus.shield` attr, add `.req_finished`
|
||||||
|
Use `stackscope` for all actor-tree rendered "views"
|
||||||
|
Fix `.to_asyncio` inter-task-cancellation!
|
||||||
|
Bump `ruff.toml` to target py313
|
||||||
|
Mv `load_module_from_path()` to new `._code_load` submod
|
||||||
|
Always use `tuple`-cast for singleton parent addrs
|
||||||
|
```
|
||||||
|
|
||||||
|
## Body Format
|
||||||
|
|
||||||
|
### General Structure
|
||||||
|
- 43.2% of commits have no body (simple changes)
|
||||||
|
- Use blank line after subject
|
||||||
|
- Max line length: 67 chars
|
||||||
|
- Use `-` bullets for lists (28.0% of commits)
|
||||||
|
- Rarely use `*` bullets (2.4%)
|
||||||
|
|
||||||
|
### Section Markers
|
||||||
|
|
||||||
|
Use these markers to organize longer commit bodies:
|
||||||
|
- `Also,` (most common: 26 occurrences)
|
||||||
|
- `Other,` (13 occurrences)
|
||||||
|
- `Deats,` (11 occurrences) - for implementation details
|
||||||
|
- `Further,` (7 occurrences)
|
||||||
|
- `TODO,` (3 occurrences)
|
||||||
|
- `Impl details,` (2 occurrences)
|
||||||
|
- `Notes,` (1 occurrence)
|
||||||
|
|
||||||
|
### Common Abbreviations
|
||||||
|
|
||||||
|
Use these freely (sorted by frequency):
|
||||||
|
- `msg` (63) - message
|
||||||
|
- `bg` (37) - background
|
||||||
|
- `ctx` (30) - context
|
||||||
|
- `impl` (27) - implementation
|
||||||
|
- `mod` (26) - module
|
||||||
|
- `obvi` (17) - obviously
|
||||||
|
- `tn` (16) - task name
|
||||||
|
- `fn` (15) - function
|
||||||
|
- `vs` (15) - versus
|
||||||
|
- `bc` (14) - because
|
||||||
|
- `var` (14) - variable
|
||||||
|
- `prolly` (9) - probably
|
||||||
|
- `ep` (6) - entry point
|
||||||
|
- `OW` (5) - otherwise
|
||||||
|
- `rn` (4) - right now
|
||||||
|
- `sig` (4) - signal/signature
|
||||||
|
- `deps` (3) - dependencies
|
||||||
|
- `iface` (2) - interface
|
||||||
|
- `subproc` (2) - subprocess
|
||||||
|
- `tho` (2) - though
|
||||||
|
- `ofc` (2) - of course
|
||||||
|
|
||||||
|
### Tone and Style
|
||||||
|
|
||||||
|
- Casual but technical (use `XD` for humor: 23 times)
|
||||||
|
- Use `..` for trailing thoughts (108 occurrences)
|
||||||
|
- Use `Woops,` to acknowledge mistakes (4 subject lines)
|
||||||
|
- Don't be afraid to show personality while being precise
|
||||||
|
|
||||||
|
### Example Bodies
|
||||||
|
|
||||||
|
Simple with bullets:
|
||||||
|
```
|
||||||
|
Add `multiaddr` and bump up some deps
|
||||||
|
|
||||||
|
Since we're planning to use it for (discovery)
|
||||||
|
addressing, allowing replacement of the hacky (pretend)
|
||||||
|
attempt in `tractor._multiaddr` Bp
|
||||||
|
|
||||||
|
Also pin some deps,
|
||||||
|
- make us py312+
|
||||||
|
- use `pdbp` with my frame indexing fix.
|
||||||
|
- mv to latest `xonsh` for fancy cmd/suggestion injections.
|
||||||
|
|
||||||
|
Bump lock file to match obvi!
|
||||||
|
```
|
||||||
|
|
||||||
|
With section markers:
|
||||||
|
```
|
||||||
|
Use `stackscope` for all actor-tree rendered "views"
|
||||||
|
|
||||||
|
Instead of the (much more) limited and hacky `.devx._code`
|
||||||
|
impls, move to using the new `.devx._stackscope` API which
|
||||||
|
wraps the `stackscope` project.
|
||||||
|
|
||||||
|
Deats,
|
||||||
|
- make new `stackscope.extract_stack()` wrapper
|
||||||
|
- port over frame-descing to `_stackscope.pformat_stack()`
|
||||||
|
- move `PdbREPL` to use `stackscope` render approach
|
||||||
|
- update tests for new stack output format
|
||||||
|
|
||||||
|
Also,
|
||||||
|
- tweak log formatting for consistency
|
||||||
|
- add typing hints throughout
|
||||||
|
```
|
||||||
|
|
||||||
|
## Special Patterns
|
||||||
|
|
||||||
|
### WIP Commits
|
||||||
|
Rare (0.2%) - avoid committing WIP if possible
|
||||||
|
|
||||||
|
### Merge Commits
|
||||||
|
Auto-generated (4.4%), don't worry about style
|
||||||
|
|
||||||
|
### File References
|
||||||
|
- Use `module.py` or `.submodule` style
|
||||||
|
- Rarely use `file.py:line` references (0 in analysis)
|
||||||
|
|
||||||
|
### Links
|
||||||
|
- GitHub links used sparingly (3 total)
|
||||||
|
- Prefer code references over external links
|
||||||
|
|
||||||
|
## Footer
|
||||||
|
|
||||||
|
The default footer should credit `claude` (you) for helping generate
|
||||||
|
the commit msg content:
|
||||||
|
|
||||||
|
```
|
||||||
|
(this commit msg was generated in some part by [`claude-code`][claude-code-gh])
|
||||||
|
[claude-code-gh]: https://github.com/anthropics/claude-code
|
||||||
|
```
|
||||||
|
|
||||||
|
Further, if the patch was solely or in part written
|
||||||
|
by `claude`, instead add:
|
||||||
|
|
||||||
|
```
|
||||||
|
(this patch was generated in some part by [`claude-code`][claude-code-gh])
|
||||||
|
[claude-code-gh]: https://github.com/anthropics/claude-code
|
||||||
|
```
|
||||||
|
|
||||||
|
## Summary Checklist
|
||||||
|
|
||||||
|
Before committing, verify:
|
||||||
|
- [ ] Subject line uses present tense verb
|
||||||
|
- [ ] Subject line ~50 chars (hard max 67)
|
||||||
|
- [ ] Code elements wrapped in backticks
|
||||||
|
- [ ] Body lines ≤67 chars
|
||||||
|
- [ ] Abbreviations used where natural
|
||||||
|
- [ ] Casual yet precise tone
|
||||||
|
- [ ] Section markers if body >3 paragraphs
|
||||||
|
- [ ] Technical accuracy maintained
|
||||||
|
|
||||||
|
## Analysis Metadata
|
||||||
|
|
||||||
|
```
|
||||||
|
Source: tractor repository
|
||||||
|
Commits analyzed: 500
|
||||||
|
Date range: 2019-2025
|
||||||
|
Analysis date: 2026-02-08
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
(this style guide was generated by [`claude-code`][claude-code-gh]
|
||||||
|
analyzing commit history)
|
||||||
|
|
||||||
|
[claude-code-gh]: https://github.com/anthropics/claude-code
|
||||||
|
|
@ -0,0 +1,297 @@
|
||||||
|
---
|
||||||
|
name: conc-anal
|
||||||
|
description: >
|
||||||
|
Concurrency analysis for tractor's trio-based
|
||||||
|
async primitives. Trace task scheduling across
|
||||||
|
checkpoint boundaries, identify race windows in
|
||||||
|
shared mutable state, and verify synchronization
|
||||||
|
correctness. Invoke on code segments the user
|
||||||
|
points at, OR proactively when reviewing/writing
|
||||||
|
concurrent cache, lock, or multi-task acm code.
|
||||||
|
argument-hint: "[file:line-range or function name]"
|
||||||
|
allowed-tools:
|
||||||
|
- Read
|
||||||
|
- Grep
|
||||||
|
- Glob
|
||||||
|
- Task
|
||||||
|
---
|
||||||
|
|
||||||
|
Perform a structured concurrency analysis on the
|
||||||
|
target code. This skill should be invoked:
|
||||||
|
|
||||||
|
- **On demand**: user points at a code segment
|
||||||
|
(file:lines, function name, or pastes a snippet)
|
||||||
|
- **Proactively**: when writing or reviewing code
|
||||||
|
that touches shared mutable state across trio
|
||||||
|
tasks — especially `_Cache`, locks, events, or
|
||||||
|
multi-task `@acm` lifecycle management
|
||||||
|
|
||||||
|
## 0. Identify the target
|
||||||
|
|
||||||
|
If the user provides a file:line-range or function
|
||||||
|
name, read that code. If not explicitly provided,
|
||||||
|
identify the relevant concurrent code from context
|
||||||
|
(e.g. the current diff, a failing test, or the
|
||||||
|
function under discussion).
|
||||||
|
|
||||||
|
## 1. Inventory shared mutable state
|
||||||
|
|
||||||
|
List every piece of state that is accessed by
|
||||||
|
multiple tasks. For each, note:
|
||||||
|
|
||||||
|
- **What**: the variable/dict/attr (e.g.
|
||||||
|
`_Cache.values`, `_Cache.resources`,
|
||||||
|
`_Cache.users`)
|
||||||
|
- **Scope**: class-level, module-level, or
|
||||||
|
closure-captured
|
||||||
|
- **Writers**: which tasks/code-paths mutate it
|
||||||
|
- **Readers**: which tasks/code-paths read it
|
||||||
|
- **Guarded by**: which lock/event/ordering
|
||||||
|
protects it (or "UNGUARDED" if none)
|
||||||
|
|
||||||
|
Format as a table:
|
||||||
|
|
||||||
|
```
|
||||||
|
| State | Writers | Readers | Guard |
|
||||||
|
|---------------------|-----------------|-----------------|----------------|
|
||||||
|
| _Cache.values | run_ctx, moc¹ | moc | ctx_key lock |
|
||||||
|
| _Cache.resources | run_ctx, moc | moc, run_ctx | UNGUARDED |
|
||||||
|
```
|
||||||
|
|
||||||
|
¹ `moc` = `maybe_open_context`
|
||||||
|
|
||||||
|
## 2. Map checkpoint boundaries
|
||||||
|
|
||||||
|
For each code path through the target, mark every
|
||||||
|
**checkpoint** — any `await` expression where trio
|
||||||
|
can switch to another task. Use line numbers:
|
||||||
|
|
||||||
|
```
|
||||||
|
L325: await lock.acquire() ← CHECKPOINT
|
||||||
|
L395: await service_tn.start(...) ← CHECKPOINT
|
||||||
|
L411: lock.release() ← (not a checkpoint, but changes lock state)
|
||||||
|
L414: yield (False, yielded) ← SUSPEND (caller runs)
|
||||||
|
L485: no_more_users.set() ← (wakes run_ctx, no switch yet)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key trio scheduling rules to apply:**
|
||||||
|
- `Event.set()` makes waiters *ready* but does NOT
|
||||||
|
switch immediately
|
||||||
|
- `lock.release()` is not a checkpoint
|
||||||
|
- `await sleep(0)` IS a checkpoint
|
||||||
|
- Code in `finally` blocks CAN have checkpoints
|
||||||
|
(unlike asyncio)
|
||||||
|
- `await` inside `except` blocks can be
|
||||||
|
`trio.Cancelled`-masked
|
||||||
|
|
||||||
|
## 3. Trace concurrent task schedules
|
||||||
|
|
||||||
|
Write out the **interleaved execution trace** for
|
||||||
|
the problematic scenario. Number each step and tag
|
||||||
|
which task executes it:
|
||||||
|
|
||||||
|
```
|
||||||
|
[Task A] 1. acquires lock
|
||||||
|
[Task A] 2. cache miss → allocates resources
|
||||||
|
[Task A] 3. releases lock
|
||||||
|
[Task A] 4. yields to caller
|
||||||
|
[Task A] 5. caller exits → finally runs
|
||||||
|
[Task A] 6. users-- → 0, sets no_more_users
|
||||||
|
[Task A] 7. pops lock from _Cache.locks
|
||||||
|
[run_ctx] 8. wakes from no_more_users.wait()
|
||||||
|
[run_ctx] 9. values.pop(ctx_key)
|
||||||
|
[run_ctx] 10. acm __aexit__ → CHECKPOINT
|
||||||
|
[Task B] 11. creates NEW lock (old one popped)
|
||||||
|
[Task B] 12. acquires immediately
|
||||||
|
[Task B] 13. values[ctx_key] → KeyError
|
||||||
|
[Task B] 14. resources[ctx_key] → STILL EXISTS
|
||||||
|
[Task B] 15. 💥 RuntimeError
|
||||||
|
```
|
||||||
|
|
||||||
|
Identify the **race window**: the range of steps
|
||||||
|
where state is inconsistent. In the example above,
|
||||||
|
steps 9–10 are the window (values gone, resources
|
||||||
|
still alive).
|
||||||
|
|
||||||
|
## 4. Classify the bug
|
||||||
|
|
||||||
|
Categorize what kind of concurrency issue this is:
|
||||||
|
|
||||||
|
- **TOCTOU** (time-of-check-to-time-of-use): state
|
||||||
|
changes between a check and the action based on it
|
||||||
|
- **Stale reference**: a task holds a reference to
|
||||||
|
state that another task has invalidated
|
||||||
|
- **Lifetime mismatch**: a synchronization primitive
|
||||||
|
(lock, event) has a shorter lifetime than the
|
||||||
|
state it's supposed to protect
|
||||||
|
- **Missing guard**: shared state is accessed
|
||||||
|
without any synchronization
|
||||||
|
- **Atomicity gap**: two operations that should be
|
||||||
|
atomic have a checkpoint between them
|
||||||
|
|
||||||
|
## 5. Propose fixes
|
||||||
|
|
||||||
|
For each proposed fix, provide:
|
||||||
|
|
||||||
|
- **Sketch**: pseudocode or diff showing the change
|
||||||
|
- **How it closes the window**: which step(s) from
|
||||||
|
the trace it eliminates or reorders
|
||||||
|
- **Tradeoffs**: complexity, perf, new edge cases,
|
||||||
|
impact on other code paths
|
||||||
|
- **Risk**: what could go wrong (deadlocks, new
|
||||||
|
races, cancellation issues)
|
||||||
|
|
||||||
|
Rate each fix: `[simple|moderate|complex]` impl
|
||||||
|
effort.
|
||||||
|
|
||||||
|
## 6. Output format
|
||||||
|
|
||||||
|
Structure the full analysis as:
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
## Concurrency analysis: `<target>`
|
||||||
|
|
||||||
|
### Shared state
|
||||||
|
<table from step 1>
|
||||||
|
|
||||||
|
### Checkpoints
|
||||||
|
<list from step 2>
|
||||||
|
|
||||||
|
### Race trace
|
||||||
|
<interleaved trace from step 3>
|
||||||
|
|
||||||
|
### Classification
|
||||||
|
<bug type from step 4>
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
<proposals from step 5>
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tractor-specific patterns to watch
|
||||||
|
|
||||||
|
These are known problem areas in tractor's
|
||||||
|
concurrency model. Flag them when encountered:
|
||||||
|
|
||||||
|
### `_Cache` lock vs `run_ctx` lifetime
|
||||||
|
|
||||||
|
The `_Cache.locks` entry is managed by
|
||||||
|
`maybe_open_context` callers, but `run_ctx` runs
|
||||||
|
in `service_tn` — a different task tree. Lock
|
||||||
|
pop/release in the caller's `finally` does NOT
|
||||||
|
wait for `run_ctx` to finish tearing down. Any
|
||||||
|
state that `run_ctx` cleans up in its `finally`
|
||||||
|
(e.g. `resources.pop()`) is vulnerable to
|
||||||
|
re-entry races after the lock is popped.
|
||||||
|
|
||||||
|
### `values.pop()` → acm `__aexit__` → `resources.pop()` gap
|
||||||
|
|
||||||
|
In `_Cache.run_ctx`, the inner `finally` pops
|
||||||
|
`values`, then the acm's `__aexit__` runs (which
|
||||||
|
has checkpoints), then the outer `finally` pops
|
||||||
|
`resources`. This creates a window where `values`
|
||||||
|
is gone but `resources` still exists — a classic
|
||||||
|
atomicity gap.
|
||||||
|
|
||||||
|
### Global vs per-key counters
|
||||||
|
|
||||||
|
`_Cache.users` as a single `int` (pre-fix) meant
|
||||||
|
that users of different `ctx_key`s inflated each
|
||||||
|
other's counts, preventing teardown when one key's
|
||||||
|
users hit zero. Always verify that per-key state
|
||||||
|
(`users`, `locks`) is actually keyed on `ctx_key`
|
||||||
|
and not on `fid` or some broader key.
|
||||||
|
|
||||||
|
### `Event.set()` wakes but doesn't switch
|
||||||
|
|
||||||
|
`trio.Event.set()` makes waiting tasks *ready* but
|
||||||
|
the current task continues executing until its next
|
||||||
|
checkpoint. Code between `.set()` and the next
|
||||||
|
`await` runs atomically from the scheduler's
|
||||||
|
perspective. Use this to your advantage (or watch
|
||||||
|
for bugs where code assumes the woken task runs
|
||||||
|
immediately).
|
||||||
|
|
||||||
|
### `except` block checkpoint masking
|
||||||
|
|
||||||
|
`await` expressions inside `except` handlers can
|
||||||
|
be masked by `trio.Cancelled`. If a `finally`
|
||||||
|
block runs from an `except` and contains
|
||||||
|
`lock.release()`, the release happens — but any
|
||||||
|
`await` after it in the same `except` may be
|
||||||
|
swallowed. This is why `maybe_open_context`'s
|
||||||
|
cache-miss path does `lock.release()` in a
|
||||||
|
`finally` inside the `except KeyError`.
|
||||||
|
|
||||||
|
### Cancellation in `finally`
|
||||||
|
|
||||||
|
Unlike asyncio, trio allows checkpoints in
|
||||||
|
`finally` blocks. This means `finally` cleanup
|
||||||
|
that does `await` can itself be cancelled (e.g.
|
||||||
|
by nursery shutdown). Watch for cleanup code that
|
||||||
|
assumes it will run to completion.
|
||||||
|
|
||||||
|
### Unbounded waits in cleanup paths
|
||||||
|
|
||||||
|
Any `await <event>.wait()` in a teardown path is
|
||||||
|
a latent deadlock unless the event's setter is
|
||||||
|
GUARANTEED to fire. If the setter depends on
|
||||||
|
external state (peer disconnects, child process
|
||||||
|
exit, subsequent task completion) that itself
|
||||||
|
depends on the current task's progress, you have
|
||||||
|
a mutual wait.
|
||||||
|
|
||||||
|
Rule: **bound every `await X.wait()` in cleanup
|
||||||
|
paths with `trio.move_on_after()`** unless you
|
||||||
|
can prove the setter is unconditionally reachable
|
||||||
|
from the state at the await site. Concrete recent
|
||||||
|
example: `ipc_server.wait_for_no_more_peers()` in
|
||||||
|
`async_main`'s finally (see
|
||||||
|
`ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`
|
||||||
|
"probe iteration 3") — it was unbounded, and when
|
||||||
|
one peer-handler was stuck the wait-for-no-more-
|
||||||
|
peers event never fired, deadlocking the whole
|
||||||
|
actor-tree teardown cascade.
|
||||||
|
|
||||||
|
### The capture-pipe-fill hang pattern (grep this first)
|
||||||
|
|
||||||
|
When investigating any hang in the test suite
|
||||||
|
**especially under fork-based backends**, first
|
||||||
|
check whether the hang reproduces under `pytest
|
||||||
|
-s` (`--capture=no`). If `-s` makes it go away
|
||||||
|
you're not looking at a trio concurrency bug —
|
||||||
|
you're looking at a Linux pipe-buffer fill.
|
||||||
|
|
||||||
|
Mechanism: pytest replaces fds 1,2 with pipe
|
||||||
|
write-ends. Fork-child subactors inherit those
|
||||||
|
fds. High-volume error-log tracebacks (cancel
|
||||||
|
cascade spew) fill the 64KB pipe buffer. Child
|
||||||
|
`write()` blocks. Child can't exit. Parent's
|
||||||
|
`waitpid`/pidfd wait blocks. Deadlock cascades up
|
||||||
|
the tree.
|
||||||
|
|
||||||
|
Pre-existing guards in `tests/conftest.py` encode
|
||||||
|
this knowledge — grep these BEFORE blaming
|
||||||
|
concurrency:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# tests/conftest.py:258
|
||||||
|
if loglevel in ('trace', 'debug'):
|
||||||
|
# XXX: too much logging will lock up the subproc (smh)
|
||||||
|
loglevel: str = 'info'
|
||||||
|
|
||||||
|
# tests/conftest.py:316
|
||||||
|
# can lock up on the `_io.BufferedReader` and hang..
|
||||||
|
stderr: str = proc.stderr.read().decode()
|
||||||
|
```
|
||||||
|
|
||||||
|
Full post-mortem +
|
||||||
|
`ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`
|
||||||
|
for the canonical reproduction. Cost several
|
||||||
|
investigation sessions before catching it —
|
||||||
|
because the capture-pipe symptom was masked by
|
||||||
|
deeper cascade-deadlocks. Once the cascades were
|
||||||
|
fixed, the tree tore down enough to generate
|
||||||
|
pipe-filling log volume → capture-pipe finally
|
||||||
|
surfaced. Grep-note for future-self: **if a
|
||||||
|
multi-subproc tractor test hangs, `pytest -s`
|
||||||
|
first, conc-anal second.**
|
||||||
|
|
@ -0,0 +1,241 @@
|
||||||
|
# PR/Patch-Request Description Format Reference
|
||||||
|
|
||||||
|
Canonical structure for `tractor` patch-request
|
||||||
|
descriptions, designed to work across GitHub,
|
||||||
|
Gitea, SourceHut, and GitLab markdown renderers.
|
||||||
|
|
||||||
|
**Line length: wrap at 72 chars** for all prose
|
||||||
|
content (Summary bullets, Motivation paragraphs,
|
||||||
|
Scopes bullets, etc.). Fill lines *to* 72 — don't
|
||||||
|
stop short at 50-65. Only raw URLs in
|
||||||
|
reference-link definitions may exceed this.
|
||||||
|
|
||||||
|
## Template
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
<!-- pr-msg-meta
|
||||||
|
branch: <branch-name>
|
||||||
|
base: <base-branch>
|
||||||
|
submitted:
|
||||||
|
github: ___
|
||||||
|
gitea: ___
|
||||||
|
srht: ___
|
||||||
|
-->
|
||||||
|
|
||||||
|
## <Title: present-tense verb + backticked code>
|
||||||
|
|
||||||
|
### Summary
|
||||||
|
- [<hash>][<hash>] Description of change ending
|
||||||
|
with period.
|
||||||
|
- [<hash>][<hash>] Another change description
|
||||||
|
ending with period.
|
||||||
|
- [<hash>][<hash>] [<hash>][<hash>] Multi-commit
|
||||||
|
change description.
|
||||||
|
|
||||||
|
### Motivation
|
||||||
|
<1-2 paragraphs: problem/limitation first,
|
||||||
|
then solution. Hard-wrap at 72 chars.>
|
||||||
|
|
||||||
|
### Scopes changed
|
||||||
|
- [<hash>][<hash>] `pkg.mod.func()` — what
|
||||||
|
changed.
|
||||||
|
* [<hash>][<hash>] Also adjusts
|
||||||
|
`.related_thing()` in same module.
|
||||||
|
- [<hash>][<hash>] `tests.test_mod` — new/changed
|
||||||
|
test coverage.
|
||||||
|
|
||||||
|
<!--
|
||||||
|
### Cross-references
|
||||||
|
Also submitted as
|
||||||
|
[github-pr][] | [gitea-pr][] | [srht-patch][].
|
||||||
|
|
||||||
|
### Links
|
||||||
|
- [relevant-issue-or-discussion](url)
|
||||||
|
- [design-doc-or-screenshot](url)
|
||||||
|
-->
|
||||||
|
|
||||||
|
(this pr content was generated in some part by
|
||||||
|
[`claude-code`][claude-code-gh])
|
||||||
|
|
||||||
|
[<hash>]: https://<service>/<owner>/<repo>/commit/<hash>
|
||||||
|
[claude-code-gh]: https://github.com/anthropics/claude-code
|
||||||
|
|
||||||
|
<!-- cross-service pr refs (fill after submit):
|
||||||
|
[github-pr]: https://github.com/<owner>/<repo>/pull/___
|
||||||
|
[gitea-pr]: https://<host>/<owner>/<repo>/pulls/___
|
||||||
|
[srht-patch]: https://git.sr.ht/~<owner>/<repo>/patches/___
|
||||||
|
-->
|
||||||
|
```
|
||||||
|
|
||||||
|
## Markdown Reference-Link Strategy
|
||||||
|
|
||||||
|
Use reference-style links for ALL commit hashes
|
||||||
|
and cross-service PR refs to ensure cross-service
|
||||||
|
compatibility:
|
||||||
|
|
||||||
|
**Inline usage** (in bullets):
|
||||||
|
```markdown
|
||||||
|
- [f3726cf9][f3726cf9] Add `reg_err_types()`
|
||||||
|
for custom exc lookup.
|
||||||
|
```
|
||||||
|
|
||||||
|
**Definition** (bottom of document):
|
||||||
|
```markdown
|
||||||
|
[f3726cf9]: https://github.com/goodboy/tractor/commit/f3726cf9
|
||||||
|
```
|
||||||
|
|
||||||
|
### Why reference-style?
|
||||||
|
- Keeps prose readable without long inline URLs.
|
||||||
|
- All URLs in one place — trivially swappable
|
||||||
|
per-service.
|
||||||
|
- Most git services auto-link bare SHAs anyway,
|
||||||
|
but explicit refs guarantee it works in *any*
|
||||||
|
md renderer.
|
||||||
|
- The `[hash][hash]` form is self-documenting —
|
||||||
|
display text matches the ref ID.
|
||||||
|
- Cross-service PR refs use the same mechanism:
|
||||||
|
`[github-pr][]` resolves via a ref-link def
|
||||||
|
at the bottom, trivially fillable post-submit.
|
||||||
|
|
||||||
|
## Cross-Service PR Placeholder Mechanism
|
||||||
|
|
||||||
|
The generated description includes three layers
|
||||||
|
of cross-service support, all using native md
|
||||||
|
reference-links:
|
||||||
|
|
||||||
|
### 1. Metadata comment (top of file)
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
<!-- pr-msg-meta
|
||||||
|
branch: remote_exc_type_registry
|
||||||
|
base: main
|
||||||
|
submitted:
|
||||||
|
github: ___
|
||||||
|
gitea: ___
|
||||||
|
srht: ___
|
||||||
|
-->
|
||||||
|
```
|
||||||
|
|
||||||
|
A YAML-ish HTML comment block. The `___`
|
||||||
|
placeholders get filled with PR/patch numbers
|
||||||
|
after submission. Machine-parseable for tooling
|
||||||
|
(e.g. `gish`) but invisible in rendered md.
|
||||||
|
|
||||||
|
### 2. Cross-references section (in body)
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
<!--
|
||||||
|
### Cross-references
|
||||||
|
Also submitted as
|
||||||
|
[github-pr][] | [gitea-pr][] | [srht-patch][].
|
||||||
|
-->
|
||||||
|
```
|
||||||
|
|
||||||
|
Commented out at generation time. After submitting
|
||||||
|
to multiple services, uncomment and the ref-links
|
||||||
|
resolve via the stubs at the bottom.
|
||||||
|
|
||||||
|
### 3. Ref-link stubs (bottom of file)
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
<!-- cross-service pr refs (fill after submit):
|
||||||
|
[github-pr]: https://github.com/goodboy/tractor/pull/___
|
||||||
|
[gitea-pr]: https://pikers.dev/goodboy/tractor/pulls/___
|
||||||
|
[srht-patch]: https://git.sr.ht/~goodboy/tractor/patches/___
|
||||||
|
-->
|
||||||
|
```
|
||||||
|
|
||||||
|
Commented out with `___` number placeholders.
|
||||||
|
After submission: uncomment, replace `___` with
|
||||||
|
the actual number. Each service-specific copy
|
||||||
|
fills in all services' numbers so any copy can
|
||||||
|
cross-reference the others.
|
||||||
|
|
||||||
|
### Post-submission file layout
|
||||||
|
|
||||||
|
```
|
||||||
|
pr_msg_LATEST.md # latest draft (skill root)
|
||||||
|
msgs/
|
||||||
|
20260325T002027Z_mybranch_pr_msg.md # timestamped
|
||||||
|
github/
|
||||||
|
42_pr_msg.md # github PR #42
|
||||||
|
gitea/
|
||||||
|
17_pr_msg.md # gitea PR #17
|
||||||
|
srht/
|
||||||
|
5_pr_msg.md # srht patch #5
|
||||||
|
```
|
||||||
|
|
||||||
|
Each `<service>/<num>_pr_msg.md` is a copy with:
|
||||||
|
- metadata `submitted:` fields filled in
|
||||||
|
- cross-references section uncommented
|
||||||
|
- ref-link stubs uncommented with real numbers
|
||||||
|
- all services cross-linked in each copy
|
||||||
|
|
||||||
|
This mirrors the `gish` skill's
|
||||||
|
`<backend>/<num>.md` pattern.
|
||||||
|
|
||||||
|
## Commit-Link URL Patterns by Service
|
||||||
|
|
||||||
|
| Service | Pattern |
|
||||||
|
|-----------|-------------------------------------|
|
||||||
|
| GitHub | `https://github.com/<o>/<r>/commit/<h>` |
|
||||||
|
| Gitea | `https://<host>/<o>/<r>/commit/<h>` |
|
||||||
|
| SourceHut | `https://git.sr.ht/~<o>/<r>/commit/<h>` |
|
||||||
|
| GitLab | `https://gitlab.com/<o>/<r>/-/commit/<h>` |
|
||||||
|
|
||||||
|
## PR/Patch URL Patterns by Service
|
||||||
|
|
||||||
|
| Service | Pattern |
|
||||||
|
|-----------|-------------------------------------|
|
||||||
|
| GitHub | `https://github.com/<o>/<r>/pull/<n>` |
|
||||||
|
| Gitea | `https://<host>/<o>/<r>/pulls/<n>` |
|
||||||
|
| SourceHut | `https://git.sr.ht/~<o>/<r>/patches/<n>` |
|
||||||
|
| GitLab | `https://gitlab.com/<o>/<r>/-/merge_requests/<n>` |
|
||||||
|
|
||||||
|
## Scope Naming Convention
|
||||||
|
|
||||||
|
Use Python namespace-resolution syntax for
|
||||||
|
referencing changed code scopes:
|
||||||
|
|
||||||
|
| File path | Scope reference |
|
||||||
|
|---------------------------|-------------------------------|
|
||||||
|
| `tractor/_exceptions.py` | `tractor._exceptions` |
|
||||||
|
| `tractor/_state.py` | `tractor._state` |
|
||||||
|
| `tests/test_foo.py` | `tests.test_foo` |
|
||||||
|
| Function in module | `tractor._exceptions.func()` |
|
||||||
|
| Method on class | `.RemoteActorError.src_type` |
|
||||||
|
| Class | `tractor._exceptions.RAE` |
|
||||||
|
|
||||||
|
Prefix with the package path for top-level refs;
|
||||||
|
use leading-dot shorthand (`.ClassName.method()`)
|
||||||
|
for sub-bullets where the parent module is already
|
||||||
|
established.
|
||||||
|
|
||||||
|
## Title Conventions
|
||||||
|
|
||||||
|
Same verb vocabulary as commit messages:
|
||||||
|
- `Add` — wholly new feature/API
|
||||||
|
- `Fix` — bug fix
|
||||||
|
- `Drop` — removal
|
||||||
|
- `Use` — adopt new approach
|
||||||
|
- `Move`/`Mv` — relocate code
|
||||||
|
- `Adjust` — minor tweak
|
||||||
|
- `Update` — enhance existing feature
|
||||||
|
- `Support` — add support for something
|
||||||
|
|
||||||
|
Target 50 chars, hard max 70. Always backtick
|
||||||
|
code elements.
|
||||||
|
|
||||||
|
## Tone
|
||||||
|
|
||||||
|
Casual yet technically precise — matching the
|
||||||
|
project's commit-msg style. Terse but every bullet
|
||||||
|
carries signal. Use project abbreviations freely
|
||||||
|
(msg, bg, ctx, impl, mod, obvi, fn, bc, var,
|
||||||
|
prolly, ep, etc.).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
(this format reference was generated by
|
||||||
|
[`claude-code`][claude-code-gh])
|
||||||
|
[claude-code-gh]: https://github.com/anthropics/claude-code
|
||||||
|
|
@ -0,0 +1,523 @@
|
||||||
|
---
|
||||||
|
name: run-tests
|
||||||
|
description: >
|
||||||
|
Run tractor test suite (or subsets). Use when the user wants
|
||||||
|
to run tests, verify changes, or check for regressions.
|
||||||
|
argument-hint: "[test-path-or-pattern] [--opts]"
|
||||||
|
allowed-tools:
|
||||||
|
- Bash(python -m pytest *)
|
||||||
|
- Bash(python -c *)
|
||||||
|
- Bash(python --version *)
|
||||||
|
- Bash(UV_PROJECT_ENVIRONMENT=py* uv run python *)
|
||||||
|
- Bash(UV_PROJECT_ENVIRONMENT=py* uv run pytest *)
|
||||||
|
- Bash(UV_PROJECT_ENVIRONMENT=py* uv sync *)
|
||||||
|
- Bash(UV_PROJECT_ENVIRONMENT=py* uv pip show *)
|
||||||
|
- Bash(git rev-parse *)
|
||||||
|
- Bash(ls *)
|
||||||
|
- Bash(cat *)
|
||||||
|
- Bash(jq * .pytest_cache/*)
|
||||||
|
- Read
|
||||||
|
- Grep
|
||||||
|
- Glob
|
||||||
|
- Task
|
||||||
|
- AskUserQuestion
|
||||||
|
---
|
||||||
|
|
||||||
|
Run the `tractor` test suite using `pytest`. Follow this
|
||||||
|
process:
|
||||||
|
|
||||||
|
## 1. Parse user intent
|
||||||
|
|
||||||
|
From the user's message and any arguments, determine:
|
||||||
|
|
||||||
|
- **scope**: full suite, specific file(s), specific
|
||||||
|
test(s), or a keyword pattern (`-k`).
|
||||||
|
- **transport**: which IPC transport protocol to test
|
||||||
|
against (default: `tcp`, also: `uds`).
|
||||||
|
- **options**: any extra pytest flags the user wants
|
||||||
|
(e.g. `--ll debug`, `--tpdb`, `-x`, `-v`).
|
||||||
|
|
||||||
|
If the user provides a bare path or pattern as argument,
|
||||||
|
treat it as the test target. Examples:
|
||||||
|
|
||||||
|
- `/run-tests` → full suite
|
||||||
|
- `/run-tests test_local.py` → single file
|
||||||
|
- `/run-tests test_registrar -v` → file + verbose
|
||||||
|
- `/run-tests -k cancel` → keyword filter
|
||||||
|
- `/run-tests tests/ipc/ --tpt-proto uds` → subdir + UDS
|
||||||
|
|
||||||
|
## 2. Construct the pytest command
|
||||||
|
|
||||||
|
Base command:
|
||||||
|
```
|
||||||
|
python -m pytest
|
||||||
|
```
|
||||||
|
|
||||||
|
### Default flags (always include unless user overrides):
|
||||||
|
- `-x` (stop on first failure)
|
||||||
|
- `--tb=short` (concise tracebacks)
|
||||||
|
- `--no-header` (reduce noise)
|
||||||
|
|
||||||
|
### Path resolution:
|
||||||
|
- If the user gives a bare filename like `test_local.py`,
|
||||||
|
resolve it under `tests/`.
|
||||||
|
- If the user gives a subdirectory like `ipc/`, resolve
|
||||||
|
under `tests/ipc/`.
|
||||||
|
- Glob if needed: `tests/**/test_*<pattern>*.py`
|
||||||
|
|
||||||
|
### Key pytest options for this project:
|
||||||
|
|
||||||
|
| Flag | Purpose |
|
||||||
|
|---|---|
|
||||||
|
| `--ll <level>` | Set tractor log level (e.g. `debug`, `info`, `runtime`) |
|
||||||
|
| `--tpdb` / `--debug-mode` | Enable tractor's multi-proc debugger |
|
||||||
|
| `--tpt-proto <key>` | IPC transport: `tcp` (default) or `uds` |
|
||||||
|
| `--spawn-backend <be>` | Spawn method: `trio` (default), `mp_spawn`, `mp_forkserver` |
|
||||||
|
| `-k <expr>` | pytest keyword filter |
|
||||||
|
| `-v` / `-vv` | Verbosity |
|
||||||
|
| `-s` | No output capture (useful with `--tpdb`) |
|
||||||
|
|
||||||
|
### Common combos:
|
||||||
|
```sh
|
||||||
|
# quick smoke test of core modules
|
||||||
|
python -m pytest tests/test_local.py tests/test_rpc.py -x --tb=short --no-header
|
||||||
|
|
||||||
|
# full suite, stop on first failure
|
||||||
|
python -m pytest tests/ -x --tb=short --no-header
|
||||||
|
|
||||||
|
# specific test with debug
|
||||||
|
python -m pytest tests/discovery/test_registrar.py::test_reg_then_unreg -x -s --tpdb --ll debug
|
||||||
|
|
||||||
|
# run with UDS transport
|
||||||
|
python -m pytest tests/ -x --tb=short --no-header --tpt-proto uds
|
||||||
|
|
||||||
|
# keyword filter
|
||||||
|
python -m pytest tests/ -x --tb=short --no-header -k "cancel and not slow"
|
||||||
|
```
|
||||||
|
|
||||||
|
## 3. Pre-flight: venv detection (MANDATORY)
|
||||||
|
|
||||||
|
**Always verify a `uv` venv is active before running
|
||||||
|
`python` or `pytest`.** This project uses
|
||||||
|
`UV_PROJECT_ENVIRONMENT=py<MINOR>` naming (e.g.
|
||||||
|
`py313`) — never `.venv`.
|
||||||
|
|
||||||
|
### Step 1: detect active venv
|
||||||
|
|
||||||
|
Run this check first:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
python -c "
|
||||||
|
import sys, os
|
||||||
|
venv = os.environ.get('VIRTUAL_ENV', '')
|
||||||
|
prefix = sys.prefix
|
||||||
|
print(f'VIRTUAL_ENV={venv}')
|
||||||
|
print(f'sys.prefix={prefix}')
|
||||||
|
print(f'executable={sys.executable}')
|
||||||
|
"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 2: interpret results
|
||||||
|
|
||||||
|
**Case A — venv is active** (`VIRTUAL_ENV` is set
|
||||||
|
and points to a `py<MINOR>/` dir under the project
|
||||||
|
root or worktree):
|
||||||
|
|
||||||
|
Use bare `python` / `python -m pytest` for all
|
||||||
|
commands. This is the normal, fast path.
|
||||||
|
|
||||||
|
**Case B — no venv active** (`VIRTUAL_ENV` is empty
|
||||||
|
or `sys.prefix` points to a system Python):
|
||||||
|
|
||||||
|
Use `AskUserQuestion` to ask the user:
|
||||||
|
|
||||||
|
> "No uv venv is active. Should I activate one
|
||||||
|
> via `UV_PROJECT_ENVIRONMENT=py<MINOR> uv sync`,
|
||||||
|
> or would you prefer to activate your shell venv
|
||||||
|
> first?"
|
||||||
|
|
||||||
|
Options:
|
||||||
|
1. **"Create/sync venv"** — run
|
||||||
|
`UV_PROJECT_ENVIRONMENT=py<MINOR> uv sync` where
|
||||||
|
`<MINOR>` is detected from `python --version`
|
||||||
|
(e.g. `313` for 3.13). Then use
|
||||||
|
`py<MINOR>/bin/python` for all subsequent
|
||||||
|
commands in this session.
|
||||||
|
2. **"I'll activate it myself"** — stop and let the
|
||||||
|
user `source py<MINOR>/bin/activate` or similar.
|
||||||
|
|
||||||
|
**Case C — inside a git worktree** (`git rev-parse
|
||||||
|
--git-common-dir` differs from `--git-dir`):
|
||||||
|
|
||||||
|
Verify Python resolves from the **worktree's own
|
||||||
|
venv**, not the main repo's:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
python -c "import tractor; print(tractor.__file__)"
|
||||||
|
```
|
||||||
|
|
||||||
|
If the path points outside the worktree, create a
|
||||||
|
worktree-local venv:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
UV_PROJECT_ENVIRONMENT=py<MINOR> uv sync
|
||||||
|
```
|
||||||
|
|
||||||
|
Then use `py<MINOR>/bin/python` for all commands.
|
||||||
|
|
||||||
|
**Why this matters**: without the correct venv,
|
||||||
|
subprocesses spawned by tractor resolve modules
|
||||||
|
from the wrong editable install, causing spurious
|
||||||
|
`AttributeError` / `ModuleNotFoundError`.
|
||||||
|
|
||||||
|
### Fallback: `uv run`
|
||||||
|
|
||||||
|
If the user can't or won't activate a venv, all
|
||||||
|
`python` and `pytest` commands can be prefixed
|
||||||
|
with `UV_PROJECT_ENVIRONMENT=py<MINOR> uv run`:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# instead of: python -m pytest tests/ -x
|
||||||
|
UV_PROJECT_ENVIRONMENT=py313 uv run pytest tests/ -x
|
||||||
|
|
||||||
|
# instead of: python -c 'import tractor'
|
||||||
|
UV_PROJECT_ENVIRONMENT=py313 uv run python -c 'import tractor'
|
||||||
|
```
|
||||||
|
|
||||||
|
`uv run` auto-discovers the project and venv,
|
||||||
|
but is slower than a pre-activated venv due to
|
||||||
|
lock-file resolution on each invocation. Prefer
|
||||||
|
activating the venv when possible.
|
||||||
|
|
||||||
|
### Step 3: import + collection checks
|
||||||
|
|
||||||
|
After venv is confirmed, always run these
|
||||||
|
(especially after refactors or module moves):
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# 1. package import smoke check
|
||||||
|
python -c 'import tractor; print(tractor)'
|
||||||
|
|
||||||
|
# 2. verify all tests collect (no import errors)
|
||||||
|
python -m pytest tests/ -x -q --co 2>&1 | tail -5
|
||||||
|
```
|
||||||
|
|
||||||
|
If either fails, fix the import error before running
|
||||||
|
any actual tests.
|
||||||
|
|
||||||
|
### Step 4: zombie-actor / stale-registry check (MANDATORY)
|
||||||
|
|
||||||
|
The tractor runtime's default registry address is
|
||||||
|
**`127.0.0.1:1616`** (TCP) / `/tmp/registry@1616.sock`
|
||||||
|
(UDS). Whenever any prior test run — especially one
|
||||||
|
using a fork-based backend like `subint_forkserver` —
|
||||||
|
leaks a child actor process, that zombie keeps the
|
||||||
|
registry port bound and **every subsequent test
|
||||||
|
session fails to bind**, often presenting as 50+
|
||||||
|
unrelated failures ("all tests broken"!) across
|
||||||
|
backends.
|
||||||
|
|
||||||
|
**This has to be checked before the first run AND
|
||||||
|
after any cancelled/SIGINT'd run** — signal failures
|
||||||
|
in the middle of a test can leave orphan children.
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# 1. TCP registry — any listener on :1616? (primary signal)
|
||||||
|
ss -tlnp 2>/dev/null | grep ':1616' || echo 'TCP :1616 free'
|
||||||
|
|
||||||
|
# 2. leftover actor/forkserver procs — scoped to THIS
|
||||||
|
# repo's python path, so we don't false-flag legit
|
||||||
|
# long-running tractor-using apps (e.g. `piker`,
|
||||||
|
# downstream projects that embed tractor).
|
||||||
|
pgrep -af "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv" \
|
||||||
|
| grep -v 'grep\|pgrep' \
|
||||||
|
|| echo 'no leaked actor procs from this repo'
|
||||||
|
|
||||||
|
# 3. stale UDS registry sockets
|
||||||
|
ls -la /tmp/registry@*.sock 2>/dev/null \
|
||||||
|
|| echo 'no leaked UDS registry sockets'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Interpretation:**
|
||||||
|
|
||||||
|
- **TCP :1616 free AND no stale sockets** → clean,
|
||||||
|
proceed. The actor-procs probe is secondary — false
|
||||||
|
positives are common (piker, any other tractor-
|
||||||
|
embedding app); only cleanup if `:1616` is bound or
|
||||||
|
sockets linger.
|
||||||
|
- **TCP :1616 bound OR stale sockets present** →
|
||||||
|
surface PIDs + cmdlines to the user, offer cleanup:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# 1. GRACEFUL FIRST (tractor is structured concurrent — it
|
||||||
|
# catches SIGINT as an OS-cancel in `_trio_main` and
|
||||||
|
# cascades Portal.cancel_actor via IPC to every descendant.
|
||||||
|
# So always try SIGINT first with a bounded timeout; only
|
||||||
|
# escalate to SIGKILL if graceful cleanup doesn't complete).
|
||||||
|
pkill -INT -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv"
|
||||||
|
|
||||||
|
# 2. bounded wait for graceful teardown (usually sub-second).
|
||||||
|
# Loop until the processes exit, or timeout. Keep the
|
||||||
|
# bound tight — hung/abrupt-killed descendants usually
|
||||||
|
# hang forever, so don't wait more than a few seconds.
|
||||||
|
for i in $(seq 1 10); do
|
||||||
|
pgrep -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv" >/dev/null || break
|
||||||
|
sleep 0.3
|
||||||
|
done
|
||||||
|
|
||||||
|
# 3. ESCALATE TO SIGKILL only if graceful didn't finish.
|
||||||
|
if pgrep -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv" >/dev/null; then
|
||||||
|
echo 'graceful teardown timed out — escalating to SIGKILL'
|
||||||
|
pkill -9 -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 4. if a test zombie holds :1616 specifically and doesn't
|
||||||
|
# match the above pattern, find its PID the hard way:
|
||||||
|
ss -tlnp 2>/dev/null | grep ':1616' # prints `users:(("<name>",pid=NNNN,...))`
|
||||||
|
# then (same SIGINT-first ladder):
|
||||||
|
# kill -INT <NNNN>; sleep 1; kill -9 <NNNN> 2>/dev/null
|
||||||
|
|
||||||
|
# 5. remove stale UDS sockets
|
||||||
|
rm -f /tmp/registry@*.sock
|
||||||
|
|
||||||
|
# 6. re-verify
|
||||||
|
ss -tlnp 2>/dev/null | grep ':1616' || echo 'TCP :1616 now free'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Never ignore stale registry state.** If you see the
|
||||||
|
"all tests failing" pattern — especially
|
||||||
|
`trio.TooSlowError` / connection refused / address in
|
||||||
|
use on many unrelated tests — check registry **before**
|
||||||
|
spelunking into test code. The failure signature will
|
||||||
|
be identical across backends because they're all
|
||||||
|
fighting for the same port.
|
||||||
|
|
||||||
|
**False-positive warning for step 2:** a plain
|
||||||
|
`pgrep -af '_actor_child_main'` will also match
|
||||||
|
legit long-running tractor-embedding apps (e.g.
|
||||||
|
`piker` at `~/repos/piker/py*/bin/python3 -m
|
||||||
|
tractor._child ...`). Always scope to the current
|
||||||
|
repo's python path, or only use step 1 (`:1616`) as
|
||||||
|
the authoritative signal.
|
||||||
|
|
||||||
|
## 4. Run and report
|
||||||
|
|
||||||
|
- Run the constructed command.
|
||||||
|
- Use a timeout of **600000ms** (10min) for full suite
|
||||||
|
runs, **120000ms** (2min) for single-file runs.
|
||||||
|
- If the suite is large (full `tests/`), consider running
|
||||||
|
in the background and checking output when done.
|
||||||
|
- Use `--lf` (last-failed) to re-run only previously
|
||||||
|
failing tests when iterating on a fix.
|
||||||
|
|
||||||
|
### On failure:
|
||||||
|
- Show the failing test name(s) and short traceback.
|
||||||
|
- If the failure looks related to recent changes, point
|
||||||
|
out the likely cause and suggest a fix.
|
||||||
|
- **Check the known-flaky list** (section 8) before
|
||||||
|
investigating — don't waste time on pre-existing
|
||||||
|
timeout issues.
|
||||||
|
- **NEVER auto-commit fixes.** If you apply a code fix
|
||||||
|
during test iteration, leave it unstaged. Tell the
|
||||||
|
user what changed and suggest they review the
|
||||||
|
worktree state, stage files manually, and use
|
||||||
|
`/commit-msg` (inline or in a separate session) to
|
||||||
|
generate the commit message. The human drives all
|
||||||
|
`git add` and `git commit` operations.
|
||||||
|
|
||||||
|
### On success:
|
||||||
|
- Report the pass/fail/skip counts concisely.
|
||||||
|
|
||||||
|
## 5. Test directory layout (reference)
|
||||||
|
|
||||||
|
```
|
||||||
|
tests/
|
||||||
|
├── conftest.py # root fixtures, daemon, signals
|
||||||
|
├── devx/ # debugger/tooling tests
|
||||||
|
├── ipc/ # transport protocol tests
|
||||||
|
├── msg/ # messaging layer tests
|
||||||
|
├── discovery/ # discovery subsystem tests
|
||||||
|
│ ├── test_multiaddr.py # multiaddr construction
|
||||||
|
│ └── test_registrar.py # registry/discovery protocol
|
||||||
|
├── test_local.py # registrar + local actor basics
|
||||||
|
├── test_rpc.py # RPC error handling
|
||||||
|
├── test_spawning.py # subprocess spawning
|
||||||
|
├── test_multi_program.py # multi-process tree tests
|
||||||
|
├── test_cancellation.py # cancellation semantics
|
||||||
|
├── test_context_stream_semantics.py # ctx streaming
|
||||||
|
├── test_inter_peer_cancellation.py # peer cancel
|
||||||
|
├── test_infected_asyncio.py # trio-in-asyncio
|
||||||
|
└── ...
|
||||||
|
```
|
||||||
|
|
||||||
|
## 6. Change-type → test mapping
|
||||||
|
|
||||||
|
After modifying specific modules, run the corresponding
|
||||||
|
test subset first for fast feedback:
|
||||||
|
|
||||||
|
| Changed module(s) | Run these tests first |
|
||||||
|
|---|---|
|
||||||
|
| `runtime/_runtime.py`, `runtime/_state.py` | `test_local.py test_rpc.py test_spawning.py test_root_runtime.py` |
|
||||||
|
| `discovery/` (`_registry`, `_discovery`, `_addr`) | `tests/discovery/ test_multi_program.py test_local.py` |
|
||||||
|
| `_context.py`, `_streaming.py` | `test_context_stream_semantics.py test_advanced_streaming.py` |
|
||||||
|
| `ipc/` (`_chan`, `_server`, `_transport`) | `tests/ipc/ test_2way.py` |
|
||||||
|
| `runtime/_portal.py`, `runtime/_rpc.py` | `test_rpc.py test_cancellation.py` |
|
||||||
|
| `spawn/` (`_spawn`, `_entry`) | `test_spawning.py test_multi_program.py` |
|
||||||
|
| `devx/debug/` | `tests/devx/test_debugger.py` (slow!) |
|
||||||
|
| `to_asyncio.py` | `test_infected_asyncio.py test_root_infect_asyncio.py` |
|
||||||
|
| `msg/` | `tests/msg/` |
|
||||||
|
| `_exceptions.py` | `test_remote_exc_relay.py test_inter_peer_cancellation.py` |
|
||||||
|
| `runtime/_supervise.py` | `test_cancellation.py test_spawning.py` |
|
||||||
|
|
||||||
|
## 7. Quick-check shortcuts
|
||||||
|
|
||||||
|
### After refactors (fastest first-pass):
|
||||||
|
```sh
|
||||||
|
# import + collect check
|
||||||
|
python -c 'import tractor' && python -m pytest tests/ -x -q --co 2>&1 | tail -3
|
||||||
|
|
||||||
|
# core subset (~10s)
|
||||||
|
python -m pytest tests/test_local.py tests/test_rpc.py tests/test_spawning.py tests/discovery/test_registrar.py -x --tb=short --no-header
|
||||||
|
```
|
||||||
|
|
||||||
|
### Inspect last failures (without re-running):
|
||||||
|
|
||||||
|
When the user asks "what failed?", "show failures",
|
||||||
|
or wants to check the last-failed set before
|
||||||
|
re-running — read the pytest cache directly. This
|
||||||
|
is instant and avoids test collection overhead.
|
||||||
|
|
||||||
|
```sh
|
||||||
|
python -c "
|
||||||
|
import json, pathlib, sys
|
||||||
|
p = pathlib.Path('.pytest_cache/v/cache/lastfailed')
|
||||||
|
if not p.exists():
|
||||||
|
print('No lastfailed cache found.'); sys.exit()
|
||||||
|
data = json.loads(p.read_text())
|
||||||
|
# filter to real test node IDs (ignore junk
|
||||||
|
# entries that can accumulate from system paths)
|
||||||
|
tests = sorted(k for k in data if k.startswith('tests/'))
|
||||||
|
if not tests:
|
||||||
|
print('No failures recorded.')
|
||||||
|
else:
|
||||||
|
print(f'{len(tests)} last-failed test(s):')
|
||||||
|
for t in tests:
|
||||||
|
print(f' {t}')
|
||||||
|
"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why not `--cache-show` or `--co --lf`?**
|
||||||
|
|
||||||
|
- `pytest --cache-show 'cache/lastfailed'` works
|
||||||
|
but dumps raw dict repr including junk entries
|
||||||
|
(stale system paths that leak into the cache).
|
||||||
|
- `pytest --co --lf` actually *collects* tests which
|
||||||
|
triggers import resolution and is slow (~0.5s+).
|
||||||
|
Worse, when cached node IDs don't exactly match
|
||||||
|
current parametrize IDs (e.g. param names changed
|
||||||
|
between runs), pytest falls back to collecting
|
||||||
|
the *entire file*, giving false positives.
|
||||||
|
- Reading the JSON directly is instant, filterable
|
||||||
|
to `tests/`-prefixed entries, and shows exactly
|
||||||
|
what pytest recorded — no interpretation.
|
||||||
|
|
||||||
|
**After inspecting**, re-run the failures:
|
||||||
|
```sh
|
||||||
|
python -m pytest --lf -x --tb=short --no-header
|
||||||
|
```
|
||||||
|
|
||||||
|
### Full suite in background:
|
||||||
|
When core tests pass and you want full coverage while
|
||||||
|
continuing other work, run in background:
|
||||||
|
```sh
|
||||||
|
python -m pytest tests/ -x --tb=short --no-header -q
|
||||||
|
```
|
||||||
|
(use `run_in_background=true` on the Bash tool)
|
||||||
|
|
||||||
|
## 8. Known flaky tests
|
||||||
|
|
||||||
|
These tests have **pre-existing** timing/environment
|
||||||
|
sensitivity. If they fail with `TooSlowError` or
|
||||||
|
pexpect `TIMEOUT`, they are almost certainly NOT caused
|
||||||
|
by your changes — note them and move on.
|
||||||
|
|
||||||
|
| Test | Typical error | Notes |
|
||||||
|
|---|---|---|
|
||||||
|
| `devx/test_debugger.py::test_multi_nested_subactors_error_through_nurseries` | pexpect TIMEOUT | Debugger pexpect timing |
|
||||||
|
| `test_cancellation.py::test_cancel_via_SIGINT_other_task` | TooSlowError | Signal handling race |
|
||||||
|
| `test_inter_peer_cancellation.py::test_peer_spawns_and_cancels_service_subactor` | TooSlowError | Async timing (both param variants) |
|
||||||
|
| `test_docs_examples.py::test_example[we_are_processes.py]` | `assert None == 0` | `__main__` missing `__file__` in subproc |
|
||||||
|
|
||||||
|
**Rule of thumb**: if a test fails with `TooSlowError`,
|
||||||
|
`trio.TooSlowError`, or `pexpect.TIMEOUT` and you didn't
|
||||||
|
touch the relevant code path, it's flaky — skip it.
|
||||||
|
|
||||||
|
## 9. The pytest-capture hang pattern (CHECK THIS FIRST)
|
||||||
|
|
||||||
|
**Symptom:** a tractor test hangs indefinitely under
|
||||||
|
default `pytest` but passes instantly when you add
|
||||||
|
`-s` (`--capture=no`).
|
||||||
|
|
||||||
|
**Cause:** tractor subactors (especially under fork-
|
||||||
|
based backends) inherit pytest's stdout/stderr
|
||||||
|
capture pipes via fds 1,2. Under high-volume error
|
||||||
|
logging (e.g. multi-level cancel cascade, nested
|
||||||
|
`run_in_actor` failures, anything triggering
|
||||||
|
`RemoteActorError` + `ExceptionGroup` traceback
|
||||||
|
spew), the **64KB Linux pipe buffer fills** faster
|
||||||
|
than pytest drains it. Subactor writes block → can't
|
||||||
|
finish exit → parent's `waitpid`/pidfd wait blocks →
|
||||||
|
deadlock cascades up the tree.
|
||||||
|
|
||||||
|
**Pre-existing guards in the tractor harness** that
|
||||||
|
encode this same knowledge — grep these FIRST
|
||||||
|
before spelunking:
|
||||||
|
|
||||||
|
- `tests/conftest.py:258-260` (in the `daemon`
|
||||||
|
fixture): `# XXX: too much logging will lock up
|
||||||
|
the subproc (smh)` — downgrades `trace`/`debug`
|
||||||
|
loglevel to `info` to prevent the hang.
|
||||||
|
- `tests/conftest.py:316`: `# can lock up on the
|
||||||
|
_io.BufferedReader and hang..` — noted on the
|
||||||
|
`proc.stderr.read()` post-SIGINT.
|
||||||
|
|
||||||
|
**Debug recipe (in priority order):**
|
||||||
|
|
||||||
|
1. **Try `-s` first.** If the hang disappears with
|
||||||
|
`pytest -s`, you've confirmed it's capture-pipe
|
||||||
|
fill. Skip spelunking.
|
||||||
|
2. **Lower the loglevel.** Default `--ll=error` on
|
||||||
|
this project; if you've bumped it to `debug` /
|
||||||
|
`info`, try dropping back. Each log level
|
||||||
|
multiplies pipe-pressure under fault cascades.
|
||||||
|
3. **If you MUST use default capture + high log
|
||||||
|
volume**, redirect subactor stdout/stderr in the
|
||||||
|
child prelude (e.g.
|
||||||
|
`tractor.spawn._subint_forkserver._child_target`
|
||||||
|
post-`_close_inherited_fds`) to `/dev/null` or a
|
||||||
|
file.
|
||||||
|
|
||||||
|
**Signature tells you it's THIS bug (vs. a real
|
||||||
|
code hang):**
|
||||||
|
|
||||||
|
- Multi-actor test under fork-based backend
|
||||||
|
(`subint_forkserver`, eventually `trio_proc` too
|
||||||
|
under enough log volume).
|
||||||
|
- Multiple `RemoteActorError` / `ExceptionGroup`
|
||||||
|
tracebacks in the error path.
|
||||||
|
- Test passes with `-s` in the 5-10s range, hangs
|
||||||
|
past pytest-timeout (usually 30+ s) without `-s`.
|
||||||
|
- Subactor processes visible via `pgrep -af
|
||||||
|
subint-forkserv` or similar after the hang —
|
||||||
|
they're alive but blocked on `write()` to an
|
||||||
|
inherited stdout fd.
|
||||||
|
|
||||||
|
**Historical reference:** this deadlock cost a
|
||||||
|
multi-session investigation (4 genuine cascade
|
||||||
|
fixes landed along the way) that only surfaced the
|
||||||
|
capture-pipe issue AFTER the deeper fixes let the
|
||||||
|
tree actually tear down enough to produce pipe-
|
||||||
|
filling log volume. Full post-mortem in
|
||||||
|
`ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`.
|
||||||
|
Lesson codified here so future-me grep-finds the
|
||||||
|
workaround before digging.
|
||||||
|
|
@ -1,10 +1,18 @@
|
||||||
name: CI
|
name: CI
|
||||||
|
|
||||||
|
# NOTE distilled from,
|
||||||
|
# https://github.com/orgs/community/discussions/26276
|
||||||
on:
|
on:
|
||||||
# any time someone pushes a new branch to origin
|
# any time a new update to 'main'
|
||||||
push:
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
|
||||||
# Allows you to run this workflow manually from the Actions tab
|
# for on all (forked) PRs to repo
|
||||||
|
# NOTE, use a draft PR if you just want CI triggered..
|
||||||
|
pull_request:
|
||||||
|
|
||||||
|
# to run workflow manually from the "Actions" tab
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
|
@ -74,24 +82,44 @@ jobs:
|
||||||
# run: mypy tractor/ --ignore-missing-imports --show-traceback
|
# run: mypy tractor/ --ignore-missing-imports --show-traceback
|
||||||
|
|
||||||
|
|
||||||
testing-linux:
|
testing:
|
||||||
name: '${{ matrix.os }} Python ${{ matrix.python }} - ${{ matrix.spawn_backend }}'
|
name: '${{ matrix.os }} Python${{ matrix.python-version }} spawn_backend=${{ matrix.spawn_backend }} tpt_proto=${{ matrix.tpt_proto }}'
|
||||||
timeout-minutes: 10
|
timeout-minutes: 16
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
os: [ubuntu-latest]
|
os: [
|
||||||
python-version: ['3.13']
|
ubuntu-latest,
|
||||||
|
macos-latest,
|
||||||
|
]
|
||||||
|
python-version: [
|
||||||
|
'3.13',
|
||||||
|
# '3.14',
|
||||||
|
]
|
||||||
spawn_backend: [
|
spawn_backend: [
|
||||||
'trio',
|
'trio',
|
||||||
# 'mp_spawn',
|
# 'mp_spawn',
|
||||||
# 'mp_forkserver',
|
# 'mp_forkserver',
|
||||||
|
# ?TODO^ is it worth it to get these running again?
|
||||||
|
#
|
||||||
|
# - [ ] next-gen backends, on 3.13+
|
||||||
|
# https://github.com/goodboy/tractor/issues/379
|
||||||
|
# 'subinterpreter',
|
||||||
|
# 'subint',
|
||||||
]
|
]
|
||||||
|
tpt_proto: [
|
||||||
|
'tcp',
|
||||||
|
'uds',
|
||||||
|
]
|
||||||
|
# https://github.com/orgs/community/discussions/26253#discussioncomment-3250989
|
||||||
|
exclude:
|
||||||
|
# don't do UDS run on macOS (for now)
|
||||||
|
- os: macos-latest
|
||||||
|
tpt_proto: 'uds'
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: 'Install uv + py-${{ matrix.python-version }}'
|
- name: 'Install uv + py-${{ matrix.python-version }}'
|
||||||
|
|
@ -118,7 +146,11 @@ jobs:
|
||||||
run: uv tree
|
run: uv tree
|
||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: uv run pytest tests/ --spawn-backend=${{ matrix.spawn_backend }} -rsx
|
run: >
|
||||||
|
uv run
|
||||||
|
pytest tests/ -rsx
|
||||||
|
--spawn-backend=${{ matrix.spawn_backend }}
|
||||||
|
--tpt-proto=${{ matrix.tpt_proto }}
|
||||||
|
|
||||||
# XXX legacy NOTE XXX
|
# XXX legacy NOTE XXX
|
||||||
#
|
#
|
||||||
|
|
|
||||||
|
|
@ -102,3 +102,69 @@ venv.bak/
|
||||||
|
|
||||||
# mypy
|
# mypy
|
||||||
.mypy_cache/
|
.mypy_cache/
|
||||||
|
|
||||||
|
# all files under
|
||||||
|
.git/
|
||||||
|
|
||||||
|
# require very explicit staging for anything we **really**
|
||||||
|
# want put/kept in repo.
|
||||||
|
notes_to_self/
|
||||||
|
snippets/
|
||||||
|
|
||||||
|
# ------- AI shiz -------
|
||||||
|
# `ai.skillz` symlinks,
|
||||||
|
# (machine-local, deploy via deploy-skill.sh)
|
||||||
|
.claude/skills/py-codestyle
|
||||||
|
.claude/skills/close-wkt
|
||||||
|
.claude/skills/plan-io
|
||||||
|
.claude/skills/prompt-io
|
||||||
|
.claude/skills/resolve-conflicts
|
||||||
|
.claude/skills/inter-skill-review
|
||||||
|
|
||||||
|
# /open-wkt specifics
|
||||||
|
.claude/skills/open-wkt
|
||||||
|
.claude/wkts/
|
||||||
|
claude_wkts
|
||||||
|
|
||||||
|
# /code-review-changes specifics
|
||||||
|
.claude/skills/code-review-changes
|
||||||
|
# review-skill ephemeral ctx (per-PR, single-use)
|
||||||
|
.claude/review_context.md
|
||||||
|
.claude/review_regression.md
|
||||||
|
|
||||||
|
# /pr-msg specifics
|
||||||
|
.claude/skills/pr-msg/*
|
||||||
|
# repo-specific
|
||||||
|
!.claude/skills/pr-msg/format-reference.md
|
||||||
|
# XXX, so u can nvim-telescope this file.
|
||||||
|
# !.claude/skills/pr-msg/pr_msg_LATEST.md
|
||||||
|
|
||||||
|
# /commit-msg specifics
|
||||||
|
# - any commit-msg gen tmp files
|
||||||
|
.claude/*_commit_*.md
|
||||||
|
.claude/*_commit*.txt
|
||||||
|
.claude/skills/commit-msg/*
|
||||||
|
!.claude/skills/commit-msg/style-duie-reference.md
|
||||||
|
|
||||||
|
# use prompt-io instead?
|
||||||
|
.claude/plans
|
||||||
|
|
||||||
|
# nix develop --profile .nixdev
|
||||||
|
.nixdev*
|
||||||
|
|
||||||
|
# :Obsession .
|
||||||
|
Session.vim
|
||||||
|
|
||||||
|
# `gish` local `.md`-files
|
||||||
|
# TODO? better all around automation!
|
||||||
|
# -[ ] it'd be handy to also commit and sync with wtv git service?
|
||||||
|
# -[ ] everything should be put under a `.gish/` no?
|
||||||
|
gitea/
|
||||||
|
gh/
|
||||||
|
|
||||||
|
# ------ macOS ------
|
||||||
|
# Finder metadata
|
||||||
|
**/.DS_Store
|
||||||
|
|
||||||
|
# LLM conversations that should remain private
|
||||||
|
docs/conversations/
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,161 @@
|
||||||
|
# `subint` backend: parent trio loop parks after subint teardown (Ctrl-C works; not a CPython-level issue)
|
||||||
|
|
||||||
|
Follow-up to the Phase B subint spawn-backend PR (see
|
||||||
|
`tractor.spawn._subint`, issue #379). Distinct from the
|
||||||
|
`subint_sigint_starvation_issue.md` (SIGINT-unresponsive
|
||||||
|
starvation hang): this one is **Ctrl-C-able**, which means
|
||||||
|
it's *not* the shared-GIL-hostage class and is ours to fix
|
||||||
|
from inside tractor rather than waiting on upstream CPython
|
||||||
|
/ msgspec progress.
|
||||||
|
|
||||||
|
## TL;DR
|
||||||
|
|
||||||
|
After a stuck-subint subactor is torn down via the
|
||||||
|
hard-kill path, a parent-side trio task parks on an
|
||||||
|
*orphaned resource* (most likely a `chan.recv()` /
|
||||||
|
`process_messages` loop on the now-dead subint's IPC
|
||||||
|
channel) and waits forever for bytes that can't arrive —
|
||||||
|
because the channel was torn down without emitting a clean
|
||||||
|
EOF/`BrokenResourceError` to the waiting receiver.
|
||||||
|
|
||||||
|
Unlike `subint_sigint_starvation_issue.md`, the main trio
|
||||||
|
loop **is** iterating normally — SIGINT delivers cleanly
|
||||||
|
and the test unhangs. But absent Ctrl-C, the test suite
|
||||||
|
wedges indefinitely.
|
||||||
|
|
||||||
|
## Symptom
|
||||||
|
|
||||||
|
Running `test_subint_non_checkpointing_child` under
|
||||||
|
`--spawn-backend=subint` (in
|
||||||
|
`tests/test_subint_cancellation.py`):
|
||||||
|
|
||||||
|
1. Test spawns a subactor whose main task runs
|
||||||
|
`threading.Event.wait(1.0)` in a loop — releases the
|
||||||
|
GIL but never inserts a trio checkpoint.
|
||||||
|
2. Parent does `an.cancel_scope.cancel()`. Our
|
||||||
|
`subint_proc` cancel path fires: soft-kill sends
|
||||||
|
`Portal.cancel_actor()` over the live IPC channel →
|
||||||
|
subint's trio loop *should* process the cancel msg on
|
||||||
|
its IPC dispatcher task (since the GIL releases are
|
||||||
|
happening).
|
||||||
|
3. Expected: subint's `trio.run()` unwinds, driver thread
|
||||||
|
exits naturally, parent returns.
|
||||||
|
4. Actual: parent `trio.run()` never completes. Test
|
||||||
|
hangs past its `trio.fail_after()` deadline.
|
||||||
|
|
||||||
|
## Evidence
|
||||||
|
|
||||||
|
### `strace` on the hung pytest process during SIGINT
|
||||||
|
|
||||||
|
```
|
||||||
|
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||||
|
write(17, "\2", 1) = 1
|
||||||
|
```
|
||||||
|
|
||||||
|
Contrast with the SIGINT-starvation hang (see
|
||||||
|
`subint_sigint_starvation_issue.md`) where that same
|
||||||
|
`write()` returned `EAGAIN`. Here the SIGINT byte is
|
||||||
|
written successfully → Python's signal handler pipe is
|
||||||
|
being drained → main trio loop **is** iterating → SIGINT
|
||||||
|
gets turned into `trio.Cancelled` → the test unhangs (if
|
||||||
|
the operator happens to be there to hit Ctrl-C).
|
||||||
|
|
||||||
|
### Stack dump (via `tractor.devx.dump_on_hang`)
|
||||||
|
|
||||||
|
Single main thread visible, parked in
|
||||||
|
`trio._core._io_epoll.get_events` inside `trio.run` at the
|
||||||
|
test's `trio.run(...)` call site. No subint driver thread
|
||||||
|
(subint was destroyed successfully — this is *after* the
|
||||||
|
hard-kill path, not during it).
|
||||||
|
|
||||||
|
## Root cause hypothesis
|
||||||
|
|
||||||
|
Most consistent with the evidence: a parent-side trio
|
||||||
|
task is awaiting a `chan.recv()` / `process_messages` loop
|
||||||
|
on the dead subint's IPC channel. The sequence:
|
||||||
|
|
||||||
|
1. Soft-kill in `subint_proc` sends `Portal.cancel_actor()`
|
||||||
|
over the channel. The subint's trio dispatcher *may* or
|
||||||
|
may not have processed the cancel msg before the subint
|
||||||
|
was destroyed — timing-dependent.
|
||||||
|
2. Hard-kill timeout fires (because the subint's main
|
||||||
|
task was in `threading.Event.wait()` with no trio
|
||||||
|
checkpoint — cancel-msg processing couldn't race the
|
||||||
|
timeout).
|
||||||
|
3. Driver thread abandoned, `_interpreters.destroy()`
|
||||||
|
runs. Subint is gone.
|
||||||
|
4. But the parent-side trio task holding a
|
||||||
|
`chan.recv()` / `process_messages` loop against that
|
||||||
|
channel was **not** explicitly cancelled. The channel's
|
||||||
|
underlying socket got torn down, but without a clean
|
||||||
|
EOF delivered to the waiting recv, the task parks
|
||||||
|
forever on `trio.lowlevel.wait_readable` (or similar).
|
||||||
|
|
||||||
|
This matches the "main loop fine, task parked on
|
||||||
|
orphaned I/O" signature.
|
||||||
|
|
||||||
|
## Why this is ours to fix (not CPython's)
|
||||||
|
|
||||||
|
- Main trio loop iterates normally → GIL isn't starved.
|
||||||
|
- SIGINT is deliverable → not a signal-pipe-full /
|
||||||
|
wakeup-fd contention scenario.
|
||||||
|
- The hang is in *our* supervision code, specifically in
|
||||||
|
how `subint_proc` tears down its side of the IPC when
|
||||||
|
the subint is abandoned/destroyed.
|
||||||
|
|
||||||
|
## Possible fix directions
|
||||||
|
|
||||||
|
1. **Explicit parent-side channel abort on subint
|
||||||
|
abandon.** In `subint_proc`'s teardown block, after the
|
||||||
|
hard-kill timeout fires, explicitly close the parent's
|
||||||
|
end of the IPC channel to the subint. Any waiting
|
||||||
|
`chan.recv()` / `process_messages` task sees
|
||||||
|
`BrokenResourceError` (or `ClosedResourceError`) and
|
||||||
|
unwinds.
|
||||||
|
2. **Cancel parent-side RPC tasks tied to the dead
|
||||||
|
subint's channel.** The `Actor._rpc_tasks` / nursery
|
||||||
|
machinery should have a handle on any
|
||||||
|
`process_messages` loops bound to a specific peer
|
||||||
|
channel. Iterate those and cancel explicitly.
|
||||||
|
3. **Bound the top-level `await actor_nursery
|
||||||
|
._join_procs.wait()` shield in `subint_proc`** (same
|
||||||
|
pattern as the other bounded shields the hard-kill
|
||||||
|
patch added). If the nursery never sets `_join_procs`
|
||||||
|
because a child task is parked, the bound would at
|
||||||
|
least let the teardown proceed.
|
||||||
|
|
||||||
|
Of these, (1) is the most surgical and directly addresses
|
||||||
|
the root cause. (2) is a defense-in-depth companion. (3)
|
||||||
|
is a band-aid but cheap to add.
|
||||||
|
|
||||||
|
## Current workaround
|
||||||
|
|
||||||
|
None in-tree. The test's `trio.fail_after()` bound
|
||||||
|
currently fires and raises `TooSlowError`, so the test
|
||||||
|
visibly **fails** rather than hangs — which is
|
||||||
|
intentional (an unbounded cancellation-audit test would
|
||||||
|
defeat itself). But in interactive test runs the operator
|
||||||
|
has to hit Ctrl-C to move past the parked state before
|
||||||
|
pytest reports the failure.
|
||||||
|
|
||||||
|
## Reproducer
|
||||||
|
|
||||||
|
```
|
||||||
|
./py314/bin/python -m pytest \
|
||||||
|
tests/test_subint_cancellation.py::test_subint_non_checkpointing_child \
|
||||||
|
--spawn-backend=subint --tb=short --no-header -v
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: hangs until `trio.fail_after(15)` fires, or
|
||||||
|
Ctrl-C unwedges it manually.
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- `tractor.spawn._subint.subint_proc` — current subint
|
||||||
|
teardown code; see the `_HARD_KILL_TIMEOUT` bounded
|
||||||
|
shields + `daemon=True` driver-thread abandonment
|
||||||
|
(commit `b025c982`).
|
||||||
|
- `ai/conc-anal/subint_sigint_starvation_issue.md` — the
|
||||||
|
sibling CPython-level hang (GIL-starvation,
|
||||||
|
SIGINT-unresponsive) which is **not** this issue.
|
||||||
|
- Phase B tracking: issue #379.
|
||||||
|
|
@ -0,0 +1,337 @@
|
||||||
|
# `os.fork()` from a non-main sub-interpreter aborts the child (CPython refuses post-fork cleanup)
|
||||||
|
|
||||||
|
Third `subint`-class analysis in this project. Unlike its
|
||||||
|
two siblings (`subint_sigint_starvation_issue.md`,
|
||||||
|
`subint_cancel_delivery_hang_issue.md`), this one is not a
|
||||||
|
hang — it's a **hard CPython-level refusal** of an
|
||||||
|
experimental spawn strategy we wanted to try.
|
||||||
|
|
||||||
|
## TL;DR
|
||||||
|
|
||||||
|
An in-process sub-interpreter cannot be used as a
|
||||||
|
"launchpad" for `os.fork()` on current CPython. The fork
|
||||||
|
syscall succeeds in the parent, but the forked CHILD
|
||||||
|
process is aborted immediately by CPython's post-fork
|
||||||
|
cleanup with:
|
||||||
|
|
||||||
|
```
|
||||||
|
Fatal Python error: _PyInterpreterState_DeleteExceptMain: not main interpreter
|
||||||
|
```
|
||||||
|
|
||||||
|
This is enforced by a hard `PyStatus_ERR` gate in
|
||||||
|
`Python/pystate.c`. The CPython devs acknowledge the
|
||||||
|
fragility with an in-source comment (`// Ideally we could
|
||||||
|
guarantee tstate is running main.`) but provide no
|
||||||
|
mechanism to satisfy the precondition from user code.
|
||||||
|
|
||||||
|
**Implication for tractor**: the `subint_fork` backend
|
||||||
|
sketched in `tractor.spawn._subint_fork` is structurally
|
||||||
|
dead on current CPython. The submodule is kept as
|
||||||
|
documentation of the attempt; `--spawn-backend=subint_fork`
|
||||||
|
raises `NotImplementedError` pointing here.
|
||||||
|
|
||||||
|
## Context — why we tried this
|
||||||
|
|
||||||
|
The motivation is issue #379's "Our own thoughts, ideas
|
||||||
|
for `fork()`-workaround/hacks..." section. The existing
|
||||||
|
trio-backend (`tractor.spawn._trio.trio_proc`) spawns
|
||||||
|
subactors via `trio.lowlevel.open_process()` → ultimately
|
||||||
|
`posix_spawn()` or `fork+exec`, from the parent's main
|
||||||
|
interpreter that is currently running `trio.run()`. This
|
||||||
|
brushes against a known-fragile interaction between
|
||||||
|
`trio` and `fork()` tracked in
|
||||||
|
[python-trio/trio#1614](https://github.com/python-trio/trio/issues/1614)
|
||||||
|
and siblings — mostly mitigated in `tractor`'s case only
|
||||||
|
incidentally (we `exec()` immediately post-fork).
|
||||||
|
|
||||||
|
The idea was:
|
||||||
|
|
||||||
|
1. Create a subint that has *never* imported `trio`.
|
||||||
|
2. From a worker thread in that subint, call `os.fork()`.
|
||||||
|
3. In the child, `execv()` back into
|
||||||
|
`python -m tractor._child` — same as `trio_proc` does.
|
||||||
|
4. The fork is from a trio-free context → trio+fork
|
||||||
|
hazards avoided regardless of downstream behavior.
|
||||||
|
|
||||||
|
The parent-side orchestration (`ipc_server.wait_for_peer`,
|
||||||
|
`SpawnSpec`, `Portal` yield) would reuse
|
||||||
|
`trio_proc`'s flow verbatim, with only the subproc-spawn
|
||||||
|
mechanics swapped.
|
||||||
|
|
||||||
|
## Symptom
|
||||||
|
|
||||||
|
Running the prototype (`tractor.spawn._subint_fork.subint_fork_proc`,
|
||||||
|
see git history prior to the stub revert) on py3.14:
|
||||||
|
|
||||||
|
```
|
||||||
|
Fatal Python error: _PyInterpreterState_DeleteExceptMain: not main interpreter
|
||||||
|
Python runtime state: initialized
|
||||||
|
|
||||||
|
Current thread 0x00007f6b71a456c0 [subint-fork-lau] (most recent call first):
|
||||||
|
File "<script>", line 2 in <module>
|
||||||
|
<script>:2: DeprecationWarning: This process (pid=802985) is multi-threaded, use of fork() may lead to deadlocks in the child.
|
||||||
|
```
|
||||||
|
|
||||||
|
Key clues:
|
||||||
|
|
||||||
|
- The **`DeprecationWarning`** fires in the parent (before
|
||||||
|
fork completes) — fork *is* executing, we get that far.
|
||||||
|
- The **`Fatal Python error`** comes from the child — it
|
||||||
|
aborts during CPython's post-fork C initialization
|
||||||
|
before any user Python runs in the child.
|
||||||
|
- The thread name `subint-fork-lau[nchpad]` is ours —
|
||||||
|
confirms the fork is being called from the launchpad
|
||||||
|
subint's driver thread.
|
||||||
|
|
||||||
|
## CPython source walkthrough
|
||||||
|
|
||||||
|
### Call site — `Modules/posixmodule.c:728-793`
|
||||||
|
|
||||||
|
The post-fork-child hook CPython runs in the child process:
|
||||||
|
|
||||||
|
```c
|
||||||
|
void
|
||||||
|
PyOS_AfterFork_Child(void)
|
||||||
|
{
|
||||||
|
PyStatus status;
|
||||||
|
_PyRuntimeState *runtime = &_PyRuntime;
|
||||||
|
|
||||||
|
// re-creates runtime->interpreters.mutex (HEAD_UNLOCK)
|
||||||
|
status = _PyRuntimeState_ReInitThreads(runtime);
|
||||||
|
...
|
||||||
|
|
||||||
|
PyThreadState *tstate = _PyThreadState_GET();
|
||||||
|
_Py_EnsureTstateNotNULL(tstate);
|
||||||
|
|
||||||
|
...
|
||||||
|
|
||||||
|
// Ideally we could guarantee tstate is running main. ← !!!
|
||||||
|
_PyInterpreterState_ReinitRunningMain(tstate);
|
||||||
|
|
||||||
|
status = _PyEval_ReInitThreads(tstate);
|
||||||
|
...
|
||||||
|
|
||||||
|
status = _PyInterpreterState_DeleteExceptMain(runtime);
|
||||||
|
if (_PyStatus_EXCEPTION(status)) {
|
||||||
|
goto fatal_error;
|
||||||
|
}
|
||||||
|
...
|
||||||
|
|
||||||
|
fatal_error:
|
||||||
|
Py_ExitStatusException(status);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The `// Ideally we could guarantee tstate is running
|
||||||
|
main.` comment is a flashing warning sign — the CPython
|
||||||
|
devs *know* this path is fragile when fork is called from
|
||||||
|
a non-main subint, but they've chosen to abort rather than
|
||||||
|
silently corrupt state. Arguably the right call.
|
||||||
|
|
||||||
|
### The refusal — `Python/pystate.c:1035-1075`
|
||||||
|
|
||||||
|
```c
|
||||||
|
/*
|
||||||
|
* Delete all interpreter states except the main interpreter. If there
|
||||||
|
* is a current interpreter state, it *must* be the main interpreter.
|
||||||
|
*/
|
||||||
|
PyStatus
|
||||||
|
_PyInterpreterState_DeleteExceptMain(_PyRuntimeState *runtime)
|
||||||
|
{
|
||||||
|
struct pyinterpreters *interpreters = &runtime->interpreters;
|
||||||
|
|
||||||
|
PyThreadState *tstate = _PyThreadState_Swap(runtime, NULL);
|
||||||
|
if (tstate != NULL && tstate->interp != interpreters->main) {
|
||||||
|
return _PyStatus_ERR("not main interpreter"); ← our error
|
||||||
|
}
|
||||||
|
|
||||||
|
HEAD_LOCK(runtime);
|
||||||
|
PyInterpreterState *interp = interpreters->head;
|
||||||
|
interpreters->head = NULL;
|
||||||
|
while (interp != NULL) {
|
||||||
|
if (interp == interpreters->main) {
|
||||||
|
interpreters->main->next = NULL;
|
||||||
|
interpreters->head = interp;
|
||||||
|
interp = interp->next;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// XXX Won't this fail since PyInterpreterState_Clear() requires
|
||||||
|
// the "current" tstate to be set?
|
||||||
|
PyInterpreterState_Clear(interp); // XXX must activate?
|
||||||
|
zapthreads(interp);
|
||||||
|
...
|
||||||
|
}
|
||||||
|
...
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The comment in the docstring (`If there is a current
|
||||||
|
interpreter state, it *must* be the main interpreter.`) is
|
||||||
|
the formal API contract. The `XXX` comments further in
|
||||||
|
suggest the CPython team is already aware this function
|
||||||
|
has latent issues even in the happy path.
|
||||||
|
|
||||||
|
## Chain summary
|
||||||
|
|
||||||
|
1. Our launchpad subint's driver OS-thread calls
|
||||||
|
`os.fork()`.
|
||||||
|
2. `fork()` succeeds. Child wakes up with:
|
||||||
|
- The parent's full memory image (including all
|
||||||
|
subints).
|
||||||
|
- Only the *calling* thread alive (the driver thread).
|
||||||
|
- `_PyThreadState_GET()` on that thread returns the
|
||||||
|
**launchpad subint's tstate**, *not* main's.
|
||||||
|
3. CPython runs `PyOS_AfterFork_Child()`.
|
||||||
|
4. It reaches `_PyInterpreterState_DeleteExceptMain()`.
|
||||||
|
5. Gate check fails: `tstate->interp != interpreters->main`.
|
||||||
|
6. `PyStatus_ERR("not main interpreter")` → `fatal_error`
|
||||||
|
goto → `Py_ExitStatusException()` → child aborts.
|
||||||
|
|
||||||
|
Parent-side consequence: `os.fork()` in the subint
|
||||||
|
bootstrap returned successfully with the child's PID, but
|
||||||
|
the child died before connecting back. Our parent's
|
||||||
|
`ipc_server.wait_for_peer(uid)` would hang forever — the
|
||||||
|
child never gets to `_actor_child_main`.
|
||||||
|
|
||||||
|
## Definitive answer to "Open Question 1"
|
||||||
|
|
||||||
|
From the (now-stub) `subint_fork_proc` docstring:
|
||||||
|
|
||||||
|
> Does CPython allow `os.fork()` from a non-main
|
||||||
|
> sub-interpreter under the legacy config?
|
||||||
|
|
||||||
|
**No.** Not in a usable-by-user-code sense. The fork
|
||||||
|
syscall is not blocked, but the child cannot survive
|
||||||
|
CPython's post-fork initialization. This is enforced, not
|
||||||
|
accidental, and the CPython devs have acknowledged the
|
||||||
|
fragility in-source.
|
||||||
|
|
||||||
|
## What we'd need from CPython to unblock
|
||||||
|
|
||||||
|
Any one of these, from least-to-most invasive:
|
||||||
|
|
||||||
|
1. **A pre-fork hook mechanism** that lets user code (or
|
||||||
|
tractor itself via `os.register_at_fork(before=...)`)
|
||||||
|
swap the current tstate to main before fork runs. The
|
||||||
|
swap would need to work across the subint→main
|
||||||
|
boundary, which is the actual hard part —
|
||||||
|
`_PyThreadState_Swap()` exists but is internal.
|
||||||
|
|
||||||
|
2. **A `_PyInterpreterState_DeleteExceptFor(tstate->interp)`
|
||||||
|
variant** that cleans up all *other* subints while
|
||||||
|
preserving the calling subint's state. Lets the child
|
||||||
|
continue executing in the subint after fork; a
|
||||||
|
subsequent `execv()` clears everything at the OS
|
||||||
|
level anyway.
|
||||||
|
|
||||||
|
3. **A cleaner error** than `Fatal Python error` aborting
|
||||||
|
the child. Even without fixing the underlying
|
||||||
|
capability, a raised Python-level exception in the
|
||||||
|
parent's `fork()` call (rather than a silent child
|
||||||
|
abort) would at least make the failure mode
|
||||||
|
debuggable.
|
||||||
|
|
||||||
|
## Upstream-report draft (for CPython issue tracker)
|
||||||
|
|
||||||
|
### Title
|
||||||
|
|
||||||
|
> `os.fork()` from a non-main sub-interpreter aborts the
|
||||||
|
> child with a fatal error in `PyOS_AfterFork_Child`; can
|
||||||
|
> we at least make it a clean `RuntimeError` in the
|
||||||
|
> parent?
|
||||||
|
|
||||||
|
### Body
|
||||||
|
|
||||||
|
> **Version**: Python 3.14.x
|
||||||
|
>
|
||||||
|
> **Summary**: Calling `os.fork()` from a thread currently
|
||||||
|
> executing inside a sub-interpreter causes the forked
|
||||||
|
> child process to abort during CPython's post-fork
|
||||||
|
> cleanup, with the following output in the child:
|
||||||
|
>
|
||||||
|
> ```
|
||||||
|
> Fatal Python error: _PyInterpreterState_DeleteExceptMain: not main interpreter
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> From the **parent's** point of view the fork succeeded
|
||||||
|
> (returned a valid child PID). The failure is completely
|
||||||
|
> opaque to parent-side Python code — unless the parent
|
||||||
|
> does `os.waitpid()` it won't even notice the child
|
||||||
|
> died.
|
||||||
|
>
|
||||||
|
> **Root cause** (as I understand it from reading sources):
|
||||||
|
> `Modules/posixmodule.c::PyOS_AfterFork_Child()` calls
|
||||||
|
> `_PyInterpreterState_DeleteExceptMain()` with a
|
||||||
|
> precondition that `_PyThreadState_GET()->interp` be the
|
||||||
|
> main interpreter. When `fork()` is called from a thread
|
||||||
|
> executing inside a subinterpreter, the child wakes up
|
||||||
|
> with its tstate still pointing at the subint, and the
|
||||||
|
> gate in `Python/pystate.c:1044-1047` fails.
|
||||||
|
>
|
||||||
|
> A comment in the source
|
||||||
|
> (`Modules/posixmodule.c:753` — `// Ideally we could
|
||||||
|
> guarantee tstate is running main.`) suggests this is a
|
||||||
|
> known-fragile path rather than an intentional
|
||||||
|
> invariant.
|
||||||
|
>
|
||||||
|
> **Use case**: I was experimenting with using a
|
||||||
|
> sub-interpreter as a "fork launchpad" — have a subint
|
||||||
|
> that has never imported `trio`, call `os.fork()` from
|
||||||
|
> that subint's thread, and in the child `execv()` back
|
||||||
|
> into a fresh Python interpreter process. The goal was
|
||||||
|
> to sidestep known issues with `trio` + `fork()`
|
||||||
|
> interaction (see
|
||||||
|
> [python-trio/trio#1614](https://github.com/python-trio/trio/issues/1614))
|
||||||
|
> by guaranteeing the forking context had never been
|
||||||
|
> "contaminated" by trio's imports or globals. This
|
||||||
|
> approach would allow `trio`-using applications to
|
||||||
|
> combine `fork`-based subprocess spawning with
|
||||||
|
> per-worker `trio.run()` runtimes — a fairly common
|
||||||
|
> pattern that currently requires workarounds.
|
||||||
|
>
|
||||||
|
> **Request**:
|
||||||
|
>
|
||||||
|
> Ideally: make fork-from-subint work (e.g., by swapping
|
||||||
|
> the caller's tstate to main in the pre-fork hook), or
|
||||||
|
> provide a `_PyInterpreterState_DeleteExceptFor(interp)`
|
||||||
|
> variant that permits the caller's subint to survive
|
||||||
|
> post-fork so user code can subsequently `execv()`.
|
||||||
|
>
|
||||||
|
> Minimally: convert the fatal child-side abort into a
|
||||||
|
> clean `RuntimeError` (or similar) raised in the
|
||||||
|
> parent's `fork()` call. Even if the capability isn't
|
||||||
|
> expanded, the failure mode should be debuggable by
|
||||||
|
> user-code in the parent — right now it's a silent
|
||||||
|
> child death with an error message buried in the
|
||||||
|
> child's stderr that parent code can't programmatically
|
||||||
|
> see.
|
||||||
|
>
|
||||||
|
> **Related**: PEP 684 (per-interpreter GIL), PEP 734
|
||||||
|
> (`concurrent.interpreters` public API). The private
|
||||||
|
> `_interpreters` module is what I used to create the
|
||||||
|
> launchpad — behavior is the same whether using
|
||||||
|
> `_interpreters.create('legacy')` or
|
||||||
|
> `concurrent.interpreters.create()` (the latter was not
|
||||||
|
> tested but the gate is identical).
|
||||||
|
>
|
||||||
|
> Happy to contribute a minimal reproducer + test case if
|
||||||
|
> this is something the team wants to pursue.
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- `Modules/posixmodule.c:728` —
|
||||||
|
[`PyOS_AfterFork_Child`](https://github.com/python/cpython/blob/main/Modules/posixmodule.c#L728)
|
||||||
|
- `Python/pystate.c:1040` —
|
||||||
|
[`_PyInterpreterState_DeleteExceptMain`](https://github.com/python/cpython/blob/main/Python/pystate.c#L1040)
|
||||||
|
- PEP 684 (per-interpreter GIL):
|
||||||
|
<https://peps.python.org/pep-0684/>
|
||||||
|
- PEP 734 (`concurrent.interpreters` public API):
|
||||||
|
<https://peps.python.org/pep-0734/>
|
||||||
|
- [python-trio/trio#1614](https://github.com/python-trio/trio/issues/1614)
|
||||||
|
— the original motivation for the launchpad idea.
|
||||||
|
- tractor issue #379 — "Our own thoughts, ideas for
|
||||||
|
`fork()`-workaround/hacks..." section where this was
|
||||||
|
first sketched.
|
||||||
|
- `tractor.spawn._subint_fork` — in-tree stub preserving
|
||||||
|
the attempted impl's shape in git history.
|
||||||
|
|
@ -0,0 +1,373 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
'''
|
||||||
|
Standalone CPython-level feasibility check for the "main-interp
|
||||||
|
worker-thread forkserver + subint-hosted trio" architecture
|
||||||
|
proposed as a workaround to the CPython-level refusal
|
||||||
|
documented in
|
||||||
|
`ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`.
|
||||||
|
|
||||||
|
Purpose
|
||||||
|
-------
|
||||||
|
Deliberately NOT a `tractor` test. Zero `tractor` imports.
|
||||||
|
Uses `_interpreters` (private stdlib) + `os.fork()` directly so
|
||||||
|
the signal is unambiguous — pass/fail here is a property of
|
||||||
|
CPython alone, independent of our runtime.
|
||||||
|
|
||||||
|
Run each scenario in isolation; the child's fate is observable
|
||||||
|
only via `os.waitpid()` of the parent and the scenario's own
|
||||||
|
status prints.
|
||||||
|
|
||||||
|
Scenarios (pick one with `--scenario <name>`)
|
||||||
|
---------------------------------------------
|
||||||
|
|
||||||
|
- `control_subint_thread_fork` — the KNOWN-BROKEN case we
|
||||||
|
documented in `subint_fork_blocked_by_cpython_post_fork_issue.md`:
|
||||||
|
drive a subint from a thread, call `os.fork()` inside its
|
||||||
|
`_interpreters.exec()`, watch the child abort. **Included as
|
||||||
|
a control** — if this scenario DOESN'T abort the child, our
|
||||||
|
analysis is wrong and we should re-check everything.
|
||||||
|
|
||||||
|
- `main_thread_fork` — baseline sanity. Call `os.fork()` from
|
||||||
|
the process's main thread. Must always succeed; if this
|
||||||
|
fails something much bigger is broken.
|
||||||
|
|
||||||
|
- `worker_thread_fork` — the architectural assertion. Spawn a
|
||||||
|
regular `threading.Thread` (attached to main interp, NOT a
|
||||||
|
subint), have IT call `os.fork()`. Child should survive
|
||||||
|
post-fork cleanup.
|
||||||
|
|
||||||
|
- `full_architecture` — end-to-end: main-interp worker thread
|
||||||
|
forks. In the child, fork-thread (still main-interp) creates
|
||||||
|
a subint, drives a second worker thread inside it that runs
|
||||||
|
a trivial `trio.run()`. Validates the "root runtime lives in
|
||||||
|
a subint in the child" piece of the proposed arch.
|
||||||
|
|
||||||
|
All scenarios print a self-contained pass/fail banner. Exit
|
||||||
|
code 0 on expected outcome (which for `control_*` means "child
|
||||||
|
aborted", not "child succeeded"!).
|
||||||
|
|
||||||
|
Requires Python 3.14+.
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
::
|
||||||
|
|
||||||
|
python subint_fork_from_main_thread_smoketest.py \\
|
||||||
|
--scenario main_thread_fork
|
||||||
|
|
||||||
|
python subint_fork_from_main_thread_smoketest.py \\
|
||||||
|
--scenario full_architecture
|
||||||
|
|
||||||
|
'''
|
||||||
|
from __future__ import annotations
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
# Hard-require py3.14 for the public `concurrent.interpreters`
|
||||||
|
# API (we still drop to `_interpreters` internally, same as
|
||||||
|
# `tractor.spawn._subint`).
|
||||||
|
try:
|
||||||
|
from concurrent import interpreters as _public_interpreters # noqa: F401
|
||||||
|
import _interpreters # type: ignore
|
||||||
|
except ImportError:
|
||||||
|
print(
|
||||||
|
'FAIL (setup): requires Python 3.14+ '
|
||||||
|
'(missing `concurrent.interpreters`)',
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
|
||||||
|
# The actual primitives this script exercises live in
|
||||||
|
# `tractor.spawn._subint_forkserver` — we re-import them here
|
||||||
|
# rather than inlining so the module and the validation stay
|
||||||
|
# in sync. (Early versions of this file had them inline for
|
||||||
|
# the "zero tractor imports" isolation guarantee; now that
|
||||||
|
# CPython-level feasibility is confirmed, the validated
|
||||||
|
# primitives have moved into tractor proper.)
|
||||||
|
from tractor.spawn._subint_forkserver import (
|
||||||
|
fork_from_worker_thread,
|
||||||
|
run_subint_in_worker_thread,
|
||||||
|
wait_child,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# small observability helpers (test-harness only)
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _banner(title: str) -> None:
|
||||||
|
line = '=' * 60
|
||||||
|
print(f'\n{line}\n{title}\n{line}', flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _report(
|
||||||
|
label: str,
|
||||||
|
*,
|
||||||
|
ok: bool,
|
||||||
|
status_str: str,
|
||||||
|
expect_exit_ok: bool,
|
||||||
|
) -> None:
|
||||||
|
verdict: str = 'PASS' if ok else 'FAIL'
|
||||||
|
expected_str: str = (
|
||||||
|
'normal exit (rc=0)'
|
||||||
|
if expect_exit_ok
|
||||||
|
else 'abnormal death (signal or nonzero exit)'
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f'[{verdict}] {label}: '
|
||||||
|
f'expected {expected_str}; observed {status_str}',
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# scenario: `control_subint_thread_fork` (known-broken)
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def scenario_control_subint_thread_fork() -> int:
|
||||||
|
_banner(
|
||||||
|
'[control] fork from INSIDE a subint (expected: child aborts)'
|
||||||
|
)
|
||||||
|
interp_id = _interpreters.create('legacy')
|
||||||
|
print(f' created subint {interp_id}', flush=True)
|
||||||
|
|
||||||
|
# Shared flag: child writes a sentinel file we can detect from
|
||||||
|
# the parent. If the child manages to write this, CPython's
|
||||||
|
# post-fork refusal is NOT happening → analysis is wrong.
|
||||||
|
sentinel = '/tmp/subint_fork_smoketest_control_child_ran'
|
||||||
|
try:
|
||||||
|
os.unlink(sentinel)
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
bootstrap = (
|
||||||
|
'import os\n'
|
||||||
|
'pid = os.fork()\n'
|
||||||
|
'if pid == 0:\n'
|
||||||
|
# child — if CPython's refusal fires this code never runs
|
||||||
|
f' with open({sentinel!r}, "w") as f:\n'
|
||||||
|
' f.write("ran")\n'
|
||||||
|
' os._exit(0)\n'
|
||||||
|
'else:\n'
|
||||||
|
# parent side (inside the launchpad subint) — stash the
|
||||||
|
# forked PID on a shareable dict so we can waitpid()
|
||||||
|
# from the outer main interp. We can't just return it;
|
||||||
|
# _interpreters.exec() returns nothing useful.
|
||||||
|
' import builtins\n'
|
||||||
|
' builtins._forked_child_pid = pid\n'
|
||||||
|
)
|
||||||
|
|
||||||
|
# NOTE, we can't easily pull state back from the subint.
|
||||||
|
# For the CONTROL scenario we just time-bound the fork +
|
||||||
|
# check the sentinel. If sentinel exists → child ran →
|
||||||
|
# analysis wrong. If not → child aborted → analysis
|
||||||
|
# confirmed.
|
||||||
|
done = threading.Event()
|
||||||
|
|
||||||
|
def _drive() -> None:
|
||||||
|
try:
|
||||||
|
_interpreters.exec(interp_id, bootstrap)
|
||||||
|
except Exception as err:
|
||||||
|
print(
|
||||||
|
f' subint bootstrap raised (expected on some '
|
||||||
|
f'CPython versions): {type(err).__name__}: {err}',
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
done.set()
|
||||||
|
|
||||||
|
t = threading.Thread(
|
||||||
|
target=_drive,
|
||||||
|
name='control-subint-fork-launchpad',
|
||||||
|
daemon=True,
|
||||||
|
)
|
||||||
|
t.start()
|
||||||
|
done.wait(timeout=5.0)
|
||||||
|
t.join(timeout=2.0)
|
||||||
|
|
||||||
|
# Give the (possibly-aborted) child a moment to die.
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
sentinel_present = os.path.exists(sentinel)
|
||||||
|
verdict = (
|
||||||
|
# "PASS" for our analysis means sentinel NOT present.
|
||||||
|
'PASS' if not sentinel_present else 'FAIL (UNEXPECTED)'
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f'[{verdict}] control: sentinel present={sentinel_present} '
|
||||||
|
f'(analysis predicts False — child should abort before '
|
||||||
|
f'writing)',
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
if sentinel_present:
|
||||||
|
os.unlink(sentinel)
|
||||||
|
|
||||||
|
try:
|
||||||
|
_interpreters.destroy(interp_id)
|
||||||
|
except _interpreters.InterpreterError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return 0 if not sentinel_present else 1
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# scenario: `main_thread_fork` (baseline sanity)
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def scenario_main_thread_fork() -> int:
|
||||||
|
_banner(
|
||||||
|
'[baseline] fork from MAIN thread (expected: child exits normally)'
|
||||||
|
)
|
||||||
|
|
||||||
|
pid = os.fork()
|
||||||
|
if pid == 0:
|
||||||
|
os._exit(0)
|
||||||
|
|
||||||
|
return 0 if _wait_child(
|
||||||
|
pid,
|
||||||
|
label='main_thread_fork',
|
||||||
|
expect_exit_ok=True,
|
||||||
|
) else 1
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# scenario: `worker_thread_fork` (architectural assertion)
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _run_worker_thread_fork_scenario(
|
||||||
|
label: str,
|
||||||
|
*,
|
||||||
|
child_target=None,
|
||||||
|
) -> int:
|
||||||
|
'''
|
||||||
|
Thin wrapper: delegate the actual fork to the
|
||||||
|
`tractor.spawn._subint_forkserver` primitive, then wait
|
||||||
|
on the child and render a pass/fail banner.
|
||||||
|
|
||||||
|
'''
|
||||||
|
try:
|
||||||
|
pid: int = fork_from_worker_thread(
|
||||||
|
child_target=child_target,
|
||||||
|
thread_name=f'worker-fork-thread[{label}]',
|
||||||
|
)
|
||||||
|
except RuntimeError as err:
|
||||||
|
print(f'[FAIL] {label}: {err}', flush=True)
|
||||||
|
return 1
|
||||||
|
print(f' forked child pid={pid}', flush=True)
|
||||||
|
ok, status_str = wait_child(pid, expect_exit_ok=True)
|
||||||
|
_report(
|
||||||
|
label,
|
||||||
|
ok=ok,
|
||||||
|
status_str=status_str,
|
||||||
|
expect_exit_ok=True,
|
||||||
|
)
|
||||||
|
return 0 if ok else 1
|
||||||
|
|
||||||
|
|
||||||
|
def scenario_worker_thread_fork() -> int:
|
||||||
|
_banner(
|
||||||
|
'[arch] fork from MAIN-INTERP WORKER thread '
|
||||||
|
'(expected: child exits normally — this is the one '
|
||||||
|
'that matters)'
|
||||||
|
)
|
||||||
|
return _run_worker_thread_fork_scenario(
|
||||||
|
'worker_thread_fork',
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# scenario: `full_architecture`
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
_CHILD_TRIO_BOOTSTRAP: str = (
|
||||||
|
'import trio\n'
|
||||||
|
'async def _main():\n'
|
||||||
|
' await trio.sleep(0.05)\n'
|
||||||
|
' return 42\n'
|
||||||
|
'result = trio.run(_main)\n'
|
||||||
|
'assert result == 42, f"trio.run returned {result}"\n'
|
||||||
|
'print(" CHILD subint: trio.run OK, result=42", '
|
||||||
|
'flush=True)\n'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _child_trio_in_subint() -> int:
|
||||||
|
'''
|
||||||
|
CHILD-side `child_target`: drive a trivial `trio.run()`
|
||||||
|
inside a fresh legacy-config subint on a worker thread,
|
||||||
|
using the `tractor.spawn._subint_forkserver.run_subint_in_worker_thread`
|
||||||
|
primitive. Returns 0 on success.
|
||||||
|
|
||||||
|
'''
|
||||||
|
try:
|
||||||
|
run_subint_in_worker_thread(
|
||||||
|
_CHILD_TRIO_BOOTSTRAP,
|
||||||
|
thread_name='child-subint-trio-thread',
|
||||||
|
)
|
||||||
|
except RuntimeError as err:
|
||||||
|
print(
|
||||||
|
f' CHILD: run_subint_in_worker_thread timed out / thread '
|
||||||
|
f'never returned: {err}',
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
return 3
|
||||||
|
except BaseException as err:
|
||||||
|
print(
|
||||||
|
f' CHILD: subint bootstrap raised: '
|
||||||
|
f'{type(err).__name__}: {err}',
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
return 4
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def scenario_full_architecture() -> int:
|
||||||
|
_banner(
|
||||||
|
'[arch-full] worker-thread fork + child runs trio in a '
|
||||||
|
'subint (end-to-end proposed arch)'
|
||||||
|
)
|
||||||
|
return _run_worker_thread_fork_scenario(
|
||||||
|
'full_architecture',
|
||||||
|
child_target=_child_trio_in_subint,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# main
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
SCENARIOS: dict[str, Callable[[], int]] = {
|
||||||
|
'control_subint_thread_fork': scenario_control_subint_thread_fork,
|
||||||
|
'main_thread_fork': scenario_main_thread_fork,
|
||||||
|
'worker_thread_fork': scenario_worker_thread_fork,
|
||||||
|
'full_architecture': scenario_full_architecture,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
ap = argparse.ArgumentParser(
|
||||||
|
description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
'--scenario',
|
||||||
|
choices=sorted(SCENARIOS.keys()),
|
||||||
|
required=True,
|
||||||
|
)
|
||||||
|
args = ap.parse_args()
|
||||||
|
return SCENARIOS[args.scenario]()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
sys.exit(main())
|
||||||
|
|
@ -0,0 +1,385 @@
|
||||||
|
# `subint_forkserver` backend: orphaned-subactor SIGINT wedged in `epoll_wait`
|
||||||
|
|
||||||
|
Follow-up to the Phase C `subint_forkserver` spawn-backend
|
||||||
|
PR (see `tractor.spawn._subint_forkserver`, issue #379).
|
||||||
|
Surfaced by the xfail'd
|
||||||
|
`tests/spawn/test_subint_forkserver.py::test_orphaned_subactor_sigint_cleanup_DRAFT`.
|
||||||
|
|
||||||
|
Related-but-distinct from
|
||||||
|
`subint_cancel_delivery_hang_issue.md` (orphaned-channel
|
||||||
|
park AFTER subint teardown) and
|
||||||
|
`subint_sigint_starvation_issue.md` (GIL-starvation,
|
||||||
|
SIGINT never delivered): here the SIGINT IS delivered,
|
||||||
|
trio's handler IS installed, but trio's event loop never
|
||||||
|
wakes — so the KBI-at-checkpoint → `_trio_main` catch path
|
||||||
|
(which is the runtime's *intentional* OS-cancel design)
|
||||||
|
never fires.
|
||||||
|
|
||||||
|
## TL;DR
|
||||||
|
|
||||||
|
When a `subint_forkserver`-spawned subactor is orphaned
|
||||||
|
(parent `SIGKILL`'d, no IPC cancel path available) and then
|
||||||
|
externally `SIGINT`'d, the subactor hangs in
|
||||||
|
`trio/_core/_io_epoll.py::get_events` (epoll_wait)
|
||||||
|
indefinitely — even though:
|
||||||
|
|
||||||
|
1. `threading.current_thread() is threading.main_thread()`
|
||||||
|
post-fork (CPython 3.14 re-designates correctly).
|
||||||
|
2. Trio's SIGINT handler IS installed in the subactor
|
||||||
|
(`signal.getsignal(SIGINT)` returns
|
||||||
|
`<function KIManager.install.<locals>.handler at 0x...>`).
|
||||||
|
3. The kernel does deliver SIGINT — the signal arrives at
|
||||||
|
the only thread in the process (the fork-inherited
|
||||||
|
worker which IS now "main" per Python).
|
||||||
|
|
||||||
|
Yet `epoll_wait` does not return. Trio's wakeup-fd mechanism
|
||||||
|
— the machinery that turns SIGINT into an epoll-wake — is
|
||||||
|
somehow not firing the wakeup. Until that's fixed, the
|
||||||
|
intentional "KBI-as-OS-cancel" path in
|
||||||
|
`tractor/spawn/_entry.py::_trio_main:164` is unreachable
|
||||||
|
for forkserver-spawned subactors whose parent dies.
|
||||||
|
|
||||||
|
## Symptom
|
||||||
|
|
||||||
|
Test: `tests/spawn/test_subint_forkserver.py::test_orphaned_subactor_sigint_cleanup_DRAFT`
|
||||||
|
(currently marked `@pytest.mark.xfail(strict=True)`).
|
||||||
|
|
||||||
|
1. Harness subprocess brings up a tractor root actor +
|
||||||
|
one `run_in_actor(_sleep_forever)` subactor via
|
||||||
|
`try_set_start_method('subint_forkserver')`.
|
||||||
|
2. Harness prints `CHILD_PID` (subactor) and
|
||||||
|
`PARENT_READY` (root actor) markers to stdout.
|
||||||
|
3. Test `os.kill(parent_pid, SIGKILL)` + `proc.wait()`
|
||||||
|
to fully reap the root-actor harness.
|
||||||
|
4. Child (now reparented to pid 1) is still alive.
|
||||||
|
5. Test `os.kill(child_pid, SIGINT)` and polls
|
||||||
|
`os.kill(child_pid, 0)` for up to 10s.
|
||||||
|
6. **Observed**: the child is still alive at deadline —
|
||||||
|
SIGINT did not unwedge the trio loop.
|
||||||
|
|
||||||
|
## What the "intentional" cancel path IS
|
||||||
|
|
||||||
|
`tractor/spawn/_entry.py::_trio_main:157-186` —
|
||||||
|
|
||||||
|
```python
|
||||||
|
try:
|
||||||
|
if infect_asyncio:
|
||||||
|
actor._infected_aio = True
|
||||||
|
run_as_asyncio_guest(trio_main)
|
||||||
|
else:
|
||||||
|
trio.run(trio_main)
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
logmeth = log.cancel
|
||||||
|
exit_status: str = (
|
||||||
|
'Actor received KBI (aka an OS-cancel)\n'
|
||||||
|
...
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
The "KBI == OS-cancel" mapping IS the runtime's
|
||||||
|
deliberate, documented design. An OS-level SIGINT should
|
||||||
|
flow as: kernel → trio handler → KBI at trio checkpoint
|
||||||
|
→ unwinds `async_main` → surfaces at `_trio_main`'s
|
||||||
|
`except KeyboardInterrupt:` → `log.cancel` + clean `rc=0`.
|
||||||
|
|
||||||
|
**So fixing this hang is not "add a new SIGINT behavior" —
|
||||||
|
it's "make the existing designed behavior actually fire in
|
||||||
|
this backend config".** That's why option (B) ("fix root
|
||||||
|
cause") is aligned with existing design intent, not a
|
||||||
|
scope expansion.
|
||||||
|
|
||||||
|
## Evidence
|
||||||
|
|
||||||
|
### Positive control: standalone fork-from-worker + `trio.run(sleep_forever)` + SIGINT WORKS
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os, signal, time, trio
|
||||||
|
from tractor.spawn._subint_forkserver import (
|
||||||
|
fork_from_worker_thread, wait_child,
|
||||||
|
)
|
||||||
|
|
||||||
|
def child_target() -> int:
|
||||||
|
async def _main():
|
||||||
|
try:
|
||||||
|
await trio.sleep_forever()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print('CHILD: caught KBI — trio SIGINT works!')
|
||||||
|
return
|
||||||
|
trio.run(_main)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
pid = fork_from_worker_thread(child_target, thread_name='trio-sigint-test')
|
||||||
|
time.sleep(1.0)
|
||||||
|
os.kill(pid, signal.SIGINT)
|
||||||
|
wait_child(pid)
|
||||||
|
```
|
||||||
|
|
||||||
|
Result: `CHILD: caught KBI — trio SIGINT works!` + clean
|
||||||
|
exit. So the fork-child + trio signal plumbing IS healthy
|
||||||
|
in isolation. The hang appears only with the full tractor
|
||||||
|
subactor runtime on top.
|
||||||
|
|
||||||
|
### Negative test: full tractor subactor + orphan-SIGINT
|
||||||
|
|
||||||
|
Equivalent to the xfail test. Traceback dump via
|
||||||
|
`faulthandler.register(SIGUSR1, all_threads=True)` at the
|
||||||
|
stuck moment:
|
||||||
|
|
||||||
|
```
|
||||||
|
Current thread 0x00007... [subint-forkserv] (most recent call first):
|
||||||
|
File ".../trio/_core/_io_epoll.py", line 245 in get_events
|
||||||
|
File ".../trio/_core/_run.py", line 2415 in run
|
||||||
|
File "tractor/spawn/_entry.py", line 162 in _trio_main
|
||||||
|
File "tractor/_child.py", line 72 in _actor_child_main
|
||||||
|
File "tractor/spawn/_subint_forkserver.py", line 650 in _child_target
|
||||||
|
File "tractor/spawn/_subint_forkserver.py", line 308 in _worker
|
||||||
|
File ".../threading.py", line 1024 in run
|
||||||
|
```
|
||||||
|
|
||||||
|
### Thread + signal-mask inventory of the stuck subactor
|
||||||
|
|
||||||
|
Single thread (`tid == pid`, comm `'subint-forkserv'`,
|
||||||
|
which IS `threading.main_thread()` post-fork):
|
||||||
|
|
||||||
|
```
|
||||||
|
SigBlk: 0000000000000000 # nothing blocked
|
||||||
|
SigIgn: 0000000001001000 # SIGPIPE etc (Python defaults)
|
||||||
|
SigCgt: 0000000108000202 # bit 1 = SIGINT caught
|
||||||
|
```
|
||||||
|
|
||||||
|
Bit 1 set in `SigCgt` → SIGINT handler IS installed. So
|
||||||
|
trio's handler IS in place at the kernel level — not a
|
||||||
|
"handler missing" situation.
|
||||||
|
|
||||||
|
### Handler identity
|
||||||
|
|
||||||
|
Inside the subactor's RPC body, `signal.getsignal(SIGINT)`
|
||||||
|
returns `<function KIManager.install.<locals>.handler at
|
||||||
|
0x...>` — trio's own `KIManager` handler. tractor's only
|
||||||
|
SIGINT touches are `signal.getsignal()` *reads* (to stash
|
||||||
|
into `debug.DebugStatus._trio_handler`); nothing writes
|
||||||
|
over trio's handler outside the debug-REPL shielding path
|
||||||
|
(`devx/debug/_tty_lock.py::shield_sigint`) which isn't
|
||||||
|
engaged here (no debug_mode).
|
||||||
|
|
||||||
|
## Ruled out
|
||||||
|
|
||||||
|
- **GIL starvation / signal-pipe-full** (class A,
|
||||||
|
`subint_sigint_starvation_issue.md`): subactor runs on
|
||||||
|
its own GIL (separate OS process), not sharing with the
|
||||||
|
parent → no cross-process GIL contention. And `strace`-
|
||||||
|
equivalent in the signal mask shows SIGINT IS caught,
|
||||||
|
not queued.
|
||||||
|
- **Orphaned channel park** (`subint_cancel_delivery_hang_issue.md`):
|
||||||
|
different failure mode — that one has trio iterating
|
||||||
|
normally and getting wedged on an orphaned
|
||||||
|
`chan.recv()` AFTER teardown. Here trio's event loop
|
||||||
|
itself never wakes.
|
||||||
|
- **Tractor explicitly catching + swallowing KBI**:
|
||||||
|
greppable — the one `except KeyboardInterrupt:` in the
|
||||||
|
runtime is the INTENTIONAL cancel-path catch at
|
||||||
|
`_trio_main:164`. `async_main` uses `except Exception`
|
||||||
|
(not BaseException), so KBI should propagate through
|
||||||
|
cleanly if it ever fires.
|
||||||
|
- **Missing `signal.set_wakeup_fd` (main-thread
|
||||||
|
restriction)**: post-fork, the fork-worker thread IS
|
||||||
|
`threading.main_thread()`, so trio's main-thread check
|
||||||
|
passes and its wakeup-fd install should succeed.
|
||||||
|
|
||||||
|
## Root cause hypothesis (unverified)
|
||||||
|
|
||||||
|
The SIGINT handler fires but trio's wakeup-fd write does
|
||||||
|
not wake `epoll_wait`. Candidate causes, ranked by
|
||||||
|
plausibility:
|
||||||
|
|
||||||
|
1. **Wakeup-fd lifecycle race around tractor IPC setup.**
|
||||||
|
`async_main` spins up an IPC server + `process_messages`
|
||||||
|
loops early. Somewhere in that path the wakeup-fd that
|
||||||
|
trio registered with its epoll instance may be
|
||||||
|
closed/replaced/clobbered, so subsequent SIGINT writes
|
||||||
|
land on an fd that's no longer in the epoll set.
|
||||||
|
Evidence needed: compare
|
||||||
|
`signal.set_wakeup_fd(-1)` return value inside a
|
||||||
|
post-tractor-bringup RPC body vs. a pre-bringup
|
||||||
|
equivalent. If they differ, that's it.
|
||||||
|
2. **Shielded cancel scope around `process_messages`.**
|
||||||
|
The RPC message loop is likely wrapped in a trio cancel
|
||||||
|
scope; if that scope is `shield=True` at any outer
|
||||||
|
layer, KBI scheduled at a checkpoint could be absorbed
|
||||||
|
by the shield and never bubble out to `_trio_main`.
|
||||||
|
3. **Pre-fork wakeup-fd inheritance.** trio in the PARENT
|
||||||
|
process registered a wakeup-fd with its own epoll. The
|
||||||
|
child inherits the fd number but not the parent's
|
||||||
|
epoll instance — if tractor/trio re-uses the parent's
|
||||||
|
stale fd number anywhere, writes would go to a no-op
|
||||||
|
fd. (This is the least likely — `trio.run()` on the
|
||||||
|
child calls `KIManager.install` which should install a
|
||||||
|
fresh wakeup-fd from scratch.)
|
||||||
|
|
||||||
|
## Cross-backend scope question
|
||||||
|
|
||||||
|
**Untested**: does the same orphan-SIGINT hang reproduce
|
||||||
|
against the `trio_proc` backend (stock subprocess + exec)?
|
||||||
|
If yes → pre-existing tractor bug, independent of
|
||||||
|
`subint_forkserver`. If no → something specific to the
|
||||||
|
fork-from-worker path (e.g. inherited fds, mid-epoll-setup
|
||||||
|
interference).
|
||||||
|
|
||||||
|
**Quick repro for trio_proc**:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# save as /tmp/trio_proc_orphan_sigint_repro.py
|
||||||
|
import os, sys, signal, time, glob
|
||||||
|
import subprocess as sp
|
||||||
|
|
||||||
|
SCRIPT = '''
|
||||||
|
import os, sys, trio, tractor
|
||||||
|
async def _sleep_forever():
|
||||||
|
print(f"CHILD_PID={os.getpid()}", flush=True)
|
||||||
|
await trio.sleep_forever()
|
||||||
|
|
||||||
|
async def _main():
|
||||||
|
async with (
|
||||||
|
tractor.open_root_actor(registry_addrs=[("127.0.0.1", 12350)]),
|
||||||
|
tractor.open_nursery() as an,
|
||||||
|
):
|
||||||
|
await an.run_in_actor(_sleep_forever, name="sf-child")
|
||||||
|
print(f"PARENT_READY={os.getpid()}", flush=True)
|
||||||
|
await trio.sleep_forever()
|
||||||
|
|
||||||
|
trio.run(_main)
|
||||||
|
'''
|
||||||
|
|
||||||
|
proc = sp.Popen(
|
||||||
|
[sys.executable, '-c', SCRIPT],
|
||||||
|
stdout=sp.PIPE, stderr=sp.STDOUT,
|
||||||
|
)
|
||||||
|
# parse CHILD_PID + PARENT_READY off proc.stdout ...
|
||||||
|
# SIGKILL parent, SIGINT child, poll.
|
||||||
|
```
|
||||||
|
|
||||||
|
If that hangs too, open a broader issue; if not, this is
|
||||||
|
`subint_forkserver`-specific (likely fd-inheritance-related).
|
||||||
|
|
||||||
|
## Why this is ours to fix (not CPython's)
|
||||||
|
|
||||||
|
- Signal IS delivered (`SigCgt` bitmask confirms).
|
||||||
|
- Handler IS installed (trio's `KIManager`).
|
||||||
|
- Thread identity is correct post-fork.
|
||||||
|
- `_trio_main` already has the intentional KBI→clean-exit
|
||||||
|
path waiting to fire.
|
||||||
|
|
||||||
|
Every CPython-level precondition is met. Something in
|
||||||
|
tractor's runtime or trio's integration with it is
|
||||||
|
breaking the SIGINT→wakeup→event-loop-wake pipeline.
|
||||||
|
|
||||||
|
## Possible fix directions
|
||||||
|
|
||||||
|
1. **Audit the wakeup-fd across tractor's IPC bringup.**
|
||||||
|
Add a trio startup hook that captures
|
||||||
|
`signal.set_wakeup_fd(-1)` at `_trio_main` entry,
|
||||||
|
after `async_main` enters, and periodically — assert
|
||||||
|
it's unchanged. If it moves, track down the writer.
|
||||||
|
2. **Explicit `signal.set_wakeup_fd` reset after IPC
|
||||||
|
setup.** Brute force: re-install a fresh wakeup-fd
|
||||||
|
mid-bringup. Band-aid, but fast to try.
|
||||||
|
3. **Ensure no `shield=True` cancel scope envelopes the
|
||||||
|
RPC-message-loop / IPC-server task.** If one does,
|
||||||
|
KBI-at-checkpoint never escapes.
|
||||||
|
4. **Once fixed, the `child_sigint='trio'` mode on
|
||||||
|
`subint_forkserver_proc`** becomes effectively a no-op
|
||||||
|
or a doc-only mode — trio's natural handler already
|
||||||
|
does the right thing. Might end up removing the flag
|
||||||
|
entirely if there's no behavioral difference between
|
||||||
|
modes.
|
||||||
|
|
||||||
|
## Current workaround
|
||||||
|
|
||||||
|
None; `child_sigint` defaults to `'ipc'` (IPC cancel is
|
||||||
|
the only reliable cancel path today), and the xfail test
|
||||||
|
documents the gap. Operators hitting orphan-SIGINT get a
|
||||||
|
hung process that needs `SIGKILL`.
|
||||||
|
|
||||||
|
## Reproducer
|
||||||
|
|
||||||
|
Inline, standalone (no pytest):
|
||||||
|
|
||||||
|
```python
|
||||||
|
# save as /tmp/orphan_sigint_repro.py (py3.14+)
|
||||||
|
import os, sys, signal, time, glob, trio
|
||||||
|
import tractor
|
||||||
|
from tractor.spawn._subint_forkserver import (
|
||||||
|
fork_from_worker_thread,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _sleep_forever():
|
||||||
|
print(f'SUBACTOR[{os.getpid()}]', flush=True)
|
||||||
|
await trio.sleep_forever()
|
||||||
|
|
||||||
|
async def _main():
|
||||||
|
async with (
|
||||||
|
tractor.open_root_actor(
|
||||||
|
registry_addrs=[('127.0.0.1', 12349)],
|
||||||
|
),
|
||||||
|
tractor.open_nursery() as an,
|
||||||
|
):
|
||||||
|
await an.run_in_actor(_sleep_forever, name='sf-child')
|
||||||
|
await trio.sleep_forever()
|
||||||
|
|
||||||
|
def child_target() -> int:
|
||||||
|
from tractor.spawn._spawn import try_set_start_method
|
||||||
|
try_set_start_method('subint_forkserver')
|
||||||
|
trio.run(_main)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
pid = fork_from_worker_thread(child_target, thread_name='repro')
|
||||||
|
time.sleep(3.0)
|
||||||
|
|
||||||
|
# find the subactor pid via /proc
|
||||||
|
children = []
|
||||||
|
for path in glob.glob(f'/proc/{pid}/task/*/children'):
|
||||||
|
with open(path) as f:
|
||||||
|
children.extend(int(x) for x in f.read().split() if x)
|
||||||
|
subactor_pid = children[0]
|
||||||
|
|
||||||
|
# SIGKILL root → orphan the subactor
|
||||||
|
os.kill(pid, signal.SIGKILL)
|
||||||
|
os.waitpid(pid, 0)
|
||||||
|
time.sleep(0.3)
|
||||||
|
|
||||||
|
# SIGINT the orphan — should cause clean trio exit
|
||||||
|
os.kill(subactor_pid, signal.SIGINT)
|
||||||
|
|
||||||
|
# poll for exit
|
||||||
|
for _ in range(100):
|
||||||
|
try:
|
||||||
|
os.kill(subactor_pid, 0)
|
||||||
|
time.sleep(0.1)
|
||||||
|
except ProcessLookupError:
|
||||||
|
print('HARNESS: subactor exited cleanly ✔')
|
||||||
|
sys.exit(0)
|
||||||
|
os.kill(subactor_pid, signal.SIGKILL)
|
||||||
|
print('HARNESS: subactor hung — reproduced')
|
||||||
|
sys.exit(1)
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected (current): `HARNESS: subactor hung — reproduced`.
|
||||||
|
|
||||||
|
After fix: `HARNESS: subactor exited cleanly ✔`.
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- `tractor/spawn/_entry.py::_trio_main:157-186` — the
|
||||||
|
intentional KBI→clean-exit path this bug makes
|
||||||
|
unreachable.
|
||||||
|
- `tractor/spawn/_subint_forkserver` — the backend whose
|
||||||
|
orphan cancel-robustness this blocks.
|
||||||
|
- `tests/spawn/test_subint_forkserver.py::test_orphaned_subactor_sigint_cleanup_DRAFT`
|
||||||
|
— the xfail'd reproducer in the test suite.
|
||||||
|
- `ai/conc-anal/subint_cancel_delivery_hang_issue.md` —
|
||||||
|
sibling "orphaned channel park" hang (different class).
|
||||||
|
- `ai/conc-anal/subint_sigint_starvation_issue.md` —
|
||||||
|
sibling "GIL starvation SIGINT drop" hang (different
|
||||||
|
class).
|
||||||
|
- tractor issue #379 — subint backend tracking.
|
||||||
|
|
@ -0,0 +1,849 @@
|
||||||
|
# `subint_forkserver` backend: `test_cancellation.py` multi-level cancel cascade hang
|
||||||
|
|
||||||
|
Follow-up tracker: surfaced while wiring the new
|
||||||
|
`subint_forkserver` spawn backend into the full tractor
|
||||||
|
test matrix (step 2 of the post-backend-lands plan).
|
||||||
|
See also
|
||||||
|
`ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md`
|
||||||
|
— sibling tracker for a different forkserver-teardown
|
||||||
|
class which probably shares the same fundamental root
|
||||||
|
cause (fork-FD-inheritance across nested spawns).
|
||||||
|
|
||||||
|
## TL;DR
|
||||||
|
|
||||||
|
`tests/test_cancellation.py::test_nested_multierrors[subint_forkserver]`
|
||||||
|
hangs indefinitely under our new backend. The hang is
|
||||||
|
**inside the graceful IPC cancel cascade** — every actor
|
||||||
|
in the multi-level tree parks in `epoll_wait` waiting
|
||||||
|
for IPC messages that never arrive. Not a hard-kill /
|
||||||
|
tree-reap issue (we don't reach the hard-kill fallback
|
||||||
|
path at all).
|
||||||
|
|
||||||
|
Working hypothesis (unverified): **`os.fork()` from a
|
||||||
|
subactor inherits the root parent's IPC listener socket
|
||||||
|
FDs**. When a first-level subactor forkserver-spawns a
|
||||||
|
grandchild, that grandchild inherits both its direct
|
||||||
|
spawner's FDs AND the root's FDs — IPC message routing
|
||||||
|
becomes ambiguous (or silently sends to the wrong
|
||||||
|
channel), so the cancel cascade can't reach its target.
|
||||||
|
|
||||||
|
## Corrected diagnosis vs. earlier draft
|
||||||
|
|
||||||
|
An earlier version of this doc claimed the root cause
|
||||||
|
was **"forkserver teardown doesn't tree-kill
|
||||||
|
descendants"** (SIGKILL only reaches the direct child,
|
||||||
|
grandchildren survive and hold TCP `:1616`). That
|
||||||
|
diagnosis was **wrong**, caused by conflating two
|
||||||
|
observations:
|
||||||
|
|
||||||
|
1. *5-zombie leak holding :1616* — happened in my own
|
||||||
|
workflow when I aborted a bg pytest task with
|
||||||
|
`pkill` (SIGTERM/SIGKILL, not SIGINT). The abrupt
|
||||||
|
kill skipped the graceful `ActorNursery.__aexit__`
|
||||||
|
cancel cascade entirely, orphaning descendants to
|
||||||
|
init. **This was my cleanup bug, not a forkserver
|
||||||
|
teardown bug.** Codified the fix (SIGINT-first +
|
||||||
|
bounded wait before SIGKILL) in
|
||||||
|
`feedback_sc_graceful_cancel_first.md` +
|
||||||
|
`.claude/skills/run-tests/SKILL.md`.
|
||||||
|
2. *`test_nested_multierrors` hangs indefinitely* —
|
||||||
|
the real, separate, forkserver-specific bug
|
||||||
|
captured by this doc.
|
||||||
|
|
||||||
|
The two symptoms are unrelated. The tree-kill / setpgrp
|
||||||
|
fix direction proposed earlier would not help (1) (SC-
|
||||||
|
graceful-cleanup is the right answer there) and would
|
||||||
|
not help (2) (the hang is in the cancel cascade, not
|
||||||
|
in the hard-kill fallback).
|
||||||
|
|
||||||
|
## Symptom
|
||||||
|
|
||||||
|
Reproducer (py3.14, clean env):
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# preflight: ensure clean env
|
||||||
|
ss -tlnp 2>/dev/null | grep ':1616' && echo 'FOUL — cleanup first!' || echo 'clean'
|
||||||
|
|
||||||
|
./py314/bin/python -m pytest --spawn-backend=subint_forkserver \
|
||||||
|
'tests/test_cancellation.py::test_nested_multierrors[subint_forkserver]' \
|
||||||
|
--timeout=30 --timeout-method=thread --tb=short -v
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: `pytest-timeout` fires at 30s with a thread-
|
||||||
|
dump banner, but the process itself **remains alive
|
||||||
|
after timeout** and doesn't unwedge on subsequent
|
||||||
|
SIGINT. Requires SIGKILL to reap.
|
||||||
|
|
||||||
|
## Evidence (tree structure at hang point)
|
||||||
|
|
||||||
|
All 5 processes are kernel-level `S` (sleeping) in
|
||||||
|
`do_epoll_wait` (trio's event loop waiting on I/O):
|
||||||
|
|
||||||
|
```
|
||||||
|
PID PPID THREADS NAME ROLE
|
||||||
|
333986 1 2 subint-forkserv pytest main (the test body)
|
||||||
|
333993 333986 3 subint-forkserv "child 1" spawner subactor
|
||||||
|
334003 333993 1 subint-forkserv grandchild errorer under child-1
|
||||||
|
334014 333993 1 subint-forkserv grandchild errorer under child-1
|
||||||
|
333999 333986 1 subint-forkserv "child 2" spawner subactor (NO grandchildren!)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Asymmetric tree depth
|
||||||
|
|
||||||
|
The test's `spawn_and_error(breadth=2, depth=3)` should
|
||||||
|
have BOTH direct children spawning 2 grandchildren
|
||||||
|
each, going 3 levels deep. Reality:
|
||||||
|
|
||||||
|
- Child 1 (333993, 3 threads) DID spawn its two
|
||||||
|
grandchildren as expected — fully booted trio
|
||||||
|
runtime.
|
||||||
|
- Child 2 (333999, 1 thread) did NOT spawn any
|
||||||
|
grandchildren — clearly never completed its
|
||||||
|
nursery's first `run_in_actor`. Its 1-thread state
|
||||||
|
suggests the runtime never fully booted (no trio
|
||||||
|
worker threads for `waitpid`/IPC).
|
||||||
|
|
||||||
|
This asymmetry is the key clue: the two direct
|
||||||
|
children started identically but diverged. Probably a
|
||||||
|
race around fork-inherited state (listener FDs,
|
||||||
|
subactor-nursery channel state) that happens to land
|
||||||
|
differently depending on spawn ordering.
|
||||||
|
|
||||||
|
### Parent-side state
|
||||||
|
|
||||||
|
Thread-dump of pytest main (333986) at the hang:
|
||||||
|
|
||||||
|
- Main trio thread — parked in
|
||||||
|
`trio._core._io_epoll.get_events` (epoll_wait on
|
||||||
|
its event loop). Waiting for IPC from children.
|
||||||
|
- Two trio-cache worker threads — each parked in
|
||||||
|
`outcome.capture(sync_fn)` calling
|
||||||
|
`os.waitpid(child_pid, 0)`. These are our
|
||||||
|
`_ForkedProc.wait()` off-loads. They're waiting for
|
||||||
|
the direct children to exit — but children are
|
||||||
|
stuck in their own epoll_wait waiting for IPC from
|
||||||
|
the parent.
|
||||||
|
|
||||||
|
**It's a deadlock, not a leak:** the parent is
|
||||||
|
correctly running `soft_kill(proc, _ForkedProc.wait,
|
||||||
|
portal)` (graceful IPC cancel via
|
||||||
|
`Portal.cancel_actor()`), but the children never
|
||||||
|
acknowledge the cancel message (or the message never
|
||||||
|
reaches them through the tangled post-fork IPC).
|
||||||
|
|
||||||
|
## What's NOT the cause (ruled out)
|
||||||
|
|
||||||
|
- **`_ForkedProc.kill()` only SIGKILLs direct pid /
|
||||||
|
missing tree-kill**: doesn't apply — we never reach
|
||||||
|
the hard-kill path. The deadlock is in the graceful
|
||||||
|
cancel cascade.
|
||||||
|
- **Port `:1616` contention**: ruled out after the
|
||||||
|
`reg_addr` fixture-wiring fix; each test session
|
||||||
|
gets a unique port now.
|
||||||
|
- **GIL starvation / SIGINT pipe filling** (class-A,
|
||||||
|
`subint_sigint_starvation_issue.md`): doesn't apply
|
||||||
|
— each subactor is its own OS process with its own
|
||||||
|
GIL (not legacy-config subint).
|
||||||
|
- **Child-side `_trio_main` absorbing KBI**: grep
|
||||||
|
confirmed; `_trio_main` only catches KBI at the
|
||||||
|
`trio.run()` callsite, which is reached only if the
|
||||||
|
trio loop exits normally. The children here never
|
||||||
|
exit trio.run() — they're wedged inside.
|
||||||
|
|
||||||
|
## Hypothesis: FD inheritance across nested forks
|
||||||
|
|
||||||
|
`subint_forkserver_proc` calls
|
||||||
|
`fork_from_worker_thread()` which ultimately does
|
||||||
|
`os.fork()` from a dedicated worker thread. Standard
|
||||||
|
Linux/POSIX fork semantics: **the child inherits ALL
|
||||||
|
open FDs from the parent**, including listener
|
||||||
|
sockets, epoll fds, trio wakeup pipes, and the
|
||||||
|
parent's IPC channel sockets.
|
||||||
|
|
||||||
|
At root-actor fork-spawn time, the root's IPC server
|
||||||
|
listener FDs are open in the parent. Those get
|
||||||
|
inherited by child 1. Child 1 then forkserver-spawns
|
||||||
|
its OWN subactor (grandchild). The grandchild
|
||||||
|
inherits FDs from child 1 — but child 1's address
|
||||||
|
space still contains **the root's IPC listener FDs
|
||||||
|
too** (inherited at first fork). So the grandchild
|
||||||
|
has THREE sets of FDs:
|
||||||
|
|
||||||
|
1. Its own (created after becoming a subactor).
|
||||||
|
2. Its direct parent child-1's.
|
||||||
|
3. The ROOT's (grandparent's) — inherited transitively.
|
||||||
|
|
||||||
|
IPC message routing may be ambiguous in this tangled
|
||||||
|
state. Or a listener socket that the root thinks it
|
||||||
|
owns is actually open in multiple processes, and
|
||||||
|
messages sent to it go to an arbitrary one. That
|
||||||
|
would exactly match the observed "graceful cancel
|
||||||
|
never propagates".
|
||||||
|
|
||||||
|
This hypothesis predicts the bug **scales with fork
|
||||||
|
depth**: single-level forkserver spawn
|
||||||
|
(`test_subint_forkserver_spawn_basic`) works
|
||||||
|
perfectly, but any test that spawns a second level
|
||||||
|
deadlocks. Matches observations so far.
|
||||||
|
|
||||||
|
## Fix directions (to validate)
|
||||||
|
|
||||||
|
### 1. `close_fds=True` equivalent in `fork_from_worker_thread()`
|
||||||
|
|
||||||
|
`subprocess.Popen` / `trio.lowlevel.open_process` have
|
||||||
|
`close_fds=True` by default on POSIX — they
|
||||||
|
enumerate open FDs in the child post-fork and close
|
||||||
|
everything except stdio + any explicitly-passed FDs.
|
||||||
|
Our raw `os.fork()` doesn't. Adding the equivalent to
|
||||||
|
our `_worker` prelude would isolate each fork
|
||||||
|
generation's FD set.
|
||||||
|
|
||||||
|
Implementation sketch in
|
||||||
|
`tractor.spawn._subint_forkserver.fork_from_worker_thread._worker`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _worker() -> None:
|
||||||
|
pid: int = os.fork()
|
||||||
|
if pid == 0:
|
||||||
|
# CHILD: close inherited FDs except stdio + the
|
||||||
|
# pid-pipe we just opened.
|
||||||
|
keep: set[int] = {0, 1, 2, rfd, wfd}
|
||||||
|
import resource
|
||||||
|
soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
|
||||||
|
os.closerange(3, soft) # blunt; or enumerate /proc/self/fd
|
||||||
|
# ... then child_target() as before
|
||||||
|
```
|
||||||
|
|
||||||
|
Problem: overly aggressive — closes FDs the
|
||||||
|
grandchild might legitimately need (e.g. its parent's
|
||||||
|
IPC channel for the spawn-spec handshake, if we rely
|
||||||
|
on that). Needs thought about which FDs are
|
||||||
|
"inheritable and safe" vs. "inherited by accident".
|
||||||
|
|
||||||
|
### 2. Cloexec on tractor's own FDs
|
||||||
|
|
||||||
|
Set `FD_CLOEXEC` on tractor-created sockets (listener
|
||||||
|
sockets, IPC channel sockets, pipes). This flag
|
||||||
|
causes automatic close on `execve`, but since we
|
||||||
|
`fork()` without `exec()`, this alone doesn't help.
|
||||||
|
BUT — combined with a child-side explicit close-
|
||||||
|
non-cloexec loop, it gives us a way to mark "my
|
||||||
|
private FDs" vs. "safe to inherit". Most robust, but
|
||||||
|
requires tractor-wide audit.
|
||||||
|
|
||||||
|
### 3. Explicit FD cleanup in `_ForkedProc`/`_child_target`
|
||||||
|
|
||||||
|
Have `subint_forkserver_proc`'s `_child_target`
|
||||||
|
closure explicitly close the parent-side IPC listener
|
||||||
|
FDs before calling `_actor_child_main`. Requires
|
||||||
|
being able to enumerate "the parent's listener FDs
|
||||||
|
that the child shouldn't keep" — plausible via
|
||||||
|
`Actor.ipc_server`'s socket objects.
|
||||||
|
|
||||||
|
### 4. Use `os.posix_spawn` with explicit `file_actions`
|
||||||
|
|
||||||
|
Instead of raw `os.fork()`, use `os.posix_spawn()`
|
||||||
|
which supports explicit file-action specifications
|
||||||
|
(close this FD, dup2 that FD). Cleaner semantics, but
|
||||||
|
probably incompatible with our "no exec" requirement
|
||||||
|
(subint_forkserver is a fork-without-exec design).
|
||||||
|
|
||||||
|
**Likely correct answer: (3) — targeted FD cleanup
|
||||||
|
via `actor.ipc_server` handle.** (1) is too blunt,
|
||||||
|
(2) is too wide-ranging, (4) changes the spawn
|
||||||
|
mechanism.
|
||||||
|
|
||||||
|
## Reproducer (standalone, no pytest)
|
||||||
|
|
||||||
|
```python
|
||||||
|
# save as /tmp/forkserver_nested_hang_repro.py (py3.14+)
|
||||||
|
import trio, tractor
|
||||||
|
|
||||||
|
async def assert_err():
|
||||||
|
assert 0
|
||||||
|
|
||||||
|
async def spawn_and_error(breadth: int = 2, depth: int = 1):
|
||||||
|
async with tractor.open_nursery() as n:
|
||||||
|
for i in range(breadth):
|
||||||
|
if depth > 0:
|
||||||
|
await n.run_in_actor(
|
||||||
|
spawn_and_error,
|
||||||
|
breadth=breadth,
|
||||||
|
depth=depth - 1,
|
||||||
|
name=f'spawner_{i}_{depth}',
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
await n.run_in_actor(
|
||||||
|
assert_err,
|
||||||
|
name=f'errorer_{i}',
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _main():
|
||||||
|
async with tractor.open_nursery() as n:
|
||||||
|
for i in range(2):
|
||||||
|
await n.run_in_actor(
|
||||||
|
spawn_and_error,
|
||||||
|
name=f'top_{i}',
|
||||||
|
breadth=2,
|
||||||
|
depth=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from tractor.spawn._spawn import try_set_start_method
|
||||||
|
try_set_start_method('subint_forkserver')
|
||||||
|
with trio.fail_after(20):
|
||||||
|
trio.run(_main)
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected (current): hangs on `trio.fail_after(20)`
|
||||||
|
— children never ack the error-propagation cancel
|
||||||
|
cascade. Pattern: top 2 direct children, 4
|
||||||
|
grandchildren, 1 errorer deadlocks while trying to
|
||||||
|
unwind through its parent chain.
|
||||||
|
|
||||||
|
After fix: `trio.TooSlowError`-free completion; the
|
||||||
|
root's `open_nursery` receives the
|
||||||
|
`BaseExceptionGroup` containing the `AssertionError`
|
||||||
|
from the errorer and unwinds cleanly.
|
||||||
|
|
||||||
|
## Update — 2026-04-23: partial fix landed, deeper layer surfaced
|
||||||
|
|
||||||
|
Three improvements landed as separate commits in the
|
||||||
|
`subint_forkserver_backend` branch (see `git log`):
|
||||||
|
|
||||||
|
1. **`_close_inherited_fds()` in fork-child prelude**
|
||||||
|
(`tractor/spawn/_subint_forkserver.py`). POSIX
|
||||||
|
close-fds-equivalent enumeration via
|
||||||
|
`/proc/self/fd` (or `RLIMIT_NOFILE` fallback), keep
|
||||||
|
only stdio. This is fix-direction (1) from the list
|
||||||
|
above — went with the blunt form rather than the
|
||||||
|
targeted enum-via-`actor.ipc_server` form, turns
|
||||||
|
out the aggressive close is safe because every
|
||||||
|
inheritable resource the fresh child needs
|
||||||
|
(IPC-channel socket, etc.) is opened AFTER the
|
||||||
|
fork anyway.
|
||||||
|
2. **`_ForkedProc.wait()` via `os.pidfd_open()` +
|
||||||
|
`trio.lowlevel.wait_readable()`** — matches the
|
||||||
|
`trio.Process.wait` / `mp.Process.sentinel` pattern
|
||||||
|
used by `trio_proc` and `proc_waiter`. Gives us
|
||||||
|
fully trio-cancellable child-wait (prior impl
|
||||||
|
blocked a cache thread on a sync `os.waitpid` that
|
||||||
|
was NOT trio-cancellable due to
|
||||||
|
`abandon_on_cancel=False`).
|
||||||
|
3. **`_parent_chan_cs` wiring** in
|
||||||
|
`tractor/runtime/_runtime.py`: capture the shielded
|
||||||
|
`loop_cs` for the parent-channel `process_messages`
|
||||||
|
task in `async_main`; explicitly cancel it in
|
||||||
|
`Actor.cancel()` teardown. This breaks the shield
|
||||||
|
during teardown so the parent-chan loop exits when
|
||||||
|
cancel is issued, instead of parking on a parent-
|
||||||
|
socket EOF that might never arrive under fork
|
||||||
|
semantics.
|
||||||
|
|
||||||
|
**Concrete wins from (1):** the sibling
|
||||||
|
`subint_forkserver_orphan_sigint_hang_issue.md` class
|
||||||
|
is **now fixed** — `test_orphaned_subactor_sigint_cleanup_DRAFT`
|
||||||
|
went from strict-xfail to pass. The xfail mark was
|
||||||
|
removed; the test remains as a regression guard.
|
||||||
|
|
||||||
|
**test_nested_multierrors STILL hangs** though.
|
||||||
|
|
||||||
|
### Updated diagnosis (narrowed)
|
||||||
|
|
||||||
|
DIAGDEBUG instrumentation of `process_messages` ENTER/
|
||||||
|
EXIT pairs + `_parent_chan_cs.cancel()` call sites
|
||||||
|
showed (captured during a 20s-timeout repro):
|
||||||
|
|
||||||
|
- 80 `process_messages` ENTERs, 75 EXITs → 5 stuck.
|
||||||
|
- **All 40 `shield=True` ENTERs matched EXIT** — every
|
||||||
|
shielded parent-chan loop exits cleanly. The
|
||||||
|
`_parent_chan_cs` wiring works as intended.
|
||||||
|
- **The 5 stuck loops are all `shield=False`** — peer-
|
||||||
|
channel handlers (inbound connections handled by
|
||||||
|
`handle_stream_from_peer` in stream_handler_tn).
|
||||||
|
- After our `_parent_chan_cs.cancel()` fires, NEW
|
||||||
|
shielded process_messages loops start (on the
|
||||||
|
session reg_addr port — probably discovery-layer
|
||||||
|
reconnection attempts). These don't block teardown
|
||||||
|
(they all exit) but indicate the cancel cascade has
|
||||||
|
more moving parts than expected.
|
||||||
|
|
||||||
|
### Remaining unknown
|
||||||
|
|
||||||
|
Why don't the 5 peer-channel loops exit when
|
||||||
|
`service_tn.cancel_scope.cancel()` fires? They're in
|
||||||
|
`stream_handler_tn` which IS `service_tn` in the
|
||||||
|
current configuration (`open_ipc_server(parent_tn=
|
||||||
|
service_tn, stream_handler_tn=service_tn)`). A
|
||||||
|
standard nursery-scope-cancel should propagate through
|
||||||
|
them — no shield, no special handler. Something
|
||||||
|
specific to the fork-spawned configuration keeps them
|
||||||
|
alive.
|
||||||
|
|
||||||
|
Candidate follow-up experiments:
|
||||||
|
|
||||||
|
- Dump the trio task tree at the hang point (via
|
||||||
|
`stackscope` or direct trio introspection) to see
|
||||||
|
what each stuck loop is awaiting. `chan.__anext__`
|
||||||
|
on a socket recv? An inner lock? A shielded sub-task?
|
||||||
|
- Compare peer-channel handler lifecycle under
|
||||||
|
`trio_proc` vs `subint_forkserver` with equivalent
|
||||||
|
logging to spot the divergence.
|
||||||
|
- Investigate whether the peer handler is caught in
|
||||||
|
the `except trio.Cancelled:` path at
|
||||||
|
`tractor/ipc/_server.py:448` that re-raises — but
|
||||||
|
re-raise means it should still exit. Unless
|
||||||
|
something higher up swallows it.
|
||||||
|
|
||||||
|
### Attempted fix (DID NOT work) — hypothesis (3)
|
||||||
|
|
||||||
|
Tried: in `_serve_ipc_eps` finally, after closing
|
||||||
|
listeners, also iterate `server._peers` and
|
||||||
|
sync-close each peer channel's underlying stream
|
||||||
|
socket fd:
|
||||||
|
|
||||||
|
```python
|
||||||
|
for _uid, _chans in list(server._peers.items()):
|
||||||
|
for _chan in _chans:
|
||||||
|
try:
|
||||||
|
_stream = _chan._transport.stream if _chan._transport else None
|
||||||
|
if _stream is not None:
|
||||||
|
_stream.socket.close() # sync fd close
|
||||||
|
except (AttributeError, OSError):
|
||||||
|
pass
|
||||||
|
```
|
||||||
|
|
||||||
|
Theory: closing the socket fd from outside the stuck
|
||||||
|
recv task would make the recv see EBADF /
|
||||||
|
ClosedResourceError and unblock.
|
||||||
|
|
||||||
|
Result: `test_nested_multierrors[subint_forkserver]`
|
||||||
|
still hangs identically. Either:
|
||||||
|
- The sync `socket.close()` doesn't propagate into
|
||||||
|
trio's in-flight `recv_some()` the way I expected
|
||||||
|
(trio may hold an internal reference that keeps the
|
||||||
|
fd open even after an external close), or
|
||||||
|
- The stuck recv isn't even the root blocker and the
|
||||||
|
peer handlers never reach the finally for some
|
||||||
|
reason I haven't understood yet.
|
||||||
|
|
||||||
|
Either way, the sync-close hypothesis is **ruled
|
||||||
|
out**. Reverted the experiment, restored the skip-
|
||||||
|
mark on the test.
|
||||||
|
|
||||||
|
### Aside: `-s` flag does NOT change `test_nested_multierrors` behavior
|
||||||
|
|
||||||
|
Tested explicitly: both with and without `-s`, the
|
||||||
|
test hangs identically. So the capture-pipe-fill
|
||||||
|
hypothesis is **ruled out** for this test.
|
||||||
|
|
||||||
|
The earlier `test_context_stream_semantics.py` `-s`
|
||||||
|
observation was most likely caused by a competing
|
||||||
|
pytest run in my session (confirmed via process list
|
||||||
|
— my leftover pytest was alive at that time and
|
||||||
|
could have been holding state on the default
|
||||||
|
registry port).
|
||||||
|
|
||||||
|
## Update — 2026-04-23 (late): cancel delivery ruled in, nursery-wait ruled BLOCKER
|
||||||
|
|
||||||
|
**New diagnostic run** instrumented
|
||||||
|
`handle_stream_from_peer` at ENTER / `except
|
||||||
|
trio.Cancelled:` / finally, plus `Actor.cancel()`
|
||||||
|
just before `self._parent_chan_cs.cancel()`. Result:
|
||||||
|
|
||||||
|
- **40 `handle_stream_from_peer` ENTERs**.
|
||||||
|
- **0 `except trio.Cancelled:` hits** — cancel
|
||||||
|
never fires on any peer-handler.
|
||||||
|
- **35 finally hits** — those handlers exit via
|
||||||
|
peer-initiated EOF (normal return), NOT cancel.
|
||||||
|
- **5 handlers never reach finally** — stuck forever.
|
||||||
|
- **`Actor.cancel()` fired in 12 PIDs** — but the
|
||||||
|
PIDs with peer handlers that DIDN'T fire
|
||||||
|
Actor.cancel are exactly **root + 2 direct
|
||||||
|
spawners**. These 3 actors have peer handlers
|
||||||
|
(for their own subactors) that stay stuck because
|
||||||
|
**`Actor.cancel()` at these levels never runs**.
|
||||||
|
|
||||||
|
### The actual deadlock shape
|
||||||
|
|
||||||
|
`Actor.cancel()` lives in
|
||||||
|
`open_root_actor.__aexit__` / `async_main` teardown.
|
||||||
|
That only runs when the enclosing `async with
|
||||||
|
tractor.open_nursery()` exits. The nursery's
|
||||||
|
`__aexit__` calls the backend `*_proc` spawn target's
|
||||||
|
teardown, which does `soft_kill() →
|
||||||
|
_ForkedProc.wait()` on its child PID. That wait is
|
||||||
|
trio-cancellable via pidfd now (good) — but nothing
|
||||||
|
CANCELS it because the outer scope only cancels when
|
||||||
|
`Actor.cancel()` runs, which only runs when the
|
||||||
|
nursery completes, which waits on the child.
|
||||||
|
|
||||||
|
It's a **multi-level mutual wait**:
|
||||||
|
|
||||||
|
```
|
||||||
|
root blocks on spawner.wait()
|
||||||
|
spawner blocks on grandchild.wait()
|
||||||
|
grandchild blocks on errorer.wait()
|
||||||
|
errorer Actor.cancel() ran, but process
|
||||||
|
may not have fully exited yet
|
||||||
|
(something in root_tn holding on?)
|
||||||
|
```
|
||||||
|
|
||||||
|
Each level waits for the level below. The bottom
|
||||||
|
level (errorer) reaches Actor.cancel(), but its
|
||||||
|
process may not fully exit — meaning its pidfd
|
||||||
|
doesn't go readable, meaning the grandchild's
|
||||||
|
waitpid doesn't return, meaning the grandchild's
|
||||||
|
nursery doesn't unwind, etc. all the way up.
|
||||||
|
|
||||||
|
### Refined question
|
||||||
|
|
||||||
|
**Why does an errorer process not exit after its
|
||||||
|
`Actor.cancel()` completes?**
|
||||||
|
|
||||||
|
Possibilities:
|
||||||
|
1. `_parent_chan_cs.cancel()` fires (shielded
|
||||||
|
parent-chan loop unshielded), but the task is
|
||||||
|
stuck INSIDE the shielded loop's recv in a way
|
||||||
|
that cancel still can't break.
|
||||||
|
2. After `Actor.cancel()` returns, `async_main`
|
||||||
|
still has other tasks in `root_tn` waiting for
|
||||||
|
something that never arrives (e.g. outbound
|
||||||
|
IPC reply delivery).
|
||||||
|
3. The `os._exit(rc)` in `_worker` (at
|
||||||
|
`_subint_forkserver.py`) doesn't run because
|
||||||
|
`_child_target` never returns.
|
||||||
|
|
||||||
|
Next-session candidate probes (in priority order):
|
||||||
|
|
||||||
|
1. **Instrument `_worker`'s fork-child branch** to
|
||||||
|
confirm whether `child_target()` returns (and
|
||||||
|
thus `os._exit(rc)` is reached) for errorer
|
||||||
|
PIDs. If yes → process should die; if no →
|
||||||
|
trace back into `_actor_child_main` /
|
||||||
|
`_trio_main` / `async_main` to find the stuck
|
||||||
|
spot.
|
||||||
|
2. **Instrument `async_main`'s final unwind** to
|
||||||
|
see which await in the teardown doesn't
|
||||||
|
complete.
|
||||||
|
3. **Compare under `trio_proc` backend** at the
|
||||||
|
same `_worker`-equivalent level to see where
|
||||||
|
the flows diverge.
|
||||||
|
|
||||||
|
### Rule-out: NOT a stuck peer-chan recv
|
||||||
|
|
||||||
|
Earlier hypothesis was that the 5 stuck peer-chan
|
||||||
|
loops were blocked on a socket recv that cancel
|
||||||
|
couldn't interrupt. This pass revealed the real
|
||||||
|
cause: cancel **never reaches those tasks** because
|
||||||
|
their owning actor's `Actor.cancel()` never runs.
|
||||||
|
The recvs are fine — they're just parked because
|
||||||
|
nothing is telling them to stop.
|
||||||
|
|
||||||
|
## Update — 2026-04-23 (very late): leaves exit, middle actors stuck in `trio.run`
|
||||||
|
|
||||||
|
Yet another instrumentation pass — this time
|
||||||
|
printing at:
|
||||||
|
|
||||||
|
- `_worker` child branch: `pre child_target()` /
|
||||||
|
`child_target RETURNED rc=N` / `about to
|
||||||
|
os._exit(rc)`
|
||||||
|
- `_trio_main`: `about to trio.run` /
|
||||||
|
`trio.run RETURNED NORMALLY` / `FINALLY`
|
||||||
|
|
||||||
|
**Fresh-run results** (`test_nested_multierrors[
|
||||||
|
subint_forkserver]`, depth=1/breadth=2, 1 root + 14
|
||||||
|
forked = 15 actors total):
|
||||||
|
|
||||||
|
- **9 processes completed the full flow** —
|
||||||
|
`trio.run RETURNED NORMALLY` → `child_target
|
||||||
|
RETURNED rc=0` → `about to os._exit(0)`. These
|
||||||
|
are the LEAVES of the tree (errorer actors) plus
|
||||||
|
their direct parents (depth-0 spawners). They
|
||||||
|
actually exit their processes.
|
||||||
|
- **5 processes are stuck INSIDE `trio.run(trio_main)`**
|
||||||
|
— they hit "about to trio.run" but NEVER see
|
||||||
|
"trio.run RETURNED NORMALLY". These are root +
|
||||||
|
top-level spawners + one intermediate.
|
||||||
|
|
||||||
|
**What this means:** `async_main` itself is the
|
||||||
|
deadlock holder, not the peer-channel loops.
|
||||||
|
Specifically, the outer `async with root_tn:` in
|
||||||
|
`async_main` never exits for the 5 stuck actors.
|
||||||
|
Their `trio.run` never returns → `_trio_main`
|
||||||
|
catch/finally never runs → `_worker` never reaches
|
||||||
|
`os._exit(rc)` → the PROCESS never dies → its
|
||||||
|
parent's `_ForkedProc.wait()` blocks → parent's
|
||||||
|
nursery hangs → parent's `async_main` hangs → ...
|
||||||
|
|
||||||
|
### The new precise question
|
||||||
|
|
||||||
|
**What task in the 5 stuck actors' `async_main`
|
||||||
|
never completes?** Candidates:
|
||||||
|
|
||||||
|
1. The shielded parent-chan `process_messages`
|
||||||
|
task in `root_tn` — but we explicitly cancel it
|
||||||
|
via `_parent_chan_cs.cancel()` in `Actor.cancel()`.
|
||||||
|
However, `Actor.cancel()` only runs during
|
||||||
|
`open_root_actor.__aexit__`, which itself runs
|
||||||
|
only after `async_main`'s outer unwind — which
|
||||||
|
doesn't happen. So the shield isn't broken.
|
||||||
|
|
||||||
|
2. `await actor_nursery._join_procs.wait()` or
|
||||||
|
similar in the inline backend `*_proc` flow.
|
||||||
|
|
||||||
|
3. `_ForkedProc.wait()` on a grandchild that
|
||||||
|
actually DID exit — but the pidfd_open watch
|
||||||
|
didn't fire for some reason (race between
|
||||||
|
pidfd_open and the child exiting?).
|
||||||
|
|
||||||
|
The most specific next probe: **add DIAG around
|
||||||
|
`_ForkedProc.wait()` enter/exit** to see whether
|
||||||
|
the pidfd-based wait returns for every grandchild
|
||||||
|
exit. If a stuck parent's `_ForkedProc.wait()`
|
||||||
|
NEVER returns despite its child exiting, the
|
||||||
|
pidfd mechanism has a race bug under nested
|
||||||
|
forkserver.
|
||||||
|
|
||||||
|
Alternative probe: instrument `async_main`'s outer
|
||||||
|
nursery exits to find which nursery's `__aexit__`
|
||||||
|
is stuck, drilling down from `trio.run` to the
|
||||||
|
specific `async with` that never completes.
|
||||||
|
|
||||||
|
### Cascade summary (updated tree view)
|
||||||
|
|
||||||
|
```
|
||||||
|
ROOT (pytest) STUCK in trio.run
|
||||||
|
├── top_0 (spawner, d=1) STUCK in trio.run
|
||||||
|
│ ├── spawner_0_d1_0 (d=0) exited (os._exit 0)
|
||||||
|
│ │ ├── errorer_0_0 exited (os._exit 0)
|
||||||
|
│ │ └── errorer_0_1 exited (os._exit 0)
|
||||||
|
│ └── spawner_0_d1_1 (d=0) exited (os._exit 0)
|
||||||
|
│ ├── errorer_0_2 exited (os._exit 0)
|
||||||
|
│ └── errorer_0_3 exited (os._exit 0)
|
||||||
|
└── top_1 (spawner, d=1) STUCK in trio.run
|
||||||
|
├── spawner_1_d1_0 (d=0) STUCK in trio.run (sibling race?)
|
||||||
|
│ ├── errorer_1_0 exited
|
||||||
|
│ └── errorer_1_1 exited
|
||||||
|
└── spawner_1_d1_1 (d=0) STUCK in trio.run
|
||||||
|
├── errorer_1_2 exited
|
||||||
|
└── errorer_1_3 exited
|
||||||
|
```
|
||||||
|
|
||||||
|
Grandchildren (d=0 spawners) exit OR stick —
|
||||||
|
asymmetric. Not purely depth-determined. Some race
|
||||||
|
condition in nursery teardown when multiple
|
||||||
|
siblings error simultaneously.
|
||||||
|
|
||||||
|
## Update — 2026-04-23 (late, probe iteration 3): hang pinpointed to `wait_for_no_more_peers()`
|
||||||
|
|
||||||
|
Further DIAGDEBUG at every milestone in `async_main`
|
||||||
|
(runtime UP / EXITED service_tn / EXITED root_tn /
|
||||||
|
FINALLY ENTER / RETURNING) plus `_ForkedProc.wait`
|
||||||
|
ENTER/RETURNED per-pidfd. Result:
|
||||||
|
|
||||||
|
**Every stuck actor reaches `async_main: FINALLY
|
||||||
|
ENTER` but NOT `async_main: RETURNING`.**
|
||||||
|
|
||||||
|
That isolates the hang to a specific await in
|
||||||
|
`async_main`'s finally block at
|
||||||
|
`tractor/runtime/_runtime.py:1837+`. The suspect:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Ensure all peers (actors connected to us as clients) are finished
|
||||||
|
if ipc_server := actor.ipc_server and ipc_server.has_peers(check_chans=True):
|
||||||
|
...
|
||||||
|
await ipc_server.wait_for_no_more_peers() # ← UNBOUNDED, blocks forever
|
||||||
|
```
|
||||||
|
|
||||||
|
`_no_more_peers` is an `Event` set only when
|
||||||
|
`server._peers` empties (see
|
||||||
|
`ipc/_server.py:526-530`). If ANY peer-handler is
|
||||||
|
stuck (the 5 unclosed loops from the earlier pass),
|
||||||
|
it keeps its channel in `server._peers`, so the
|
||||||
|
event never fires, so the wait hangs.
|
||||||
|
|
||||||
|
### Applied fix (partial, landed as defensive-in-depth)
|
||||||
|
|
||||||
|
`tractor/runtime/_runtime.py:1981` —
|
||||||
|
`wait_for_no_more_peers()` call now wrapped in
|
||||||
|
`trio.move_on_after(3.0)` + a warning log when the
|
||||||
|
timeout fires. Commented with the full rationale.
|
||||||
|
|
||||||
|
**Verified:** with this fix, ALL 15 actors reach
|
||||||
|
`async_main: RETURNING` cleanly (up from 10/15
|
||||||
|
reaching end before).
|
||||||
|
|
||||||
|
**Unfortunately:** the test still hangs past 45s
|
||||||
|
total — meaning there's YET ANOTHER unbounded wait
|
||||||
|
downstream of `async_main`. The bounded
|
||||||
|
`wait_for_no_more_peers` unblocks one level, but
|
||||||
|
the cascade has another level above it.
|
||||||
|
|
||||||
|
### Candidates for the remaining hang
|
||||||
|
|
||||||
|
1. `open_root_actor`'s own finally / post-
|
||||||
|
`async_main` flow in `_root.py` — specifically
|
||||||
|
`await actor.cancel(None)` which has its own
|
||||||
|
internal waits.
|
||||||
|
2. The `trio.run()` itself doesn't return even
|
||||||
|
after the root task completes because trio's
|
||||||
|
nursery still has background tasks running.
|
||||||
|
3. Maybe `_serve_ipc_eps`'s finally has an await
|
||||||
|
that blocks when peers aren't clearing.
|
||||||
|
|
||||||
|
### Current stance
|
||||||
|
|
||||||
|
- Defensive `wait_for_no_more_peers` bound landed
|
||||||
|
(good hygiene regardless). Revealing a real
|
||||||
|
deadlock-avoidance gap in tractor's cleanup.
|
||||||
|
- Test still hangs → skip-mark restored on
|
||||||
|
`test_nested_multierrors[subint_forkserver]`.
|
||||||
|
- The full chain of unbounded waits needs another
|
||||||
|
session of drilling, probably at
|
||||||
|
`open_root_actor` / `actor.cancel` level.
|
||||||
|
|
||||||
|
### Summary of this investigation's wins
|
||||||
|
|
||||||
|
1. **FD hygiene fix** (`_close_inherited_fds`) —
|
||||||
|
correct, closed orphan-SIGINT sibling issue.
|
||||||
|
2. **pidfd-based `_ForkedProc.wait`** — cancellable,
|
||||||
|
matches trio_proc pattern.
|
||||||
|
3. **`_parent_chan_cs` wiring** —
|
||||||
|
`Actor.cancel()` now breaks the shielded parent-
|
||||||
|
chan `process_messages` loop.
|
||||||
|
4. **`wait_for_no_more_peers` bounded** —
|
||||||
|
prevents the actor-level finally hang.
|
||||||
|
5. **Ruled-out hypotheses:** tree-kill missing
|
||||||
|
(wrong), stuck socket recv (wrong).
|
||||||
|
6. **Pinpointed remaining unknown:** at least one
|
||||||
|
more unbounded wait in the teardown cascade
|
||||||
|
above `async_main`. Concrete candidates
|
||||||
|
enumerated above.
|
||||||
|
|
||||||
|
## Update — 2026-04-23 (VERY late): pytest capture pipe IS the final gate
|
||||||
|
|
||||||
|
After landing fixes 1-4 and instrumenting every
|
||||||
|
layer down to `tractor_test`'s `trio.run(_main)`:
|
||||||
|
|
||||||
|
**Empirical result: with `pytest -s` the test PASSES
|
||||||
|
in 6.20s.** Without `-s` (default `--capture=fd`) it
|
||||||
|
hangs forever.
|
||||||
|
|
||||||
|
DIAG timeline for the root pytest PID (with `-s`
|
||||||
|
implied from later verification):
|
||||||
|
|
||||||
|
```
|
||||||
|
tractor_test: about to trio.run(_main)
|
||||||
|
open_root_actor: async_main task started, yielding to test body
|
||||||
|
_main: about to await wrapped test fn
|
||||||
|
_main: wrapped RETURNED cleanly ← test body completed!
|
||||||
|
open_root_actor: about to actor.cancel(None)
|
||||||
|
Actor.cancel ENTER req_chan=False
|
||||||
|
Actor.cancel RETURN
|
||||||
|
open_root_actor: actor.cancel RETURNED
|
||||||
|
open_root_actor: outer FINALLY
|
||||||
|
open_root_actor: finally END (returning from ctxmgr)
|
||||||
|
tractor_test: trio.run FINALLY (returned or raised) ← trio.run fully returned!
|
||||||
|
```
|
||||||
|
|
||||||
|
`trio.run()` fully returns. The test body itself
|
||||||
|
completes successfully (pytest.raises absorbed the
|
||||||
|
expected `BaseExceptionGroup`). What blocks is
|
||||||
|
**pytest's own stdout/stderr capture** — under
|
||||||
|
`--capture=fd` default, pytest replaces the parent
|
||||||
|
process's fd 1,2 with pipe write-ends it's reading
|
||||||
|
from. Fork children inherit those pipe fds
|
||||||
|
(because `_close_inherited_fds` correctly preserves
|
||||||
|
stdio). High-volume subactor error-log tracebacks
|
||||||
|
(7+ actors each logging multiple
|
||||||
|
`RemoteActorError`/`ExceptionGroup` tracebacks on
|
||||||
|
the error-propagation cascade) fill the 64KB Linux
|
||||||
|
pipe buffer. Subactor writes block. Subactor can't
|
||||||
|
progress. Process doesn't exit. Parent's
|
||||||
|
`_ForkedProc.wait` (now pidfd-based and
|
||||||
|
cancellable, but nothing's cancelling here since
|
||||||
|
the test body already completed) keeps the pipe
|
||||||
|
reader alive... but pytest isn't draining its end
|
||||||
|
fast enough because test-teardown/fixture-cleanup
|
||||||
|
is in progress.
|
||||||
|
|
||||||
|
**Actually** the exact mechanism is slightly
|
||||||
|
different: pytest's capture fixture MIGHT be
|
||||||
|
actively reading, but faster-than-writer subactors
|
||||||
|
overflow its internal buffer. Or pytest might be
|
||||||
|
blocked itself on the finalization step.
|
||||||
|
|
||||||
|
Either way, `-s` conclusively fixes it.
|
||||||
|
|
||||||
|
### Why I ruled this out earlier (and shouldn't have)
|
||||||
|
|
||||||
|
Earlier in this investigation I tested
|
||||||
|
`test_nested_multierrors` with/without `-s` and
|
||||||
|
both hung. That's because AT THAT TIME, fixes 1-4
|
||||||
|
weren't all in place yet. The test was hanging at
|
||||||
|
multiple deeper levels long before reaching the
|
||||||
|
"generate lots of error-log output" phase. Once
|
||||||
|
the cascade actually tore down cleanly, enough
|
||||||
|
output was produced to hit the capture-pipe limit.
|
||||||
|
|
||||||
|
**Classic order-of-operations mistake in
|
||||||
|
debugging:** ruling something out too early based
|
||||||
|
on a test that was actually failing for a
|
||||||
|
different reason.
|
||||||
|
|
||||||
|
### Fix direction (next session)
|
||||||
|
|
||||||
|
Redirect subactor stdout/stderr to `/dev/null` (or
|
||||||
|
a session-scoped log file) in the fork-child
|
||||||
|
prelude, right after `_close_inherited_fds()`. This
|
||||||
|
severs the inherited pytest-capture pipes and lets
|
||||||
|
subactor output flow elsewhere. Under normal
|
||||||
|
production use (non-pytest), stdout/stderr would
|
||||||
|
be the TTY — we'd want to keep that. So the
|
||||||
|
redirect should be conditional or opt-in via the
|
||||||
|
`child_sigint`/proc_kwargs flag family.
|
||||||
|
|
||||||
|
Alternative: document as a gotcha and recommend
|
||||||
|
`pytest -s` for any tests using the
|
||||||
|
`subint_forkserver` backend with multi-level actor
|
||||||
|
trees. Simpler, user-visible, no code change.
|
||||||
|
|
||||||
|
### Current state
|
||||||
|
|
||||||
|
- Skip-mark on `test_nested_multierrors[subint_forkserver]`
|
||||||
|
restored with reason pointing here.
|
||||||
|
- Test confirmed passing with `-s` after all 4
|
||||||
|
cascade fixes applied.
|
||||||
|
- The 4 cascade fixes are NOT wasted — they're
|
||||||
|
correct hardening regardless of the capture-pipe
|
||||||
|
issue, AND without them we'd never reach the
|
||||||
|
"actually produces enough output to fill the
|
||||||
|
pipe" state.
|
||||||
|
|
||||||
|
## Stopgap (landed)
|
||||||
|
|
||||||
|
`test_nested_multierrors` skip-marked under
|
||||||
|
`subint_forkserver` via
|
||||||
|
`@pytest.mark.skipon_spawn_backend('subint_forkserver',
|
||||||
|
reason='...')`, cross-referenced to this doc. Mark
|
||||||
|
should be dropped once the peer-channel-loop exit
|
||||||
|
issue is fixed.
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- `tractor/spawn/_subint_forkserver.py::fork_from_worker_thread`
|
||||||
|
— the primitive whose post-fork FD hygiene is
|
||||||
|
probably the culprit.
|
||||||
|
- `tractor/spawn/_subint_forkserver.py::subint_forkserver_proc`
|
||||||
|
— the backend function that orchestrates the
|
||||||
|
graceful cancel path hitting this bug.
|
||||||
|
- `tractor/spawn/_subint_forkserver.py::_ForkedProc`
|
||||||
|
— the `trio.Process`-compatible shim; NOT the
|
||||||
|
failing component (confirmed via thread-dump).
|
||||||
|
- `tests/test_cancellation.py::test_nested_multierrors`
|
||||||
|
— the test that surfaced the hang.
|
||||||
|
- `ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md`
|
||||||
|
— sibling hang class; probably same underlying
|
||||||
|
fork-FD-inheritance root cause.
|
||||||
|
- tractor issue #379 — subint backend tracking.
|
||||||
|
|
@ -0,0 +1,184 @@
|
||||||
|
# Revisit `subint_forkserver` thread-cache constraints once msgspec PEP 684 support lands
|
||||||
|
|
||||||
|
Follow-up tracker for cleanup work gated on the msgspec
|
||||||
|
PEP 684 adoption upstream ([jcrist/msgspec#563](https://github.com/jcrist/msgspec/issues/563)).
|
||||||
|
|
||||||
|
Context — why this exists
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
The `tractor.spawn._subint_forkserver` submodule currently
|
||||||
|
carries two "non-trio" thread-hygiene constraints whose
|
||||||
|
necessity is tangled with issues that *should* dissolve
|
||||||
|
under PEP 684 isolated-mode subinterpreters:
|
||||||
|
|
||||||
|
1. `fork_from_worker_thread()` / `run_subint_in_worker_thread()`
|
||||||
|
internally allocate a **dedicated `threading.Thread`**
|
||||||
|
rather than using `trio.to_thread.run_sync()`.
|
||||||
|
2. The test helper is named
|
||||||
|
`run_fork_in_non_trio_thread()` — the
|
||||||
|
`non_trio` qualifier is load-bearing today.
|
||||||
|
|
||||||
|
This doc catalogs *why* those constraints exist, which of
|
||||||
|
them isolated-mode would fix, and what the
|
||||||
|
audit-and-cleanup path looks like once msgspec #563 is
|
||||||
|
resolved.
|
||||||
|
|
||||||
|
The three reasons the constraints exist
|
||||||
|
---------------------------------------
|
||||||
|
|
||||||
|
### 1. GIL-starvation class → fixed by PEP 684 isolated mode
|
||||||
|
|
||||||
|
The class-A hang documented in
|
||||||
|
`subint_sigint_starvation_issue.md` is entirely about
|
||||||
|
legacy-config subints **sharing the main GIL**. Once
|
||||||
|
msgspec #563 lands and tractor flips
|
||||||
|
`tractor.spawn._subint` to
|
||||||
|
`concurrent.interpreters.create()` (isolated config), each
|
||||||
|
subint gets its own GIL. Abandoned subint threads can't
|
||||||
|
contend for main's GIL → can't starve the main trio loop
|
||||||
|
→ signal-wakeup-pipe drains normally → no SIGINT-drop.
|
||||||
|
|
||||||
|
This class of hazard **dissolves entirely**. The
|
||||||
|
non-trio-thread requirement for *this reason* disappears.
|
||||||
|
|
||||||
|
### 2. Destroy race / tstate-recycling → orthogonal; unclear
|
||||||
|
|
||||||
|
The `subint_proc` dedicated-thread fix (commit `26fb8206`)
|
||||||
|
addressed a different issue: `_interpreters.destroy(interp_id)`
|
||||||
|
was blocking on a trio-cache worker that had run an
|
||||||
|
earlier `interp.exec()` for that subint. Working
|
||||||
|
hypothesis at the time was "the cached thread retains the
|
||||||
|
subint's tstate".
|
||||||
|
|
||||||
|
But tstate-handling is **not specific to GIL mode** —
|
||||||
|
`_PyXI_Enter` / `_PyXI_Exit` (the C-level machinery both
|
||||||
|
configs use to enter/leave a subint from a thread) should
|
||||||
|
restore the caller's tstate regardless of GIL config. So
|
||||||
|
isolated mode **doesn't obviously fix this**. It might be:
|
||||||
|
|
||||||
|
- A py3.13 bug fixed in later versions — we saw the race
|
||||||
|
first on 3.13 and never re-tested on 3.14 after moving
|
||||||
|
to dedicated threads.
|
||||||
|
- A genuine CPython quirk around cached threads that
|
||||||
|
exec'd into a subint, persisting across GIL modes.
|
||||||
|
- Something else we misdiagnosed — the empirical fix
|
||||||
|
(dedicated thread) worked but the analysis may have
|
||||||
|
been incomplete.
|
||||||
|
|
||||||
|
Only way to know: once we're on isolated mode, empirically
|
||||||
|
retry `trio.to_thread.run_sync(interp.exec, ...)` and see
|
||||||
|
if `destroy()` still blocks. If it does, keep the
|
||||||
|
dedicated thread; if not, one constraint relaxed.
|
||||||
|
|
||||||
|
### 3. Fork-from-main-interp-tstate (the constraint in this module's helper names)
|
||||||
|
|
||||||
|
The fork-from-main-interp-tstate invariant — CPython's
|
||||||
|
`PyOS_AfterFork_Child` →
|
||||||
|
`_PyInterpreterState_DeleteExceptMain` gate documented in
|
||||||
|
`subint_fork_blocked_by_cpython_post_fork_issue.md` — is
|
||||||
|
about the calling thread's **current** tstate at the
|
||||||
|
moment `os.fork()` runs. If trio's cache threads never
|
||||||
|
enter subints at all, their tstate is plain main-interp,
|
||||||
|
and fork from them would be fine.
|
||||||
|
|
||||||
|
The reason the smoke test +
|
||||||
|
`run_fork_in_non_trio_thread` test helper
|
||||||
|
currently use a dedicated `threading.Thread` is narrow:
|
||||||
|
**we don't want to risk a trio cache thread that has
|
||||||
|
previously been used as a subint driver being the one that
|
||||||
|
picks up the fork job**. If cached tstate doesn't get
|
||||||
|
cleared (back to reason #2), the fork's child-side
|
||||||
|
post-init would see the wrong interp and abort.
|
||||||
|
|
||||||
|
In an isolated-mode world where msgspec works:
|
||||||
|
|
||||||
|
- `subint_proc` would use the public
|
||||||
|
`concurrent.interpreters.create()` + `Interpreter.exec()`
|
||||||
|
/ `Interpreter.close()` — which *should* handle tstate
|
||||||
|
cleanly (they're the "blessed" API).
|
||||||
|
- If so, trio's cache threads are safe to fork from
|
||||||
|
regardless of whether they've previously driven subints.
|
||||||
|
- → the `non_trio` qualifier in
|
||||||
|
`run_fork_in_non_trio_thread` becomes
|
||||||
|
*overcautious* rather than load-bearing, and the
|
||||||
|
dedicated-thread primitives in `_subint_forkserver.py`
|
||||||
|
can likely be replaced with straight
|
||||||
|
`trio.to_thread.run_sync()` wrappers.
|
||||||
|
|
||||||
|
TL;DR
|
||||||
|
-----
|
||||||
|
|
||||||
|
| constraint | fixed by isolated mode? |
|
||||||
|
|---|---|
|
||||||
|
| GIL-starvation (class A) | **yes** |
|
||||||
|
| destroy race on cached worker | unclear — empirical test on py3.14 + isolated API required |
|
||||||
|
| fork-from-main-tstate requirement on worker | **probably yes, conditional on the destroy-race question above** |
|
||||||
|
|
||||||
|
If #2 also resolves on py3.14+ with isolated mode,
|
||||||
|
tractor could drop the `non_trio` qualifier from the fork
|
||||||
|
helper's name and just use `trio.to_thread.run_sync(...)`
|
||||||
|
for everything. But **we shouldn't do that preemptively**
|
||||||
|
— the current cautious design is cheap (one dedicated
|
||||||
|
thread per fork / per subint-exec) and correct.
|
||||||
|
|
||||||
|
Audit plan when msgspec #563 lands
|
||||||
|
----------------------------------
|
||||||
|
|
||||||
|
Assuming msgspec grows `Py_mod_multiple_interpreters`
|
||||||
|
support:
|
||||||
|
|
||||||
|
1. **Flip `tractor.spawn._subint` to isolated mode.** Drop
|
||||||
|
the `_interpreters.create('legacy')` call in favor of
|
||||||
|
the public API (`concurrent.interpreters.create()` +
|
||||||
|
`Interpreter.exec()` / `Interpreter.close()`). Run the
|
||||||
|
three `ai/conc-anal/subint_*_issue.md` reproducers —
|
||||||
|
class-A (`test_stale_entry_is_deleted` etc.) should
|
||||||
|
pass without the `skipon_spawn_backend('subint')` marks
|
||||||
|
(revisit the marker inventory).
|
||||||
|
|
||||||
|
2. **Empirical destroy-race retest.** In `subint_proc`,
|
||||||
|
swap the dedicated `threading.Thread` back to
|
||||||
|
`trio.to_thread.run_sync(Interpreter.exec, ...,
|
||||||
|
abandon_on_cancel=False)` and run the full subint test
|
||||||
|
suite. If `Interpreter.close()` (or the backing
|
||||||
|
destroy) blocks the same way as the legacy version
|
||||||
|
did, revert and keep the dedicated thread.
|
||||||
|
|
||||||
|
3. **If #2 clean**, audit `_subint_forkserver.py`:
|
||||||
|
- Rename `run_fork_in_non_trio_thread` → drop the
|
||||||
|
`_non_trio_` qualifier (e.g. `run_fork_in_thread`) or
|
||||||
|
inline the two-line `trio.to_thread.run_sync` call at
|
||||||
|
the call sites and drop the helper entirely.
|
||||||
|
- Consider whether `fork_from_worker_thread` +
|
||||||
|
`run_subint_in_worker_thread` still warrant being
|
||||||
|
separate module-level primitives or whether they
|
||||||
|
collapse into a compound
|
||||||
|
`trio.to_thread.run_sync`-driven pattern inside the
|
||||||
|
(future) `subint_forkserver_proc` backend.
|
||||||
|
|
||||||
|
4. **Doc fallout.** `subint_sigint_starvation_issue.md`
|
||||||
|
and `subint_cancel_delivery_hang_issue.md` both cite
|
||||||
|
the legacy-GIL-sharing architecture as the root cause.
|
||||||
|
Close them with commit-refs to the isolated-mode
|
||||||
|
migration. This doc itself should get a closing
|
||||||
|
post-mortem section noting which of #1/#2/#3 actually
|
||||||
|
resolved vs persisted.
|
||||||
|
|
||||||
|
References
|
||||||
|
----------
|
||||||
|
|
||||||
|
- `tractor.spawn._subint_forkserver` — the in-tree module
|
||||||
|
whose constraints this doc catalogs.
|
||||||
|
- `ai/conc-anal/subint_sigint_starvation_issue.md` — the
|
||||||
|
GIL-starvation class.
|
||||||
|
- `ai/conc-anal/subint_cancel_delivery_hang_issue.md` —
|
||||||
|
sibling Ctrl-C-able hang class.
|
||||||
|
- `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
|
||||||
|
— why fork-from-subint is blocked (this drives the
|
||||||
|
forkserver-via-non-subint-thread workaround).
|
||||||
|
- `ai/conc-anal/subint_fork_from_main_thread_smoketest.py`
|
||||||
|
— empirical validation for the workaround.
|
||||||
|
- [PEP 684 — per-interpreter GIL](https://peps.python.org/pep-0684/)
|
||||||
|
- [PEP 734 — `concurrent.interpreters` public API](https://peps.python.org/pep-0734/)
|
||||||
|
- [jcrist/msgspec#563 — PEP 684 support tracker](https://github.com/jcrist/msgspec/issues/563)
|
||||||
|
- tractor issue #379 — subint backend tracking.
|
||||||
|
|
@ -0,0 +1,350 @@
|
||||||
|
# `subint` backend: abandoned-subint thread can wedge main trio event loop (Ctrl-C unresponsive)
|
||||||
|
|
||||||
|
Follow-up to the Phase B subint spawn-backend PR (see
|
||||||
|
`tractor.spawn._subint`, issue #379). The hard-kill escape
|
||||||
|
hatch we landed (`_HARD_KILL_TIMEOUT`, bounded shields,
|
||||||
|
`daemon=True` driver-thread abandonment) handles *most*
|
||||||
|
stuck-subint scenarios cleanly, but there's one class of
|
||||||
|
hang that can't be fully escaped from within tractor: a
|
||||||
|
still-running abandoned sub-interpreter can starve the
|
||||||
|
**parent's** trio event loop to the point where **SIGINT is
|
||||||
|
effectively dropped by the kernel ↔ Python boundary** —
|
||||||
|
making the pytest process un-Ctrl-C-able.
|
||||||
|
|
||||||
|
## Symptom
|
||||||
|
|
||||||
|
Running `test_stale_entry_is_deleted[subint]` under
|
||||||
|
`--spawn-backend=subint`:
|
||||||
|
|
||||||
|
1. Test spawns a subactor (`transport_fails_actor`) which
|
||||||
|
kills its own IPC server and then
|
||||||
|
`trio.sleep_forever()`.
|
||||||
|
2. Parent tries `Portal.cancel_actor()` → channel
|
||||||
|
disconnected → fast return.
|
||||||
|
3. Nursery teardown triggers our `subint_proc` cancel path.
|
||||||
|
Portal-cancel fails (dead channel),
|
||||||
|
`_HARD_KILL_TIMEOUT` fires, driver thread is abandoned
|
||||||
|
(`daemon=True`), `_interpreters.destroy(interp_id)`
|
||||||
|
raises `InterpreterError` (because the subint is still
|
||||||
|
running).
|
||||||
|
4. Test appears to hang indefinitely at the *outer*
|
||||||
|
`async with tractor.open_nursery() as an:` exit.
|
||||||
|
5. `Ctrl-C` at the terminal does nothing. The pytest
|
||||||
|
process is un-interruptable.
|
||||||
|
|
||||||
|
## Evidence
|
||||||
|
|
||||||
|
### `strace` on the hung pytest process
|
||||||
|
|
||||||
|
```
|
||||||
|
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||||
|
write(37, "\2", 1) = -1 EAGAIN (Resource temporarily unavailable)
|
||||||
|
rt_sigreturn({mask=[WINCH]}) = 140585542325792
|
||||||
|
```
|
||||||
|
|
||||||
|
Translated:
|
||||||
|
|
||||||
|
- Kernel delivers `SIGINT` to pytest.
|
||||||
|
- CPython's C-level signal handler fires and tries to
|
||||||
|
write the signal number byte (`0x02` = SIGINT) to fd 37
|
||||||
|
— the **Python signal-wakeup fd** (set via
|
||||||
|
`signal.set_wakeup_fd()`, which trio uses to wake its
|
||||||
|
event loop on signals).
|
||||||
|
- Write returns `EAGAIN` — **the pipe is full**. Nothing
|
||||||
|
is draining it.
|
||||||
|
- `rt_sigreturn` with the signal masked off — signal is
|
||||||
|
"handled" from the kernel's perspective but the actual
|
||||||
|
Python-level handler (and therefore trio's
|
||||||
|
`KeyboardInterrupt` delivery) never runs.
|
||||||
|
|
||||||
|
### Stack dump (via `tractor.devx.dump_on_hang`)
|
||||||
|
|
||||||
|
At 20s into the hang, only the **main thread** is visible:
|
||||||
|
|
||||||
|
```
|
||||||
|
Thread 0x...7fdca0191780 [python] (most recent call first):
|
||||||
|
File ".../trio/_core/_io_epoll.py", line 245 in get_events
|
||||||
|
File ".../trio/_core/_run.py", line 2415 in run
|
||||||
|
File ".../tests/discovery/test_registrar.py", line 575 in test_stale_entry_is_deleted
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
No driver thread shows up. The abandoned-legacy-subint
|
||||||
|
thread still exists from the OS's POV (it's still running
|
||||||
|
inside `_interpreters.exec()` driving the subint's
|
||||||
|
`trio.run()` on `trio.sleep_forever()`) but the **main
|
||||||
|
interp's faulthandler can't see threads currently executing
|
||||||
|
inside a sub-interpreter's tstate**. Concretely: the thread
|
||||||
|
is alive, holding state we can't introspect from here.
|
||||||
|
|
||||||
|
## Root cause analysis
|
||||||
|
|
||||||
|
The most consistent explanation for both observations:
|
||||||
|
|
||||||
|
1. **Legacy-config subinterpreters share the main GIL.**
|
||||||
|
PEP 734's public `concurrent.interpreters.create()`
|
||||||
|
defaults to `'isolated'` (per-interp GIL), but tractor
|
||||||
|
uses `_interpreters.create('legacy')` as a workaround
|
||||||
|
for C extensions that don't yet support PEP 684
|
||||||
|
(notably `msgspec`, see
|
||||||
|
[jcrist/msgspec#563](https://github.com/jcrist/msgspec/issues/563)).
|
||||||
|
Legacy-mode subints share process-global state
|
||||||
|
including the GIL.
|
||||||
|
|
||||||
|
2. **Our abandoned subint thread never exits.** After our
|
||||||
|
hard-kill timeout, `driver_thread.join()` is abandoned
|
||||||
|
via `abandon_on_cancel=True` and the thread is
|
||||||
|
`daemon=True` so proc-exit won't block on it — but the
|
||||||
|
thread *itself* is still alive inside
|
||||||
|
`_interpreters.exec()`, driving a `trio.run()` that
|
||||||
|
will never return (the subint actor is in
|
||||||
|
`trio.sleep_forever()`).
|
||||||
|
|
||||||
|
3. **`_interpreters.destroy()` cannot force-stop a running
|
||||||
|
subint.** It raises `InterpreterError` on any
|
||||||
|
still-running subinterpreter; there is no public
|
||||||
|
CPython API to force-destroy one.
|
||||||
|
|
||||||
|
4. **Shared-GIL + non-terminating subint thread → main
|
||||||
|
trio loop starvation.** Under enough load (the subint's
|
||||||
|
trio event loop iterating in the background, IPC-layer
|
||||||
|
tasks still in the subint, etc.) the main trio event
|
||||||
|
loop can fail to iterate frequently enough to drain its
|
||||||
|
wakeup pipe. Once that pipe fills, `SIGINT` writes from
|
||||||
|
the C signal handler return `EAGAIN` and signals are
|
||||||
|
silently dropped — exactly what `strace` shows.
|
||||||
|
|
||||||
|
The shielded
|
||||||
|
`await actor_nursery._join_procs.wait()` at the top of
|
||||||
|
`subint_proc` (inherited unchanged from the `trio_proc`
|
||||||
|
pattern) is structurally involved too: if main trio *does*
|
||||||
|
get a schedule slice, it'd find the `subint_proc` task
|
||||||
|
parked on `_join_procs` under shield — which traps whatever
|
||||||
|
`Cancelled` arrives. But that's a second-order effect; the
|
||||||
|
signal-pipe-full condition is the primary "Ctrl-C doesn't
|
||||||
|
work" cause.
|
||||||
|
|
||||||
|
## Why we can't fix this from inside tractor
|
||||||
|
|
||||||
|
- **No force-destroy API.** CPython provides neither a
|
||||||
|
`_interpreters.force_destroy()` nor a thread-
|
||||||
|
cancellation primitive (`pthread_cancel` is actively
|
||||||
|
discouraged and unavailable on Windows). A subint stuck
|
||||||
|
in pure-Python loops (or worse, C code that doesn't poll
|
||||||
|
for signals) is structurally unreachable from outside.
|
||||||
|
- **Shared GIL is the root scheduling issue.** As long as
|
||||||
|
we're forced into legacy-mode subints for `msgspec`
|
||||||
|
compatibility, the abandoned-thread scenario is
|
||||||
|
fundamentally a process-global GIL-starvation window.
|
||||||
|
- **`signal.set_wakeup_fd()` is process-global.** Even if
|
||||||
|
we wanted to put our own drainer on the wakeup pipe,
|
||||||
|
only one party owns it at a time.
|
||||||
|
|
||||||
|
## Current workaround
|
||||||
|
|
||||||
|
- **Fixture-side SIGINT loop on the `daemon` subproc** (in
|
||||||
|
this test's `daemon: subprocess.Popen` fixture in
|
||||||
|
`tests/conftest.py`). The daemon dying closes its end of
|
||||||
|
the registry IPC, which unblocks a pending recv in main
|
||||||
|
trio's IPC-server task, which lets the event loop
|
||||||
|
iterate, which drains the wakeup pipe, which finally
|
||||||
|
delivers the test-harness SIGINT.
|
||||||
|
- **Module-level skip on py3.13**
|
||||||
|
(`pytest.importorskip('concurrent.interpreters')`) — the
|
||||||
|
private `_interpreters` C module exists on 3.13 but the
|
||||||
|
multi-trio-task interaction hangs silently there
|
||||||
|
independently of this issue.
|
||||||
|
|
||||||
|
## Path forward
|
||||||
|
|
||||||
|
1. **Primary**: upstream `msgspec` PEP 684 adoption
|
||||||
|
([jcrist/msgspec#563](https://github.com/jcrist/msgspec/issues/563)).
|
||||||
|
Unlocks `concurrent.interpreters.create()` isolated
|
||||||
|
mode → per-interp GIL → abandoned subint threads no
|
||||||
|
longer starve the parent's main trio loop. At that
|
||||||
|
point we can flip `_subint.py` back to the public API
|
||||||
|
(`create()` / `Interpreter.exec()` / `Interpreter.close()`)
|
||||||
|
and drop the private `_interpreters` path.
|
||||||
|
|
||||||
|
2. **Secondary**: watch CPython for a public
|
||||||
|
force-destroy primitive. If something like
|
||||||
|
`Interpreter.close(force=True)` lands, we can use it as
|
||||||
|
a hard-kill final stage and actually tear down
|
||||||
|
abandoned subints.
|
||||||
|
|
||||||
|
3. **Harness-level**: document the fixture-side SIGINT
|
||||||
|
loop pattern as the "known workaround" for subint-
|
||||||
|
backend tests that can leave background state holding
|
||||||
|
the main event loop hostage.
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- PEP 734 (`concurrent.interpreters`):
|
||||||
|
<https://peps.python.org/pep-0734/>
|
||||||
|
- PEP 684 (per-interpreter GIL):
|
||||||
|
<https://peps.python.org/pep-0684/>
|
||||||
|
- `msgspec` PEP 684 tracker:
|
||||||
|
<https://github.com/jcrist/msgspec/issues/563>
|
||||||
|
- CPython `_interpretersmodule.c` source:
|
||||||
|
<https://github.com/python/cpython/blob/main/Modules/_interpretersmodule.c>
|
||||||
|
- `tractor.spawn._subint` module docstring (in-tree
|
||||||
|
explanation of the legacy-mode choice and its
|
||||||
|
tradeoffs).
|
||||||
|
|
||||||
|
## Reproducer
|
||||||
|
|
||||||
|
```
|
||||||
|
./py314/bin/python -m pytest \
|
||||||
|
tests/discovery/test_registrar.py::test_stale_entry_is_deleted \
|
||||||
|
--spawn-backend=subint \
|
||||||
|
--tb=short --no-header -v
|
||||||
|
```
|
||||||
|
|
||||||
|
Hangs indefinitely without the fixture-side SIGINT loop;
|
||||||
|
with the loop, the test completes (albeit with the
|
||||||
|
abandoned-thread warning in logs).
|
||||||
|
|
||||||
|
## Additional known-hanging tests (same class)
|
||||||
|
|
||||||
|
All three tests below exhibit the same
|
||||||
|
signal-wakeup-fd-starvation fingerprint (`write() → EAGAIN`
|
||||||
|
on the wakeup pipe after enough SIGINT attempts) and
|
||||||
|
share the same structural cause — abandoned legacy-subint
|
||||||
|
driver threads contending with the main interpreter for
|
||||||
|
the shared GIL until the main trio loop can no longer
|
||||||
|
drain its wakeup pipe fast enough to deliver signals.
|
||||||
|
|
||||||
|
They're listed separately because each exposes the class
|
||||||
|
under a different load pattern worth documenting.
|
||||||
|
|
||||||
|
### `tests/discovery/test_registrar.py::test_stale_entry_is_deleted[subint]`
|
||||||
|
|
||||||
|
Original exemplar — see the **Symptom** and **Evidence**
|
||||||
|
sections above. One abandoned subint
|
||||||
|
(`transport_fails_actor`, stuck in `trio.sleep_forever()`
|
||||||
|
after self-cancelling its IPC server) is sufficient to
|
||||||
|
tip main into starvation once the harness's `daemon`
|
||||||
|
fixture subproc keeps its half of the registry IPC alive.
|
||||||
|
|
||||||
|
### `tests/test_cancellation.py::test_cancel_while_childs_child_in_sync_sleep[subint-False]`
|
||||||
|
|
||||||
|
Cancel a grandchild that's in sync Python sleep from 2
|
||||||
|
nurseries up. The test's own docstring declares the
|
||||||
|
dependency: "its parent should issue a 'zombie reaper' to
|
||||||
|
hard kill it after sufficient timeout" — which for
|
||||||
|
`trio`/`mp_*` is an OS-level `SIGKILL` of the grandchild
|
||||||
|
subproc. **Under `subint` there's no equivalent** (no
|
||||||
|
public CPython API to force-destroy a running
|
||||||
|
sub-interpreter), so the grandchild's sync-sleeping
|
||||||
|
`trio.run()` persists inside its abandoned driver thread
|
||||||
|
indefinitely. The nested actor-tree (parent → child →
|
||||||
|
grandchild, all subints) means a single cancel triggers
|
||||||
|
multiple concurrent hard-kill abandonments, each leaving
|
||||||
|
a live driver thread.
|
||||||
|
|
||||||
|
This test often only manifests the starvation under
|
||||||
|
**full-suite runs** rather than solo execution —
|
||||||
|
earlier-in-session subint tests also leave abandoned
|
||||||
|
driver threads behind, and the combined population is
|
||||||
|
what actually tips main trio into starvation. Solo runs
|
||||||
|
may stay Ctrl-C-able with fewer abandoned threads in the
|
||||||
|
mix.
|
||||||
|
|
||||||
|
### `tests/test_cancellation.py::test_multierror_fast_nursery[subint-25-0.5]`
|
||||||
|
|
||||||
|
Nursery-error-path throughput stress-test parametrized
|
||||||
|
for **25 concurrent subactors**. When the multierror
|
||||||
|
fires and the nursery cancels, every subactor goes
|
||||||
|
through our `subint_proc` teardown. The bounded
|
||||||
|
hard-kills run in parallel (all `subint_proc` tasks are
|
||||||
|
sibling trio tasks), so the timeout budget is ~3s total
|
||||||
|
rather than 3s × 25. After that, **25 abandoned
|
||||||
|
`daemon=True` driver threads are simultaneously alive** —
|
||||||
|
an extreme pressure multiplier on the same mechanism.
|
||||||
|
|
||||||
|
The `strace` fingerprint is striking under this load: six
|
||||||
|
or more **successful** `write(16, "\2", 1) = 1` calls
|
||||||
|
(main trio getting brief GIL slices, each long enough to
|
||||||
|
drain exactly one wakeup-pipe byte) before finally
|
||||||
|
saturating with `EAGAIN`:
|
||||||
|
|
||||||
|
```
|
||||||
|
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||||
|
write(16, "\2", 1) = 1
|
||||||
|
rt_sigreturn({mask=[WINCH]}) = 140141623162400
|
||||||
|
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||||
|
write(16, "\2", 1) = 1
|
||||||
|
rt_sigreturn({mask=[WINCH]}) = 140141623162400
|
||||||
|
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||||
|
write(16, "\2", 1) = 1
|
||||||
|
rt_sigreturn({mask=[WINCH]}) = 140141623162400
|
||||||
|
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||||
|
write(16, "\2", 1) = 1
|
||||||
|
rt_sigreturn({mask=[WINCH]}) = 140141623162400
|
||||||
|
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||||
|
write(16, "\2", 1) = 1
|
||||||
|
rt_sigreturn({mask=[WINCH]}) = 140141623162400
|
||||||
|
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||||
|
write(16, "\2", 1) = 1
|
||||||
|
rt_sigreturn({mask=[WINCH]}) = 140141623162400
|
||||||
|
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||||
|
write(16, "\2", 1) = -1 EAGAIN (Resource temporarily unavailable)
|
||||||
|
rt_sigreturn({mask=[WINCH]}) = 140141623162400
|
||||||
|
```
|
||||||
|
|
||||||
|
Those successful writes indicate CPython's
|
||||||
|
`sys.getswitchinterval()`-based GIL round-robin *is*
|
||||||
|
giving main brief slices — just never long enough to run
|
||||||
|
the Python-level signal handler through to the point
|
||||||
|
where trio converts the delivered SIGINT into a
|
||||||
|
`Cancelled` on the appropriate scope. Once the
|
||||||
|
accumulated write rate outpaces main's drain rate, the
|
||||||
|
pipe saturates and subsequent signals are silently
|
||||||
|
dropped.
|
||||||
|
|
||||||
|
The `pstree` below (pid `530060` = hung `pytest`) shows
|
||||||
|
the subint-driver thread population at the moment of
|
||||||
|
capture. Even with fewer than the full 25 shown (pstree
|
||||||
|
truncates thread names to `subint-driver[<interp_id>` —
|
||||||
|
interpreters `3` and `4` visible across 16 thread
|
||||||
|
entries), the GIL-contender count is more than enough to
|
||||||
|
explain the starvation:
|
||||||
|
|
||||||
|
```
|
||||||
|
>>> pstree -snapt 530060
|
||||||
|
systemd,1 --switched-root --system --deserialize=40
|
||||||
|
└─login,1545 --
|
||||||
|
└─bash,1872
|
||||||
|
└─sway,2012
|
||||||
|
└─alacritty,70471 -e xonsh
|
||||||
|
└─xonsh,70487 .../bin/xonsh
|
||||||
|
└─uv,70955 run xonsh
|
||||||
|
└─xonsh,70959 .../py314/bin/xonsh
|
||||||
|
└─python,530060 .../py314/bin/pytest -v tests/test_cancellation.py --spawn-backend=subint
|
||||||
|
├─{subint-driver[3},531857
|
||||||
|
├─{subint-driver[3},531860
|
||||||
|
├─{subint-driver[3},531862
|
||||||
|
├─{subint-driver[3},531866
|
||||||
|
├─{subint-driver[3},531877
|
||||||
|
├─{subint-driver[3},531882
|
||||||
|
├─{subint-driver[3},531884
|
||||||
|
├─{subint-driver[3},531945
|
||||||
|
├─{subint-driver[3},531950
|
||||||
|
├─{subint-driver[3},531952
|
||||||
|
├─{subint-driver[4},531956
|
||||||
|
├─{subint-driver[4},531959
|
||||||
|
├─{subint-driver[4},531961
|
||||||
|
├─{subint-driver[4},531965
|
||||||
|
├─{subint-driver[4},531968
|
||||||
|
└─{subint-driver[4},531979
|
||||||
|
```
|
||||||
|
|
||||||
|
(`pstree` uses `{...}` to denote threads rather than
|
||||||
|
processes — these are all the **driver OS-threads** our
|
||||||
|
`subint_proc` creates with name
|
||||||
|
`f'subint-driver[{interp_id}]'`. Every one of them is
|
||||||
|
still alive, executing `_interpreters.exec()` inside a
|
||||||
|
sub-interpreter our hard-kill has abandoned. At 16+
|
||||||
|
abandoned driver threads competing for the main GIL, the
|
||||||
|
main-interpreter trio loop gets starved and signal
|
||||||
|
delivery stalls.)
|
||||||
|
|
@ -0,0 +1,54 @@
|
||||||
|
---
|
||||||
|
model: claude-opus-4-6
|
||||||
|
service: claude
|
||||||
|
session: (ad-hoc, not tracked via conf.toml)
|
||||||
|
timestamp: 2026-04-06T17:28:48Z
|
||||||
|
git_ref: 02b2ef1
|
||||||
|
scope: tests
|
||||||
|
substantive: true
|
||||||
|
raw_file: 20260406T172848Z_02b2ef1_prompt_io.raw.md
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prompt
|
||||||
|
|
||||||
|
User asked to extend `tests/test_resource_cache.py` with a test
|
||||||
|
that reproduces the edge case fixed in commit `02b2ef18` (per-key
|
||||||
|
locking+user tracking in `maybe_open_context()`). The bug was
|
||||||
|
originally triggered in piker's `brokerd.kraken` backend where the
|
||||||
|
same `acm_func` was called with different kwargs, and the old
|
||||||
|
global `_Cache.users` counter caused:
|
||||||
|
|
||||||
|
- teardown skipped for one `ctx_key` bc another key's users kept
|
||||||
|
the global count > 0
|
||||||
|
- re-entry hitting `assert not resources.get(ctx_key)` during the
|
||||||
|
teardown window
|
||||||
|
|
||||||
|
User requested a test that would fail under the old code and pass
|
||||||
|
with the fix.
|
||||||
|
|
||||||
|
## Response summary
|
||||||
|
|
||||||
|
Designed and implemented `test_per_ctx_key_resource_lifecycle`
|
||||||
|
which verifies per-`ctx_key` resource isolation by:
|
||||||
|
|
||||||
|
1. Holding resource `'a'` open in a bg task
|
||||||
|
2. Opening+closing resource `'b'` (same `acm_func`, different
|
||||||
|
kwargs) while `'a'` is still alive
|
||||||
|
3. Re-opening `'b'` and asserting cache MISS — proving `'b'` was
|
||||||
|
torn down independently despite `'a'` keeping its own user
|
||||||
|
count > 0
|
||||||
|
|
||||||
|
With the old global counter, phase 3 would produce a stale cache
|
||||||
|
HIT (leaked resource) or crash on the assert.
|
||||||
|
|
||||||
|
Also added a trivial `acm_with_resource(resource_id)` ACM helper
|
||||||
|
at module level.
|
||||||
|
|
||||||
|
## Files changed
|
||||||
|
|
||||||
|
- `tests/test_resource_cache.py` — add `acm_with_resource` ACM +
|
||||||
|
`test_per_ctx_key_resource_lifecycle` test fn
|
||||||
|
|
||||||
|
## Human edits
|
||||||
|
|
||||||
|
None — committed as generated (pending user review).
|
||||||
|
|
@ -0,0 +1,57 @@
|
||||||
|
---
|
||||||
|
model: claude-opus-4-6
|
||||||
|
service: claude
|
||||||
|
session: (ad-hoc, not tracked via conf.toml)
|
||||||
|
timestamp: 2026-04-06T19:31:25Z
|
||||||
|
git_ref: 85f9c5d
|
||||||
|
scope: tests
|
||||||
|
substantive: true
|
||||||
|
raw_file: 20260406T193125Z_85f9c5d_prompt_io.raw.md
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prompt
|
||||||
|
|
||||||
|
User asked to reproduce the exact `assert not resources.get(ctx_key)`
|
||||||
|
crash originally seen in piker's `brokerd.kraken` backend via
|
||||||
|
`open_cached_client('kraken')`. Key constraints from user:
|
||||||
|
|
||||||
|
- In piker, kwargs were the **same** (empty) so all callers
|
||||||
|
share one `ctx_key = (fid, ())`
|
||||||
|
- The root issue is `_Cache.locks` being indexed by `fid`
|
||||||
|
rather than `ctx_key`, plus the race window between
|
||||||
|
`values.pop()` and `resources.pop()` in `_Cache.run_ctx`
|
||||||
|
(the acm `__aexit__` has checkpoints in between)
|
||||||
|
- Mark the test `@pytest.mark.xfail` so it can be committed
|
||||||
|
against the unpatched code and later adjusted once the fix
|
||||||
|
from `9e49eddd` is applied
|
||||||
|
|
||||||
|
User pointed to piker's `open_cached_client('kraken')` usage
|
||||||
|
as the real-world pattern to model.
|
||||||
|
|
||||||
|
## Response summary
|
||||||
|
|
||||||
|
Added `test_moc_reentry_during_teardown` which deterministically
|
||||||
|
reproduces the teardown race:
|
||||||
|
|
||||||
|
1. A `cached_client()` acm (no params, like
|
||||||
|
`kraken.api.get_client()`) signals via `in_aexit` event
|
||||||
|
when its `__aexit__` starts, then sleeps to hold the
|
||||||
|
window open
|
||||||
|
2. `use_and_exit` task enters+exits the cached resource
|
||||||
|
3. `reenter_during_teardown` waits on `in_aexit` then
|
||||||
|
re-enters — landing exactly in the window where
|
||||||
|
`values` is popped but `resources` still exists
|
||||||
|
4. The old `assert not resources.get(ctx_key)` fires
|
||||||
|
5. `@pytest.mark.xfail(raises=AssertionError)` catches it
|
||||||
|
|
||||||
|
Test confirmed: `XFAIL` with the exact traceback at
|
||||||
|
`_mngrs.py:327`.
|
||||||
|
|
||||||
|
## Files changed
|
||||||
|
|
||||||
|
- `tests/test_resource_cache.py` — add
|
||||||
|
`test_moc_reentry_during_teardown` xfail test fn
|
||||||
|
|
||||||
|
## Human edits
|
||||||
|
|
||||||
|
None — committed as generated (pending user review).
|
||||||
|
|
@ -0,0 +1,52 @@
|
||||||
|
---
|
||||||
|
model: claude-opus-4-6
|
||||||
|
service: claude
|
||||||
|
session: (ad-hoc, not tracked via conf.toml)
|
||||||
|
timestamp: 2026-04-06T22:26:48Z
|
||||||
|
git_ref: f086222
|
||||||
|
scope: code
|
||||||
|
substantive: true
|
||||||
|
raw_file: 20260406T222648Z_f086222_prompt_io.raw.md
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prompt
|
||||||
|
|
||||||
|
User applied the per-key locking patch (`f086222d`),
|
||||||
|
removed the `@pytest.mark.xfail` from
|
||||||
|
`test_moc_reentry_during_teardown`, and found the
|
||||||
|
test STILL fails — now with `RuntimeError` instead
|
||||||
|
of `AssertionError`. Asked to diagnose why the fix
|
||||||
|
doesn't actually resolve this race.
|
||||||
|
|
||||||
|
## Response summary
|
||||||
|
|
||||||
|
Traced the exact scheduling sequence across three
|
||||||
|
concurrent tasks to show that the per-`ctx_key`
|
||||||
|
locking change doesn't close the fundamental gap:
|
||||||
|
|
||||||
|
- `maybe_open_context`'s finally pops the lock and
|
||||||
|
sets `no_more_users` in the **exiting caller's
|
||||||
|
task**
|
||||||
|
- But the actual resource cleanup (`values.pop`,
|
||||||
|
acm `__aexit__`, `resources.pop`) runs in the
|
||||||
|
**`run_ctx` task** inside `service_tn`
|
||||||
|
- A re-entering task creates a fresh lock (old one
|
||||||
|
popped), acquires immediately, and hits stale
|
||||||
|
`resources[ctx_key]` while `run_ctx` is still in
|
||||||
|
`__aexit__`
|
||||||
|
|
||||||
|
Identified three possible fix strategies:
|
||||||
|
1. Let `run_ctx` own lock release/pop after
|
||||||
|
`resources.pop()`
|
||||||
|
2. Add a "teardown complete" event that new callers
|
||||||
|
wait on
|
||||||
|
3. Move `resources.pop()` into `run_ctx`'s inner
|
||||||
|
finally (before acm `__aexit__`)
|
||||||
|
|
||||||
|
## Files changed
|
||||||
|
|
||||||
|
No code changes — analysis only.
|
||||||
|
|
||||||
|
## Human edits
|
||||||
|
|
||||||
|
N/A — diagnostic output, no code generated.
|
||||||
|
|
@ -0,0 +1,32 @@
|
||||||
|
---
|
||||||
|
model: claude-opus-4-6
|
||||||
|
service: claude
|
||||||
|
timestamp: 2026-04-06T22:26:48Z
|
||||||
|
git_ref: f086222
|
||||||
|
diff_cmd: git diff f086222~1..f086222
|
||||||
|
---
|
||||||
|
|
||||||
|
Here's the problem. The per-key locking doesn't fix this race because the **lock lifetime is decoupled from `run_ctx`'s lifetime**. Trace through what happens:
|
||||||
|
|
||||||
|
**`maybe_open_context` finally** (lines 468-495):
|
||||||
|
1. `users[ctx_key] -= 1` → 0
|
||||||
|
2. `no_more_users.set()` → wakes `run_ctx` (but it doesn't run yet, just becomes ready)
|
||||||
|
3. `_Cache.locks.pop(ctx_key)` → **lock is gone**
|
||||||
|
4. `maybe_open_context` exits
|
||||||
|
|
||||||
|
**`run_ctx`** now gets scheduled (lines 237-241):
|
||||||
|
5. Inner finally: `values.pop(ctx_key)` → values gone
|
||||||
|
6. `mng.__aexit__()` runs → `in_aexit.set()`, `await trio.sleep(10)` → **CHECKPOINT**
|
||||||
|
|
||||||
|
**`reenter_during_teardown`** wakes:
|
||||||
|
7. Enters `maybe_open_context` → no lock exists for `ctx_key` → creates **fresh** lock, acquires immediately
|
||||||
|
8. `values[ctx_key]` → `KeyError`
|
||||||
|
9. `resources.get(ctx_key)` → **still exists!** (step 6 hasn't reached `resources.pop()` in `run_ctx`'s outer finally yet)
|
||||||
|
10. `raise RuntimeError('Caching resources ALREADY exist?!')`
|
||||||
|
|
||||||
|
The core issue: `no_more_users.set()` (step 2) and `locks.pop()` (step 3) happen in the **exiting caller's task**, but the actual resource cleanup (`values.pop`, acm `__aexit__`, `resources.pop`) happens in the **`run_ctx` task** inside `service_tn`. There's no synchronization between "lock removed, new callers welcome" and "run_ctx has actually finished tearing down". The per-`ctx_key` lock change just made the lock more granular — it didn't close this gap.
|
||||||
|
|
||||||
|
The fix needs to ensure a new caller can't acquire a lock for `ctx_key` until `run_ctx` has fully exited (including `resources.pop()`). Either:
|
||||||
|
- **don't pop the lock in `maybe_open_context`** — let `run_ctx` release/pop it after `resources.pop()`
|
||||||
|
- **wait on a "teardown complete" event** in the cache-miss path when stale `resources` exist
|
||||||
|
- **pop `resources` before the acm `__aexit__`** (move `resources.pop()` into the inner finally alongside `values.pop()`)
|
||||||
|
|
@ -0,0 +1,74 @@
|
||||||
|
---
|
||||||
|
model: claude-opus-4-6
|
||||||
|
service: claude
|
||||||
|
session: (ad-hoc, not tracked via conf.toml)
|
||||||
|
timestamp: 2026-04-13T19:21:16Z
|
||||||
|
git_ref: f851f28
|
||||||
|
scope: code
|
||||||
|
substantive: true
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prompt
|
||||||
|
|
||||||
|
User provided a detailed implementation plan for
|
||||||
|
a new `test_tpt_bind_addrs.py` test suite covering
|
||||||
|
`open_root_actor(tpt_bind_addrs=...)` — the three
|
||||||
|
runtime code paths in `_root.py:385-450`:
|
||||||
|
|
||||||
|
1. Non-registrar, no explicit bind -> random addrs
|
||||||
|
2. Registrar, no explicit bind -> registry_addrs
|
||||||
|
3. Explicit bind given -> `wrap_address()` + merge
|
||||||
|
|
||||||
|
Plan specified 6 test functions (~10 parametrized
|
||||||
|
variants), predicted a type-mixing bug at line 446,
|
||||||
|
and asked for an in-flight fix if confirmed.
|
||||||
|
|
||||||
|
## Response summary
|
||||||
|
|
||||||
|
Created `tests/discovery/test_tpt_bind_addrs.py`
|
||||||
|
with 9 collected test variants across 6 functions:
|
||||||
|
|
||||||
|
- `test_registrar_root_tpt_bind_addrs` (3 variants:
|
||||||
|
`bind-eq-reg`, `bind-subset-reg`,
|
||||||
|
`bind-disjoint-reg`)
|
||||||
|
- `test_non_registrar_root_tpt_bind_addrs`
|
||||||
|
(2 variants: `bind-same-bindspace`,
|
||||||
|
`bind-disjoint`)
|
||||||
|
- `test_non_registrar_default_random_bind`
|
||||||
|
(baseline, no explicit bind)
|
||||||
|
- `test_tpt_bind_addrs_as_maddr_str`
|
||||||
|
(multiaddr string input)
|
||||||
|
- `test_registrar_merge_binds_union`
|
||||||
|
(registrar + disjoint bind -> union)
|
||||||
|
- `test_open_nursery_forwards_tpt_bind_addrs`
|
||||||
|
(`open_nursery(**kwargs)` forwarding)
|
||||||
|
|
||||||
|
Confirmed and fixed the predicted bug at
|
||||||
|
`_root.py:446`: the registrar merge path mixed
|
||||||
|
`Address` objects (`tpt_bind_addrs`) with raw tuples
|
||||||
|
(`uw_reg_addrs`) inside `set()`, preventing
|
||||||
|
deduplication and causing double-bind `OSError`.
|
||||||
|
|
||||||
|
Fix: wrap `uw_reg_addrs` before the set union:
|
||||||
|
```python
|
||||||
|
# before (broken)
|
||||||
|
tpt_bind_addrs = list(set(
|
||||||
|
tpt_bind_addrs + uw_reg_addrs
|
||||||
|
))
|
||||||
|
# after (fixed)
|
||||||
|
tpt_bind_addrs = list(set(
|
||||||
|
tpt_bind_addrs
|
||||||
|
+ [wrap_address(a) for a in uw_reg_addrs]
|
||||||
|
))
|
||||||
|
```
|
||||||
|
|
||||||
|
All 9 tests pass after the fix.
|
||||||
|
|
||||||
|
## Files changed
|
||||||
|
|
||||||
|
- `tests/discovery/test_tpt_bind_addrs.py` (new)
|
||||||
|
- `tractor/_root.py:446` (bug fix, 1 line)
|
||||||
|
|
||||||
|
## Human edits
|
||||||
|
|
||||||
|
N/A — pending review.
|
||||||
|
|
@ -0,0 +1,50 @@
|
||||||
|
---
|
||||||
|
model: claude-opus-4-6
|
||||||
|
service: claude
|
||||||
|
session: 76154e65-d8e1-4b5f-9275-0ea45ba7e98a
|
||||||
|
timestamp: 2026-04-13T20:50:48Z
|
||||||
|
git_ref: 269d939c
|
||||||
|
scope: code
|
||||||
|
substantive: true
|
||||||
|
raw_file: 20260413T205048Z_269d939c_prompt_io.raw.md
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prompt
|
||||||
|
|
||||||
|
Implement a `parse_endpoints()` API in
|
||||||
|
`tractor.discovery._multiaddr` that lets downstream
|
||||||
|
projects declare per-actor transport bind addresses
|
||||||
|
as a service table of actor-name -> multiaddr
|
||||||
|
strings (loaded from config, e.g. TOML `[network]`
|
||||||
|
section). Add type aliases `EndpointsTable` and
|
||||||
|
`ParsedEndpoints`, plus 7 unit tests covering TCP,
|
||||||
|
UDS, tuples, mixed input, unsupported protocols, and
|
||||||
|
edge cases.
|
||||||
|
|
||||||
|
A detailed implementation plan was provided
|
||||||
|
specifying insertion points, type signatures, test
|
||||||
|
names, and verification commands.
|
||||||
|
|
||||||
|
## Response summary
|
||||||
|
|
||||||
|
Add `parse_endpoints()` function with two supporting
|
||||||
|
type aliases to `_multiaddr.py`. The function
|
||||||
|
iterates a `dict[str, list[str|tuple]]` service
|
||||||
|
table and delegates each entry to the existing
|
||||||
|
`wrap_address()` helper, which already handles maddr
|
||||||
|
strings, raw tuples, and pre-wrapped `Address`
|
||||||
|
objects. Seven pure unit tests (no trio runtime)
|
||||||
|
validate all input variants and edge cases. All 22
|
||||||
|
multiaddr tests pass.
|
||||||
|
|
||||||
|
## Files changed
|
||||||
|
|
||||||
|
- `tractor/discovery/_multiaddr.py` — add
|
||||||
|
`EndpointsTable`, `ParsedEndpoints` type aliases
|
||||||
|
and `parse_endpoints()` function after line 133
|
||||||
|
- `tests/discovery/test_multiaddr.py` — add
|
||||||
|
`parse_endpoints` import and 7 new test functions
|
||||||
|
|
||||||
|
## Human edits
|
||||||
|
|
||||||
|
None — committed as generated.
|
||||||
|
|
@ -0,0 +1,32 @@
|
||||||
|
---
|
||||||
|
model: claude-opus-4-6
|
||||||
|
service: claude
|
||||||
|
timestamp: 2026-04-13T20:50:48Z
|
||||||
|
git_ref: 269d939c
|
||||||
|
diff_cmd: git diff 269d939c~1..269d939c
|
||||||
|
---
|
||||||
|
|
||||||
|
## Implementation output
|
||||||
|
|
||||||
|
### `tractor/discovery/_multiaddr.py`
|
||||||
|
|
||||||
|
> `git diff 269d939c~1..269d939c -- tractor/discovery/_multiaddr.py`
|
||||||
|
|
||||||
|
Type aliases `EndpointsTable` and `ParsedEndpoints`
|
||||||
|
plus `parse_endpoints()` helper that wraps each
|
||||||
|
entry via `wrap_address()`.
|
||||||
|
|
||||||
|
### `tests/discovery/test_multiaddr.py`
|
||||||
|
|
||||||
|
> `git diff 269d939c~1..269d939c -- tests/discovery/test_multiaddr.py`
|
||||||
|
|
||||||
|
Add import of `parse_endpoints` to existing import
|
||||||
|
block, then 7 new test functions.
|
||||||
|
|
||||||
|
### Test run output
|
||||||
|
|
||||||
|
```
|
||||||
|
22 passed, 1 warning in 0.05s
|
||||||
|
```
|
||||||
|
|
||||||
|
All 22 tests pass (15 existing + 7 new).
|
||||||
|
|
@ -0,0 +1,38 @@
|
||||||
|
---
|
||||||
|
model: claude-opus-4-6
|
||||||
|
service: claude
|
||||||
|
session: multiaddr-support-rename-prefer
|
||||||
|
timestamp: 2026-04-14T16:33:00Z
|
||||||
|
git_ref: befedc49
|
||||||
|
scope: code
|
||||||
|
substantive: true
|
||||||
|
raw_file: 20260414T163300Z_befedc49_prompt_io.raw.md
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prompt
|
||||||
|
|
||||||
|
Create a helper function that determines the best transport given
|
||||||
|
actor locality (distributed vs same host). Use PID/hostname
|
||||||
|
comparison for locality detection, apply at registry addr selection
|
||||||
|
only (not spawn-time).
|
||||||
|
|
||||||
|
## Response summary
|
||||||
|
|
||||||
|
New `prefer_addr()` + `_is_local_addr()` helpers
|
||||||
|
in `_api.py` using `socket.getaddrinfo()` and
|
||||||
|
`ipaddress` for PID/hostname locality detection.
|
||||||
|
Preference: UDS > local TCP > remote TCP.
|
||||||
|
Integrated into `query_actor()` and
|
||||||
|
`wait_for_actor()`. Also changed
|
||||||
|
`Registrar.find_actor()` to return full addr list
|
||||||
|
so callers can apply preference.
|
||||||
|
|
||||||
|
## Files changed
|
||||||
|
|
||||||
|
- `tractor/discovery/_discovery.py` → `_api.py`
|
||||||
|
— renamed + added `prefer_addr()`,
|
||||||
|
`_is_local_addr()`; updated `query_actor()` and
|
||||||
|
`wait_for_actor()` call sites
|
||||||
|
- `tractor/discovery/_registry.py`
|
||||||
|
— `Registrar.find_actor()` returns
|
||||||
|
`list[UnwrappedAddress]|None`
|
||||||
|
|
@ -0,0 +1,62 @@
|
||||||
|
---
|
||||||
|
model: claude-opus-4-6
|
||||||
|
service: claude
|
||||||
|
timestamp: 2026-04-14T16:33:00Z
|
||||||
|
git_ref: befedc49
|
||||||
|
diff_cmd: git diff befedc49~1..befedc49
|
||||||
|
---
|
||||||
|
|
||||||
|
### `tractor/discovery/_api.py`
|
||||||
|
|
||||||
|
> `git diff befedc49~1..befedc49 -- tractor/discovery/_api.py`
|
||||||
|
|
||||||
|
Add `_is_local_addr()` and `prefer_addr()` transport
|
||||||
|
preference helpers.
|
||||||
|
|
||||||
|
#### `_is_local_addr(addr: Address) -> bool`
|
||||||
|
|
||||||
|
Determines whether an `Address` is reachable on the
|
||||||
|
local host:
|
||||||
|
|
||||||
|
- `UDSAddress`: always returns `True`
|
||||||
|
(filesystem-bound, inherently local)
|
||||||
|
- `TCPAddress`: checks if `._host` is a loopback IP
|
||||||
|
via `ipaddress.ip_address().is_loopback`, then
|
||||||
|
falls back to comparing against the machine's own
|
||||||
|
interface IPs via
|
||||||
|
`socket.getaddrinfo(socket.gethostname(), None)`
|
||||||
|
|
||||||
|
#### `prefer_addr(addrs: list[UnwrappedAddress]) -> UnwrappedAddress`
|
||||||
|
|
||||||
|
Selects the "best" transport address from a
|
||||||
|
multihomed actor's address list. Wraps each
|
||||||
|
candidate via `wrap_address()` to get typed
|
||||||
|
`Address` objects, then classifies into three tiers:
|
||||||
|
|
||||||
|
1. **UDS** (same-host guaranteed, lowest overhead)
|
||||||
|
2. **TCP loopback / same-host IP** (local network)
|
||||||
|
3. **TCP remote** (only option for distributed)
|
||||||
|
|
||||||
|
Within each tier, the last-registered (latest) entry
|
||||||
|
is preferred. Falls back to `addrs[-1]` if no
|
||||||
|
heuristic matches.
|
||||||
|
|
||||||
|
### `tractor/discovery/_registry.py`
|
||||||
|
|
||||||
|
> `git diff befedc49~1..befedc49 -- tractor/discovery/_registry.py`
|
||||||
|
|
||||||
|
`Registrar.find_actor()` return type broadened from
|
||||||
|
single addr to `list[UnwrappedAddress]|None` — full
|
||||||
|
addr list lets callers apply transport preference.
|
||||||
|
|
||||||
|
#### Integration
|
||||||
|
|
||||||
|
`query_actor()` and `wait_for_actor()` now call
|
||||||
|
`prefer_addr(addrs)` instead of `addrs[-1]`.
|
||||||
|
|
||||||
|
### Verification
|
||||||
|
|
||||||
|
All discovery tests pass (13/13 non-daemon).
|
||||||
|
`test_local.py` and `test_multi_program.py` also
|
||||||
|
pass (daemon fixture teardown failures are
|
||||||
|
pre-existing and unrelated).
|
||||||
|
|
@ -0,0 +1,101 @@
|
||||||
|
---
|
||||||
|
model: claude-opus-4-7[1m]
|
||||||
|
service: claude
|
||||||
|
session: subints-spawner-design-kickoff
|
||||||
|
timestamp: 2026-04-17T03:49:18Z
|
||||||
|
git_ref: 9703210
|
||||||
|
scope: docs
|
||||||
|
substantive: true
|
||||||
|
raw_file: 20260417T034918Z_9703210_prompt_io.raw.md
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prompt
|
||||||
|
|
||||||
|
Drive the "first big boi, from GH issue" task seeded by
|
||||||
|
`ai/prompt-io/prompts/subints_spawner.md`: design, plan
|
||||||
|
and implement sub-interpreter (subint) spawn-backend
|
||||||
|
support per issue #379, including (1) modularizing
|
||||||
|
`tractor.spawn._spawn` into per-backend submods, (2) a new
|
||||||
|
`._subint` backend, and (3) harness parametrization via the
|
||||||
|
existing `--spawn-backend` / `start_method` pytest fixture
|
||||||
|
in `tractor._testing.pytest`.
|
||||||
|
|
||||||
|
Follow-up clarifications from the user (this turn):
|
||||||
|
1. Pin `<3.15` on this dev branch and feature-gate subint
|
||||||
|
tests — chose option (a).
|
||||||
|
2. Split Phase A (modularization) into its own PR first.
|
||||||
|
3. Defer the `fork()`-via-subint hack to a follow-up.
|
||||||
|
4. Harness flag is `pytest --spawn-backend <key>` CLI →
|
||||||
|
`start_method` fixture (prompt file updated to match).
|
||||||
|
|
||||||
|
## Response summary
|
||||||
|
|
||||||
|
Produced a three-phase plan and a concrete Phase A (pure
|
||||||
|
modularization) file-split plan for user review; no code
|
||||||
|
written yet — the green-light to start Phase A was given
|
||||||
|
in this same turn conditional on logging this prompt-io
|
||||||
|
entry first.
|
||||||
|
|
||||||
|
Phases:
|
||||||
|
- **A — modularize** `tractor/spawn/_spawn.py` (847 LOC):
|
||||||
|
keep generic machinery in `_spawn.py`, extract
|
||||||
|
`trio_proc` → `spawn/_trio.py`, `mp_proc` →
|
||||||
|
`spawn/_mp.py`. No pin bump.
|
||||||
|
- **B — `_subint` backend**: bump `pyproject.toml`
|
||||||
|
`requires-python` upper to `<3.15`; add `'subint'` to
|
||||||
|
`SpawnMethodKey`; reuse existing UDS transport; shm
|
||||||
|
escape-hatch deferred.
|
||||||
|
- **C — harness**: drive the valid-backend tuple in
|
||||||
|
`tractor/_testing/pytest.py:345-349` from
|
||||||
|
`typing.get_args(SpawnMethodKey)`; skip subint tests on
|
||||||
|
Python < 3.14.
|
||||||
|
|
||||||
|
Key findings surfaced to the user:
|
||||||
|
- `pyproject.toml:12` currently pins `<3.14`; PEP 734
|
||||||
|
`concurrent.interpreters` only ships in 3.14 — the
|
||||||
|
load-bearing constraint.
|
||||||
|
- `_testing/pytest.py:345-349` hardcodes valid backends
|
||||||
|
as a string tuple (`'mp_spawn'`, `'mp_forkserver'`,
|
||||||
|
`'trio'`) — should be `get_args(SpawnMethodKey)`.
|
||||||
|
- `_testing/pytest.py:228` already imports
|
||||||
|
`try_set_start_method` from `tractor.spawn._spawn` —
|
||||||
|
keeping the `_spawn.py` path as the "core" module
|
||||||
|
avoids breaking external refs during Phase A.
|
||||||
|
|
||||||
|
Phase A commit breakdown (each green via `/run-tests`):
|
||||||
|
1. Drive `_testing/pytest.py` backend-tuple from
|
||||||
|
`get_args(SpawnMethodKey)` — prep.
|
||||||
|
2. Extract `trio_proc` → `spawn/_trio.py`; `_methods`
|
||||||
|
registry imports it.
|
||||||
|
3. Extract `mp_proc` → `spawn/_mp.py`; `_methods` imports
|
||||||
|
it.
|
||||||
|
4. Tidy `spawn/__init__.py` docstring for the new layout.
|
||||||
|
|
||||||
|
Workflow: Phase A runs in a new worktree via `/open-wkt
|
||||||
|
spawn_modularize` (branch `wkt/spawn_modularize`) and
|
||||||
|
lands as its own PR before Phase B/C returns to the
|
||||||
|
current `claude_bun_opus4.7_subint_spawner_backend`
|
||||||
|
branch.
|
||||||
|
|
||||||
|
## Files changed
|
||||||
|
|
||||||
|
None this turn — design/planning only. The user updated
|
||||||
|
`ai/prompt-io/prompts/subints_spawner.md` themselves to
|
||||||
|
complete the truncated sentence on line 32 (harness-flag
|
||||||
|
reference).
|
||||||
|
|
||||||
|
Upcoming (Phase A, next turn):
|
||||||
|
- `tractor/spawn/_spawn.py` — trimmed to "core"
|
||||||
|
- `tractor/spawn/_trio.py` — new; holds `trio_proc()`
|
||||||
|
- `tractor/spawn/_mp.py` — new; holds `mp_proc()`
|
||||||
|
- `tractor/spawn/__init__.py` — docstring refresh
|
||||||
|
- `tractor/_testing/pytest.py` — backend tuple from
|
||||||
|
`get_args(SpawnMethodKey)`
|
||||||
|
|
||||||
|
## Human edits
|
||||||
|
|
||||||
|
None — committed as generated. User completed the
|
||||||
|
truncated sentence on line 32 of
|
||||||
|
`ai/prompt-io/prompts/subints_spawner.md` themselves
|
||||||
|
*before* this prompt-io log was written; that edit was
|
||||||
|
the user's authoring, not an edit to AI output.
|
||||||
|
|
@ -0,0 +1,117 @@
|
||||||
|
---
|
||||||
|
model: claude-opus-4-7[1m]
|
||||||
|
service: claude
|
||||||
|
timestamp: 2026-04-17T03:49:18Z
|
||||||
|
git_ref: 9703210
|
||||||
|
diff_cmd: git diff HEAD~1..HEAD
|
||||||
|
---
|
||||||
|
|
||||||
|
No code was produced in this turn — this is a design/planning
|
||||||
|
exchange for subint spawner support per issue #379. Captured
|
||||||
|
verbatim below are the two substantive AI responses, which
|
||||||
|
together fix the phased plan (A: modularize, B: subint, C:
|
||||||
|
harness) and the concrete Phase A file split before any
|
||||||
|
implementation begins.
|
||||||
|
|
||||||
|
## Turn 1 — initial triage + gating questions
|
||||||
|
|
||||||
|
> Read: issue #379, current `spawn/_spawn.py` (847 lines),
|
||||||
|
> and pyproject pins.
|
||||||
|
>
|
||||||
|
> ## Task summary
|
||||||
|
> Design/plan/implement subint backend per #379, plus
|
||||||
|
> modularize `spawn/_spawn.py` into per-backend submods, plus
|
||||||
|
> harness flag for test parametrization.
|
||||||
|
>
|
||||||
|
> ## Key finding
|
||||||
|
> `pyproject.toml:12` pins `requires-python = ">=3.12, <3.14"`
|
||||||
|
> — but `concurrent.interpreters` / PEP 734 **only lands in
|
||||||
|
> 3.14**. This is the load-bearing design question before
|
||||||
|
> coding.
|
||||||
|
>
|
||||||
|
> ## Proposed phasing (incremental commits)
|
||||||
|
> - **Phase A — modularize** `_spawn.py`: extract `trio_proc`
|
||||||
|
> → `spawn/_trio.py`, `mp_proc` → `spawn/_mp.py`, keep
|
||||||
|
> generic helpers (`hard_kill`/`soft_kill`/`new_proc`
|
||||||
|
> dispatcher/`SpawnMethodKey`) in a slim core. Tests green
|
||||||
|
> at each step.
|
||||||
|
> - **Phase B — `_subint` backend**: extend `SpawnMethodKey`
|
||||||
|
> with `'subint'`; reuse UDS transport by default; shm is
|
||||||
|
> escape-hatch follow-up.
|
||||||
|
> - **Phase C — harness**: add backend-parametrize flag, skip
|
||||||
|
> subint when Python < 3.14.
|
||||||
|
>
|
||||||
|
> ## Gating questions before I start
|
||||||
|
> 1. **Python pin**: bump `<3.15` and feature-gate subint
|
||||||
|
> tests (a), require 3.14 outright (b), or backport shim
|
||||||
|
> (c)? I'd recommend (a).
|
||||||
|
> 2. **PR strategy**: one branch, commits per phase (current
|
||||||
|
> `claude_bun_opus4.7_subint_spawner_backend`)? Or split
|
||||||
|
> Phase A as its own PR first?
|
||||||
|
> 3. **`fork()`-via-subint hack** from the issue's "Our own
|
||||||
|
> thoughts" section — v1 or follow-up? I'd defer to
|
||||||
|
> follow-up.
|
||||||
|
> 4. **Prompt file line 32 is truncated** — what's the
|
||||||
|
> existing harness flag name you had in mind ("using the
|
||||||
|
> existing ___")?
|
||||||
|
>
|
||||||
|
> Which of these are already decided vs. open?
|
||||||
|
|
||||||
|
## User answers (for the raw record)
|
||||||
|
|
||||||
|
1. bump `<3.15`, feature-gate — option (a).
|
||||||
|
2. split Phase A as its own PR first.
|
||||||
|
3. defer `fork()`-via-subint hack.
|
||||||
|
4. harness flag is `--spawn-backend` CLI → `start_method`
|
||||||
|
fixture, per `tractor._testing._pytest`. Prompt file
|
||||||
|
updated accordingly.
|
||||||
|
|
||||||
|
## Turn 2 — concrete Phase A plan
|
||||||
|
|
||||||
|
> Good — prompt file now ends cleanly; harness flag
|
||||||
|
> confirmed as `--spawn-backend` CLI → `start_method`
|
||||||
|
> fixture (via `_testing/pytest.py:193,355`). Also noted
|
||||||
|
> `pytest.py:345-349` hardcodes valid backends as a string
|
||||||
|
> tuple — `typing.get_args(SpawnMethodKey)` should drive
|
||||||
|
> that list.
|
||||||
|
>
|
||||||
|
> ## Phase A — concrete plan (separate PR)
|
||||||
|
>
|
||||||
|
> **File split** in `tractor/spawn/`:
|
||||||
|
> - `_spawn.py` stays as "core": keeps `SpawnMethodKey`,
|
||||||
|
> `_spawn_method`, `proc_waiter`, `try_set_start_method`,
|
||||||
|
> `exhaust_portal`, `cancel_on_completion`, `hard_kill`,
|
||||||
|
> `soft_kill`, `new_proc` dispatcher, `_methods`
|
||||||
|
> registry. Kept at this path so `_testing/pytest.py:228`
|
||||||
|
> and other external imports don't break.
|
||||||
|
> - `_trio.py` ← `trio_proc()` (lines 448–671, ~224 lines)
|
||||||
|
> - `_mp.py` ← `mp_proc()` (lines 672–842, ~171 lines)
|
||||||
|
> - `_methods` dict imports the two backend fns from the
|
||||||
|
> new modules.
|
||||||
|
>
|
||||||
|
> **Commits** (each green via `/run-tests`):
|
||||||
|
> 1. Drive `_testing/pytest.py` backend-tuple from
|
||||||
|
> `get_args(SpawnMethodKey)` — tiny prep.
|
||||||
|
> 2. Extract `trio_proc` → `spawn/_trio.py`; `_methods`
|
||||||
|
> imports it.
|
||||||
|
> 3. Extract `mp_proc` → `spawn/_mp.py`; `_methods`
|
||||||
|
> imports it.
|
||||||
|
> 4. Tidy `spawn/__init__.py` docstring to describe new
|
||||||
|
> layout.
|
||||||
|
>
|
||||||
|
> No pyproject pin bump in Phase A (that's Phase B).
|
||||||
|
>
|
||||||
|
> **Workflow**: I'd run this in a new worktree via
|
||||||
|
> `/open-wkt spawn_modularize` (branch
|
||||||
|
> `wkt/spawn_modularize`), then open a PR before returning
|
||||||
|
> to the current `claude_bun_opus4.7_subint_spawner_backend`
|
||||||
|
> branch for Phase B/C.
|
||||||
|
>
|
||||||
|
> Green-light to `/open-wkt` and start Phase A?
|
||||||
|
|
||||||
|
## User final response
|
||||||
|
|
||||||
|
> all this first then yes, green light.
|
||||||
|
|
||||||
|
— log this `/prompt-io` entry first, then proceed with
|
||||||
|
Phase A in a worktree.
|
||||||
|
|
@ -0,0 +1,86 @@
|
||||||
|
---
|
||||||
|
model: claude-opus-4-7[1m]
|
||||||
|
service: claude
|
||||||
|
session: subints-spawner-phase-a-impl
|
||||||
|
timestamp: 2026-04-17T03:58:05Z
|
||||||
|
git_ref: 61a73ba
|
||||||
|
scope: code
|
||||||
|
substantive: true
|
||||||
|
raw_file: 20260417T035800Z_61a73ba_prompt_io.raw.md
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prompt
|
||||||
|
|
||||||
|
Follow-up from the design session logged in
|
||||||
|
`20260417T034918Z_9703210_prompt_io.md`: the user
|
||||||
|
greenlit Phase A after `/prompt-io` tracking, so
|
||||||
|
execute the Phase A "modularize `spawn._spawn`" plan
|
||||||
|
end-to-end in a new worktree — four logical steps
|
||||||
|
(pytest harness prep, `trio_proc` extraction,
|
||||||
|
`mp_proc` extraction, docstring refresh), each green
|
||||||
|
via `/run-tests`.
|
||||||
|
|
||||||
|
User directives during execution:
|
||||||
|
- **Option 3** for the test cadence ("continue A.2–A.4
|
||||||
|
first, then run the full suite once at the end of
|
||||||
|
Phase A").
|
||||||
|
- **One commit** for the whole phase ("can't we just
|
||||||
|
commit the whole patch in one commit?") instead of
|
||||||
|
the 3/4-commit split I initially proposed.
|
||||||
|
- **Don't pre-draft** commit messages — wait for the
|
||||||
|
user to invoke `/commit-msg` (captured as feedback
|
||||||
|
memory `feedback_no_auto_draft_commit_msgs.md`).
|
||||||
|
|
||||||
|
## Response summary
|
||||||
|
|
||||||
|
Produced the cohesive Phase A modularization patch,
|
||||||
|
landed as commit `61a73bae` (subject: `Mv
|
||||||
|
trio_proc`/`mp_proc` to per-backend submods`). Five
|
||||||
|
files changed, +565 / -418 lines.
|
||||||
|
|
||||||
|
Key pieces of the patch (generated by claude,
|
||||||
|
reviewed by the human before commit):
|
||||||
|
- `tractor/spawn/_trio.py` — **new**; receives
|
||||||
|
`trio_proc()` verbatim from `_spawn.py`; imports
|
||||||
|
cross-backend helpers back from `._spawn`.
|
||||||
|
- `tractor/spawn/_mp.py` — **new**; receives
|
||||||
|
`mp_proc()` verbatim; uses `from . import _spawn`
|
||||||
|
for late-binding access to the mutable `_ctx` /
|
||||||
|
`_spawn_method` globals (design decision made
|
||||||
|
during impl, not the original plan).
|
||||||
|
- `tractor/spawn/_spawn.py` — shrunk 847 → 448 LOC;
|
||||||
|
import pruning; bottom-of-module late imports for
|
||||||
|
`trio_proc` / `mp_proc` with a one-line comment
|
||||||
|
explaining the circular-dep reason.
|
||||||
|
- `tractor/spawn/__init__.py` — docstring refresh
|
||||||
|
describing the new layout.
|
||||||
|
- `tractor/_testing/pytest.py` — the valid-backend
|
||||||
|
set now comes from `typing.get_args(SpawnMethodKey)`
|
||||||
|
so future additions (`'subint'`) don't need harness
|
||||||
|
edits.
|
||||||
|
|
||||||
|
## Files changed
|
||||||
|
|
||||||
|
See `git diff 61a73ba~1..61a73ba --stat`:
|
||||||
|
|
||||||
|
```
|
||||||
|
tractor/_testing/pytest.py | 12 +-
|
||||||
|
tractor/spawn/__init__.py | 31 +++-
|
||||||
|
tractor/spawn/_mp.py | 235 ++++++++++++++++++++++++
|
||||||
|
tractor/spawn/_spawn.py | 413 +-------------------------------
|
||||||
|
tractor/spawn/_trio.py | 292 ++++++++++++++++++++++++++++
|
||||||
|
5 files changed, 565 insertions(+), 418 deletions(-)
|
||||||
|
```
|
||||||
|
|
||||||
|
Validation:
|
||||||
|
- import probe + `_methods` wiring check — OK
|
||||||
|
- spawn-relevant test subset — 37 passed, 1 skipped
|
||||||
|
- full suite — 350 passed, 14 skipped, 7 xfailed, 1
|
||||||
|
xpassed
|
||||||
|
|
||||||
|
## Human edits
|
||||||
|
|
||||||
|
None — committed as generated by claude (no diff
|
||||||
|
between `.claude/git_commit_msg_LATEST.md` and the
|
||||||
|
committed body, as far as the assistant could
|
||||||
|
observe).
|
||||||
|
|
@ -0,0 +1,138 @@
|
||||||
|
---
|
||||||
|
model: claude-opus-4-7[1m]
|
||||||
|
service: claude
|
||||||
|
timestamp: 2026-04-17T03:58:05Z
|
||||||
|
git_ref: 61a73ba
|
||||||
|
diff_cmd: git diff 61a73ba~1..61a73ba
|
||||||
|
---
|
||||||
|
|
||||||
|
Code generated in this turn was committed verbatim as
|
||||||
|
`61a73bae` ("Mv `trio_proc`/`mp_proc` to per-backend
|
||||||
|
submods"). Per diff-ref mode, per-file code is captured
|
||||||
|
via the pointers below, each followed by a prose
|
||||||
|
summary of what the AI generated. Non-code output
|
||||||
|
(sanity-check results, design rationale) is included
|
||||||
|
verbatim.
|
||||||
|
|
||||||
|
## Per-file generated content
|
||||||
|
|
||||||
|
### `tractor/spawn/_trio.py` (new, 292 lines)
|
||||||
|
|
||||||
|
> `git diff 61a73ba~1..61a73ba -- tractor/spawn/_trio.py`
|
||||||
|
|
||||||
|
Pure lift-and-shift of `trio_proc()` out of
|
||||||
|
`tractor/spawn/_spawn.py` (previously lines 448–670).
|
||||||
|
Added AGPL header + module docstring describing the
|
||||||
|
backend; imports include local `from ._spawn import
|
||||||
|
cancel_on_completion, hard_kill, soft_kill` which
|
||||||
|
creates the bottom-of-module late-import pattern in
|
||||||
|
the core file to avoid a cycle. All call sites,
|
||||||
|
log-format strings, and body logic are byte-identical
|
||||||
|
to the originals — no semantic change.
|
||||||
|
|
||||||
|
### `tractor/spawn/_mp.py` (new, 235 lines)
|
||||||
|
|
||||||
|
> `git diff 61a73ba~1..61a73ba -- tractor/spawn/_mp.py`
|
||||||
|
|
||||||
|
Pure lift-and-shift of `mp_proc()` out of
|
||||||
|
`tractor/spawn/_spawn.py` (previously lines 672–842).
|
||||||
|
Same AGPL header convention. Key difference from
|
||||||
|
`_trio.py`: uses `from . import _spawn` (module
|
||||||
|
import, not from-import) for `_ctx` and
|
||||||
|
`_spawn_method` references — these are mutated at
|
||||||
|
runtime by `try_set_start_method()`, so late binding
|
||||||
|
via `_spawn._ctx` / `_spawn._spawn_method` is required
|
||||||
|
for correctness. Also imports `cancel_on_completion`,
|
||||||
|
`soft_kill`, `proc_waiter` from `._spawn`.
|
||||||
|
|
||||||
|
### `tractor/spawn/_spawn.py` (modified, 847 → 448 LOC)
|
||||||
|
|
||||||
|
> `git diff 61a73ba~1..61a73ba -- tractor/spawn/_spawn.py`
|
||||||
|
|
||||||
|
- removed `trio_proc()` body (moved to `_trio.py`)
|
||||||
|
- removed `mp_proc()` body (moved to `_mp.py`)
|
||||||
|
- pruned imports now unused in core: `sys`,
|
||||||
|
`is_root_process`, `current_actor`,
|
||||||
|
`is_main_process`, `_mp_main`, `ActorFailure`,
|
||||||
|
`pretty_struct`, `_pformat`
|
||||||
|
- added bottom-of-file late imports
|
||||||
|
`from ._trio import trio_proc` and
|
||||||
|
`from ._mp import mp_proc` with a one-line
|
||||||
|
comment explaining why (circular dep)
|
||||||
|
- `_methods` dict unchanged structurally; still binds
|
||||||
|
`'trio' → trio_proc`, `'mp_spawn' → mp_proc`,
|
||||||
|
`'mp_forkserver' → mp_proc`
|
||||||
|
|
||||||
|
### `tractor/spawn/__init__.py` (modified)
|
||||||
|
|
||||||
|
> `git diff 61a73ba~1..61a73ba -- tractor/spawn/__init__.py`
|
||||||
|
|
||||||
|
Docstring rewrite only — no code. New text describes
|
||||||
|
the per-backend `Layout` with `- ._spawn` (core),
|
||||||
|
`- ._trio` (default), `- ._mp` (two mp variants), and
|
||||||
|
reiterates the "no eager import" NOTE.
|
||||||
|
|
||||||
|
### `tractor/_testing/pytest.py` (modified)
|
||||||
|
|
||||||
|
> `git diff 61a73ba~1..61a73ba -- tractor/_testing/pytest.py`
|
||||||
|
|
||||||
|
Two-line logical change (plus a comment):
|
||||||
|
- `from typing import` grows `get_args`
|
||||||
|
- `pytest_generate_tests()` replaces the hardcoded
|
||||||
|
`('mp_spawn','mp_forkserver','trio')` tuple with
|
||||||
|
`get_args(SpawnMethodKey)` via lazy import
|
||||||
|
`from tractor.spawn._spawn import SpawnMethodKey`
|
||||||
|
inside the function body (matching the project's
|
||||||
|
existing lazy-import convention for the spawn
|
||||||
|
module)
|
||||||
|
|
||||||
|
## Non-code output (verbatim)
|
||||||
|
|
||||||
|
### Design rationale for the file split
|
||||||
|
|
||||||
|
From my Phase A plan turn (already captured in
|
||||||
|
`20260417T034918Z_9703210_prompt_io.md`):
|
||||||
|
- `_spawn.py` stays as the "core" rather than renamed
|
||||||
|
to `_core.py` so external imports
|
||||||
|
(`_testing/pytest.py:228` imports
|
||||||
|
`try_set_start_method` from `tractor.spawn._spawn`)
|
||||||
|
keep working without churn.
|
||||||
|
- Per-backend extraction chosen over alternatives
|
||||||
|
(e.g. splitting generic helpers further) because
|
||||||
|
the immediate motivation is hosting a 3rd
|
||||||
|
`_subint.py` sibling cleanly in Phase B.
|
||||||
|
|
||||||
|
### Sanity-check output (verbatim terminal excerpts)
|
||||||
|
|
||||||
|
Post-extraction import probe:
|
||||||
|
```
|
||||||
|
extraction OK
|
||||||
|
_methods: {'trio': 'tractor.spawn._trio.trio_proc',
|
||||||
|
'mp_spawn': 'tractor.spawn._mp.mp_proc',
|
||||||
|
'mp_forkserver': 'tractor.spawn._mp.mp_proc'}
|
||||||
|
```
|
||||||
|
|
||||||
|
Spawn-relevant test subset (`tests/test_local.py
|
||||||
|
test_rpc.py test_spawning.py test_multi_program.py
|
||||||
|
test_discovery.py`):
|
||||||
|
```
|
||||||
|
37 passed, 1 skipped, 14 warnings in 55.37s
|
||||||
|
```
|
||||||
|
|
||||||
|
Full suite:
|
||||||
|
```
|
||||||
|
350 passed, 14 skipped, 7 xfailed, 1 xpassed,
|
||||||
|
151 warnings in 437.73s (0:07:17)
|
||||||
|
```
|
||||||
|
|
||||||
|
No regressions vs. `main`. One transient `-x`
|
||||||
|
early-stop `ERROR` on
|
||||||
|
`test_close_channel_explicit_remote_registrar[trio-True]`
|
||||||
|
was flaky (passed solo, passed without `-x`), not
|
||||||
|
caused by this refactor.
|
||||||
|
|
||||||
|
### Commit message
|
||||||
|
|
||||||
|
Also AI-drafted (via `/commit-msg`) — the 40-line
|
||||||
|
message on commit `61a73bae` itself. Not reproduced
|
||||||
|
here; see `git log -1 61a73bae`.
|
||||||
|
|
@ -0,0 +1,97 @@
|
||||||
|
---
|
||||||
|
model: claude-opus-4-7[1m]
|
||||||
|
service: claude
|
||||||
|
session: subints-spawner-phase-b1-impl
|
||||||
|
timestamp: 2026-04-17T12:44:37Z
|
||||||
|
git_ref: 5cd6df5
|
||||||
|
scope: code
|
||||||
|
substantive: true
|
||||||
|
raw_file: 20260417T124437Z_5cd6df5_prompt_io.raw.md
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prompt
|
||||||
|
|
||||||
|
Continuing the Phase B work from the design sessions
|
||||||
|
logged in `20260417T034918Z_9703210_prompt_io.md` and
|
||||||
|
the Phase A impl in `20260417T035800Z_61a73ba_...`.
|
||||||
|
|
||||||
|
User direction at this step:
|
||||||
|
- "ok now let's continue with the subints part of the
|
||||||
|
plan ya?" → kick off Phase B.
|
||||||
|
- After I proposed a 4-sub-phase breakdown (B.1
|
||||||
|
scaffolding, B.2 min-viable spawn, B.3 lifecycle,
|
||||||
|
B.4 test enablement), user: "ok can we continue
|
||||||
|
with B" → approving the phasing and design
|
||||||
|
defaults (thread-per-subint, UDS default,
|
||||||
|
code-string bootstrap via `interp.exec()`).
|
||||||
|
- User switched the worktree to
|
||||||
|
`subint_spawner_backend` (instead of opening a
|
||||||
|
fresh worktree as I'd proposed): "i already
|
||||||
|
switched branches in the wkt, you should work off
|
||||||
|
`subint_spawner_backend` now".
|
||||||
|
|
||||||
|
Scope of this turn: **B.1 scaffolding only** —
|
||||||
|
wire the `'subint'` name through the spawn-method
|
||||||
|
registry + harness, add a feature-detected stub
|
||||||
|
module, bump pyproject. No actual spawn flow.
|
||||||
|
|
||||||
|
## Response summary
|
||||||
|
|
||||||
|
Produced the Phase B.1 scaffolding patch, landed as
|
||||||
|
commit `5cd6df58` (subject: `Add `'subint'` spawn
|
||||||
|
backend scaffold (#379)`). Four files changed,
|
||||||
|
+124 / -2 lines.
|
||||||
|
|
||||||
|
Key pieces (all generated by claude, reviewed by
|
||||||
|
human before commit):
|
||||||
|
- `tractor/spawn/_subint.py` — **new**; feature-
|
||||||
|
detects `concurrent.interpreters`; `subint_proc()`
|
||||||
|
stub raises `RuntimeError` on py<3.14 or
|
||||||
|
`NotImplementedError` with issue-#379 URL on
|
||||||
|
py≥3.14. Signature mirrors `trio_proc`/`mp_proc`
|
||||||
|
so B.2 can drop the impl in without touching
|
||||||
|
`_methods`.
|
||||||
|
- `tractor/spawn/_spawn.py` — adds `'subint'` to
|
||||||
|
`SpawnMethodKey`, grows a `case 'subint'` arm in
|
||||||
|
`try_set_start_method()` with feature-gate, re-
|
||||||
|
imports `sys` for the gate-error msg, adds late
|
||||||
|
`from ._subint import subint_proc` import and
|
||||||
|
`_methods` entry.
|
||||||
|
- `tractor/_testing/pytest.py` — converts the
|
||||||
|
gate-error into `pytest.UsageError` via a
|
||||||
|
`try/except` around `try_set_start_method()` so
|
||||||
|
`--spawn-backend=subint` on py<3.14 prints a
|
||||||
|
clean banner instead of a traceback.
|
||||||
|
- `pyproject.toml` — pin `requires-python` `<3.14`
|
||||||
|
→ `<3.15`, add `3.14` trove classifier.
|
||||||
|
|
||||||
|
## Files changed
|
||||||
|
|
||||||
|
See `git diff 5cd6df5~1..5cd6df5 --stat`:
|
||||||
|
|
||||||
|
```
|
||||||
|
pyproject.toml | 3 +-
|
||||||
|
tractor/_testing/pytest.py | 8 +++-
|
||||||
|
tractor/spawn/_spawn.py | 15 ++++++
|
||||||
|
tractor/spawn/_subint.py | 100 +++++++++++++++++++++++++
|
||||||
|
4 files changed, 124 insertions(+), 2 deletions(-)
|
||||||
|
```
|
||||||
|
|
||||||
|
Validation on py3.13:
|
||||||
|
- registry/gate probe — OK; `_has_subints = False`,
|
||||||
|
`subint_proc()` raises `RuntimeError` as expected.
|
||||||
|
- `try_set_start_method('subint')` — raises cleanly.
|
||||||
|
- `pytest --spawn-backend=subint` — exits with
|
||||||
|
`pytest.UsageError` banner.
|
||||||
|
- Spawn-relevant test subset — 69 passed, 1 skipped.
|
||||||
|
- Full suite on py3.13 deferred (scaffolding is
|
||||||
|
no-op on <3.14).
|
||||||
|
- Full-suite on py3.14 pending user setting up the
|
||||||
|
venv.
|
||||||
|
|
||||||
|
## Human edits
|
||||||
|
|
||||||
|
None — committed as generated by claude. `uv.lock`
|
||||||
|
was intentionally left unstaged by the user (pin-
|
||||||
|
bump regenerated cp314 wheel entries but they were
|
||||||
|
deferred to a follow-up).
|
||||||
|
|
@ -0,0 +1,168 @@
|
||||||
|
---
|
||||||
|
model: claude-opus-4-7[1m]
|
||||||
|
service: claude
|
||||||
|
timestamp: 2026-04-17T12:44:37Z
|
||||||
|
git_ref: 5cd6df5
|
||||||
|
diff_cmd: git diff 5cd6df5~1..5cd6df5
|
||||||
|
---
|
||||||
|
|
||||||
|
Code generated in this turn was committed verbatim as
|
||||||
|
`5cd6df58` ("Add `'subint'` spawn backend scaffold
|
||||||
|
(#379)"). Per diff-ref mode, per-file code is captured
|
||||||
|
via the pointers below, each followed by a prose
|
||||||
|
summary. Non-code output (sanity-check results,
|
||||||
|
design rationale) is included verbatim.
|
||||||
|
|
||||||
|
## Per-file generated content
|
||||||
|
|
||||||
|
### `tractor/spawn/_subint.py` (new, 100 lines)
|
||||||
|
|
||||||
|
> `git diff 5cd6df5~1..5cd6df5 -- tractor/spawn/_subint.py`
|
||||||
|
|
||||||
|
New scaffolding module for the PEP 734 subinterpreter
|
||||||
|
backend. Contents:
|
||||||
|
- AGPL header + module docstring (describes backend
|
||||||
|
intent, 3.14+ availability gate, and explicit
|
||||||
|
"SCAFFOLDING STUB" status pointing at issue #379).
|
||||||
|
- Top-level `try/except ImportError` wrapping
|
||||||
|
`from concurrent import interpreters as
|
||||||
|
_interpreters` → sets module-global
|
||||||
|
`_has_subints: bool`. This lets the registry stay
|
||||||
|
introspectable on py<3.14 while spawn-time still
|
||||||
|
fails cleanly.
|
||||||
|
- `subint_proc()` coroutine with signature matching
|
||||||
|
`trio_proc`/`mp_proc` exactly (same param names,
|
||||||
|
defaults, and `TaskStatus[Portal]` typing) —
|
||||||
|
intentional so Phase B.2 can drop the impl in
|
||||||
|
without touching `_methods` or changing call-site
|
||||||
|
binding.
|
||||||
|
- Body raises `RuntimeError` on py<3.14 (with
|
||||||
|
`sys.version` printed) or `NotImplementedError`
|
||||||
|
with issue-#379 URL on py≥3.14.
|
||||||
|
|
||||||
|
### `tractor/spawn/_spawn.py` (modified, +15 LOC)
|
||||||
|
|
||||||
|
> `git diff 5cd6df5~1..5cd6df5 -- tractor/spawn/_spawn.py`
|
||||||
|
|
||||||
|
- `import sys` re-added (pruned during Phase A, now
|
||||||
|
needed again for the py-version string in the
|
||||||
|
`'subint'` gate-error).
|
||||||
|
- `SpawnMethodKey = Literal[...]` grows `'subint'` as
|
||||||
|
the 4th member, with inline comment `# py3.14+ via
|
||||||
|
`concurrent.interpreters` (PEP 734)`.
|
||||||
|
- `try_set_start_method()` match-block grows a new
|
||||||
|
`case 'subint':` arm that imports
|
||||||
|
`from ._subint import _has_subints` lazily and
|
||||||
|
raises `RuntimeError` with a multi-line gate msg
|
||||||
|
if unavailable.
|
||||||
|
- Bottom-of-module late-import section grows
|
||||||
|
`from ._subint import subint_proc` alongside the
|
||||||
|
existing `_trio` / `_mp` imports.
|
||||||
|
- `_methods` dict grows `'subint': subint_proc`.
|
||||||
|
|
||||||
|
### `tractor/_testing/pytest.py` (modified, +8 LOC)
|
||||||
|
|
||||||
|
> `git diff 5cd6df5~1..5cd6df5 -- tractor/_testing/pytest.py`
|
||||||
|
|
||||||
|
`pytest_configure()` wraps the
|
||||||
|
`try_set_start_method(backend)` call in a
|
||||||
|
`try/except RuntimeError` that re-raises as
|
||||||
|
`pytest.UsageError(str(err))`. Rationale: the gate
|
||||||
|
error on py<3.14 is legitimately a configuration
|
||||||
|
problem, not a test failure, so pytest's UsageError
|
||||||
|
path (exit code 4) gives a clean single-line banner
|
||||||
|
instead of a traceback.
|
||||||
|
|
||||||
|
### `pyproject.toml` (modified, +2 / -1)
|
||||||
|
|
||||||
|
> `git diff 5cd6df5~1..5cd6df5 -- pyproject.toml`
|
||||||
|
|
||||||
|
- `requires-python` pin relaxed `>=3.12, <3.14` →
|
||||||
|
`>=3.12, <3.15` to admit 3.14 as a valid target.
|
||||||
|
- Added `"Programming Language :: Python :: 3.14"`
|
||||||
|
to the trove classifiers.
|
||||||
|
|
||||||
|
## Non-code output (verbatim)
|
||||||
|
|
||||||
|
### Design choices captured in the prior turn
|
||||||
|
|
||||||
|
(Previously confirmed by the user on April 17 in this
|
||||||
|
session — relevant excerpts captured here for
|
||||||
|
provenance since they drove the shape of the impl):
|
||||||
|
|
||||||
|
1. **Python pin**: `<3.15` + feature-gate the subint
|
||||||
|
backend at spawn time — user answered option (a)
|
||||||
|
from the design triage.
|
||||||
|
2. **Deferred `fork()`-via-subint hack** from issue
|
||||||
|
#379's "Our own thoughts" section.
|
||||||
|
3. **Phase B phasing**: user approved the B.1 / B.2 /
|
||||||
|
B.3 / B.4 breakdown — this commit is strictly B.1
|
||||||
|
(scaffolding only, no spawn-flow impl).
|
||||||
|
4. **Option (B) worktree strategy**: new worktree
|
||||||
|
branched from `wkt/spawn_modularize`. *(Amended by
|
||||||
|
user at runtime: user switched the existing
|
||||||
|
`spawn_modularize` worktree to the
|
||||||
|
`subint_spawner_backend` branch instead.)*
|
||||||
|
|
||||||
|
### Sanity-check output (verbatim terminal excerpts)
|
||||||
|
|
||||||
|
Registry / feature-gate verification on py3.13:
|
||||||
|
```
|
||||||
|
SpawnMethodKey values: ('trio', 'mp_spawn',
|
||||||
|
'mp_forkserver', 'subint')
|
||||||
|
_methods keys: ['trio', 'mp_spawn',
|
||||||
|
'mp_forkserver', 'subint']
|
||||||
|
_has_subints: False (py version: (3, 13) )
|
||||||
|
[expected] RuntimeError: The 'subint' spawn backend
|
||||||
|
requires Python 3.14+ (stdlib
|
||||||
|
`concurrent.interpreters`, PEP 734).
|
||||||
|
```
|
||||||
|
|
||||||
|
`try_set_start_method('subint')` gate on py3.13:
|
||||||
|
```
|
||||||
|
[expected] RuntimeError: Spawn method 'subint'
|
||||||
|
requires Python 3.14+ (stdlib
|
||||||
|
`concurrent.interpreters`, PEP 734).
|
||||||
|
```
|
||||||
|
|
||||||
|
Pytest `--spawn-backend=subint` on py3.13 (the new
|
||||||
|
UsageError wrapper kicking in):
|
||||||
|
```
|
||||||
|
ERROR: Spawn method 'subint' requires Python 3.14+
|
||||||
|
(stdlib `concurrent.interpreters`, PEP 734).
|
||||||
|
Current runtime: 3.13.11 (main, Dec 5 2025,
|
||||||
|
16:06:33) [GCC 15.2.0]
|
||||||
|
```
|
||||||
|
|
||||||
|
Collection probe: `404 tests collected in 0.18s`
|
||||||
|
(no import errors from the new module).
|
||||||
|
|
||||||
|
Spawn-relevant test subset (`tests/test_local.py
|
||||||
|
test_rpc.py test_spawning.py test_multi_program.py
|
||||||
|
tests/discovery/`):
|
||||||
|
```
|
||||||
|
69 passed, 1 skipped, 10 warnings in 61.38s
|
||||||
|
```
|
||||||
|
|
||||||
|
Full suite was **not** run on py3.13 for this commit
|
||||||
|
— the scaffolding is no-op on <3.14 and full-suite
|
||||||
|
validation under py3.14 is pending that venv being
|
||||||
|
set up by the user.
|
||||||
|
|
||||||
|
### Commit message
|
||||||
|
|
||||||
|
Also AI-drafted (via `/commit-msg`, with the prose
|
||||||
|
rewrapped through `/home/goodboy/.claude/skills/pr-msg/
|
||||||
|
scripts/rewrap.py --width 67`) — the 33-line message
|
||||||
|
on commit `5cd6df58` itself. Not reproduced here; see
|
||||||
|
`git log -1 5cd6df58`.
|
||||||
|
|
||||||
|
### Known follow-ups flagged to user
|
||||||
|
|
||||||
|
- **`uv.lock` deferred**: pin-bump regenerated cp314
|
||||||
|
wheel entries in `uv.lock`, but the user chose to
|
||||||
|
not stage `uv.lock` for this commit. Warned
|
||||||
|
explicitly.
|
||||||
|
- **Phase B.2 needs py3.14 venv** — running the
|
||||||
|
actual subint impl requires it; user said they'd
|
||||||
|
set it up separately.
|
||||||
|
|
@ -0,0 +1,117 @@
|
||||||
|
---
|
||||||
|
model: claude-opus-4-7[1m]
|
||||||
|
service: claude
|
||||||
|
session: subints-phase-b2-destroy-race-fix
|
||||||
|
timestamp: 2026-04-18T04:25:26Z
|
||||||
|
git_ref: 26fb820
|
||||||
|
scope: code
|
||||||
|
substantive: true
|
||||||
|
raw_file: 20260418T042526Z_26fb820_prompt_io.raw.md
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prompt
|
||||||
|
|
||||||
|
Follow-up to Phase B.2 (`5cd6df58`) after the user
|
||||||
|
observed intermittent mid-suite hangs when running
|
||||||
|
the tractor test suite under `--spawn-backend=subint`
|
||||||
|
on py3.14. The specific sequence of prompts over
|
||||||
|
several turns:
|
||||||
|
|
||||||
|
1. User pointed at the `test_context_stream_semantics.py`
|
||||||
|
suite as the first thing to make run clean under
|
||||||
|
`--spawn-backend=subint`.
|
||||||
|
2. After a series of `timeout`-terminated runs that
|
||||||
|
gave no diagnostic info, user nudged me to stop
|
||||||
|
relying on `timeout` and get actual runtime
|
||||||
|
diagnostics ("the suite hangs indefinitely, so i
|
||||||
|
don't think this `timeout 30` is helping you at
|
||||||
|
all.."). Switched to
|
||||||
|
`faulthandler.dump_traceback_later(...)` and a
|
||||||
|
resource-tracker fixture to rule out leaks.
|
||||||
|
3. Captured a stack pinning the hang on
|
||||||
|
`_interpreters.destroy(interp_id)` in the subint
|
||||||
|
teardown finally block.
|
||||||
|
4. Proposed dedicated-OS-thread fix. User greenlit.
|
||||||
|
5. Implemented + verified on-worktree; user needed
|
||||||
|
to be pointed at the *worktree*'s `./py313` venv
|
||||||
|
because bare `pytest` was picking up the main
|
||||||
|
repo's venv (running un-patched `_subint.py`) and
|
||||||
|
still hanging.
|
||||||
|
|
||||||
|
Running theme over the whole exchange: this patch
|
||||||
|
only closes the *destroy race*. The user and I also
|
||||||
|
traced through the deeper cancellation story — SIGINT
|
||||||
|
can't reach subints, legacy-mode shares the GIL,
|
||||||
|
portal-cancel dies when the IPC channel is already
|
||||||
|
broken — and agreed the next step is a bounded
|
||||||
|
hard-kill in `subint_proc`'s teardown plus a
|
||||||
|
dedicated cancellation test suite. Those land as
|
||||||
|
separate commits.
|
||||||
|
|
||||||
|
## Response summary
|
||||||
|
|
||||||
|
Produced the `tractor/spawn/_subint.py` patch landed
|
||||||
|
as commit `26fb8206` ("Fix subint destroy race via
|
||||||
|
dedicated OS thread"). One file, +110/-84 LOC.
|
||||||
|
|
||||||
|
Mechanism: swap `trio.to_thread.run_sync(_interpreters
|
||||||
|
.exec, ...)` for a plain `threading.Thread(target=...
|
||||||
|
, daemon=False)`. The trio thread cache recycles
|
||||||
|
workers — so the OS thread that ran `_interpreters
|
||||||
|
.exec()` remained alive in the cache holding a
|
||||||
|
stale subint tstate, blocking
|
||||||
|
`_interpreters.destroy()` in the finally indefinitely.
|
||||||
|
A dedicated one-shot thread exits naturally after
|
||||||
|
the sync target returns, releasing tstate and
|
||||||
|
unblocking destroy.
|
||||||
|
|
||||||
|
Coordination across the trio↔thread boundary:
|
||||||
|
- `trio.lowlevel.current_trio_token()` captured at
|
||||||
|
`subint_proc` entry
|
||||||
|
- driver thread signals `subint_exited.set()` back
|
||||||
|
to parent trio via `trio.from_thread.run_sync(...,
|
||||||
|
trio_token=token)` (synchronous from the thread's
|
||||||
|
POV; the call returns after trio has run `.set()`)
|
||||||
|
- `trio.RunFinishedError` swallowed in that path for
|
||||||
|
the process-teardown case where parent trio already
|
||||||
|
exited
|
||||||
|
- teardown `finally` off-loads the sync
|
||||||
|
`driver_thread.join()` via `to_thread.run_sync` (a
|
||||||
|
cache thread carries no subint tstate — safe)
|
||||||
|
|
||||||
|
## Files changed
|
||||||
|
|
||||||
|
See `git diff 26fb820~1..26fb820 --stat`:
|
||||||
|
|
||||||
|
```
|
||||||
|
tractor/spawn/_subint.py | 194 +++++++++++++++++++------------
|
||||||
|
1 file changed, 110 insertions(+), 84 deletions(-)
|
||||||
|
```
|
||||||
|
|
||||||
|
Validation:
|
||||||
|
- `test_parent_cancels[chk_ctx_result_before_exit=True-
|
||||||
|
cancel_method=ctx-child_returns_early=False]`
|
||||||
|
(the specific test that was hanging for the user)
|
||||||
|
— passed in 1.06s.
|
||||||
|
- Full `tests/test_context_stream_semantics.py` under
|
||||||
|
subint — 61 passed in 100.35s (clean-cache re-run:
|
||||||
|
100.82s).
|
||||||
|
- Trio backend regression subset — 69 passed / 1
|
||||||
|
skipped / 89.19s — no regressions from this change.
|
||||||
|
|
||||||
|
## Files changed
|
||||||
|
|
||||||
|
Beyond the `_subint.py` patch, the raw log also
|
||||||
|
records the cancellation-semantics research that
|
||||||
|
spanned this conversation but did not ship as code
|
||||||
|
in *this* commit. Preserving it inline under "Non-
|
||||||
|
code output" because it directly informs the
|
||||||
|
Phase B.3 hard-kill impl that will follow (and any
|
||||||
|
upstream CPython bug reports we end up filing).
|
||||||
|
|
||||||
|
## Human edits
|
||||||
|
|
||||||
|
None — committed as generated. The commit message
|
||||||
|
itself was also AI-drafted via `/commit-msg` and
|
||||||
|
rewrapped via the project's `rewrap.py --width 67`
|
||||||
|
tooling; user landed it without edits.
|
||||||
|
|
@ -0,0 +1,220 @@
|
||||||
|
---
|
||||||
|
model: claude-opus-4-7[1m]
|
||||||
|
service: claude
|
||||||
|
timestamp: 2026-04-18T04:25:26Z
|
||||||
|
git_ref: 26fb820
|
||||||
|
diff_cmd: git diff 26fb820~1..26fb820
|
||||||
|
---
|
||||||
|
|
||||||
|
Code generated in this turn was committed as `26fb8206`
|
||||||
|
("Fix subint destroy race via dedicated OS thread").
|
||||||
|
Per diff-ref mode, per-file code is captured via the
|
||||||
|
pointer below; the rest is verbatim non-code output
|
||||||
|
— test-session diagnostics, CPython-internals
|
||||||
|
reasoning, and the design synthesis that's worth
|
||||||
|
keeping alongside the code since it informs the
|
||||||
|
still-pending Phase B.3 hard-kill work.
|
||||||
|
|
||||||
|
## Per-file generated content
|
||||||
|
|
||||||
|
### `tractor/spawn/_subint.py` (modified, +110/-84 LOC)
|
||||||
|
|
||||||
|
> `git diff 26fb820~1..26fb820 -- tractor/spawn/_subint.py`
|
||||||
|
|
||||||
|
Rewrites the subint driver-thread strategy:
|
||||||
|
- replaces `trio.to_thread.run_sync(_interpreters.exec, ...)`
|
||||||
|
with a plain `threading.Thread(target=_subint_target,
|
||||||
|
daemon=False)` so the OS thread truly exits after
|
||||||
|
`_interpreters.exec()` returns
|
||||||
|
- captures a `trio.lowlevel.current_trio_token()` at
|
||||||
|
`subint_proc` entry; the driver thread signals
|
||||||
|
completion back via `trio.from_thread.run_sync(
|
||||||
|
subint_exited.set, trio_token=...)`
|
||||||
|
- swallows `trio.RunFinishedError` in the signal path
|
||||||
|
for the case where the parent trio loop has already
|
||||||
|
exited (process teardown)
|
||||||
|
- in the teardown `finally` off-loads the sync
|
||||||
|
`driver_thread.join()` call to `trio.to_thread.run_sync`
|
||||||
|
(a cache thread w/ no subint tstate — so no cache
|
||||||
|
conflict) to wait for the driver thread to fully
|
||||||
|
exit before calling `_interpreters.destroy()`
|
||||||
|
|
||||||
|
## Non-code output (verbatim) — the CPython-internals research
|
||||||
|
|
||||||
|
### What went wrong before this commit
|
||||||
|
|
||||||
|
Under `--spawn-backend=subint` on py3.14, most single
|
||||||
|
tests passed but longer runs hung intermittently. The
|
||||||
|
position of the hang moved between runs (test #22 on
|
||||||
|
one run, test #53 on another) suggesting a timing-
|
||||||
|
dependent race rather than a deterministic bug.
|
||||||
|
|
||||||
|
`faulthandler.dump_traceback_later()` eventually
|
||||||
|
caught a stack with the main thread blocked in
|
||||||
|
`_interpreters.destroy(interp_id)` at `_subint.py:293`.
|
||||||
|
Only 2 threads were alive:
|
||||||
|
- main thread waiting in `_interpreters.destroy()`
|
||||||
|
- one idle trio thread-cache worker in
|
||||||
|
`trio._core._thread_cache._work`
|
||||||
|
|
||||||
|
No subint was still running (`_interpreters.list_all()`
|
||||||
|
showed only the main interp). A resource-tracker
|
||||||
|
pytest fixture confirmed threads/subints did NOT
|
||||||
|
accumulate across tests — this was not a leak but a
|
||||||
|
specific "destroy blocks on cached thread w/ stale
|
||||||
|
tstate" race.
|
||||||
|
|
||||||
|
### Why the race exists
|
||||||
|
|
||||||
|
`trio.to_thread.run_sync` uses a thread *cache* to
|
||||||
|
avoid OS-thread creation overhead. When the sync
|
||||||
|
callable returns, the OS thread is NOT terminated —
|
||||||
|
it's parked in `_thread_cache._work` waiting for the
|
||||||
|
next job. CPython's subinterpreter implementation
|
||||||
|
attaches a **tstate** (thread-state object) to each
|
||||||
|
OS thread that ever entered a subint via
|
||||||
|
`_interpreters.exec()`. That tstate is released
|
||||||
|
lazily — either when the thread picks up a new job
|
||||||
|
(which re-attaches a new tstate, evicting the old
|
||||||
|
one) or when the thread truly exits.
|
||||||
|
|
||||||
|
`_interpreters.destroy(interp_id)` waits for *all*
|
||||||
|
tstates associated w/ that subint to be released
|
||||||
|
before it can proceed. If the cached worker is idle
|
||||||
|
holding the stale tstate, destroy blocks indefinitely.
|
||||||
|
Whether the race manifests depends on timing — if
|
||||||
|
the cached thread happens to pick up another job
|
||||||
|
quickly, destroy unblocks; if it sits idle, we hang.
|
||||||
|
|
||||||
|
### Why a dedicated `threading.Thread` fixes it
|
||||||
|
|
||||||
|
A plain `threading.Thread(target=_subint_target,
|
||||||
|
daemon=False)` runs its target once and exits. When
|
||||||
|
the target returns, OS-thread teardown (`_bootstrap_inner`
|
||||||
|
→ `_bootstrap`) fires and CPython releases the
|
||||||
|
tstate for that thread. `_interpreters.destroy()`
|
||||||
|
then has no blocker.
|
||||||
|
|
||||||
|
### Diagnostic tactics that actually helped
|
||||||
|
|
||||||
|
1. `faulthandler.dump_traceback_later(n, repeat=False,
|
||||||
|
file=open(path, 'w'))` for captured stack dumps on
|
||||||
|
hang. Critically, pipe to a `file=` not stderr —
|
||||||
|
pytest captures stderr weirdly and the dump is
|
||||||
|
easy to miss.
|
||||||
|
2. A resource-tracker autouse fixture printing
|
||||||
|
per-test `threading.active_count()` +
|
||||||
|
`len(_interpreters.list_all())` deltas → ruled out
|
||||||
|
leak-accumulation theories quickly.
|
||||||
|
3. Running the hanging test *solo* vs in-suite —
|
||||||
|
when solo passes but in-suite hangs, you know
|
||||||
|
it's a cross-test state-transfer bug rather than
|
||||||
|
a test-internal bug.
|
||||||
|
|
||||||
|
### Design synthesis — SIGINT + subints + SC
|
||||||
|
|
||||||
|
The user and I walked through the cancellation
|
||||||
|
semantics of PEP 684/734 subinterpreters in detail.
|
||||||
|
Key findings we want to preserve:
|
||||||
|
|
||||||
|
**Signal delivery in subints (stdlib limitation).**
|
||||||
|
CPython's signal machinery only delivers signals
|
||||||
|
(SIGINT included) to the *main thread of the main
|
||||||
|
interpreter*. Subints cannot install signal handlers
|
||||||
|
that will ever fire. This is an intentional design
|
||||||
|
choice in PEP 684 and not expected to change. For
|
||||||
|
tractor's subint actors, this means:
|
||||||
|
|
||||||
|
- Ctrl-C never reaches a subint directly.
|
||||||
|
- `trio.run()` running on a worker thread (as we do
|
||||||
|
for subints) already skips SIGINT handler install
|
||||||
|
because `signal.signal()` raises on non-main
|
||||||
|
threads.
|
||||||
|
- The only cancellation surface into a subint is
|
||||||
|
our IPC `Portal.cancel_actor()`.
|
||||||
|
|
||||||
|
**Legacy-mode subints share the main GIL** (which
|
||||||
|
our impl uses since `msgspec` lacks PEP 684 support
|
||||||
|
per `jcrist/msgspec#563`). This means a stuck subint
|
||||||
|
thread can starve the parent's trio loop during
|
||||||
|
cancellation — the parent can't even *start* its
|
||||||
|
teardown handling until the subint yields the GIL.
|
||||||
|
|
||||||
|
**Failure modes identified for Phase B.3 audit:**
|
||||||
|
|
||||||
|
1. Portal cancel lands cleanly → subint unwinds →
|
||||||
|
thread exits → destroy succeeds. (Happy path.)
|
||||||
|
2. IPC channel is already broken when we try to
|
||||||
|
send cancel (e.g., `test_ipc_channel_break_*`)
|
||||||
|
→ cancel raises `BrokenResourceError` → subint
|
||||||
|
keeps running unaware → parent hangs waiting for
|
||||||
|
`subint_exited`. This is what breaks
|
||||||
|
`test_advanced_faults.py` under subint.
|
||||||
|
3. Subint is stuck in non-checkpointing Python code
|
||||||
|
→ portal-cancel msg queued but never processed.
|
||||||
|
4. Subint is in a shielded cancel scope when cancel
|
||||||
|
arrives → delay until shield exits.
|
||||||
|
|
||||||
|
**Current teardown has a shield-bug too:**
|
||||||
|
`trio.CancelScope(shield=True)` wrapping the `finally`
|
||||||
|
block absorbs Ctrl-C, so even when the user tries
|
||||||
|
to break out they can't. This is the reason
|
||||||
|
`test_ipc_channel_break_during_stream[break_parent-...
|
||||||
|
no_msgstream_aclose]` locks up unkillable.
|
||||||
|
|
||||||
|
**B.3 hard-kill fix plan (next commit):**
|
||||||
|
|
||||||
|
1. Bound `driver_thread.join()` with
|
||||||
|
`trio.move_on_after(HARD_KILL_TIMEOUT)`.
|
||||||
|
2. If it times out, log a warning naming the
|
||||||
|
`interp_id` and switch the driver thread to
|
||||||
|
`daemon=True` mode (not actually possible after
|
||||||
|
start — so instead create as daemon=True upfront
|
||||||
|
and accept the tradeoff of proc-exit not waiting
|
||||||
|
for a stuck subint).
|
||||||
|
3. Best-effort `_interpreters.destroy()`; catch the
|
||||||
|
`InterpreterError` if the subint is still running.
|
||||||
|
4. Document that the leak is real and the only
|
||||||
|
escape hatch we have without upstream cooperation.
|
||||||
|
|
||||||
|
**Test plan for Phase B.3:**
|
||||||
|
|
||||||
|
New `tests/test_subint_cancellation.py` covering:
|
||||||
|
- SIGINT at spawn
|
||||||
|
- SIGINT mid-portal-RPC
|
||||||
|
- SIGINT during shielded section in subint
|
||||||
|
- Dead-channel cancel (mirror of `test_ipc_channel_
|
||||||
|
break_during_stream` minimized)
|
||||||
|
- Non-checkpointing subint (tight `while True` in
|
||||||
|
user code)
|
||||||
|
- Per-test `pytest-timeout`-style bounds so the
|
||||||
|
tests visibly fail instead of wedging the runner
|
||||||
|
|
||||||
|
### Sanity-check output (verbatim terminal excerpts)
|
||||||
|
|
||||||
|
Post-fix single-test validation:
|
||||||
|
```
|
||||||
|
1 passed, 1 warning in 1.06s
|
||||||
|
```
|
||||||
|
(same test that was hanging pre-fix:
|
||||||
|
`test_parent_cancels[...cancel_method=ctx-...False]`)
|
||||||
|
|
||||||
|
Full `tests/test_context_stream_semantics.py`
|
||||||
|
under subint:
|
||||||
|
```
|
||||||
|
61 passed, 1 warning in 100.35s (0:01:40)
|
||||||
|
```
|
||||||
|
and a clean-cache re-run:
|
||||||
|
```
|
||||||
|
61 passed, 1 warning in 100.82s (0:01:40)
|
||||||
|
```
|
||||||
|
|
||||||
|
No regressions on trio backend (same subset):
|
||||||
|
```
|
||||||
|
69 passed, 1 skipped, 3 warnings in 89.19s
|
||||||
|
```
|
||||||
|
|
||||||
|
### Commit msg
|
||||||
|
|
||||||
|
Also AI-drafted via `/commit-msg` + `rewrap.py
|
||||||
|
--width 67`. See `git log -1 26fb820`.
|
||||||
|
|
@ -0,0 +1,111 @@
|
||||||
|
---
|
||||||
|
model: claude-opus-4-7[1m]
|
||||||
|
service: claude
|
||||||
|
session: subint-phase-b-hang-classification
|
||||||
|
timestamp: 2026-04-20T19:27:39Z
|
||||||
|
git_ref: HEAD (pre-commit; on branch `subint_spawner_backend`)
|
||||||
|
scope: docs
|
||||||
|
substantive: true
|
||||||
|
raw_file: 20260420T192739Z_5e8cd8b2_prompt_io.raw.md
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prompt
|
||||||
|
|
||||||
|
Collab between user (`goodboy`) and `claude` to
|
||||||
|
disambiguate two distinct hang modes hit during
|
||||||
|
Phase B subint-spawn-backend bringup (issue #379).
|
||||||
|
The user ran the failing suites, captured `strace`
|
||||||
|
evidence on hung pytest pids, and set the framing:
|
||||||
|
|
||||||
|
> "These cannot be the same bug. Different fix
|
||||||
|
> paths. Write them up separately or we'll keep
|
||||||
|
> conflating them."
|
||||||
|
|
||||||
|
Follow-on asks:
|
||||||
|
- Cross-link each doc from its reproducer test so
|
||||||
|
a future triager lands on the analysis without
|
||||||
|
needing to grep `ai/conc-anal/`.
|
||||||
|
- On `test_stale_entry_is_deleted`: arm
|
||||||
|
`dump_on_hang(seconds=20, path=...)` so a
|
||||||
|
regression captures a stack dump (pytest stderr
|
||||||
|
capture otherwise eats `faulthandler` output).
|
||||||
|
- Keep `test_stale_entry_is_deleted` un-`skip`ped
|
||||||
|
so the dump file remains inspectable.
|
||||||
|
|
||||||
|
Defer the actual fix for the class-2 hang
|
||||||
|
(Ctrl-C-able, tractor-side bug) to a later commit
|
||||||
|
per the incremental-commits preference.
|
||||||
|
|
||||||
|
## Response summary
|
||||||
|
|
||||||
|
Wrote two new `ai/conc-anal/` classification docs
|
||||||
|
and cross-linked them from the reproducer tests:
|
||||||
|
|
||||||
|
1. **`subint_sigint_starvation_issue.md`** —
|
||||||
|
class 1: abandoned-legacy-subint thread +
|
||||||
|
shared GIL starves main trio loop →
|
||||||
|
signal-wakeup-fd pipe fills → SIGINT silently
|
||||||
|
dropped (`write() = EAGAIN`). Pytest process
|
||||||
|
un-Ctrl-C-able. Structurally a CPython limit;
|
||||||
|
blocked on `msgspec` PEP 684 support
|
||||||
|
(jcrist/msgspec#563). Reproducer:
|
||||||
|
`test_stale_entry_is_deleted[subint]`.
|
||||||
|
|
||||||
|
2. **`subint_cancel_delivery_hang_issue.md`** —
|
||||||
|
class 2: parent-side trio task parks on an
|
||||||
|
orphaned IPC channel after subint teardown;
|
||||||
|
no clean EOF delivered to waiting receiver.
|
||||||
|
Ctrl-C-able (main trio loop iterating fine).
|
||||||
|
OUR bug to fix. Candidate fix: explicit
|
||||||
|
parent-side channel abort in `subint_proc`'s
|
||||||
|
hard-kill teardown. Reproducer:
|
||||||
|
`test_subint_non_checkpointing_child`.
|
||||||
|
|
||||||
|
Test-side cross-links:
|
||||||
|
- `tests/discovery/test_registrar.py`:
|
||||||
|
`test_stale_entry_is_deleted` → `trio.run(main)`
|
||||||
|
wrapped in `dump_on_hang(seconds=20,
|
||||||
|
path=<per-method-tmp>)`; long inline comment
|
||||||
|
summarizes `strace` evidence + root-cause chain
|
||||||
|
and points at both docs.
|
||||||
|
- `tests/test_subint_cancellation.py`:
|
||||||
|
`test_subint_non_checkpointing_child` docstring
|
||||||
|
extended with "KNOWN ISSUE (Ctrl-C-able hang)"
|
||||||
|
section pointing at the class-2 doc + noting
|
||||||
|
the class-1 doc is NOT what this test hits.
|
||||||
|
|
||||||
|
## Files changed
|
||||||
|
|
||||||
|
- `ai/conc-anal/subint_sigint_starvation_issue.md`
|
||||||
|
— new, 205 LOC
|
||||||
|
- `ai/conc-anal/subint_cancel_delivery_hang_issue.md`
|
||||||
|
— new, 161 LOC
|
||||||
|
- `tests/discovery/test_registrar.py` — +52/-1
|
||||||
|
(arm `dump_on_hang`, inline-comment cross-link)
|
||||||
|
- `tests/test_subint_cancellation.py` — +26
|
||||||
|
(docstring "KNOWN ISSUE" block)
|
||||||
|
|
||||||
|
## Human edits
|
||||||
|
|
||||||
|
Substantive collab — prose was jointly iterated:
|
||||||
|
|
||||||
|
- User framed the two-doc split, set the
|
||||||
|
classification criteria (Ctrl-C-able vs not),
|
||||||
|
and provided the `strace` evidence.
|
||||||
|
- User decided to keep `test_stale_entry_is_deleted`
|
||||||
|
un-`skip`ped (my initial suggestion was
|
||||||
|
`pytestmark.skipif(spawn_backend=='subint')`).
|
||||||
|
- User chose the candidate fix ordering for
|
||||||
|
class 2 and marked "explicit parent-side channel
|
||||||
|
abort" as the surgical preferred fix.
|
||||||
|
- User picked the file naming convention
|
||||||
|
(`subint_<hang-shape>_issue.md`) over my initial
|
||||||
|
`hang_class_{1,2}.md`.
|
||||||
|
- Assistant drafted the prose, aggregated prior-
|
||||||
|
session root-cause findings from Phase B.2/B.3
|
||||||
|
bringup, and wrote the test-side cross-linking
|
||||||
|
comments.
|
||||||
|
|
||||||
|
No further mechanical edits expected before
|
||||||
|
commit; user may still rewrap via
|
||||||
|
`scripts/rewrap.py` if preferred.
|
||||||
|
|
@ -0,0 +1,198 @@
|
||||||
|
---
|
||||||
|
model: claude-opus-4-7[1m]
|
||||||
|
service: claude
|
||||||
|
timestamp: 2026-04-20T19:27:39Z
|
||||||
|
git_ref: HEAD (pre-commit; will land on branch `subint_spawner_backend`)
|
||||||
|
diff_cmd: git diff HEAD~1..HEAD
|
||||||
|
---
|
||||||
|
|
||||||
|
Collab between `goodboy` (user) and `claude` (this
|
||||||
|
assistant) spanning multiple test-run iterations on
|
||||||
|
branch `subint_spawner_backend`. The user ran the
|
||||||
|
failing suites, captured `strace` evidence on the
|
||||||
|
hung pytest pids, and set the direction ("these are
|
||||||
|
two different hangs — write them up separately so
|
||||||
|
we don't re-confuse ourselves later"). The assistant
|
||||||
|
aggregated prior-session findings (Phase B.2/B.3
|
||||||
|
bringup) into two classification docs + test-side
|
||||||
|
cross-links. All prose was jointly iterated; the
|
||||||
|
user had final say on framing and decided which
|
||||||
|
candidate fix directions to list.
|
||||||
|
|
||||||
|
## Per-file generated content
|
||||||
|
|
||||||
|
### `ai/conc-anal/subint_sigint_starvation_issue.md` (new, 205 LOC)
|
||||||
|
|
||||||
|
> `git diff HEAD~1..HEAD -- ai/conc-anal/subint_sigint_starvation_issue.md`
|
||||||
|
|
||||||
|
Writes up the "abandoned-legacy-subint thread wedges
|
||||||
|
the parent trio loop" class. Key sections:
|
||||||
|
|
||||||
|
- **Symptom** — `test_stale_entry_is_deleted[subint]`
|
||||||
|
hangs indefinitely AND is un-Ctrl-C-able.
|
||||||
|
- **Evidence** — annotated `strace` excerpt showing
|
||||||
|
SIGINT delivered to pytest, C-level signal handler
|
||||||
|
tries to write to the signal-wakeup-fd pipe, gets
|
||||||
|
`write() = -1 EAGAIN (Resource temporarily
|
||||||
|
unavailable)`. Pipe is full because main trio loop
|
||||||
|
isn't iterating often enough to drain it.
|
||||||
|
- **Root-cause chain** — our hard-kill abandons the
|
||||||
|
`daemon=True` driver OS thread after
|
||||||
|
`_HARD_KILL_TIMEOUT`; the subint *inside* that
|
||||||
|
thread is still running `trio.run()`;
|
||||||
|
`_interpreters.destroy()` cannot force-stop a
|
||||||
|
running subint (raises `InterpreterError`); legacy
|
||||||
|
subints share the main GIL → abandoned subint
|
||||||
|
starves main trio loop → wakeup-fd fills → SIGINT
|
||||||
|
silently dropped.
|
||||||
|
- **Why it's structurally a CPython limit** — no
|
||||||
|
public force-destroy primitive for a running
|
||||||
|
subint; the only escape is per-interpreter GIL
|
||||||
|
isolation, gated on msgspec PEP 684 adoption
|
||||||
|
(jcrist/msgspec#563).
|
||||||
|
- **Current escape hatch** — harness-side SIGINT
|
||||||
|
loop in the `daemon` fixture teardown that kills
|
||||||
|
the bg registrar subproc, eventually unblocking
|
||||||
|
a parent-side recv enough for the main loop to
|
||||||
|
drain the wakeup pipe.
|
||||||
|
|
||||||
|
### `ai/conc-anal/subint_cancel_delivery_hang_issue.md` (new, 161 LOC)
|
||||||
|
|
||||||
|
> `git diff HEAD~1..HEAD -- ai/conc-anal/subint_cancel_delivery_hang_issue.md`
|
||||||
|
|
||||||
|
Writes up the *sibling* hang class — same subint
|
||||||
|
backend, distinct root cause:
|
||||||
|
|
||||||
|
- **TL;DR** — Ctrl-C-able, so NOT the SIGINT-
|
||||||
|
starvation class; main trio loop iterates fine;
|
||||||
|
ours to fix.
|
||||||
|
- **Symptom** — `test_subint_non_checkpointing_child`
|
||||||
|
hangs past the expected `_HARD_KILL_TIMEOUT`
|
||||||
|
budget even after the subint is torn down.
|
||||||
|
- **Diagnosis** — a parent-side trio task (likely
|
||||||
|
a `chan.recv()` in `process_messages`) parks on
|
||||||
|
an orphaned IPC channel; channel was torn down
|
||||||
|
without emitting a clean EOF /
|
||||||
|
`BrokenResourceError` to the waiting receiver.
|
||||||
|
- **Candidate fix directions** — listed in rough
|
||||||
|
order of preference:
|
||||||
|
1. Explicit parent-side channel abort in
|
||||||
|
`subint_proc`'s hard-kill teardown (surgical;
|
||||||
|
most likely).
|
||||||
|
2. Audit `process_messages` to add a timeout or
|
||||||
|
cancel-scope protection that catches the
|
||||||
|
orphaned-recv state.
|
||||||
|
3. Wrap subint IPC channel construction in a
|
||||||
|
sentinel that can force-close from the parent
|
||||||
|
side regardless of subint liveness.
|
||||||
|
|
||||||
|
### `tests/discovery/test_registrar.py` (modified, +52/-1 LOC)
|
||||||
|
|
||||||
|
> `git diff HEAD~1..HEAD -- tests/discovery/test_registrar.py`
|
||||||
|
|
||||||
|
Wraps the `trio.run(main)` call at the bottom of
|
||||||
|
`test_stale_entry_is_deleted` in
|
||||||
|
`dump_on_hang(seconds=20, path=<per-method-tmp>)`.
|
||||||
|
Adds a long inline comment that:
|
||||||
|
- Enumerates variant-by-variant status
|
||||||
|
(`[trio]`/`[mp_*]` = clean; `[subint]` = hangs
|
||||||
|
+ un-Ctrl-C-able)
|
||||||
|
- Summarizes the `strace` evidence and root-cause
|
||||||
|
chain inline (so a future reader hitting this
|
||||||
|
test doesn't need to cross-ref the doc to
|
||||||
|
understand the hang shape)
|
||||||
|
- Points at
|
||||||
|
`ai/conc-anal/subint_sigint_starvation_issue.md`
|
||||||
|
for full analysis
|
||||||
|
- Cross-links to the *sibling*
|
||||||
|
`subint_cancel_delivery_hang_issue.md` so
|
||||||
|
readers can tell the two classes apart
|
||||||
|
- Explains why it's kept un-`skip`ped: the dump
|
||||||
|
file is useful if the hang ever returns after
|
||||||
|
a refactor. pytest stderr capture would
|
||||||
|
otherwise eat `faulthandler` output, hence the
|
||||||
|
file path.
|
||||||
|
|
||||||
|
### `tests/test_subint_cancellation.py` (modified, +26 LOC)
|
||||||
|
|
||||||
|
> `git diff HEAD~1..HEAD -- tests/test_subint_cancellation.py`
|
||||||
|
|
||||||
|
Extends the docstring of
|
||||||
|
`test_subint_non_checkpointing_child` with a
|
||||||
|
"KNOWN ISSUE (Ctrl-C-able hang)" block:
|
||||||
|
- Describes the current hang: parent-side orphaned
|
||||||
|
IPC recv after hard-kill; distinct from the
|
||||||
|
SIGINT-starvation sibling class.
|
||||||
|
- Cites `strace` distinguishing signal: wakeup-fd
|
||||||
|
`write() = 1` (not `EAGAIN`) — i.e. main loop
|
||||||
|
iterating.
|
||||||
|
- Points at
|
||||||
|
`ai/conc-anal/subint_cancel_delivery_hang_issue.md`
|
||||||
|
for full analysis + candidate fix directions.
|
||||||
|
- Clarifies that the *other* sibling doc
|
||||||
|
(SIGINT-starvation) is NOT what this test hits.
|
||||||
|
|
||||||
|
## Non-code output
|
||||||
|
|
||||||
|
### Classification reasoning (why two docs, not one)
|
||||||
|
|
||||||
|
The user and I converged on the two-doc split after
|
||||||
|
running the suites and noticing two *qualitatively
|
||||||
|
different* hang symptoms:
|
||||||
|
|
||||||
|
1. `test_stale_entry_is_deleted[subint]` — pytest
|
||||||
|
process un-Ctrl-C-able. Ctrl-C at the terminal
|
||||||
|
does nothing. Must kill-9 from another shell.
|
||||||
|
2. `test_subint_non_checkpointing_child` — pytest
|
||||||
|
process Ctrl-C-able. One Ctrl-C at the prompt
|
||||||
|
unblocks cleanly and the test reports a hang
|
||||||
|
via pytest-timeout.
|
||||||
|
|
||||||
|
From the user: "These cannot be the same bug.
|
||||||
|
Different fix paths. Write them up separately or
|
||||||
|
we'll keep conflating them."
|
||||||
|
|
||||||
|
`strace` on the `[subint]` hang gave the decisive
|
||||||
|
signal for the first class:
|
||||||
|
|
||||||
|
```
|
||||||
|
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||||
|
write(5, "\2", 1) = -1 EAGAIN (Resource temporarily unavailable)
|
||||||
|
```
|
||||||
|
|
||||||
|
fd 5 is Python's signal-wakeup-fd pipe. `EAGAIN`
|
||||||
|
on a `write()` of 1 byte to a pipe means the pipe
|
||||||
|
buffer is full → reader side (main Python thread
|
||||||
|
inside `trio.run()`) isn't consuming. That's the
|
||||||
|
GIL-hostage signature.
|
||||||
|
|
||||||
|
The second class's `strace` showed `write(5, "\2",
|
||||||
|
1) = 1` — clean drain — so the main trio loop was
|
||||||
|
iterating and the hang had to be on the application
|
||||||
|
side of things, not the kernel-↔-Python signal
|
||||||
|
boundary.
|
||||||
|
|
||||||
|
### Why the candidate fix for class 2 is "explicit parent-side channel abort"
|
||||||
|
|
||||||
|
The second hang class has the trio loop alive. A
|
||||||
|
parked `chan.recv()` that will never get bytes is
|
||||||
|
fundamentally a tractor-side resource-lifetime bug
|
||||||
|
— the IPC channel was torn down (subint destroyed)
|
||||||
|
but no one explicitly raised
|
||||||
|
`BrokenResourceError` at the parent-side receiver.
|
||||||
|
The `subint_proc` hard-kill path is the natural
|
||||||
|
place to add that notification, because it already
|
||||||
|
knows the subint is unreachable at that point.
|
||||||
|
|
||||||
|
Alternative fix paths (blanket timeouts on
|
||||||
|
`process_messages`, sentinel-wrapped channels) are
|
||||||
|
less surgical and risk masking unrelated bugs —
|
||||||
|
hence the preference ordering in the doc.
|
||||||
|
|
||||||
|
### Why we're not just patching the code now
|
||||||
|
|
||||||
|
The user explicitly deferred the fix to a later
|
||||||
|
commit: "Document both classes now, land the fix
|
||||||
|
for class 2 separately so the diff reviews clean."
|
||||||
|
This matches the incremental-commits preference
|
||||||
|
from memory.
|
||||||
|
|
@ -0,0 +1,155 @@
|
||||||
|
---
|
||||||
|
model: claude-opus-4-7[1m]
|
||||||
|
service: claude
|
||||||
|
session: subints-phase-b-hardening-and-fork-block
|
||||||
|
timestamp: 2026-04-22T20:07:23Z
|
||||||
|
git_ref: 797f57c
|
||||||
|
scope: code
|
||||||
|
substantive: true
|
||||||
|
raw_file: 20260422T200723Z_797f57c_prompt_io.raw.md
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prompt
|
||||||
|
|
||||||
|
Session-spanning work on the Phase B `subint` spawn-backend.
|
||||||
|
Three distinct sub-phases in one log:
|
||||||
|
|
||||||
|
1. **Py3.13 gate tightening** — diagnose a reproducible hang
|
||||||
|
of subint spawn flow under py3.13 (works on py3.14), trace
|
||||||
|
to a private `_interpreters` module vintage issue, tighten
|
||||||
|
our feature gate from "`_interpreters` present" to "public
|
||||||
|
`concurrent.interpreters` present" (i.e. py3.14+).
|
||||||
|
2. **Test-harness hardening** — add `pytest-timeout` dep, put
|
||||||
|
`@pytest.mark.timeout(30, method='thread')` on the
|
||||||
|
three known-hanging subint tests cataloged in
|
||||||
|
`ai/conc-anal/subint_sigint_starvation_issue.md`. Separately,
|
||||||
|
code-review the user's in-flight `skipon_spawn_backend`
|
||||||
|
marker implementation; find four bugs; refactor to use
|
||||||
|
`item.iter_markers()`.
|
||||||
|
3. **`subint_fork` prototype → CPython-block finding** — draft
|
||||||
|
a WIP `subint_fork_proc` backend using a sub-interpreter as
|
||||||
|
a launchpad for `os.fork()` (to sidestep trio#1614). User
|
||||||
|
tests on py3.14, hits
|
||||||
|
`Fatal Python error: _PyInterpreterState_DeleteExceptMain:
|
||||||
|
not main interpreter`. Walk CPython sources (local clone at
|
||||||
|
`~/repos/cpython/`) to pinpoint the refusal
|
||||||
|
(`Modules/posixmodule.c:728` → `Python/pystate.c:1040`).
|
||||||
|
Revert implementation to a `NotImplementedError` stub in a
|
||||||
|
new `_subint_fork.py` submodule, document the finding in a
|
||||||
|
third `conc-anal/` doc with an upstream-report draft for
|
||||||
|
the CPython issue tracker. Finally, discuss user's proposed
|
||||||
|
workaround architecture (main-interp worker-thread
|
||||||
|
forkserver) and draft a standalone smoke-test script for
|
||||||
|
feasibility validation.
|
||||||
|
|
||||||
|
## Response summary
|
||||||
|
|
||||||
|
All three sub-phases landed concrete artifacts:
|
||||||
|
|
||||||
|
**Sub-phase 1** — `_subint.py` + `_spawn.py` gates + error
|
||||||
|
messages updated to require py3.14+ via the public
|
||||||
|
`concurrent.interpreters` module presence check. Module
|
||||||
|
docstring revised to explain the empirical reason
|
||||||
|
(py3.13's private `_interpreters` vintage wedges under
|
||||||
|
multi-trio-task usage even though minimal standalone
|
||||||
|
reproducers work fine there). Test-module
|
||||||
|
`pytest.importorskip` likewise switched.
|
||||||
|
|
||||||
|
**Sub-phase 2** — `pytest-timeout>=2.3` added to `testing`
|
||||||
|
dep group. `@pytest.mark.timeout(30, method='thread')`
|
||||||
|
applied on:
|
||||||
|
- `tests/discovery/test_registrar.py::test_stale_entry_is_deleted`
|
||||||
|
- `tests/test_cancellation.py::test_cancel_while_childs_child_in_sync_sleep`
|
||||||
|
- `tests/test_cancellation.py::test_multierror_fast_nursery`
|
||||||
|
- `tests/test_subint_cancellation.py::test_subint_non_checkpointing_child`
|
||||||
|
|
||||||
|
`method='thread'` documented inline as load-bearing — the
|
||||||
|
GIL-starvation path that drops `SIGINT` would equally drop
|
||||||
|
`SIGALRM`, so only a watchdog-thread timeout can reliably
|
||||||
|
escape.
|
||||||
|
|
||||||
|
`skipon_spawn_backend` plugin refactored into a single
|
||||||
|
`iter_markers`-driven loop in `pytest_collection_modifyitems`
|
||||||
|
(~30 LOC replacing ~30 LOC of nested conditionals). Four
|
||||||
|
bugs dissolved: wrong `.get()` key, module-level `pytestmark`
|
||||||
|
suppressing per-test marks, unhandled `pytestmark = [list]`
|
||||||
|
form, `pytest.Makr` typo. Marker help text updated to
|
||||||
|
document the variadic backend-list + `reason=` kwarg
|
||||||
|
surface.
|
||||||
|
|
||||||
|
**Sub-phase 3** — Prototype drafted (then reverted):
|
||||||
|
|
||||||
|
- `tractor/spawn/_subint_fork.py` — new dedicated submodule
|
||||||
|
housing the `subint_fork_proc` stub. Module docstring +
|
||||||
|
fn docstring explain the attempt, the CPython-level
|
||||||
|
block, and the reason for keeping the stub in-tree
|
||||||
|
(documentation of the attempt + starting point if CPython
|
||||||
|
ever lifts the restriction).
|
||||||
|
- `tractor/spawn/_spawn.py` — `'subint_fork'` registered as a
|
||||||
|
`SpawnMethodKey` literal + in `_methods`, so
|
||||||
|
`--spawn-backend=subint_fork` routes to a clean
|
||||||
|
`NotImplementedError` pointing at the analysis doc rather
|
||||||
|
than an "invalid backend" error.
|
||||||
|
- `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md` —
|
||||||
|
third sibling conc-anal doc. Full annotated CPython
|
||||||
|
source walkthrough from user-visible
|
||||||
|
`Fatal Python error` → `Modules/posixmodule.c:728
|
||||||
|
PyOS_AfterFork_Child()` → `Python/pystate.c:1040
|
||||||
|
_PyInterpreterState_DeleteExceptMain()` gate. Includes a
|
||||||
|
copy-paste-ready upstream-report draft for the CPython
|
||||||
|
issue tracker with a two-tier ask (ideally "make it work",
|
||||||
|
minimally "cleaner error than `Fatal Python error`
|
||||||
|
aborting the child").
|
||||||
|
- `ai/conc-anal/subint_fork_from_main_thread_smoketest.py` —
|
||||||
|
standalone zero-tractor-import CPython-level smoke test
|
||||||
|
for the user's proposed workaround architecture
|
||||||
|
(forkserver on a main-interp worker thread). Four
|
||||||
|
argparse-driven scenarios: `control_subint_thread_fork`
|
||||||
|
(reproduces the known-broken case as a test-harness
|
||||||
|
sanity), `main_thread_fork` (baseline), `worker_thread_fork`
|
||||||
|
(architectural assertion), `full_architecture`
|
||||||
|
(end-to-end trio-in-subint in forked child). User will
|
||||||
|
run on py3.14 next.
|
||||||
|
|
||||||
|
## Files changed
|
||||||
|
|
||||||
|
See `git log 26fb820..HEAD --stat` for the canonical list.
|
||||||
|
New files this session:
|
||||||
|
- `tractor/spawn/_subint_fork.py`
|
||||||
|
- `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
|
||||||
|
- `ai/conc-anal/subint_fork_from_main_thread_smoketest.py`
|
||||||
|
|
||||||
|
Modified (diff pointers in raw log):
|
||||||
|
- `tractor/spawn/_subint.py` (py3.14 gate)
|
||||||
|
- `tractor/spawn/_spawn.py` (`subint_fork` registration)
|
||||||
|
- `tractor/_testing/pytest.py` (`skipon_spawn_backend` refactor)
|
||||||
|
- `pyproject.toml` (`pytest-timeout` dep)
|
||||||
|
- `tests/discovery/test_registrar.py`,
|
||||||
|
`tests/test_cancellation.py`,
|
||||||
|
`tests/test_subint_cancellation.py` (timeout marks,
|
||||||
|
cross-refs to conc-anal docs)
|
||||||
|
|
||||||
|
## Human edits
|
||||||
|
|
||||||
|
Several back-and-forth iterations with user-driven
|
||||||
|
adjustments during the session:
|
||||||
|
|
||||||
|
- User corrected my initial mis-classification of
|
||||||
|
`test_cancel_while_childs_child_in_sync_sleep[subint-False]`
|
||||||
|
as Ctrl-C-able — second strace showed `EAGAIN`, putting
|
||||||
|
it squarely in class A (GIL-starvation). Re-analysis
|
||||||
|
preserved in the raw log.
|
||||||
|
- User independently fixed the `.get(reason)` → `.get('reason', reason)`
|
||||||
|
bug in the marker plugin before my review; preserved their
|
||||||
|
fix.
|
||||||
|
- User suggested moving the `subint_fork_proc` stub from
|
||||||
|
the bottom of `_subint.py` into its own
|
||||||
|
`_subint_fork.py` submodule — applied.
|
||||||
|
- User asked to keep the forkserver-architecture
|
||||||
|
discussion as background for the smoke-test rather than
|
||||||
|
committing to a tractor-side refactor until the smoke
|
||||||
|
test validates the CPython-level assumptions.
|
||||||
|
|
||||||
|
Commit messages in this range (b025c982 … 797f57c) were
|
||||||
|
drafted via `/commit-msg` + `rewrap.py --width 67`; user
|
||||||
|
landed them with the usual review.
|
||||||
|
|
@ -0,0 +1,343 @@
|
||||||
|
---
|
||||||
|
model: claude-opus-4-7[1m]
|
||||||
|
service: claude
|
||||||
|
timestamp: 2026-04-22T20:07:23Z
|
||||||
|
git_ref: 797f57c
|
||||||
|
diff_cmd: git log 26fb820..HEAD # all session commits since the destroy-race fix log
|
||||||
|
---
|
||||||
|
|
||||||
|
Session-spanning conversation covering the Phase B hardening
|
||||||
|
of the `subint` spawn-backend and an investigation into a
|
||||||
|
proposed `subint_fork` follow-up which turned out to be
|
||||||
|
blocked at the CPython level. This log is a narrative capture
|
||||||
|
of the substantive turns (not every message) and references
|
||||||
|
the concrete code + docs the session produced. Per diff-ref
|
||||||
|
mode the actual code diffs are pointed at via `git log` on
|
||||||
|
each ref rather than duplicated inline.
|
||||||
|
|
||||||
|
## Narrative of the substantive turns
|
||||||
|
|
||||||
|
### Py3.13 hang / gate tightening
|
||||||
|
|
||||||
|
Diagnosed a reproducible hang of the `subint` backend under
|
||||||
|
py3.13 (test_spawning tests wedge after root-actor bringup).
|
||||||
|
Root cause: py3.13's vintage of the private `_interpreters` C
|
||||||
|
module has a latent thread/subint-interaction issue that
|
||||||
|
`_interpreters.exec()` silently fails to progress under
|
||||||
|
tractor's multi-trio usage pattern — even though a minimal
|
||||||
|
standalone `threading.Thread` + `_interpreters.exec()`
|
||||||
|
reproducer works fine on the same Python. Empirically
|
||||||
|
py3.14 fixes it.
|
||||||
|
|
||||||
|
Fix (from this session): tighten the `_has_subints` gate in
|
||||||
|
`tractor.spawn._subint` from "private module importable" to
|
||||||
|
"public `concurrent.interpreters` present" — which is 3.14+
|
||||||
|
only. This leaves `subint_proc()` unchanged in behavior (we
|
||||||
|
still call the *private* `_interpreters.create('legacy')`
|
||||||
|
etc. under the hood) but refuses to engage on 3.13.
|
||||||
|
|
||||||
|
Also tightened the matching gate in
|
||||||
|
`tractor.spawn._spawn.try_set_start_method('subint')` and
|
||||||
|
rev'd the corresponding error messages from "3.13+" to
|
||||||
|
"3.14+" with a sentence explaining why. Test-module
|
||||||
|
`pytest.importorskip` switched from `_interpreters` →
|
||||||
|
`concurrent.interpreters` to match.
|
||||||
|
|
||||||
|
### `pytest-timeout` dep + `skipon_spawn_backend` marker plumbing
|
||||||
|
|
||||||
|
Added `pytest-timeout>=2.3` to the `testing` dep group with
|
||||||
|
an inline comment pointing at the `ai/conc-anal/*.md` docs.
|
||||||
|
Applied `@pytest.mark.timeout(30, method='thread')` (the
|
||||||
|
`method='thread'` is load-bearing — `signal`-method
|
||||||
|
`SIGALRM` suffers the same GIL-starvation path that drops
|
||||||
|
`SIGINT` in the class-A hang pattern) to the three known-
|
||||||
|
hanging subint tests cataloged in
|
||||||
|
`subint_sigint_starvation_issue.md`.
|
||||||
|
|
||||||
|
Separately code-reviewed the user's newly-staged
|
||||||
|
`skipon_spawn_backend` pytest marker implementation in
|
||||||
|
`tractor/_testing/pytest.py`. Found four bugs:
|
||||||
|
|
||||||
|
1. `modmark.kwargs.get(reason)` called `.get()` with the
|
||||||
|
*variable* `reason` as the dict key instead of the string
|
||||||
|
`'reason'` — user-supplied `reason=` was never picked up.
|
||||||
|
(User had already fixed this locally via `.get('reason',
|
||||||
|
reason)` by the time my review happened — preserved that
|
||||||
|
fix.)
|
||||||
|
2. The module-level `pytestmark` branch suppressed per-test
|
||||||
|
marker handling (the `else:` was an `else:` rather than
|
||||||
|
independent iteration).
|
||||||
|
3. `mod_pytestmark.mark` assumed a single
|
||||||
|
`MarkDecorator` — broke on the valid-pytest `pytestmark =
|
||||||
|
[mark, mark]` list form.
|
||||||
|
4. Typo: `pytest.Makr` → `pytest.Mark`.
|
||||||
|
|
||||||
|
Refactored the hook to use `item.iter_markers(name=...)`
|
||||||
|
which walks function + class + module scopes uniformly and
|
||||||
|
handles both `pytestmark` forms natively. ~30 LOC replaced
|
||||||
|
the original ~30 LOC of nested conditionals, all four bugs
|
||||||
|
dissolved. Also updated the marker help string to reflect
|
||||||
|
the variadic `*start_methods` + `reason=` surface.
|
||||||
|
|
||||||
|
### `subint_fork_proc` prototype attempt
|
||||||
|
|
||||||
|
User's hypothesis: the known trio+`fork()` issues
|
||||||
|
(python-trio/trio#1614) could be sidestepped by using a
|
||||||
|
sub-interpreter purely as a launchpad — `os.fork()` from a
|
||||||
|
subint that has never imported trio → child is in a
|
||||||
|
trio-free context. In the child `execv()` back into
|
||||||
|
`python -m tractor._child` and the downstream handshake
|
||||||
|
matches `trio_proc()` identically.
|
||||||
|
|
||||||
|
Drafted the prototype at `tractor/spawn/_subint.py`'s bottom
|
||||||
|
(originally — later moved to its own submod, see below):
|
||||||
|
launchpad-subint creation, bootstrap code-string with
|
||||||
|
`os.fork()` + `execv()`, driver-thread orchestration,
|
||||||
|
parent-side `ipc_server.wait_for_peer()` dance. Registered
|
||||||
|
`'subint_fork'` as a new `SpawnMethodKey` literal, added
|
||||||
|
`case 'subint' | 'subint_fork':` feature-gate arm in
|
||||||
|
`try_set_start_method()`, added entry in `_methods` dict.
|
||||||
|
|
||||||
|
### CPython-level block discovered
|
||||||
|
|
||||||
|
User tested on py3.14 and saw:
|
||||||
|
|
||||||
|
```
|
||||||
|
Fatal Python error: _PyInterpreterState_DeleteExceptMain: not main interpreter
|
||||||
|
Python runtime state: initialized
|
||||||
|
|
||||||
|
Current thread 0x00007f6b71a456c0 [subint-fork-lau] (most recent call first):
|
||||||
|
File "<script>", line 2 in <module>
|
||||||
|
<script>:2: DeprecationWarning: This process (pid=802985) is multi-threaded, use of fork() may lead to deadlocks in the child.
|
||||||
|
```
|
||||||
|
|
||||||
|
Walked CPython sources (local clone at `~/repos/cpython/`):
|
||||||
|
|
||||||
|
- **`Modules/posixmodule.c:728` `PyOS_AfterFork_Child()`** —
|
||||||
|
post-fork child-side cleanup. Calls
|
||||||
|
`_PyInterpreterState_DeleteExceptMain(runtime)` with
|
||||||
|
`goto fatal_error` on non-zero status. Has the
|
||||||
|
`// Ideally we could guarantee tstate is running main.`
|
||||||
|
self-acknowledging-fragile comment directly above.
|
||||||
|
|
||||||
|
- **`Python/pystate.c:1040`
|
||||||
|
`_PyInterpreterState_DeleteExceptMain()`** — the
|
||||||
|
refusal. Hard `PyStatus_ERR("not main interpreter")` gate
|
||||||
|
when `tstate->interp != interpreters->main`. Docstring
|
||||||
|
formally declares the precondition ("If there is a
|
||||||
|
current interpreter state, it *must* be the main
|
||||||
|
interpreter"). `XXX` comments acknowledge further latent
|
||||||
|
issues within.
|
||||||
|
|
||||||
|
Definitive answer to "Open Question 1" of the prototype
|
||||||
|
docstring: **no, CPython does not support `os.fork()` from
|
||||||
|
a non-main sub-interpreter**. Not because the fork syscall
|
||||||
|
is blocked (it isn't — the parent returns a valid pid),
|
||||||
|
but because the child cannot survive CPython's post-fork
|
||||||
|
initialization. This is an enforced invariant, not an
|
||||||
|
incidental limitation.
|
||||||
|
|
||||||
|
### Revert: move to stub submod + doc the finding
|
||||||
|
|
||||||
|
Per user request:
|
||||||
|
|
||||||
|
1. Reverted the working `subint_fork_proc` body to a
|
||||||
|
`NotImplementedError` stub, MOVED to its own submod
|
||||||
|
`tractor/spawn/_subint_fork.py` (keeps `_subint.py`
|
||||||
|
focused on the working `subint_proc` backend).
|
||||||
|
2. Updated `_spawn.py` to import the stub from the new
|
||||||
|
submod path; kept `'subint_fork'` in `SpawnMethodKey` +
|
||||||
|
`_methods` so `--spawn-backend=subint_fork` routes to a
|
||||||
|
clean `NotImplementedError` with pointer to the analysis
|
||||||
|
doc rather than an "invalid backend" error.
|
||||||
|
3. Wrote
|
||||||
|
`ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
|
||||||
|
with the full annotated CPython walkthrough + an
|
||||||
|
upstream-report draft for the CPython issue tracker.
|
||||||
|
Draft has a two-tier ask: ideally "make it work"
|
||||||
|
(pre-fork tstate-swap hook or `DeleteExceptFor(interp)`
|
||||||
|
variant), minimally "give us a clean `RuntimeError` in
|
||||||
|
the parent instead of a `Fatal Python error` aborting
|
||||||
|
the child silently".
|
||||||
|
|
||||||
|
### Design discussion — main-interp-thread forkserver workaround
|
||||||
|
|
||||||
|
User proposed: set up a "subint forking server" that fork()s
|
||||||
|
on behalf of subint callers. Core insight: the CPython gate
|
||||||
|
is on `tstate->interp`, not thread identity, so **any thread
|
||||||
|
whose tstate is main-interp** can fork cleanly. A worker
|
||||||
|
thread attached to main-interp (never entering a subint)
|
||||||
|
satisfies the precondition.
|
||||||
|
|
||||||
|
Structurally this is `mp.forkserver` (which tractor already
|
||||||
|
has as `mp_forkserver`) but **in-process**: instead of a
|
||||||
|
separate Python subproc as the fork server, we'd put the
|
||||||
|
forkserver on a thread in the tractor parent process. Pros:
|
||||||
|
faster spawn (no IPC marshalling to external server + no
|
||||||
|
separate Python startup), inherits already-imported modules
|
||||||
|
for free. Cons: less crash isolation (forkserver failure
|
||||||
|
takes the whole process).
|
||||||
|
|
||||||
|
Required tractor-side refactor: move the root actor's
|
||||||
|
`trio.run()` off main-interp-main-thread (so main-thread can
|
||||||
|
run the forkserver loop). Nontrivial; approximately the same
|
||||||
|
magnitude as "Phase C".
|
||||||
|
|
||||||
|
The design would also not fully resolve the class-A
|
||||||
|
GIL-starvation issue because child actors' trio still runs
|
||||||
|
inside subints (legacy config, msgspec PEP 684 pending).
|
||||||
|
Would mitigate SIGINT-starvation specifically if signal
|
||||||
|
handling moves to the forkserver thread.
|
||||||
|
|
||||||
|
Recommended pre-commitment: a standalone CPython-only smoke
|
||||||
|
test validating the four assumptions the arch rests on,
|
||||||
|
before any tractor-side work.
|
||||||
|
|
||||||
|
### Smoke-test script drafted
|
||||||
|
|
||||||
|
Wrote `ai/conc-anal/subint_fork_from_main_thread_smoketest.py`:
|
||||||
|
argparse-driven, four scenarios (`control_subint_thread_fork`
|
||||||
|
reproducing the known-broken case, `main_thread_fork`
|
||||||
|
baseline, `worker_thread_fork` the architectural assertion,
|
||||||
|
`full_architecture` end-to-end with trio in a subint in the
|
||||||
|
forked child). No `tractor` imports; pure CPython + `_interpreters`
|
||||||
|
+ `trio`. Bails cleanly on py<3.14. Pass/fail banners per
|
||||||
|
scenario.
|
||||||
|
|
||||||
|
User will validate on their py3.14 env next.
|
||||||
|
|
||||||
|
## Per-code-artifact provenance
|
||||||
|
|
||||||
|
### `tractor/spawn/_subint_fork.py` (new submod)
|
||||||
|
|
||||||
|
> `git show 797f57c -- tractor/spawn/_subint_fork.py`
|
||||||
|
|
||||||
|
NotImplementedError stub for the subint-fork backend. Module
|
||||||
|
docstring + fn docstring explain the attempt, the CPython
|
||||||
|
block, and why the stub is kept in-tree. No runtime behavior
|
||||||
|
beyond raising with a pointer at the conc-anal doc.
|
||||||
|
|
||||||
|
### `tractor/spawn/_spawn.py` (modified)
|
||||||
|
|
||||||
|
> `git log 26fb820..HEAD -- tractor/spawn/_spawn.py`
|
||||||
|
|
||||||
|
- Added `'subint_fork'` to `SpawnMethodKey` literal with a
|
||||||
|
block comment explaining the CPython-level block.
|
||||||
|
- Generalized the `case 'subint':` arm to `case 'subint' |
|
||||||
|
'subint_fork':` since both use the same py3.14+ gate.
|
||||||
|
- Registered `subint_fork_proc` in `_methods` with a
|
||||||
|
pointer-comment at the analysis doc.
|
||||||
|
|
||||||
|
### `tractor/spawn/_subint.py` (modified across session)
|
||||||
|
|
||||||
|
> `git log 26fb820..HEAD -- tractor/spawn/_subint.py`
|
||||||
|
|
||||||
|
- Tightened `_has_subints` gate: dual-requires public
|
||||||
|
`concurrent.interpreters` + private `_interpreters`
|
||||||
|
(tests for py3.14-or-newer on the public-API presence,
|
||||||
|
then uses the private one for legacy-config subints
|
||||||
|
because `msgspec` still blocks the public isolated mode
|
||||||
|
per jcrist/msgspec#563).
|
||||||
|
- Updated module docstring, `subint_proc()` docstring, and
|
||||||
|
gate-error messages to reflect the 3.14+ requirement and
|
||||||
|
the reason (py3.13 wedges under multi-trio usage even
|
||||||
|
though the private module exists there).
|
||||||
|
|
||||||
|
### `tractor/_testing/pytest.py` (modified)
|
||||||
|
|
||||||
|
> `git log 26fb820..HEAD -- tractor/_testing/pytest.py`
|
||||||
|
|
||||||
|
- New `skipon_spawn_backend(*start_methods, reason=...)`
|
||||||
|
pytest marker expanded into `pytest.mark.skip(reason=...)`
|
||||||
|
at collection time via
|
||||||
|
`pytest_collection_modifyitems()`.
|
||||||
|
- Implementation uses `item.iter_markers(name=...)` which
|
||||||
|
walks function + class + module scopes uniformly and
|
||||||
|
handles both `pytestmark = <single Mark>` and
|
||||||
|
`pytestmark = [mark, ...]` forms natively. ~30-LOC
|
||||||
|
single-loop refactor replacing a prior nested
|
||||||
|
conditional that had four bugs (see "Review" narrative
|
||||||
|
above).
|
||||||
|
- Added `pytest.Config` / `pytest.Function` /
|
||||||
|
`pytest.FixtureRequest` type annotations on fixture
|
||||||
|
signatures while touching the file.
|
||||||
|
|
||||||
|
### `pyproject.toml` (modified)
|
||||||
|
|
||||||
|
> `git log 26fb820..HEAD -- pyproject.toml`
|
||||||
|
|
||||||
|
Added `pytest-timeout>=2.3` to `testing` dep group with
|
||||||
|
comment pointing at the `ai/conc-anal/` docs.
|
||||||
|
|
||||||
|
### `tests/discovery/test_registrar.py`,
|
||||||
|
`tests/test_subint_cancellation.py`,
|
||||||
|
`tests/test_cancellation.py` (modified)
|
||||||
|
|
||||||
|
> `git log 26fb820..HEAD -- tests/`
|
||||||
|
|
||||||
|
Applied `@pytest.mark.timeout(30, method='thread')` on
|
||||||
|
known-hanging subint tests. Extended comments to cross-
|
||||||
|
reference the `ai/conc-anal/*.md` docs. `method='thread'`
|
||||||
|
is documented inline as load-bearing (`signal`-method
|
||||||
|
SIGALRM suffers the same GIL-starvation path that drops
|
||||||
|
SIGINT).
|
||||||
|
|
||||||
|
### `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md` (new)
|
||||||
|
|
||||||
|
> `git show 797f57c -- ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
|
||||||
|
|
||||||
|
Third sibling doc under `conc-anal/`. Structure: TL;DR,
|
||||||
|
context ("what we tried"), symptom (the user's exact
|
||||||
|
`Fatal Python error` output), CPython source walkthrough
|
||||||
|
with excerpted snippets from `posixmodule.c` +
|
||||||
|
`pystate.c`, chain summary, definitive answer to Open
|
||||||
|
Question 1, `## Upstream-report draft (for CPython issue
|
||||||
|
tracker)` section with a two-tier ask, references.
|
||||||
|
|
||||||
|
### `ai/conc-anal/subint_fork_from_main_thread_smoketest.py` (new, THIS turn)
|
||||||
|
|
||||||
|
Zero-tractor-import smoke test for the proposed workaround
|
||||||
|
architecture. Four argparse-driven scenarios covering the
|
||||||
|
control case + baseline + arch-critical case + end-to-end.
|
||||||
|
Pass/fail banners per scenario; clean `--help` output;
|
||||||
|
py3.13 early-exit.
|
||||||
|
|
||||||
|
## Non-code output (verbatim)
|
||||||
|
|
||||||
|
### The `strace` signature that kicked off the CPython
|
||||||
|
walkthrough
|
||||||
|
|
||||||
|
```
|
||||||
|
--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
|
||||||
|
write(16, "\2", 1) = -1 EAGAIN (Resource temporarily unavailable)
|
||||||
|
rt_sigreturn({mask=[WINCH]}) = 139801964688928
|
||||||
|
```
|
||||||
|
|
||||||
|
### Key user quotes framing the direction
|
||||||
|
|
||||||
|
> ok actually we get this [fatal error] ... see if you can
|
||||||
|
> take a look at what's going on, in particular wrt to
|
||||||
|
> cpython's sources. pretty sure there's a local copy at
|
||||||
|
> ~/repos/cpython/
|
||||||
|
|
||||||
|
(Drove the CPython walkthrough that produced the
|
||||||
|
definitive refusal chain.)
|
||||||
|
|
||||||
|
> is there any reason we can't just sidestep this "must fork
|
||||||
|
> from main thread in main subint" issue by simply ensuring
|
||||||
|
> a "subint forking server" is always setup prior to
|
||||||
|
> invoking trio in a non-main-thread subint ...
|
||||||
|
|
||||||
|
(Drove the main-interp-thread-forkserver architectural
|
||||||
|
discussion + smoke-test script design.)
|
||||||
|
|
||||||
|
### CPython source tags for quick jump-back
|
||||||
|
|
||||||
|
```
|
||||||
|
Modules/posixmodule.c:728 PyOS_AfterFork_Child()
|
||||||
|
Modules/posixmodule.c:753 // Ideally we could guarantee tstate is running main.
|
||||||
|
Modules/posixmodule.c:778 status = _PyInterpreterState_DeleteExceptMain(runtime);
|
||||||
|
|
||||||
|
Python/pystate.c:1040 _PyInterpreterState_DeleteExceptMain()
|
||||||
|
Python/pystate.c:1044-1047 tstate->interp != main → PyStatus_ERR("not main interpreter")
|
||||||
|
```
|
||||||
|
|
@ -0,0 +1,27 @@
|
||||||
|
# AI Prompt I/O Log — claude
|
||||||
|
|
||||||
|
This directory tracks prompt inputs and model
|
||||||
|
outputs for AI-assisted development using
|
||||||
|
`claude` (Claude Code).
|
||||||
|
|
||||||
|
## Policy
|
||||||
|
|
||||||
|
Prompt logging follows the
|
||||||
|
[NLNet generative AI policy][nlnet-ai].
|
||||||
|
All substantive AI contributions are logged
|
||||||
|
with:
|
||||||
|
- Model name and version
|
||||||
|
- Timestamps
|
||||||
|
- The prompts that produced the output
|
||||||
|
- Unedited model output (`.raw.md` files)
|
||||||
|
|
||||||
|
[nlnet-ai]: https://nlnet.nl/foundation/policies/generativeAI/
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
Entries are created by the `/prompt-io` skill
|
||||||
|
or automatically via `/commit-msg` integration.
|
||||||
|
|
||||||
|
Human contributors remain accountable for all
|
||||||
|
code decisions. AI-generated content is never
|
||||||
|
presented as human-authored work.
|
||||||
|
|
@ -0,0 +1,76 @@
|
||||||
|
ok now i want you to take a look at the most recent commit adding
|
||||||
|
a `tpt_bind_addrs` to `open_root_actor()` and extend the existing
|
||||||
|
tests/discovery/test_multiaddr* and friends to use this new param in
|
||||||
|
at least one suite with parametrizations over,
|
||||||
|
|
||||||
|
- `registry_addrs == tpt_bind_addrs`, as in both inputs are the same.
|
||||||
|
- `set(registry_addrs) >= set(tpt_bind_addrs)`, as in the registry
|
||||||
|
addrs include the bind set.
|
||||||
|
- `registry_addrs != tpt_bind_addrs`, where the reg set is disjoint from
|
||||||
|
the bind set in all possible combos you can imagine.
|
||||||
|
|
||||||
|
All of the ^above cases should further be parametrized over,
|
||||||
|
- the root being the registrar,
|
||||||
|
- a non-registrar root using our bg `daemon` fixture.
|
||||||
|
|
||||||
|
once we have a fairly thorough test suite and have flushed out all
|
||||||
|
bugs and edge cases we want to design a wrapping API which allows
|
||||||
|
declaring full tree's of actors tpt endpoints using multiaddrs such
|
||||||
|
that a `dict[str, list[str]]` of actor-name -> multiaddr can be used
|
||||||
|
to configure a tree of actors-as-services given such an input
|
||||||
|
"endpoints-table" can be matched with the number of appropriately
|
||||||
|
named subactore spawns in a `tractor` user-app.
|
||||||
|
|
||||||
|
Here is a small example from piker,
|
||||||
|
|
||||||
|
- in piker's root conf.toml we define a `[network]` section which can
|
||||||
|
define various actor-service-daemon names set to a maddr
|
||||||
|
(multiaddress str).
|
||||||
|
|
||||||
|
- each actor whether part of the `pikerd` tree (as a sub) or spawned
|
||||||
|
in other non-registrar rooted trees (such as `piker chart`) should
|
||||||
|
configurable in terms of its `tractor` tpt bind addresses via
|
||||||
|
a simple service lookup table,
|
||||||
|
|
||||||
|
```toml
|
||||||
|
[network]
|
||||||
|
pikerd = [
|
||||||
|
'/ip4/127.0.0.1/tcp/6116', # std localhost daemon-actor tree
|
||||||
|
'/uds/run/user/1000/piker/pikerd@6116.sock', # same but serving UDS
|
||||||
|
]
|
||||||
|
chart = [
|
||||||
|
'/ip4/127.0.0.1/tcp/3333', # std localhost daemon-actor tree
|
||||||
|
'/uds/run/user/1000/piker/chart@3333.sock',
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
We should take whatever common API is needed to support this and
|
||||||
|
distill it into a
|
||||||
|
```python
|
||||||
|
tractor.discovery.parse_endpoints(
|
||||||
|
) -> dict[
|
||||||
|
str,
|
||||||
|
list[Address]
|
||||||
|
|dict[str, list[Address]]
|
||||||
|
# ^recursive case, see below
|
||||||
|
]:
|
||||||
|
```
|
||||||
|
|
||||||
|
style API which can,
|
||||||
|
|
||||||
|
- be re-used easily across dependent projects.
|
||||||
|
- correctly raise tpt-backend support errors when a maddr specifying
|
||||||
|
a unsupport proto is passed.
|
||||||
|
- be used to handle "tunnelled" maddrs per
|
||||||
|
https://github.com/multiformats/py-multiaddr/#tunneling such that
|
||||||
|
for any such tunneled maddr-`str`-entry we deliver a data-structure
|
||||||
|
which can easily be passed to nested `@acm`s which consecutively
|
||||||
|
setup nested net bindspaces for binding the endpoint addrs using
|
||||||
|
a combo of our `.ipc.*` machinery and, say for example something like
|
||||||
|
https://github.com/svinota/pyroute2, more precisely say for
|
||||||
|
managing tunnelled wireguard eps within network-namespaces,
|
||||||
|
* https://docs.pyroute2.org/
|
||||||
|
* https://docs.pyroute2.org/netns.html
|
||||||
|
|
||||||
|
remember to include use of all default `.claude/skills` throughout
|
||||||
|
this work!
|
||||||
|
|
@ -0,0 +1,34 @@
|
||||||
|
This is your first big boi, "from GH issue" design, plan and
|
||||||
|
implement task.
|
||||||
|
|
||||||
|
We need to try and add sub-interpreter (aka subint) support per the
|
||||||
|
issue,
|
||||||
|
|
||||||
|
https://github.com/goodboy/tractor/issues/379
|
||||||
|
|
||||||
|
Part of this work should include,
|
||||||
|
|
||||||
|
- modularizing and thus better organizing the `.spawn.*` subpkg by
|
||||||
|
breaking up various backends currently in `spawn._spawn` into
|
||||||
|
separate submods where it makes sense.
|
||||||
|
|
||||||
|
- add a new `._subint` backend which tries to keep as much of the
|
||||||
|
inter-process-isolation machinery in use as possible but with plans
|
||||||
|
to optimize for localhost only benefits as offered by python's
|
||||||
|
subints where possible.
|
||||||
|
|
||||||
|
* utilizing localhost-only tpts like UDS, shm-buffers for
|
||||||
|
performant IPC between subactors but also leveraging the benefits from
|
||||||
|
the traditional OS subprocs mem/storage-domain isolation, linux
|
||||||
|
namespaces where possible and as available/permitted by whatever
|
||||||
|
is happening under the hood with how cpython implements subints.
|
||||||
|
|
||||||
|
* default configuration should encourage state isolation as with
|
||||||
|
subprocs, but explicit public escape hatches to enable rigorously
|
||||||
|
managed shm channels for high performance apps.
|
||||||
|
|
||||||
|
- all tests should be (able to be) parameterized to use the new
|
||||||
|
`subints` backend and enabled by flag in the harness using the
|
||||||
|
existing `pytest --spawn-backend <spawn-backend>` support offered in
|
||||||
|
the `open_root_actor()` and `.testing._pytest` harness override
|
||||||
|
fixture.
|
||||||
|
|
@ -420,20 +420,17 @@ Check out our experimental system for `guest`_-mode controlled
|
||||||
|
|
||||||
|
|
||||||
async def aio_echo_server(
|
async def aio_echo_server(
|
||||||
to_trio: trio.MemorySendChannel,
|
chan: tractor.to_asyncio.LinkedTaskChannel,
|
||||||
from_trio: asyncio.Queue,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
# a first message must be sent **from** this ``asyncio``
|
# a first message must be sent **from** this ``asyncio``
|
||||||
# task or the ``trio`` side will never unblock from
|
# task or the ``trio`` side will never unblock from
|
||||||
# ``tractor.to_asyncio.open_channel_from():``
|
# ``tractor.to_asyncio.open_channel_from():``
|
||||||
to_trio.send_nowait('start')
|
chan.started_nowait('start')
|
||||||
|
|
||||||
# XXX: this uses an ``from_trio: asyncio.Queue`` currently but we
|
|
||||||
# should probably offer something better.
|
|
||||||
while True:
|
while True:
|
||||||
# echo the msg back
|
# echo the msg back
|
||||||
to_trio.send_nowait(await from_trio.get())
|
chan.send_nowait(await chan.get())
|
||||||
await asyncio.sleep(0)
|
await asyncio.sleep(0)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -445,7 +442,7 @@ Check out our experimental system for `guest`_-mode controlled
|
||||||
# message.
|
# message.
|
||||||
async with tractor.to_asyncio.open_channel_from(
|
async with tractor.to_asyncio.open_channel_from(
|
||||||
aio_echo_server,
|
aio_echo_server,
|
||||||
) as (first, chan):
|
) as (chan, first):
|
||||||
|
|
||||||
assert first == 'start'
|
assert first == 'start'
|
||||||
await ctx.started(first)
|
await ctx.started(first)
|
||||||
|
|
@ -504,8 +501,10 @@ Yes, we spawn a python process, run ``asyncio``, start ``trio`` on the
|
||||||
``asyncio`` loop, then send commands to the ``trio`` scheduled tasks to
|
``asyncio`` loop, then send commands to the ``trio`` scheduled tasks to
|
||||||
tell ``asyncio`` tasks what to do XD
|
tell ``asyncio`` tasks what to do XD
|
||||||
|
|
||||||
We need help refining the `asyncio`-side channel API to be more
|
The ``asyncio``-side task receives a single
|
||||||
`trio`-like. Feel free to sling your opinion in `#273`_!
|
``chan: LinkedTaskChannel`` handle providing a ``trio``-like
|
||||||
|
API: ``.started_nowait()``, ``.send_nowait()``, ``.get()``
|
||||||
|
and more. Feel free to sling your opinion in `#273`_!
|
||||||
|
|
||||||
|
|
||||||
.. _#273: https://github.com/goodboy/tractor/issues/273
|
.. _#273: https://github.com/goodboy/tractor/issues/273
|
||||||
|
|
@ -641,13 +640,15 @@ Help us push toward the future of distributed `Python`.
|
||||||
- Typed capability-based (dialog) protocols ( see `#196
|
- Typed capability-based (dialog) protocols ( see `#196
|
||||||
<https://github.com/goodboy/tractor/issues/196>`_ with draft work
|
<https://github.com/goodboy/tractor/issues/196>`_ with draft work
|
||||||
started in `#311 <https://github.com/goodboy/tractor/pull/311>`_)
|
started in `#311 <https://github.com/goodboy/tractor/pull/311>`_)
|
||||||
- We **recently disabled CI-testing on windows** and need help getting
|
- **macOS is now officially supported** and tested in CI
|
||||||
it running again! (see `#327
|
alongside Linux!
|
||||||
<https://github.com/goodboy/tractor/pull/327>`_). **We do have windows
|
- We **recently disabled CI-testing on windows** and need
|
||||||
support** (and have for quite a while) but since no active hacker
|
help getting it running again! (see `#327
|
||||||
exists in the user-base to help test on that OS, for now we're not
|
<https://github.com/goodboy/tractor/pull/327>`_). **We do
|
||||||
actively maintaining testing due to the added hassle and general
|
have windows support** (and have for quite a while) but
|
||||||
latency..
|
since no active hacker exists in the user-base to help
|
||||||
|
test on that OS, for now we're not actively maintaining
|
||||||
|
testing due to the added hassle and general latency..
|
||||||
|
|
||||||
|
|
||||||
Feel like saying hi?
|
Feel like saying hi?
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,7 @@ from tractor import (
|
||||||
MsgStream,
|
MsgStream,
|
||||||
_testing,
|
_testing,
|
||||||
trionics,
|
trionics,
|
||||||
|
TransportClosed,
|
||||||
)
|
)
|
||||||
import trio
|
import trio
|
||||||
import pytest
|
import pytest
|
||||||
|
|
@ -208,11 +209,15 @@ async def main(
|
||||||
# TODO: is this needed or no?
|
# TODO: is this needed or no?
|
||||||
raise
|
raise
|
||||||
|
|
||||||
except trio.ClosedResourceError:
|
except (
|
||||||
|
trio.ClosedResourceError,
|
||||||
|
TransportClosed,
|
||||||
|
) as _tpt_err:
|
||||||
# NOTE: don't send if we already broke the
|
# NOTE: don't send if we already broke the
|
||||||
# connection to avoid raising a closed-error
|
# connection to avoid raising a closed-error
|
||||||
# such that we drop through to the ctl-c
|
# such that we drop through to the ctl-c
|
||||||
# mashing by user.
|
# mashing by user.
|
||||||
|
with trio.CancelScope(shield=True):
|
||||||
await trio.sleep(0.01)
|
await trio.sleep(0.01)
|
||||||
|
|
||||||
# timeout: int = 1
|
# timeout: int = 1
|
||||||
|
|
@ -247,6 +252,7 @@ async def main(
|
||||||
await stream.send(i)
|
await stream.send(i)
|
||||||
pytest.fail('stream not closed?')
|
pytest.fail('stream not closed?')
|
||||||
except (
|
except (
|
||||||
|
TransportClosed,
|
||||||
trio.ClosedResourceError,
|
trio.ClosedResourceError,
|
||||||
trio.EndOfChannel,
|
trio.EndOfChannel,
|
||||||
) as send_err:
|
) as send_err:
|
||||||
|
|
|
||||||
|
|
@ -18,15 +18,14 @@ async def aio_sleep_forever():
|
||||||
|
|
||||||
|
|
||||||
async def bp_then_error(
|
async def bp_then_error(
|
||||||
to_trio: trio.MemorySendChannel,
|
chan: to_asyncio.LinkedTaskChannel,
|
||||||
from_trio: asyncio.Queue,
|
|
||||||
|
|
||||||
raise_after_bp: bool = True,
|
raise_after_bp: bool = True,
|
||||||
|
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
# sync with `trio`-side (caller) task
|
# sync with `trio`-side (caller) task
|
||||||
to_trio.send_nowait('start')
|
chan.started_nowait('start')
|
||||||
|
|
||||||
# NOTE: what happens here inside the hook needs some refinement..
|
# NOTE: what happens here inside the hook needs some refinement..
|
||||||
# => seems like it's still `.debug._set_trace()` but
|
# => seems like it's still `.debug._set_trace()` but
|
||||||
|
|
@ -60,7 +59,7 @@ async def trio_ctx(
|
||||||
to_asyncio.open_channel_from(
|
to_asyncio.open_channel_from(
|
||||||
bp_then_error,
|
bp_then_error,
|
||||||
# raise_after_bp=not bp_before_started,
|
# raise_after_bp=not bp_before_started,
|
||||||
) as (first, chan),
|
) as (chan, first),
|
||||||
|
|
||||||
trio.open_nursery() as tn,
|
trio.open_nursery() as tn,
|
||||||
):
|
):
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,7 @@ async def sleep(
|
||||||
|
|
||||||
|
|
||||||
async def open_ctx(
|
async def open_ctx(
|
||||||
n: tractor._supervise.ActorNursery
|
n: tractor.runtime._supervise.ActorNursery
|
||||||
):
|
):
|
||||||
|
|
||||||
# spawn both actors
|
# spawn both actors
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ Verify we can dump a `stackscope` tree on a hang.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
import os
|
import os
|
||||||
|
import platform
|
||||||
import signal
|
import signal
|
||||||
|
|
||||||
import trio
|
import trio
|
||||||
|
|
@ -31,13 +32,26 @@ async def main(
|
||||||
from_test: bool = False,
|
from_test: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
|
if platform.system() != 'Darwin':
|
||||||
|
tpt = 'uds'
|
||||||
|
else:
|
||||||
|
# XXX, precisely we can't use pytest's tmp-path generation
|
||||||
|
# for tests.. apparently because:
|
||||||
|
#
|
||||||
|
# > The OSError: AF_UNIX path too long in macOS Python occurs
|
||||||
|
# > because the path to the Unix domain socket exceeds the
|
||||||
|
# > operating system's maximum path length limit (around 104
|
||||||
|
#
|
||||||
|
# WHICH IS just, wtf hillarious XD
|
||||||
|
tpt = 'tcp'
|
||||||
|
|
||||||
async with (
|
async with (
|
||||||
tractor.open_nursery(
|
tractor.open_nursery(
|
||||||
debug_mode=True,
|
debug_mode=True,
|
||||||
enable_stack_on_sig=True,
|
enable_stack_on_sig=True,
|
||||||
# maybe_enable_greenback=False,
|
# maybe_enable_greenback=False,
|
||||||
loglevel='devx',
|
loglevel='devx',
|
||||||
enable_transports=['uds'],
|
enable_transports=[tpt],
|
||||||
) as an,
|
) as an,
|
||||||
):
|
):
|
||||||
ptl: tractor.Portal = await an.start_actor(
|
ptl: tractor.Portal = await an.start_actor(
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,5 @@
|
||||||
|
import platform
|
||||||
|
|
||||||
import tractor
|
import tractor
|
||||||
import trio
|
import trio
|
||||||
|
|
||||||
|
|
@ -34,9 +36,22 @@ async def just_bp(
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
|
|
||||||
|
if platform.system() != 'Darwin':
|
||||||
|
tpt = 'uds'
|
||||||
|
else:
|
||||||
|
# XXX, precisely we can't use pytest's tmp-path generation
|
||||||
|
# for tests.. apparently because:
|
||||||
|
#
|
||||||
|
# > The OSError: AF_UNIX path too long in macOS Python occurs
|
||||||
|
# > because the path to the Unix domain socket exceeds the
|
||||||
|
# > operating system's maximum path length limit (around 104
|
||||||
|
#
|
||||||
|
# WHICH IS just, wtf hillarious XD
|
||||||
|
tpt = 'tcp'
|
||||||
|
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
debug_mode=True,
|
debug_mode=True,
|
||||||
enable_transports=['uds'],
|
enable_transports=[tpt],
|
||||||
loglevel='devx',
|
loglevel='devx',
|
||||||
) as n:
|
) as n:
|
||||||
p = await n.start_actor(
|
p = await n.start_actor(
|
||||||
|
|
|
||||||
|
|
@ -90,7 +90,7 @@ async def main() -> list[int]:
|
||||||
# yes, a nursery which spawns `trio`-"actors" B)
|
# yes, a nursery which spawns `trio`-"actors" B)
|
||||||
an: ActorNursery
|
an: ActorNursery
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
loglevel='cancel',
|
loglevel='error',
|
||||||
# debug_mode=True,
|
# debug_mode=True,
|
||||||
) as an:
|
) as an:
|
||||||
|
|
||||||
|
|
@ -118,8 +118,10 @@ async def main() -> list[int]:
|
||||||
cancelled: bool = await portal.cancel_actor()
|
cancelled: bool = await portal.cancel_actor()
|
||||||
assert cancelled
|
assert cancelled
|
||||||
|
|
||||||
print(f"STREAM TIME = {time.time() - start}")
|
print(
|
||||||
print(f"STREAM + SPAWN TIME = {time.time() - pre_start}")
|
f"STREAM TIME = {time.time() - start}\n"
|
||||||
|
f"STREAM + SPAWN TIME = {time.time() - pre_start}\n"
|
||||||
|
)
|
||||||
assert result_stream == list(range(seed))
|
assert result_stream == list(range(seed))
|
||||||
return result_stream
|
return result_stream
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -11,21 +11,17 @@ import tractor
|
||||||
|
|
||||||
|
|
||||||
async def aio_echo_server(
|
async def aio_echo_server(
|
||||||
to_trio: trio.MemorySendChannel,
|
chan: tractor.to_asyncio.LinkedTaskChannel,
|
||||||
from_trio: asyncio.Queue,
|
|
||||||
|
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
# a first message must be sent **from** this ``asyncio``
|
# a first message must be sent **from** this ``asyncio``
|
||||||
# task or the ``trio`` side will never unblock from
|
# task or the ``trio`` side will never unblock from
|
||||||
# ``tractor.to_asyncio.open_channel_from():``
|
# ``tractor.to_asyncio.open_channel_from():``
|
||||||
to_trio.send_nowait('start')
|
chan.started_nowait('start')
|
||||||
|
|
||||||
# XXX: this uses an ``from_trio: asyncio.Queue`` currently but we
|
|
||||||
# should probably offer something better.
|
|
||||||
while True:
|
while True:
|
||||||
# echo the msg back
|
# echo the msg back
|
||||||
to_trio.send_nowait(await from_trio.get())
|
chan.send_nowait(await chan.get())
|
||||||
await asyncio.sleep(0)
|
await asyncio.sleep(0)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -37,7 +33,7 @@ async def trio_to_aio_echo_server(
|
||||||
# message.
|
# message.
|
||||||
async with tractor.to_asyncio.open_channel_from(
|
async with tractor.to_asyncio.open_channel_from(
|
||||||
aio_echo_server,
|
aio_echo_server,
|
||||||
) as (first, chan):
|
) as (chan, first):
|
||||||
|
|
||||||
assert first == 'start'
|
assert first == 'start'
|
||||||
await ctx.started(first)
|
await ctx.started(first)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
async def child_fn() -> str:
|
||||||
|
return f"child OK pid={os.getpid()}"
|
||||||
|
|
@ -0,0 +1,50 @@
|
||||||
|
"""
|
||||||
|
Integration test: spawning tractor actors from an MPI process.
|
||||||
|
|
||||||
|
When a parent is launched via ``mpirun``, Open MPI sets ``OMPI_*`` env
|
||||||
|
vars that bind ``MPI_Init`` to the ``orted`` daemon. Tractor children
|
||||||
|
inherit those env vars, so if ``inherit_parent_main=True`` (the default)
|
||||||
|
the child re-executes ``__main__``, re-imports ``mpi4py``, and
|
||||||
|
``MPI_Init_thread`` fails because the child was never spawned by
|
||||||
|
``orted``::
|
||||||
|
|
||||||
|
getting local rank failed
|
||||||
|
--> Returned value No permission (-17) instead of ORTE_SUCCESS
|
||||||
|
|
||||||
|
Passing ``inherit_parent_main=False`` and placing RPC functions in a
|
||||||
|
separate importable module (``_child``) avoids the re-import entirely.
|
||||||
|
|
||||||
|
Usage::
|
||||||
|
|
||||||
|
mpirun --allow-run-as-root -np 1 python -m \
|
||||||
|
examples.integration.mpi4py.inherit_parent_main
|
||||||
|
"""
|
||||||
|
|
||||||
|
from mpi4py import MPI
|
||||||
|
|
||||||
|
import os
|
||||||
|
import trio
|
||||||
|
import tractor
|
||||||
|
|
||||||
|
from ._child import child_fn
|
||||||
|
|
||||||
|
|
||||||
|
async def main() -> None:
|
||||||
|
rank = MPI.COMM_WORLD.Get_rank()
|
||||||
|
print(f"[parent] rank={rank} pid={os.getpid()}", flush=True)
|
||||||
|
|
||||||
|
async with tractor.open_nursery(start_method='trio') as an:
|
||||||
|
portal = await an.start_actor(
|
||||||
|
'mpi-child',
|
||||||
|
enable_modules=[child_fn.__module__],
|
||||||
|
# Without this the child replays __main__, which
|
||||||
|
# re-imports mpi4py and crashes on MPI_Init.
|
||||||
|
inherit_parent_main=False,
|
||||||
|
)
|
||||||
|
result = await portal.run(child_fn)
|
||||||
|
print(f"[parent] got: {result}", flush=True)
|
||||||
|
await portal.cancel_actor()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
trio.run(main)
|
||||||
|
|
@ -10,7 +10,7 @@ async def main(service_name):
|
||||||
await an.start_actor(service_name)
|
await an.start_actor(service_name)
|
||||||
|
|
||||||
async with tractor.get_registry() as portal:
|
async with tractor.get_registry() as portal:
|
||||||
print(f"Arbiter is listening on {portal.channel}")
|
print(f"Registrar is listening on {portal.channel}")
|
||||||
|
|
||||||
async with tractor.wait_for_actor(service_name) as sockaddr:
|
async with tractor.wait_for_actor(service_name) as sockaddr:
|
||||||
print(f"my_service is found at {sockaddr}")
|
print(f"my_service is found at {sockaddr}")
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,27 @@
|
||||||
|
{
|
||||||
|
"nodes": {
|
||||||
|
"nixpkgs": {
|
||||||
|
"locked": {
|
||||||
|
"lastModified": 1769018530,
|
||||||
|
"narHash": "sha256-MJ27Cy2NtBEV5tsK+YraYr2g851f3Fl1LpNHDzDX15c=",
|
||||||
|
"owner": "nixos",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"rev": "88d3861acdd3d2f0e361767018218e51810df8a1",
|
||||||
|
"type": "github"
|
||||||
|
},
|
||||||
|
"original": {
|
||||||
|
"owner": "nixos",
|
||||||
|
"ref": "nixos-unstable",
|
||||||
|
"repo": "nixpkgs",
|
||||||
|
"type": "github"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": {
|
||||||
|
"inputs": {
|
||||||
|
"nixpkgs": "nixpkgs"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"root": "root",
|
||||||
|
"version": 7
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,70 @@
|
||||||
|
# An "impure" template thx to `pyproject.nix`,
|
||||||
|
# https://pyproject-nix.github.io/pyproject.nix/templates.html#impure
|
||||||
|
# https://github.com/pyproject-nix/pyproject.nix/blob/master/templates/impure/flake.nix
|
||||||
|
{
|
||||||
|
description = "An impure overlay (w dev-shell) using `uv`";
|
||||||
|
|
||||||
|
inputs = {
|
||||||
|
nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";
|
||||||
|
};
|
||||||
|
|
||||||
|
outputs =
|
||||||
|
{ nixpkgs, ... }:
|
||||||
|
let
|
||||||
|
inherit (nixpkgs) lib;
|
||||||
|
forAllSystems = lib.genAttrs lib.systems.flakeExposed;
|
||||||
|
in
|
||||||
|
{
|
||||||
|
devShells = forAllSystems (
|
||||||
|
system:
|
||||||
|
let
|
||||||
|
pkgs = nixpkgs.legacyPackages.${system};
|
||||||
|
|
||||||
|
# XXX NOTE XXX, for now we overlay specific pkgs via
|
||||||
|
# a major-version-pinned-`cpython`
|
||||||
|
cpython = "python313";
|
||||||
|
venv_dir = "py313";
|
||||||
|
pypkgs = pkgs."${cpython}Packages";
|
||||||
|
in
|
||||||
|
{
|
||||||
|
default = pkgs.mkShell {
|
||||||
|
|
||||||
|
packages = [
|
||||||
|
# XXX, ensure sh completions activate!
|
||||||
|
pkgs.bashInteractive
|
||||||
|
pkgs.bash-completion
|
||||||
|
|
||||||
|
# XXX, on nix(os), use pkgs version to avoid
|
||||||
|
# build/sys-sh-integration issues
|
||||||
|
pkgs.ruff
|
||||||
|
|
||||||
|
pkgs.uv
|
||||||
|
pkgs.${cpython}# ?TODO^ how to set from `cpython` above?
|
||||||
|
];
|
||||||
|
|
||||||
|
shellHook = ''
|
||||||
|
# unmask to debug **this** dev-shell-hook
|
||||||
|
# set -e
|
||||||
|
|
||||||
|
# link-in c++ stdlib for various AOT-ext-pkgs (numpy, etc.)
|
||||||
|
LD_LIBRARY_PATH="${pkgs.stdenv.cc.cc.lib}/lib:$LD_LIBRARY_PATH"
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH
|
||||||
|
|
||||||
|
# RUNTIME-SETTINGS
|
||||||
|
# ------ uv ------
|
||||||
|
# - always use the ./py313/ venv-subdir
|
||||||
|
# - sync env with all extras
|
||||||
|
export UV_PROJECT_ENVIRONMENT=${venv_dir}
|
||||||
|
uv sync --dev --all-extras
|
||||||
|
|
||||||
|
# ------ TIPS ------
|
||||||
|
# NOTE, to launch the py-venv installed `xonsh` (like @goodboy)
|
||||||
|
# run the `nix develop` cmd with,
|
||||||
|
# >> nix develop -c uv run xonsh
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
}
|
||||||
|
);
|
||||||
|
};
|
||||||
|
}
|
||||||
115
pyproject.toml
115
pyproject.toml
|
|
@ -9,7 +9,7 @@ name = "tractor"
|
||||||
version = "0.1.0a6dev0"
|
version = "0.1.0a6dev0"
|
||||||
description = 'structured concurrent `trio`-"actors"'
|
description = 'structured concurrent `trio`-"actors"'
|
||||||
authors = [{ name = "Tyler Goodlet", email = "goodboy_foss@protonmail.com" }]
|
authors = [{ name = "Tyler Goodlet", email = "goodboy_foss@protonmail.com" }]
|
||||||
requires-python = ">= 3.11"
|
requires-python = ">=3.13, <3.15"
|
||||||
readme = "docs/README.rst"
|
readme = "docs/README.rst"
|
||||||
license = "AGPL-3.0-or-later"
|
license = "AGPL-3.0-or-later"
|
||||||
keywords = [
|
keywords = [
|
||||||
|
|
@ -24,11 +24,14 @@ keywords = [
|
||||||
classifiers = [
|
classifiers = [
|
||||||
"Development Status :: 3 - Alpha",
|
"Development Status :: 3 - Alpha",
|
||||||
"Operating System :: POSIX :: Linux",
|
"Operating System :: POSIX :: Linux",
|
||||||
|
"Operating System :: MacOS",
|
||||||
"Framework :: Trio",
|
"Framework :: Trio",
|
||||||
"License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
|
"License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
|
||||||
"Programming Language :: Python :: Implementation :: CPython",
|
"Programming Language :: Python :: Implementation :: CPython",
|
||||||
"Programming Language :: Python :: 3 :: Only",
|
"Programming Language :: Python :: 3 :: Only",
|
||||||
"Programming Language :: Python :: 3.11",
|
"Programming Language :: Python :: 3.12",
|
||||||
|
"Programming Language :: Python :: 3.13",
|
||||||
|
"Programming Language :: Python :: 3.14",
|
||||||
"Topic :: System :: Distributed Computing",
|
"Topic :: System :: Distributed Computing",
|
||||||
]
|
]
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
|
@ -41,49 +44,99 @@ dependencies = [
|
||||||
"tricycle>=0.4.1,<0.5",
|
"tricycle>=0.4.1,<0.5",
|
||||||
"wrapt>=1.16.0,<2",
|
"wrapt>=1.16.0,<2",
|
||||||
"colorlog>=6.8.2,<7",
|
"colorlog>=6.8.2,<7",
|
||||||
|
|
||||||
# built-in multi-actor `pdb` REPL
|
# built-in multi-actor `pdb` REPL
|
||||||
"pdbp>=1.6,<2", # windows only (from `pdbp`)
|
"pdbp>=1.8.2,<2", # windows only (from `pdbp`)
|
||||||
|
|
||||||
# typed IPC msging
|
# typed IPC msging
|
||||||
"msgspec>=0.19.0",
|
"msgspec>=0.20.0",
|
||||||
"cffi>=1.17.1",
|
|
||||||
"bidict>=0.23.1",
|
"bidict>=0.23.1",
|
||||||
|
"multiaddr>=0.2.0",
|
||||||
|
"platformdirs>=4.4.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
# ------ project ------
|
# ------ project ------
|
||||||
|
|
||||||
[dependency-groups]
|
[dependency-groups]
|
||||||
dev = [
|
dev = [
|
||||||
|
{include-group = 'devx'},
|
||||||
|
{include-group = 'testing'},
|
||||||
|
{include-group = 'repl'},
|
||||||
|
]
|
||||||
|
devx = [
|
||||||
|
# `tractor.devx` tooling
|
||||||
|
"stackscope>=0.2.2,<0.3",
|
||||||
|
# ^ requires this?
|
||||||
|
"typing-extensions>=4.14.1",
|
||||||
|
# {include-group = 'sync_pause'}, # XXX, no 3.14 yet!
|
||||||
|
]
|
||||||
|
sync_pause = [
|
||||||
|
"greenback>=1.2.1,<2", # TODO? 3.14 greenlet on nix?
|
||||||
|
]
|
||||||
|
testing = [
|
||||||
# test suite
|
# test suite
|
||||||
# TODO: maybe some of these layout choices?
|
# TODO: maybe some of these layout choices?
|
||||||
# https://docs.pytest.org/en/8.0.x/explanation/goodpractices.html#choosing-a-test-layout-import-rules
|
# https://docs.pytest.org/en/8.0.x/explanation/goodpractices.html#choosing-a-test-layout-import-rules
|
||||||
"pytest>=8.3.5",
|
"pytest>=8.3.5",
|
||||||
"pexpect>=4.9.0,<5",
|
"pexpect>=4.9.0,<5",
|
||||||
# `tractor.devx` tooling
|
# per-test wall-clock bound (used via
|
||||||
"greenback>=1.2.1,<2",
|
# `@pytest.mark.timeout(..., method='thread')` on the
|
||||||
"stackscope>=0.2.2,<0.3",
|
# known-hanging `subint`-backend audit tests; see
|
||||||
# ^ requires this?
|
# `ai/conc-anal/subint_*_issue.md`).
|
||||||
"typing-extensions>=4.14.1",
|
"pytest-timeout>=2.3",
|
||||||
|
]
|
||||||
|
repl = [
|
||||||
"pyperclip>=1.9.0",
|
"pyperclip>=1.9.0",
|
||||||
"prompt-toolkit>=3.0.50",
|
"prompt-toolkit>=3.0.50",
|
||||||
"xonsh>=0.19.2",
|
"xonsh>=0.22.8",
|
||||||
"psutil>=7.0.0",
|
"psutil>=7.0.0",
|
||||||
]
|
]
|
||||||
|
lint = [
|
||||||
|
"ruff>=0.9.6"
|
||||||
|
]
|
||||||
|
# XXX, used for linux-only hi perf eventfd+shm channels
|
||||||
|
# now mostly moved over to `hotbaud`.
|
||||||
|
eventfd = [
|
||||||
|
"cffi>=1.17.1",
|
||||||
|
]
|
||||||
|
subints = [
|
||||||
|
"msgspec>=0.21.0",
|
||||||
|
]
|
||||||
# TODO, add these with sane versions; were originally in
|
# TODO, add these with sane versions; were originally in
|
||||||
# `requirements-docs.txt`..
|
# `requirements-docs.txt`..
|
||||||
# docs = [
|
# docs = [
|
||||||
# "sphinx>="
|
# "sphinx>="
|
||||||
# "sphinx_book_theme>="
|
# "sphinx_book_theme>="
|
||||||
# ]
|
# ]
|
||||||
|
|
||||||
# ------ dependency-groups ------
|
# ------ dependency-groups ------
|
||||||
|
|
||||||
# ------ dependency-groups ------
|
[tool.uv.dependency-groups]
|
||||||
|
# for subints, we require 3.14+ due to 2 issues,
|
||||||
|
# - hanging behaviour for various multi-task teardown cases (see
|
||||||
|
# "Availability" section in the `tractor.spawn._subints` doc string).
|
||||||
|
# - `msgspec` support which is oustanding per PEP 684 upstream tracker:
|
||||||
|
# https://github.com/jcrist/msgspec/issues/563
|
||||||
|
#
|
||||||
|
# https://docs.astral.sh/uv/concepts/projects/dependencies/#group-requires-python
|
||||||
|
subints = {requires-python = ">=3.14"}
|
||||||
|
eventfd = {requires-python = ">=3.13, <3.14"}
|
||||||
|
sync_pause = {requires-python = ">=3.13, <3.14"}
|
||||||
|
|
||||||
[tool.uv.sources]
|
[tool.uv.sources]
|
||||||
# XXX NOTE, only for @goodboy's hacking on `pprint(sort_dicts=False)`
|
# XXX NOTE, only for @goodboy's hacking on `pprint(sort_dicts=False)`
|
||||||
# for the `pp` alias..
|
# for the `pp` alias..
|
||||||
# pdbp = { path = "../pdbp", editable = true }
|
# ------ gh upstream ------
|
||||||
|
# xonsh = { git = 'https://github.com/anki-code/xonsh.git', branch = 'prompt_next_suggestion' }
|
||||||
|
# ^ https://github.com/xonsh/xonsh/pull/6048
|
||||||
|
# xonsh = { git = 'https://github.com/xonsh/xonsh.git', branch = 'main' }
|
||||||
|
xonsh = { path = "../xonsh", editable = true }
|
||||||
|
|
||||||
|
# [tool.uv.sources.pdbp]
|
||||||
|
# XXX, in case we need to tmp patch again.
|
||||||
|
# git = "https://github.com/goodboy/pdbp.git"
|
||||||
|
# branch ="repair_stack_trace_frame_indexing"
|
||||||
|
# path = "../pdbp"
|
||||||
|
# editable = true
|
||||||
|
|
||||||
# ------ tool.uv.sources ------
|
# ------ tool.uv.sources ------
|
||||||
# TODO, distributed (multi-host) extensions
|
# TODO, distributed (multi-host) extensions
|
||||||
|
|
@ -145,6 +198,8 @@ all_bullets = true
|
||||||
|
|
||||||
[tool.pytest.ini_options]
|
[tool.pytest.ini_options]
|
||||||
minversion = '6.0'
|
minversion = '6.0'
|
||||||
|
timeout = 200 # per-test hard limit
|
||||||
|
# https://docs.pytest.org/en/stable/reference/reference.html#configuration-options
|
||||||
testpaths = [
|
testpaths = [
|
||||||
'tests'
|
'tests'
|
||||||
]
|
]
|
||||||
|
|
@ -155,10 +210,40 @@ addopts = [
|
||||||
'--import-mode=importlib',
|
'--import-mode=importlib',
|
||||||
# don't show frickin captured logs AGAIN in the report..
|
# don't show frickin captured logs AGAIN in the report..
|
||||||
'--show-capture=no',
|
'--show-capture=no',
|
||||||
|
|
||||||
|
# sys-level capture. REQUIRED for fork-based spawn
|
||||||
|
# backends (e.g. `subint_forkserver`): default
|
||||||
|
# `--capture=fd` redirects fd 1,2 to temp files, and fork
|
||||||
|
# children inherit those fds — opaque deadlocks happen in
|
||||||
|
# the pytest-capture-machinery ↔ fork-child stdio
|
||||||
|
# interaction. `--capture=sys` only redirects Python-level
|
||||||
|
# `sys.stdout`/`sys.stderr`, leaving fd 1,2 alone.
|
||||||
|
#
|
||||||
|
# Trade-off (vs. `--capture=fd`):
|
||||||
|
# - LOST: per-test attribution of subactor *raw-fd* output
|
||||||
|
# (C-ext writes, `os.write(2, ...)`, subproc stdout). Not
|
||||||
|
# zero — those go to the terminal, captured by CI's
|
||||||
|
# terminal-level capture, just not per-test-scoped in the
|
||||||
|
# pytest failure report.
|
||||||
|
# - KEPT: Python-level `print()` + `logging` capture per-
|
||||||
|
# test (tractor's logger uses `sys.stderr`, so tractor
|
||||||
|
# log output IS still attributed per-test).
|
||||||
|
# - KEPT: user `pytest -s` for debugging (unaffected).
|
||||||
|
#
|
||||||
|
# Full post-mortem in
|
||||||
|
# `ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`.
|
||||||
|
'--capture=sys',
|
||||||
|
|
||||||
|
# disable `xonsh` plugin
|
||||||
|
# https://docs.pytest.org/en/stable/how-to/plugins.html#disabling-plugins-from-autoloading
|
||||||
|
# https://docs.pytest.org/en/stable/how-to/plugins.html#deactivating-unregistering-a-plugin-by-name
|
||||||
|
'-p no:xonsh'
|
||||||
]
|
]
|
||||||
log_cli = false
|
log_cli = false
|
||||||
# TODO: maybe some of these layout choices?
|
# TODO: maybe some of these layout choices?
|
||||||
# https://docs.pytest.org/en/8.0.x/explanation/goodpractices.html#choosing-a-test-layout-import-rules
|
# https://docs.pytest.org/en/8.0.x/explanation/goodpractices.html#choosing-a-test-layout-import-rules
|
||||||
# pythonpath = "src"
|
# pythonpath = "src"
|
||||||
|
|
||||||
|
# https://docs.pytest.org/en/stable/reference/reference.html#confval-console_output_style
|
||||||
|
console_output_style = 'progress'
|
||||||
# ------ tool.pytest ------
|
# ------ tool.pytest ------
|
||||||
|
|
|
||||||
|
|
@ -1,8 +0,0 @@
|
||||||
# vim: ft=ini
|
|
||||||
# pytest.ini for tractor
|
|
||||||
|
|
||||||
[pytest]
|
|
||||||
# don't show frickin captured logs AGAIN in the report..
|
|
||||||
addopts = --show-capture='no'
|
|
||||||
log_cli = false
|
|
||||||
; minversion = 6.0
|
|
||||||
|
|
@ -35,8 +35,8 @@ exclude = [
|
||||||
line-length = 88
|
line-length = 88
|
||||||
indent-width = 4
|
indent-width = 4
|
||||||
|
|
||||||
# Assume Python 3.9
|
# assume latest minor cpython
|
||||||
target-version = "py311"
|
target-version = "py313"
|
||||||
|
|
||||||
[lint]
|
[lint]
|
||||||
# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default.
|
# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default.
|
||||||
|
|
|
||||||
|
|
@ -9,8 +9,11 @@ import os
|
||||||
import signal
|
import signal
|
||||||
import platform
|
import platform
|
||||||
import time
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import tractor
|
||||||
from tractor._testing import (
|
from tractor._testing import (
|
||||||
examples_dir as examples_dir,
|
examples_dir as examples_dir,
|
||||||
tractor_test as tractor_test,
|
tractor_test as tractor_test,
|
||||||
|
|
@ -22,6 +25,8 @@ pytest_plugins: list[str] = [
|
||||||
'tractor._testing.pytest',
|
'tractor._testing.pytest',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
_ci_env: bool = os.environ.get('CI', False)
|
||||||
|
_non_linux: bool = platform.system() != 'Linux'
|
||||||
|
|
||||||
# Sending signal.SIGINT on subprocess fails on windows. Use CTRL_* alternatives
|
# Sending signal.SIGINT on subprocess fails on windows. Use CTRL_* alternatives
|
||||||
if platform.system() == 'Windows':
|
if platform.system() == 'Windows':
|
||||||
|
|
@ -34,9 +39,8 @@ else:
|
||||||
_INT_SIGNAL = signal.SIGINT
|
_INT_SIGNAL = signal.SIGINT
|
||||||
_INT_RETURN_CODE = 1 if sys.version_info < (3, 8) else -signal.SIGINT.value
|
_INT_RETURN_CODE = 1 if sys.version_info < (3, 8) else -signal.SIGINT.value
|
||||||
_PROC_SPAWN_WAIT = (
|
_PROC_SPAWN_WAIT = (
|
||||||
0.6
|
2 if _ci_env
|
||||||
if sys.version_info < (3, 7)
|
else 1
|
||||||
else 0.4
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -44,6 +48,80 @@ no_windows = pytest.mark.skipif(
|
||||||
platform.system() == "Windows",
|
platform.system() == "Windows",
|
||||||
reason="Test is unsupported on windows",
|
reason="Test is unsupported on windows",
|
||||||
)
|
)
|
||||||
|
no_macos = pytest.mark.skipif(
|
||||||
|
platform.system() == "Darwin",
|
||||||
|
reason="Test is unsupported on MacOS",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_cpu_state(
|
||||||
|
icpu: int = 0,
|
||||||
|
setting: Literal[
|
||||||
|
'scaling_governor',
|
||||||
|
'*_pstate_max_freq',
|
||||||
|
'scaling_max_freq',
|
||||||
|
# 'scaling_cur_freq',
|
||||||
|
] = '*_pstate_max_freq',
|
||||||
|
) -> tuple[
|
||||||
|
Path,
|
||||||
|
str|int,
|
||||||
|
]|None:
|
||||||
|
'''
|
||||||
|
Attempt to read the (first) CPU's setting according
|
||||||
|
to the set `setting` from under the file-sys,
|
||||||
|
|
||||||
|
/sys/devices/system/cpu/cpu0/cpufreq/{setting}
|
||||||
|
|
||||||
|
Useful to determine latency headroom for various perf affected
|
||||||
|
test suites.
|
||||||
|
|
||||||
|
'''
|
||||||
|
try:
|
||||||
|
# Read governor for core 0 (usually same for all)
|
||||||
|
setting_path: Path = list(
|
||||||
|
Path(f'/sys/devices/system/cpu/cpu{icpu}/cpufreq/')
|
||||||
|
.glob(f'{setting}')
|
||||||
|
)[0] # <- XXX must be single match!
|
||||||
|
with open(
|
||||||
|
setting_path,
|
||||||
|
'r',
|
||||||
|
) as f:
|
||||||
|
return (
|
||||||
|
setting_path,
|
||||||
|
f.read().strip(),
|
||||||
|
)
|
||||||
|
except (FileNotFoundError, IndexError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def cpu_scaling_factor() -> float:
|
||||||
|
'''
|
||||||
|
Return a latency-headroom multiplier (>= 1.0) reflecting how
|
||||||
|
much to inflate time-limits when CPU-freq scaling is active on
|
||||||
|
linux.
|
||||||
|
|
||||||
|
When no scaling info is available (non-linux, missing sysfs),
|
||||||
|
returns 1.0 (i.e. no headroom adjustment needed).
|
||||||
|
|
||||||
|
'''
|
||||||
|
if _non_linux:
|
||||||
|
return 1.
|
||||||
|
|
||||||
|
mx = get_cpu_state()
|
||||||
|
cur = get_cpu_state(setting='scaling_max_freq')
|
||||||
|
if mx is None or cur is None:
|
||||||
|
return 1.
|
||||||
|
|
||||||
|
_mx_pth, max_freq = mx
|
||||||
|
_cur_pth, cur_freq = cur
|
||||||
|
cpu_scaled: float = int(cur_freq) / int(max_freq)
|
||||||
|
|
||||||
|
if cpu_scaled != 1.:
|
||||||
|
return 1. / (
|
||||||
|
cpu_scaled * 2 # <- bc likely "dual threaded"
|
||||||
|
)
|
||||||
|
|
||||||
|
return 1.
|
||||||
|
|
||||||
|
|
||||||
def pytest_addoption(
|
def pytest_addoption(
|
||||||
|
|
@ -61,16 +139,54 @@ def pytest_addoption(
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session', autouse=True)
|
@pytest.fixture(scope='session', autouse=True)
|
||||||
def loglevel(request):
|
def loglevel(
|
||||||
|
request: pytest.FixtureRequest,
|
||||||
|
) -> str:
|
||||||
import tractor
|
import tractor
|
||||||
orig = tractor.log._default_loglevel
|
orig = tractor.log._default_loglevel
|
||||||
level = tractor.log._default_loglevel = request.config.option.loglevel
|
level = tractor.log._default_loglevel = request.config.option.loglevel
|
||||||
tractor.log.get_console_log(level)
|
log = tractor.log.get_console_log(
|
||||||
|
level=level,
|
||||||
|
name='tractor', # <- enable root logger
|
||||||
|
)
|
||||||
|
log.info(
|
||||||
|
f'Test-harness set runtime loglevel: {level!r}\n'
|
||||||
|
)
|
||||||
yield level
|
yield level
|
||||||
tractor.log._default_loglevel = orig
|
tractor.log._default_loglevel = orig
|
||||||
|
|
||||||
|
|
||||||
_ci_env: bool = os.environ.get('CI', False)
|
@pytest.fixture(scope='function')
|
||||||
|
def test_log(
|
||||||
|
request: pytest.FixtureRequest,
|
||||||
|
loglevel: str,
|
||||||
|
) -> tractor.log.StackLevelAdapter:
|
||||||
|
'''
|
||||||
|
Deliver a per test-module-fn logger instance for reporting from
|
||||||
|
within actual test bodies/fixtures.
|
||||||
|
|
||||||
|
For example this can be handy to report certain error cases from
|
||||||
|
exception handlers using `test_log.exception()`.
|
||||||
|
|
||||||
|
'''
|
||||||
|
modname: str = request.function.__module__
|
||||||
|
log = tractor.log.get_logger(
|
||||||
|
name=modname, # <- enable root logger
|
||||||
|
# pkg_name='tests',
|
||||||
|
)
|
||||||
|
_log = tractor.log.get_console_log(
|
||||||
|
level=loglevel,
|
||||||
|
logger=log,
|
||||||
|
name=modname,
|
||||||
|
# pkg_name='tests',
|
||||||
|
)
|
||||||
|
_log.debug(
|
||||||
|
f'In-test-logging requested\n'
|
||||||
|
f'test_log.name: {log.name!r}\n'
|
||||||
|
f'level: {loglevel!r}\n'
|
||||||
|
|
||||||
|
)
|
||||||
|
yield _log
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
@pytest.fixture(scope='session')
|
||||||
|
|
@ -85,15 +201,39 @@ def ci_env() -> bool:
|
||||||
def sig_prog(
|
def sig_prog(
|
||||||
proc: subprocess.Popen,
|
proc: subprocess.Popen,
|
||||||
sig: int,
|
sig: int,
|
||||||
canc_timeout: float = 0.1,
|
canc_timeout: float = 0.2,
|
||||||
|
tries: int = 3,
|
||||||
) -> int:
|
) -> int:
|
||||||
"Kill the actor-process with ``sig``."
|
'''
|
||||||
|
Kill the actor-process with `sig`.
|
||||||
|
|
||||||
|
Prefer to kill with the provided signal and
|
||||||
|
failing a `canc_timeout`, send a `SIKILL`-like
|
||||||
|
to ensure termination.
|
||||||
|
|
||||||
|
'''
|
||||||
|
for i in range(tries):
|
||||||
proc.send_signal(sig)
|
proc.send_signal(sig)
|
||||||
|
if proc.poll() is None:
|
||||||
|
print(
|
||||||
|
f'WARNING, proc still alive after,\n'
|
||||||
|
f'canc_timeout={canc_timeout!r}\n'
|
||||||
|
f'sig={sig!r}\n'
|
||||||
|
f'\n'
|
||||||
|
f'{proc.args!r}\n'
|
||||||
|
)
|
||||||
time.sleep(canc_timeout)
|
time.sleep(canc_timeout)
|
||||||
if not proc.poll():
|
else:
|
||||||
# TODO: why sometimes does SIGINT not work on teardown?
|
# TODO: why sometimes does SIGINT not work on teardown?
|
||||||
# seems to happen only when trace logging enabled?
|
# seems to happen only when trace logging enabled?
|
||||||
|
if proc.poll() is None:
|
||||||
|
print(
|
||||||
|
f'XXX WARNING KILLING PROG WITH SIGINT XXX\n'
|
||||||
|
f'canc_timeout={canc_timeout!r}\n'
|
||||||
|
f'{proc.args!r}\n'
|
||||||
|
)
|
||||||
proc.send_signal(_KILL_SIGNAL)
|
proc.send_signal(_KILL_SIGNAL)
|
||||||
|
|
||||||
ret: int = proc.wait()
|
ret: int = proc.wait()
|
||||||
assert ret
|
assert ret
|
||||||
|
|
||||||
|
|
@ -106,6 +246,8 @@ def daemon(
|
||||||
testdir: pytest.Pytester,
|
testdir: pytest.Pytester,
|
||||||
reg_addr: tuple[str, int],
|
reg_addr: tuple[str, int],
|
||||||
tpt_proto: str,
|
tpt_proto: str,
|
||||||
|
ci_env: bool,
|
||||||
|
test_log: tractor.log.StackLevelAdapter,
|
||||||
|
|
||||||
) -> subprocess.Popen:
|
) -> subprocess.Popen:
|
||||||
'''
|
'''
|
||||||
|
|
@ -121,10 +263,12 @@ def daemon(
|
||||||
"import tractor; "
|
"import tractor; "
|
||||||
"tractor.run_daemon([], "
|
"tractor.run_daemon([], "
|
||||||
"registry_addrs={reg_addrs}, "
|
"registry_addrs={reg_addrs}, "
|
||||||
|
"enable_transports={enable_tpts}, "
|
||||||
"debug_mode={debug_mode}, "
|
"debug_mode={debug_mode}, "
|
||||||
"loglevel={ll})"
|
"loglevel={ll})"
|
||||||
).format(
|
).format(
|
||||||
reg_addrs=str([reg_addr]),
|
reg_addrs=str([reg_addr]),
|
||||||
|
enable_tpts=str([tpt_proto]),
|
||||||
ll="'{}'".format(loglevel) if loglevel else None,
|
ll="'{}'".format(loglevel) if loglevel else None,
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
)
|
)
|
||||||
|
|
@ -143,13 +287,25 @@ def daemon(
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# TODO! we should poll for the registry socket-bind to take place
|
||||||
|
# and only once that's done yield to the requester!
|
||||||
|
# -[ ] TCP: use the `._root.open_root_actor()`::`ping_tpt_socket()`
|
||||||
|
# closure!
|
||||||
|
# -[ ] UDS: can we do something similar for 'pinging" the
|
||||||
|
# file-socket?
|
||||||
|
#
|
||||||
|
global _PROC_SPAWN_WAIT
|
||||||
# UDS sockets are **really** fast to bind()/listen()/connect()
|
# UDS sockets are **really** fast to bind()/listen()/connect()
|
||||||
# so it's often required that we delay a bit more starting
|
# so it's often required that we delay a bit more starting
|
||||||
# the first actor-tree..
|
# the first actor-tree..
|
||||||
if tpt_proto == 'uds':
|
if tpt_proto == 'uds':
|
||||||
global _PROC_SPAWN_WAIT
|
_PROC_SPAWN_WAIT += 1.6
|
||||||
_PROC_SPAWN_WAIT = 0.6
|
|
||||||
|
|
||||||
|
if _non_linux and ci_env:
|
||||||
|
_PROC_SPAWN_WAIT += 1
|
||||||
|
|
||||||
|
# XXX, allow time for the sub-py-proc to boot up.
|
||||||
|
# !TODO, see ping-polling ideas above!
|
||||||
time.sleep(_PROC_SPAWN_WAIT)
|
time.sleep(_PROC_SPAWN_WAIT)
|
||||||
|
|
||||||
assert not proc.returncode
|
assert not proc.returncode
|
||||||
|
|
@ -159,18 +315,30 @@ def daemon(
|
||||||
# XXX! yeah.. just be reaaal careful with this bc sometimes it
|
# XXX! yeah.. just be reaaal careful with this bc sometimes it
|
||||||
# can lock up on the `_io.BufferedReader` and hang..
|
# can lock up on the `_io.BufferedReader` and hang..
|
||||||
stderr: str = proc.stderr.read().decode()
|
stderr: str = proc.stderr.read().decode()
|
||||||
if stderr:
|
stdout: str = proc.stdout.read().decode()
|
||||||
|
if (
|
||||||
|
stderr
|
||||||
|
or
|
||||||
|
stdout
|
||||||
|
):
|
||||||
print(
|
print(
|
||||||
f'Daemon actor tree produced STDERR:\n'
|
f'Daemon actor tree produced output:\n'
|
||||||
f'{proc.args}\n'
|
f'{proc.args}\n'
|
||||||
f'\n'
|
f'\n'
|
||||||
f'{stderr}\n'
|
f'stderr: {stderr!r}\n'
|
||||||
|
f'stdout: {stdout!r}\n'
|
||||||
)
|
)
|
||||||
if proc.returncode != -2:
|
|
||||||
raise RuntimeError(
|
if (rc := proc.returncode) != -2:
|
||||||
'Daemon actor tree failed !?\n'
|
msg: str = (
|
||||||
f'{proc.args}\n'
|
f'Daemon actor tree was not cancelled !?\n'
|
||||||
|
f'proc.args: {proc.args!r}\n'
|
||||||
|
f'proc.returncode: {rc!r}\n'
|
||||||
)
|
)
|
||||||
|
if rc < 0:
|
||||||
|
raise RuntimeError(msg)
|
||||||
|
|
||||||
|
test_log.error(msg)
|
||||||
|
|
||||||
|
|
||||||
# @pytest.fixture(autouse=True)
|
# @pytest.fixture(autouse=True)
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,8 @@
|
||||||
|
|
||||||
'''
|
'''
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
import platform
|
||||||
|
import signal
|
||||||
import time
|
import time
|
||||||
from typing import (
|
from typing import (
|
||||||
Callable,
|
Callable,
|
||||||
|
|
@ -32,9 +34,23 @@ if TYPE_CHECKING:
|
||||||
from pexpect import pty_spawn
|
from pexpect import pty_spawn
|
||||||
|
|
||||||
|
|
||||||
|
_non_linux: bool = platform.system() != 'Linux'
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_configure(config):
|
||||||
|
# register custom marks to avoid warnings see,
|
||||||
|
# https://docs.pytest.org/en/stable/how-to/writing_plugins.html#registering-custom-markers
|
||||||
|
config.addinivalue_line(
|
||||||
|
'markers',
|
||||||
|
'ctlcs_bish: test will (likely) not behave under SIGINT..'
|
||||||
|
)
|
||||||
|
|
||||||
# a fn that sub-instantiates a `pexpect.spawn()`
|
# a fn that sub-instantiates a `pexpect.spawn()`
|
||||||
# and returns it.
|
# and returns it.
|
||||||
type PexpectSpawner = Callable[[str], pty_spawn.spawn]
|
type PexpectSpawner = Callable[
|
||||||
|
[str],
|
||||||
|
pty_spawn.spawn,
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|
@ -64,26 +80,65 @@ def spawn(
|
||||||
|
|
||||||
'''
|
'''
|
||||||
import os
|
import os
|
||||||
|
# disable colored tbs
|
||||||
os.environ['PYTHON_COLORS'] = '0'
|
os.environ['PYTHON_COLORS'] = '0'
|
||||||
|
# disable all ANSI color output
|
||||||
|
# os.environ['NO_COLOR'] = '1'
|
||||||
|
|
||||||
|
spawned: PexpectSpawner|None = None
|
||||||
|
|
||||||
def _spawn(
|
def _spawn(
|
||||||
cmd: str,
|
cmd: str,
|
||||||
|
expect_timeout: float = 4,
|
||||||
**mkcmd_kwargs,
|
**mkcmd_kwargs,
|
||||||
) -> pty_spawn.spawn:
|
) -> pty_spawn.spawn:
|
||||||
|
nonlocal spawned
|
||||||
unset_colors()
|
unset_colors()
|
||||||
return testdir.spawn(
|
spawned = testdir.spawn(
|
||||||
cmd=mk_cmd(
|
cmd=mk_cmd(
|
||||||
cmd,
|
cmd,
|
||||||
**mkcmd_kwargs,
|
**mkcmd_kwargs,
|
||||||
),
|
),
|
||||||
expect_timeout=3,
|
expect_timeout=(timeout:=(
|
||||||
|
expect_timeout + 6
|
||||||
|
if _non_linux and _ci_env
|
||||||
|
else expect_timeout
|
||||||
|
)),
|
||||||
# preexec_fn=unset_colors,
|
# preexec_fn=unset_colors,
|
||||||
# ^TODO? get `pytest` core to expose underlying
|
# ^TODO? get `pytest` core to expose underlying
|
||||||
# `pexpect.spawn()` stuff?
|
# `pexpect.spawn()` stuff?
|
||||||
)
|
)
|
||||||
|
# sanity
|
||||||
|
assert spawned.timeout == timeout
|
||||||
|
return spawned
|
||||||
|
|
||||||
# such that test-dep can pass input script name.
|
# such that test-dep can pass input script name.
|
||||||
return _spawn # the `PexpectSpawner`, type alias.
|
yield _spawn # the `PexpectSpawner`, type alias.
|
||||||
|
|
||||||
|
if (
|
||||||
|
spawned
|
||||||
|
and
|
||||||
|
(ptyproc := spawned.ptyproc)
|
||||||
|
):
|
||||||
|
start: float = time.time()
|
||||||
|
timeout: float = 5
|
||||||
|
while (
|
||||||
|
ptyproc.isalive()
|
||||||
|
and
|
||||||
|
(
|
||||||
|
(_time_took := (time.time() - start))
|
||||||
|
<
|
||||||
|
timeout
|
||||||
|
)
|
||||||
|
):
|
||||||
|
ptyproc.kill(signal.SIGINT)
|
||||||
|
time.sleep(0.01)
|
||||||
|
|
||||||
|
if ptyproc.isalive():
|
||||||
|
ptyproc.kill(signal.SIGKILL)
|
||||||
|
|
||||||
|
# TODO? ensure we've cleaned up any UDS-paths?
|
||||||
|
# breakpoint()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(
|
@pytest.fixture(
|
||||||
|
|
@ -91,13 +146,12 @@ def spawn(
|
||||||
ids='ctl-c={}'.format,
|
ids='ctl-c={}'.format,
|
||||||
)
|
)
|
||||||
def ctlc(
|
def ctlc(
|
||||||
request,
|
request: pytest.FixtureRequest,
|
||||||
ci_env: bool,
|
ci_env: bool,
|
||||||
|
|
||||||
) -> bool:
|
) -> bool:
|
||||||
|
|
||||||
use_ctlc = request.param
|
use_ctlc: bool = request.param
|
||||||
|
|
||||||
node = request.node
|
node = request.node
|
||||||
markers = node.own_markers
|
markers = node.own_markers
|
||||||
for mark in markers:
|
for mark in markers:
|
||||||
|
|
@ -109,7 +163,13 @@ def ctlc(
|
||||||
'https://github.com/goodboy/tractor/issues/320'
|
'https://github.com/goodboy/tractor/issues/320'
|
||||||
)
|
)
|
||||||
|
|
||||||
if mark.name == 'ctlcs_bish':
|
if (
|
||||||
|
mark.name == 'ctlcs_bish'
|
||||||
|
and
|
||||||
|
use_ctlc
|
||||||
|
and
|
||||||
|
all(mark.args)
|
||||||
|
):
|
||||||
pytest.skip(
|
pytest.skip(
|
||||||
f'Test {node} prolly uses something from the stdlib (namely `asyncio`..)\n'
|
f'Test {node} prolly uses something from the stdlib (namely `asyncio`..)\n'
|
||||||
f'The test and/or underlying example script can *sometimes* run fine '
|
f'The test and/or underlying example script can *sometimes* run fine '
|
||||||
|
|
@ -214,12 +274,13 @@ def assert_before(
|
||||||
err_on_false=True,
|
err_on_false=True,
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
|
return str(child.before.decode())
|
||||||
|
|
||||||
|
|
||||||
def do_ctlc(
|
def do_ctlc(
|
||||||
child,
|
child,
|
||||||
count: int = 3,
|
count: int = 3,
|
||||||
delay: float = 0.1,
|
delay: float|None = None,
|
||||||
patt: str|None = None,
|
patt: str|None = None,
|
||||||
|
|
||||||
# expect repl UX to reprint the prompt after every
|
# expect repl UX to reprint the prompt after every
|
||||||
|
|
@ -231,6 +292,7 @@ def do_ctlc(
|
||||||
) -> str|None:
|
) -> str|None:
|
||||||
|
|
||||||
before: str|None = None
|
before: str|None = None
|
||||||
|
delay = delay or 0.1
|
||||||
|
|
||||||
# make sure ctl-c sends don't do anything but repeat output
|
# make sure ctl-c sends don't do anything but repeat output
|
||||||
for _ in range(count):
|
for _ in range(count):
|
||||||
|
|
@ -241,7 +303,10 @@ def do_ctlc(
|
||||||
# if you run this test manually it works just fine..
|
# if you run this test manually it works just fine..
|
||||||
if expect_prompt:
|
if expect_prompt:
|
||||||
time.sleep(delay)
|
time.sleep(delay)
|
||||||
child.expect(PROMPT)
|
child.expect(
|
||||||
|
PROMPT,
|
||||||
|
timeout=(child.timeout * 2) if _ci_env else child.timeout,
|
||||||
|
)
|
||||||
before = str(child.before.decode())
|
before = str(child.before.decode())
|
||||||
time.sleep(delay)
|
time.sleep(delay)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,9 @@ from .conftest import (
|
||||||
in_prompt_msg,
|
in_prompt_msg,
|
||||||
assert_before,
|
assert_before,
|
||||||
)
|
)
|
||||||
|
from ..conftest import (
|
||||||
|
_ci_env,
|
||||||
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from ..conftest import PexpectSpawner
|
from ..conftest import PexpectSpawner
|
||||||
|
|
@ -51,13 +54,14 @@ if TYPE_CHECKING:
|
||||||
# - recurrent root errors
|
# - recurrent root errors
|
||||||
|
|
||||||
|
|
||||||
|
_non_linux: bool = platform.system() != 'Linux'
|
||||||
|
|
||||||
if platform.system() == 'Windows':
|
if platform.system() == 'Windows':
|
||||||
pytest.skip(
|
pytest.skip(
|
||||||
'Debugger tests have no windows support (yet)',
|
'Debugger tests have no windows support (yet)',
|
||||||
allow_module_level=True,
|
allow_module_level=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# TODO: was trying to this xfail style but some weird bug i see in CI
|
# TODO: was trying to this xfail style but some weird bug i see in CI
|
||||||
# that's happening at collect time.. pretty soon gonna dump actions i'm
|
# that's happening at collect time.. pretty soon gonna dump actions i'm
|
||||||
# thinkin...
|
# thinkin...
|
||||||
|
|
@ -193,6 +197,11 @@ def test_root_actor_bp_forever(
|
||||||
child.expect(EOF)
|
child.expect(EOF)
|
||||||
|
|
||||||
|
|
||||||
|
# skip on non-Linux CI
|
||||||
|
@pytest.mark.ctlcs_bish(
|
||||||
|
_non_linux,
|
||||||
|
_ci_env,
|
||||||
|
)
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
'do_next',
|
'do_next',
|
||||||
(True, False),
|
(True, False),
|
||||||
|
|
@ -258,6 +267,11 @@ def test_subactor_error(
|
||||||
child.expect(EOF)
|
child.expect(EOF)
|
||||||
|
|
||||||
|
|
||||||
|
# skip on non-Linux CI
|
||||||
|
@pytest.mark.ctlcs_bish(
|
||||||
|
_non_linux,
|
||||||
|
_ci_env,
|
||||||
|
)
|
||||||
def test_subactor_breakpoint(
|
def test_subactor_breakpoint(
|
||||||
spawn,
|
spawn,
|
||||||
ctlc: bool,
|
ctlc: bool,
|
||||||
|
|
@ -480,8 +494,24 @@ def test_multi_daemon_subactors(
|
||||||
stream.
|
stream.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
child = spawn('multi_daemon_subactors')
|
non_linux = _non_linux
|
||||||
|
if non_linux and ctlc:
|
||||||
|
pytest.skip(
|
||||||
|
'Ctl-c + MacOS is too unreliable/racy for this test..\n'
|
||||||
|
)
|
||||||
|
# !TODO, if someone with more patience then i wants to muck
|
||||||
|
# with the timings on this please feel free to see all the
|
||||||
|
# `non_linux` branching logic i added on my first attempt
|
||||||
|
# below!
|
||||||
|
#
|
||||||
|
# my conclusion was that if i were to run the script
|
||||||
|
# manually, and thus as slowly as a human would, the test
|
||||||
|
# would and should pass as described in this test fn, however
|
||||||
|
# after fighting with it for >= 1hr. i decided more then
|
||||||
|
# likely the more extensive `linux` testing should cover most
|
||||||
|
# regressions.
|
||||||
|
|
||||||
|
child = spawn('multi_daemon_subactors')
|
||||||
child.expect(PROMPT)
|
child.expect(PROMPT)
|
||||||
|
|
||||||
# there can be a race for which subactor will acquire
|
# there can be a race for which subactor will acquire
|
||||||
|
|
@ -511,8 +541,19 @@ def test_multi_daemon_subactors(
|
||||||
else:
|
else:
|
||||||
raise ValueError('Neither log msg was found !?')
|
raise ValueError('Neither log msg was found !?')
|
||||||
|
|
||||||
|
non_linux_delay: float = 0.3
|
||||||
if ctlc:
|
if ctlc:
|
||||||
do_ctlc(child)
|
do_ctlc(
|
||||||
|
child,
|
||||||
|
delay=(
|
||||||
|
non_linux_delay
|
||||||
|
if non_linux
|
||||||
|
else None
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
if non_linux:
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
# NOTE: previously since we did not have clobber prevention
|
# NOTE: previously since we did not have clobber prevention
|
||||||
# in the root actor this final resume could result in the debugger
|
# in the root actor this final resume could result in the debugger
|
||||||
|
|
@ -543,33 +584,66 @@ def test_multi_daemon_subactors(
|
||||||
# assert "in use by child ('bp_forever'," in before
|
# assert "in use by child ('bp_forever'," in before
|
||||||
|
|
||||||
if ctlc:
|
if ctlc:
|
||||||
do_ctlc(child)
|
do_ctlc(
|
||||||
|
child,
|
||||||
|
delay=(
|
||||||
|
non_linux_delay
|
||||||
|
if non_linux
|
||||||
|
else None
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
if non_linux:
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
# expect another breakpoint actor entry
|
# expect another breakpoint actor entry
|
||||||
child.sendline('c')
|
child.sendline('c')
|
||||||
child.expect(PROMPT)
|
child.expect(PROMPT)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
assert_before(
|
before: str = assert_before(
|
||||||
child,
|
child,
|
||||||
bp_forev_parts,
|
bp_forev_parts,
|
||||||
)
|
)
|
||||||
except AssertionError:
|
except AssertionError:
|
||||||
assert_before(
|
before: str = assert_before(
|
||||||
child,
|
child,
|
||||||
name_error_parts,
|
name_error_parts,
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if ctlc:
|
if ctlc:
|
||||||
do_ctlc(child)
|
before: str = do_ctlc(
|
||||||
|
child,
|
||||||
|
delay=(
|
||||||
|
non_linux_delay
|
||||||
|
if non_linux
|
||||||
|
else None
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
if non_linux:
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
# should crash with the 2nd name error (simulates
|
# should crash with the 2nd name error (simulates
|
||||||
# a retry) and then the root eventually (boxed) errors
|
# a retry) and then the root eventually (boxed) errors
|
||||||
# after 1 or more further bp actor entries.
|
# after 1 or more further bp actor entries.
|
||||||
|
|
||||||
child.sendline('c')
|
child.sendline('c')
|
||||||
child.expect(PROMPT)
|
try:
|
||||||
|
child.expect(
|
||||||
|
PROMPT,
|
||||||
|
timeout=3,
|
||||||
|
)
|
||||||
|
except EOF:
|
||||||
|
before: str = child.before.decode()
|
||||||
|
print(
|
||||||
|
f'\n'
|
||||||
|
f'??? NEVER RXED `pdb` PROMPT ???\n'
|
||||||
|
f'\n'
|
||||||
|
f'{before}\n'
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
|
||||||
assert_before(
|
assert_before(
|
||||||
child,
|
child,
|
||||||
name_error_parts,
|
name_error_parts,
|
||||||
|
|
@ -689,7 +763,8 @@ def test_multi_subactors_root_errors(
|
||||||
|
|
||||||
@has_nested_actors
|
@has_nested_actors
|
||||||
def test_multi_nested_subactors_error_through_nurseries(
|
def test_multi_nested_subactors_error_through_nurseries(
|
||||||
spawn,
|
ci_env: bool,
|
||||||
|
spawn: PexpectSpawner,
|
||||||
|
|
||||||
# TODO: address debugger issue for nested tree:
|
# TODO: address debugger issue for nested tree:
|
||||||
# https://github.com/goodboy/tractor/issues/320
|
# https://github.com/goodboy/tractor/issues/320
|
||||||
|
|
@ -710,9 +785,29 @@ def test_multi_nested_subactors_error_through_nurseries(
|
||||||
|
|
||||||
# timed_out_early: bool = False
|
# timed_out_early: bool = False
|
||||||
|
|
||||||
for send_char in itertools.cycle(['c', 'q']):
|
for (
|
||||||
|
i,
|
||||||
|
send_char,
|
||||||
|
) in enumerate(itertools.cycle(['c', 'q'])):
|
||||||
|
|
||||||
|
timeout: float = -1
|
||||||
|
if (
|
||||||
|
_non_linux
|
||||||
|
and
|
||||||
|
ci_env
|
||||||
|
):
|
||||||
|
timeout: float = 6
|
||||||
|
|
||||||
|
# XXX linux but the first crash sequence
|
||||||
|
# can take longer to arrive at a prompt.
|
||||||
|
elif i == 0:
|
||||||
|
timeout = 5
|
||||||
|
|
||||||
try:
|
try:
|
||||||
child.expect(PROMPT)
|
child.expect(
|
||||||
|
PROMPT,
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
child.sendline(send_char)
|
child.sendline(send_char)
|
||||||
time.sleep(0.01)
|
time.sleep(0.01)
|
||||||
|
|
||||||
|
|
@ -889,6 +984,11 @@ def test_different_debug_mode_per_actor(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# skip on non-Linux CI
|
||||||
|
@pytest.mark.ctlcs_bish(
|
||||||
|
_non_linux,
|
||||||
|
_ci_env,
|
||||||
|
)
|
||||||
def test_post_mortem_api(
|
def test_post_mortem_api(
|
||||||
spawn,
|
spawn,
|
||||||
ctlc: bool,
|
ctlc: bool,
|
||||||
|
|
@ -1133,12 +1233,21 @@ def test_ctxep_pauses_n_maybe_ipc_breaks(
|
||||||
# closed so verify we see error reporting as well as
|
# closed so verify we see error reporting as well as
|
||||||
# a failed crash-REPL request msg and can CTL-c our way
|
# a failed crash-REPL request msg and can CTL-c our way
|
||||||
# out.
|
# out.
|
||||||
|
|
||||||
|
# ?TODO, match depending on `tpt_proto(s)`?
|
||||||
|
# - [ ] how can we pass it into the script tho?
|
||||||
|
tpt: str = 'UDS'
|
||||||
|
if _non_linux:
|
||||||
|
tpt: str = 'TCP'
|
||||||
|
|
||||||
assert_before(
|
assert_before(
|
||||||
child,
|
child,
|
||||||
['peer IPC channel closed abruptly?',
|
['peer IPC channel closed abruptly?',
|
||||||
'another task closed this fd',
|
'another task closed this fd',
|
||||||
'Debug lock request was CANCELLED?',
|
'Debug lock request was CANCELLED?',
|
||||||
"TransportClosed: 'MsgpackUDSStream' was already closed locally ?",]
|
f"'Msgpack{tpt}Stream' was already closed locally?",
|
||||||
|
f"TransportClosed: 'Msgpack{tpt}Stream' was already closed 'by peer'?",
|
||||||
|
]
|
||||||
|
|
||||||
# XXX races on whether these show/hit?
|
# XXX races on whether these show/hit?
|
||||||
# 'Failed to REPl via `_pause()` You called `tractor.pause()` from an already cancelled scope!',
|
# 'Failed to REPl via `_pause()` You called `tractor.pause()` from an already cancelled scope!',
|
||||||
|
|
|
||||||
|
|
@ -63,6 +63,9 @@ def test_pause_from_sync(
|
||||||
`examples/debugging/sync_bp.py`
|
`examples/debugging/sync_bp.py`
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
# XXX required for `breakpoint()` overload and
|
||||||
|
# thus`tractor.devx.pause_from_sync()`.
|
||||||
|
pytest.importorskip('greenback')
|
||||||
child = spawn('sync_bp')
|
child = spawn('sync_bp')
|
||||||
|
|
||||||
# first `sync_pause()` after nurseries open
|
# first `sync_pause()` after nurseries open
|
||||||
|
|
@ -260,6 +263,9 @@ def test_sync_pause_from_aio_task(
|
||||||
`examples/debugging/asycio_bp.py`
|
`examples/debugging/asycio_bp.py`
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
# XXX required for `breakpoint()` overload and
|
||||||
|
# thus`tractor.devx.pause_from_sync()`.
|
||||||
|
pytest.importorskip('greenback')
|
||||||
child = spawn('asyncio_bp')
|
child = spawn('asyncio_bp')
|
||||||
|
|
||||||
# RACE on whether trio/asyncio task bps first
|
# RACE on whether trio/asyncio task bps first
|
||||||
|
|
|
||||||
|
|
@ -31,6 +31,9 @@ from .conftest import (
|
||||||
PROMPT,
|
PROMPT,
|
||||||
_pause_msg,
|
_pause_msg,
|
||||||
)
|
)
|
||||||
|
from ..conftest import (
|
||||||
|
no_macos,
|
||||||
|
)
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from pexpect.exceptions import (
|
from pexpect.exceptions import (
|
||||||
|
|
@ -42,6 +45,7 @@ if TYPE_CHECKING:
|
||||||
from ..conftest import PexpectSpawner
|
from ..conftest import PexpectSpawner
|
||||||
|
|
||||||
|
|
||||||
|
@no_macos
|
||||||
def test_shield_pause(
|
def test_shield_pause(
|
||||||
spawn: PexpectSpawner,
|
spawn: PexpectSpawner,
|
||||||
):
|
):
|
||||||
|
|
@ -57,6 +61,7 @@ def test_shield_pause(
|
||||||
expect(
|
expect(
|
||||||
child,
|
child,
|
||||||
'Yo my child hanging..?',
|
'Yo my child hanging..?',
|
||||||
|
timeout=3,
|
||||||
)
|
)
|
||||||
assert_before(
|
assert_before(
|
||||||
child,
|
child,
|
||||||
|
|
@ -121,7 +126,7 @@ def test_shield_pause(
|
||||||
child.pid,
|
child.pid,
|
||||||
signal.SIGINT,
|
signal.SIGINT,
|
||||||
)
|
)
|
||||||
from tractor._supervise import _shutdown_msg
|
from tractor.runtime._supervise import _shutdown_msg
|
||||||
expect(
|
expect(
|
||||||
child,
|
child,
|
||||||
# 'Shutting down actor runtime',
|
# 'Shutting down actor runtime',
|
||||||
|
|
@ -151,8 +156,10 @@ def test_breakpoint_hook_restored(
|
||||||
calls used.
|
calls used.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
# XXX required for `breakpoint()` overload and
|
||||||
|
# thus`tractor.devx.pause_from_sync()`.
|
||||||
|
pytest.importorskip('greenback')
|
||||||
child = spawn('restore_builtin_breakpoint')
|
child = spawn('restore_builtin_breakpoint')
|
||||||
|
|
||||||
child.expect(PROMPT)
|
child.expect(PROMPT)
|
||||||
try:
|
try:
|
||||||
assert_before(
|
assert_before(
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,376 @@
|
||||||
|
'''
|
||||||
|
Multiaddr construction, parsing, and round-trip tests for
|
||||||
|
`tractor.discovery._multiaddr.mk_maddr()` and
|
||||||
|
`tractor.discovery._multiaddr.parse_maddr()`.
|
||||||
|
|
||||||
|
'''
|
||||||
|
from pathlib import Path
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from multiaddr import Multiaddr
|
||||||
|
|
||||||
|
from tractor.ipc._tcp import TCPAddress
|
||||||
|
from tractor.ipc._uds import UDSAddress
|
||||||
|
from tractor.discovery._multiaddr import (
|
||||||
|
mk_maddr,
|
||||||
|
parse_maddr,
|
||||||
|
parse_endpoints,
|
||||||
|
_tpt_proto_to_maddr,
|
||||||
|
_maddr_to_tpt_proto,
|
||||||
|
)
|
||||||
|
from tractor.discovery._addr import wrap_address
|
||||||
|
|
||||||
|
|
||||||
|
def test_tpt_proto_to_maddr_mapping():
|
||||||
|
'''
|
||||||
|
`_tpt_proto_to_maddr` maps all supported `proto_key`
|
||||||
|
values to their correct multiaddr protocol names.
|
||||||
|
|
||||||
|
'''
|
||||||
|
assert _tpt_proto_to_maddr['tcp'] == 'tcp'
|
||||||
|
assert _tpt_proto_to_maddr['uds'] == 'unix'
|
||||||
|
assert len(_tpt_proto_to_maddr) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_mk_maddr_tcp_ipv4():
|
||||||
|
'''
|
||||||
|
`mk_maddr()` on a `TCPAddress` with an IPv4 host
|
||||||
|
produces the correct `/ip4/<host>/tcp/<port>` multiaddr.
|
||||||
|
|
||||||
|
'''
|
||||||
|
addr = TCPAddress('127.0.0.1', 1234)
|
||||||
|
result: Multiaddr = mk_maddr(addr)
|
||||||
|
|
||||||
|
assert isinstance(result, Multiaddr)
|
||||||
|
assert str(result) == '/ip4/127.0.0.1/tcp/1234'
|
||||||
|
|
||||||
|
protos = result.protocols()
|
||||||
|
assert protos[0].name == 'ip4'
|
||||||
|
assert protos[1].name == 'tcp'
|
||||||
|
|
||||||
|
assert result.value_for_protocol('ip4') == '127.0.0.1'
|
||||||
|
assert result.value_for_protocol('tcp') == '1234'
|
||||||
|
|
||||||
|
|
||||||
|
def test_mk_maddr_tcp_ipv6():
|
||||||
|
'''
|
||||||
|
`mk_maddr()` on a `TCPAddress` with an IPv6 host
|
||||||
|
produces the correct `/ip6/<host>/tcp/<port>` multiaddr.
|
||||||
|
|
||||||
|
'''
|
||||||
|
addr = TCPAddress('::1', 5678)
|
||||||
|
result: Multiaddr = mk_maddr(addr)
|
||||||
|
|
||||||
|
assert str(result) == '/ip6/::1/tcp/5678'
|
||||||
|
|
||||||
|
protos = result.protocols()
|
||||||
|
assert protos[0].name == 'ip6'
|
||||||
|
assert protos[1].name == 'tcp'
|
||||||
|
|
||||||
|
|
||||||
|
def test_mk_maddr_uds():
|
||||||
|
'''
|
||||||
|
`mk_maddr()` on a `UDSAddress` produces a `/unix/<path>`
|
||||||
|
multiaddr containing the full socket path.
|
||||||
|
|
||||||
|
'''
|
||||||
|
# NOTE, use an absolute `filedir` to match real runtime
|
||||||
|
# UDS paths; `mk_maddr()` strips the leading `/` to avoid
|
||||||
|
# the double-slash `/unix//run/..` that py-multiaddr
|
||||||
|
# rejects as "empty protocol path".
|
||||||
|
filedir = '/tmp/tractor_test'
|
||||||
|
filename = 'test_sock.sock'
|
||||||
|
addr = UDSAddress(
|
||||||
|
filedir=filedir,
|
||||||
|
filename=filename,
|
||||||
|
)
|
||||||
|
result: Multiaddr = mk_maddr(addr)
|
||||||
|
|
||||||
|
assert isinstance(result, Multiaddr)
|
||||||
|
|
||||||
|
result_str: str = str(result)
|
||||||
|
assert result_str.startswith('/unix/')
|
||||||
|
# verify the leading `/` was stripped to avoid double-slash
|
||||||
|
assert '/unix/tmp/tractor_test/' in result_str
|
||||||
|
|
||||||
|
sockpath_rel: str = str(
|
||||||
|
Path(filedir) / filename
|
||||||
|
).lstrip('/')
|
||||||
|
unix_val: str = result.value_for_protocol('unix')
|
||||||
|
assert unix_val.endswith(sockpath_rel)
|
||||||
|
|
||||||
|
|
||||||
|
def test_mk_maddr_unsupported_proto_key():
|
||||||
|
'''
|
||||||
|
`mk_maddr()` raises `ValueError` for an unsupported
|
||||||
|
`proto_key`.
|
||||||
|
|
||||||
|
'''
|
||||||
|
fake_addr = SimpleNamespace(proto_key='quic')
|
||||||
|
with pytest.raises(
|
||||||
|
ValueError,
|
||||||
|
match='Unsupported proto_key',
|
||||||
|
):
|
||||||
|
mk_maddr(fake_addr)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
'addr',
|
||||||
|
[
|
||||||
|
pytest.param(
|
||||||
|
TCPAddress('127.0.0.1', 9999),
|
||||||
|
id='tcp-ipv4',
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
UDSAddress(
|
||||||
|
filedir='/tmp/tractor_rt',
|
||||||
|
filename='roundtrip.sock',
|
||||||
|
),
|
||||||
|
id='uds',
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_mk_maddr_roundtrip(addr):
|
||||||
|
'''
|
||||||
|
`mk_maddr()` output is valid multiaddr syntax that the
|
||||||
|
library can re-parse back into an equivalent `Multiaddr`.
|
||||||
|
|
||||||
|
'''
|
||||||
|
maddr: Multiaddr = mk_maddr(addr)
|
||||||
|
reparsed = Multiaddr(str(maddr))
|
||||||
|
|
||||||
|
assert reparsed == maddr
|
||||||
|
assert str(reparsed) == str(maddr)
|
||||||
|
|
||||||
|
|
||||||
|
# ------ parse_maddr() tests ------
|
||||||
|
|
||||||
|
def test_maddr_to_tpt_proto_mapping():
|
||||||
|
'''
|
||||||
|
`_maddr_to_tpt_proto` is the exact inverse of
|
||||||
|
`_tpt_proto_to_maddr`.
|
||||||
|
|
||||||
|
'''
|
||||||
|
assert _maddr_to_tpt_proto == {
|
||||||
|
'tcp': 'tcp',
|
||||||
|
'unix': 'uds',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_maddr_tcp_ipv4():
|
||||||
|
'''
|
||||||
|
`parse_maddr()` on an IPv4 TCP multiaddr string
|
||||||
|
produce a `TCPAddress` with the correct host and port.
|
||||||
|
|
||||||
|
'''
|
||||||
|
result = parse_maddr('/ip4/127.0.0.1/tcp/1234')
|
||||||
|
|
||||||
|
assert isinstance(result, TCPAddress)
|
||||||
|
assert result.unwrap() == ('127.0.0.1', 1234)
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_maddr_tcp_ipv6():
|
||||||
|
'''
|
||||||
|
`parse_maddr()` on an IPv6 TCP multiaddr string
|
||||||
|
produce a `TCPAddress` with the correct host and port.
|
||||||
|
|
||||||
|
'''
|
||||||
|
result = parse_maddr('/ip6/::1/tcp/5678')
|
||||||
|
|
||||||
|
assert isinstance(result, TCPAddress)
|
||||||
|
assert result.unwrap() == ('::1', 5678)
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_maddr_uds():
|
||||||
|
'''
|
||||||
|
`parse_maddr()` on a `/unix/...` multiaddr string
|
||||||
|
produce a `UDSAddress` with the correct dir and filename,
|
||||||
|
preserving absolute path semantics.
|
||||||
|
|
||||||
|
'''
|
||||||
|
result = parse_maddr('/unix/tmp/tractor_test/test.sock')
|
||||||
|
|
||||||
|
assert isinstance(result, UDSAddress)
|
||||||
|
filedir, filename = result.unwrap()
|
||||||
|
assert filename == 'test.sock'
|
||||||
|
assert str(filedir) == '/tmp/tractor_test'
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_maddr_unsupported():
|
||||||
|
'''
|
||||||
|
`parse_maddr()` raise `ValueError` for an unsupported
|
||||||
|
protocol combination like UDP.
|
||||||
|
|
||||||
|
'''
|
||||||
|
with pytest.raises(
|
||||||
|
ValueError,
|
||||||
|
match='Unsupported multiaddr protocol combo',
|
||||||
|
):
|
||||||
|
parse_maddr('/ip4/127.0.0.1/udp/1234')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
'addr',
|
||||||
|
[
|
||||||
|
pytest.param(
|
||||||
|
TCPAddress('127.0.0.1', 9999),
|
||||||
|
id='tcp-ipv4',
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
UDSAddress(
|
||||||
|
filedir='/tmp/tractor_rt',
|
||||||
|
filename='roundtrip.sock',
|
||||||
|
),
|
||||||
|
id='uds',
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_parse_maddr_roundtrip(addr):
|
||||||
|
'''
|
||||||
|
Full round-trip: `addr -> mk_maddr -> str -> parse_maddr`
|
||||||
|
produce an `Address` whose `.unwrap()` matches the original.
|
||||||
|
|
||||||
|
'''
|
||||||
|
maddr: Multiaddr = mk_maddr(addr)
|
||||||
|
maddr_str: str = str(maddr)
|
||||||
|
parsed = parse_maddr(maddr_str)
|
||||||
|
|
||||||
|
assert type(parsed) is type(addr)
|
||||||
|
assert parsed.unwrap() == addr.unwrap()
|
||||||
|
|
||||||
|
|
||||||
|
def test_wrap_address_maddr_str():
|
||||||
|
'''
|
||||||
|
`wrap_address()` accept a multiaddr-format string and
|
||||||
|
return the correct `Address` type.
|
||||||
|
|
||||||
|
'''
|
||||||
|
result = wrap_address('/ip4/127.0.0.1/tcp/9999')
|
||||||
|
|
||||||
|
assert isinstance(result, TCPAddress)
|
||||||
|
assert result.unwrap() == ('127.0.0.1', 9999)
|
||||||
|
|
||||||
|
|
||||||
|
# ------ parse_endpoints() tests ------
|
||||||
|
|
||||||
|
def test_parse_endpoints_tcp_only():
|
||||||
|
'''
|
||||||
|
`parse_endpoints()` with a single TCP maddr per actor
|
||||||
|
produce the correct `TCPAddress` instances.
|
||||||
|
|
||||||
|
'''
|
||||||
|
table = {
|
||||||
|
'registry': ['/ip4/127.0.0.1/tcp/1616'],
|
||||||
|
'data_feed': ['/ip4/0.0.0.0/tcp/5555'],
|
||||||
|
}
|
||||||
|
result = parse_endpoints(table)
|
||||||
|
|
||||||
|
assert set(result.keys()) == {'registry', 'data_feed'}
|
||||||
|
|
||||||
|
reg_addr = result['registry'][0]
|
||||||
|
assert isinstance(reg_addr, TCPAddress)
|
||||||
|
assert reg_addr.unwrap() == ('127.0.0.1', 1616)
|
||||||
|
|
||||||
|
feed_addr = result['data_feed'][0]
|
||||||
|
assert isinstance(feed_addr, TCPAddress)
|
||||||
|
assert feed_addr.unwrap() == ('0.0.0.0', 5555)
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_endpoints_mixed_tpts():
|
||||||
|
'''
|
||||||
|
`parse_endpoints()` with both TCP and UDS maddrs for
|
||||||
|
the same actor produce the correct mixed `Address` list.
|
||||||
|
|
||||||
|
'''
|
||||||
|
table = {
|
||||||
|
'broker': [
|
||||||
|
'/ip4/127.0.0.1/tcp/4040',
|
||||||
|
'/unix/tmp/tractor/broker.sock',
|
||||||
|
],
|
||||||
|
}
|
||||||
|
result = parse_endpoints(table)
|
||||||
|
addrs = result['broker']
|
||||||
|
|
||||||
|
assert len(addrs) == 2
|
||||||
|
assert isinstance(addrs[0], TCPAddress)
|
||||||
|
assert addrs[0].unwrap() == ('127.0.0.1', 4040)
|
||||||
|
|
||||||
|
assert isinstance(addrs[1], UDSAddress)
|
||||||
|
filedir, filename = addrs[1].unwrap()
|
||||||
|
assert filename == 'broker.sock'
|
||||||
|
assert str(filedir) == '/tmp/tractor'
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_endpoints_unwrapped_tuples():
|
||||||
|
'''
|
||||||
|
`parse_endpoints()` accept raw `(host, port)` tuples
|
||||||
|
and wrap them as `TCPAddress`.
|
||||||
|
|
||||||
|
'''
|
||||||
|
table = {
|
||||||
|
'ems': [('127.0.0.1', 6666)],
|
||||||
|
}
|
||||||
|
result = parse_endpoints(table)
|
||||||
|
|
||||||
|
addr = result['ems'][0]
|
||||||
|
assert isinstance(addr, TCPAddress)
|
||||||
|
assert addr.unwrap() == ('127.0.0.1', 6666)
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_endpoints_mixed_str_and_tuple():
|
||||||
|
'''
|
||||||
|
`parse_endpoints()` accept a mix of maddr strings and
|
||||||
|
raw tuples in the same actor entry list.
|
||||||
|
|
||||||
|
'''
|
||||||
|
table = {
|
||||||
|
'quoter': [
|
||||||
|
'/ip4/127.0.0.1/tcp/7777',
|
||||||
|
('127.0.0.1', 8888),
|
||||||
|
],
|
||||||
|
}
|
||||||
|
result = parse_endpoints(table)
|
||||||
|
addrs = result['quoter']
|
||||||
|
|
||||||
|
assert len(addrs) == 2
|
||||||
|
assert isinstance(addrs[0], TCPAddress)
|
||||||
|
assert addrs[0].unwrap() == ('127.0.0.1', 7777)
|
||||||
|
|
||||||
|
assert isinstance(addrs[1], TCPAddress)
|
||||||
|
assert addrs[1].unwrap() == ('127.0.0.1', 8888)
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_endpoints_unsupported_proto():
|
||||||
|
'''
|
||||||
|
`parse_endpoints()` raise `ValueError` when a maddr
|
||||||
|
string uses an unsupported protocol like `/udp/`.
|
||||||
|
|
||||||
|
'''
|
||||||
|
table = {
|
||||||
|
'bad_actor': ['/ip4/127.0.0.1/udp/9999'],
|
||||||
|
}
|
||||||
|
with pytest.raises(
|
||||||
|
ValueError,
|
||||||
|
match='Unsupported multiaddr protocol combo',
|
||||||
|
):
|
||||||
|
parse_endpoints(table)
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_endpoints_empty_table():
|
||||||
|
'''
|
||||||
|
`parse_endpoints()` on an empty table return an empty
|
||||||
|
dict.
|
||||||
|
|
||||||
|
'''
|
||||||
|
assert parse_endpoints({}) == {}
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_endpoints_empty_actor_list():
|
||||||
|
'''
|
||||||
|
`parse_endpoints()` with an actor mapped to an empty
|
||||||
|
list preserve the key with an empty list value.
|
||||||
|
|
||||||
|
'''
|
||||||
|
result = parse_endpoints({'x': []})
|
||||||
|
assert result == {'x': []}
|
||||||
|
|
@ -1,25 +1,33 @@
|
||||||
"""
|
'''
|
||||||
Actor "discovery" testing
|
Discovery subsystem via a "registrar" actor scenarios.
|
||||||
"""
|
|
||||||
|
'''
|
||||||
import os
|
import os
|
||||||
import signal
|
import signal
|
||||||
import platform
|
import platform
|
||||||
from functools import partial
|
from functools import partial
|
||||||
import itertools
|
import itertools
|
||||||
|
import time
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
import psutil
|
import psutil
|
||||||
import pytest
|
import pytest
|
||||||
import subprocess
|
import subprocess
|
||||||
import tractor
|
import tractor
|
||||||
|
from tractor.devx import dump_on_hang
|
||||||
from tractor.trionics import collapse_eg
|
from tractor.trionics import collapse_eg
|
||||||
from tractor._testing import tractor_test
|
from tractor._testing import tractor_test
|
||||||
|
from tractor.discovery._addr import wrap_address
|
||||||
|
from tractor.discovery._multiaddr import mk_maddr
|
||||||
import trio
|
import trio
|
||||||
|
|
||||||
|
|
||||||
@tractor_test
|
@tractor_test
|
||||||
async def test_reg_then_unreg(reg_addr):
|
async def test_reg_then_unreg(
|
||||||
|
reg_addr: tuple,
|
||||||
|
):
|
||||||
actor = tractor.current_actor()
|
actor = tractor.current_actor()
|
||||||
assert actor.is_arbiter
|
assert actor.is_registrar
|
||||||
assert len(actor._registry) == 1 # only self is registered
|
assert len(actor._registry) == 1 # only self is registered
|
||||||
|
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
|
|
@ -27,10 +35,10 @@ async def test_reg_then_unreg(reg_addr):
|
||||||
) as n:
|
) as n:
|
||||||
|
|
||||||
portal = await n.start_actor('actor', enable_modules=[__name__])
|
portal = await n.start_actor('actor', enable_modules=[__name__])
|
||||||
uid = portal.channel.uid
|
uid = portal.channel.aid.uid
|
||||||
|
|
||||||
async with tractor.get_registry(reg_addr) as aportal:
|
async with tractor.get_registry(reg_addr) as aportal:
|
||||||
# this local actor should be the arbiter
|
# this local actor should be the registrar
|
||||||
assert actor is aportal.actor
|
assert actor is aportal.actor
|
||||||
|
|
||||||
async with tractor.wait_for_actor('actor'):
|
async with tractor.wait_for_actor('actor'):
|
||||||
|
|
@ -48,6 +56,49 @@ async def test_reg_then_unreg(reg_addr):
|
||||||
assert not sockaddrs
|
assert not sockaddrs
|
||||||
|
|
||||||
|
|
||||||
|
@tractor_test
|
||||||
|
async def test_reg_then_unreg_maddr(
|
||||||
|
reg_addr: tuple,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Same as `test_reg_then_unreg` but pass the registry
|
||||||
|
address as a multiaddr string to verify `wrap_address()`
|
||||||
|
multiaddr parsing end-to-end through the runtime.
|
||||||
|
|
||||||
|
'''
|
||||||
|
# tuple -> Address -> multiaddr string
|
||||||
|
addr_obj = wrap_address(reg_addr)
|
||||||
|
maddr_str: str = str(mk_maddr(addr_obj))
|
||||||
|
|
||||||
|
actor = tractor.current_actor()
|
||||||
|
assert actor.is_registrar
|
||||||
|
|
||||||
|
async with tractor.open_nursery(
|
||||||
|
registry_addrs=[maddr_str],
|
||||||
|
) as n:
|
||||||
|
|
||||||
|
portal = await n.start_actor(
|
||||||
|
'actor_maddr',
|
||||||
|
enable_modules=[__name__],
|
||||||
|
)
|
||||||
|
uid = portal.channel.aid.uid
|
||||||
|
|
||||||
|
async with tractor.get_registry(maddr_str) as aportal:
|
||||||
|
assert actor is aportal.actor
|
||||||
|
|
||||||
|
async with tractor.wait_for_actor('actor_maddr'):
|
||||||
|
assert uid in aportal.actor._registry
|
||||||
|
sockaddrs = actor._registry[uid]
|
||||||
|
assert sockaddrs
|
||||||
|
|
||||||
|
await n.cancel()
|
||||||
|
|
||||||
|
await trio.sleep(0.1)
|
||||||
|
assert uid not in aportal.actor._registry
|
||||||
|
sockaddrs = actor._registry.get(uid)
|
||||||
|
assert not sockaddrs
|
||||||
|
|
||||||
|
|
||||||
the_line = 'Hi my name is {}'
|
the_line = 'Hi my name is {}'
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -81,12 +132,20 @@ async def say_hello_use_wait(
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.timeout(
|
||||||
|
7,
|
||||||
|
method='thread',
|
||||||
|
)
|
||||||
@tractor_test
|
@tractor_test
|
||||||
@pytest.mark.parametrize('func', [say_hello, say_hello_use_wait])
|
@pytest.mark.parametrize(
|
||||||
|
'func',
|
||||||
|
[say_hello,
|
||||||
|
say_hello_use_wait]
|
||||||
|
)
|
||||||
async def test_trynamic_trio(
|
async def test_trynamic_trio(
|
||||||
func,
|
func: Callable,
|
||||||
start_method,
|
start_method: str,
|
||||||
reg_addr,
|
reg_addr: tuple,
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
Root actor acting as the "director" and running one-shot-task-actors
|
Root actor acting as the "director" and running one-shot-task-actors
|
||||||
|
|
@ -119,7 +178,10 @@ async def stream_forever():
|
||||||
await trio.sleep(0.01)
|
await trio.sleep(0.01)
|
||||||
|
|
||||||
|
|
||||||
async def cancel(use_signal, delay=0):
|
async def cancel(
|
||||||
|
use_signal: bool,
|
||||||
|
delay: float = 0,
|
||||||
|
):
|
||||||
# hold on there sally
|
# hold on there sally
|
||||||
await trio.sleep(delay)
|
await trio.sleep(delay)
|
||||||
|
|
||||||
|
|
@ -132,15 +194,17 @@ async def cancel(use_signal, delay=0):
|
||||||
raise KeyboardInterrupt
|
raise KeyboardInterrupt
|
||||||
|
|
||||||
|
|
||||||
async def stream_from(portal):
|
async def stream_from(portal: tractor.Portal):
|
||||||
async with portal.open_stream_from(stream_forever) as stream:
|
async with portal.open_stream_from(stream_forever) as stream:
|
||||||
async for value in stream:
|
async for value in stream:
|
||||||
print(value)
|
print(value)
|
||||||
|
|
||||||
|
|
||||||
async def unpack_reg(actor_or_portal):
|
async def unpack_reg(
|
||||||
|
actor_or_portal: tractor.Portal|tractor.Actor,
|
||||||
|
):
|
||||||
'''
|
'''
|
||||||
Get and unpack a "registry" RPC request from the "arbiter" registry
|
Get and unpack a "registry" RPC request from the registrar
|
||||||
system.
|
system.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
|
@ -149,7 +213,10 @@ async def unpack_reg(actor_or_portal):
|
||||||
else:
|
else:
|
||||||
msg = await actor_or_portal.run_from_ns('self', 'get_registry')
|
msg = await actor_or_portal.run_from_ns('self', 'get_registry')
|
||||||
|
|
||||||
return {tuple(key.split('.')): val for key, val in msg.items()}
|
return {
|
||||||
|
tuple(key.split('.')): val
|
||||||
|
for key, val in msg.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
async def spawn_and_check_registry(
|
async def spawn_and_check_registry(
|
||||||
|
|
@ -173,24 +240,26 @@ async def spawn_and_check_registry(
|
||||||
registry_addrs=[reg_addr],
|
registry_addrs=[reg_addr],
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
):
|
):
|
||||||
async with tractor.get_registry(reg_addr) as portal:
|
async with tractor.get_registry(
|
||||||
|
addr=reg_addr,
|
||||||
|
) as portal:
|
||||||
# runtime needs to be up to call this
|
# runtime needs to be up to call this
|
||||||
actor = tractor.current_actor()
|
actor = tractor.current_actor()
|
||||||
|
|
||||||
if remote_arbiter:
|
if remote_arbiter:
|
||||||
assert not actor.is_arbiter
|
assert not actor.is_registrar
|
||||||
|
|
||||||
if actor.is_arbiter:
|
if actor.is_registrar:
|
||||||
extra = 1 # arbiter is local root actor
|
extra = 1 # registrar is local root actor
|
||||||
get_reg = partial(unpack_reg, actor)
|
get_reg = partial(unpack_reg, actor)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
get_reg = partial(unpack_reg, portal)
|
get_reg = partial(unpack_reg, portal)
|
||||||
extra = 2 # local root actor + remote arbiter
|
extra = 2 # local root actor + remote registrar
|
||||||
|
|
||||||
# ensure current actor is registered
|
# ensure current actor is registered
|
||||||
registry: dict = await get_reg()
|
registry: dict = await get_reg()
|
||||||
assert actor.uid in registry
|
assert actor.aid.uid in registry
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async with tractor.open_nursery() as an:
|
async with tractor.open_nursery() as an:
|
||||||
|
|
@ -238,22 +307,35 @@ async def spawn_and_check_registry(
|
||||||
|
|
||||||
# all subactors should have de-registered
|
# all subactors should have de-registered
|
||||||
registry = await get_reg()
|
registry = await get_reg()
|
||||||
|
start: float = time.time()
|
||||||
|
while (
|
||||||
|
not (len(registry) == extra)
|
||||||
|
and
|
||||||
|
(time.time() - start) < 5
|
||||||
|
):
|
||||||
|
print(
|
||||||
|
f'Waiting for remaining subs to dereg..\n'
|
||||||
|
f'{registry!r}\n'
|
||||||
|
)
|
||||||
|
await trio.sleep(0.3)
|
||||||
|
else:
|
||||||
assert len(registry) == extra
|
assert len(registry) == extra
|
||||||
assert actor.uid in registry
|
|
||||||
|
assert actor.aid.uid in registry
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('use_signal', [False, True])
|
@pytest.mark.parametrize('use_signal', [False, True])
|
||||||
@pytest.mark.parametrize('with_streaming', [False, True])
|
@pytest.mark.parametrize('with_streaming', [False, True])
|
||||||
def test_subactors_unregister_on_cancel(
|
def test_subactors_unregister_on_cancel(
|
||||||
debug_mode: bool,
|
debug_mode: bool,
|
||||||
start_method,
|
start_method: str,
|
||||||
use_signal,
|
use_signal: bool,
|
||||||
reg_addr,
|
reg_addr: tuple,
|
||||||
with_streaming,
|
with_streaming: bool,
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
Verify that cancelling a nursery results in all subactors
|
Verify that cancelling a nursery results in all subactors
|
||||||
deregistering themselves with the arbiter.
|
deregistering themselves with the registrar.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
with pytest.raises(KeyboardInterrupt):
|
with pytest.raises(KeyboardInterrupt):
|
||||||
|
|
@ -274,15 +356,17 @@ def test_subactors_unregister_on_cancel(
|
||||||
def test_subactors_unregister_on_cancel_remote_daemon(
|
def test_subactors_unregister_on_cancel_remote_daemon(
|
||||||
daemon: subprocess.Popen,
|
daemon: subprocess.Popen,
|
||||||
debug_mode: bool,
|
debug_mode: bool,
|
||||||
start_method,
|
start_method: str,
|
||||||
use_signal,
|
use_signal: bool,
|
||||||
reg_addr,
|
reg_addr: tuple,
|
||||||
with_streaming,
|
with_streaming: bool,
|
||||||
):
|
):
|
||||||
"""Verify that cancelling a nursery results in all subactors
|
'''
|
||||||
deregistering themselves with a **remote** (not in the local process
|
Verify that cancelling a nursery results in all subactors
|
||||||
tree) arbiter.
|
deregistering themselves with a **remote** (not in the local
|
||||||
"""
|
process tree) registrar.
|
||||||
|
|
||||||
|
'''
|
||||||
with pytest.raises(KeyboardInterrupt):
|
with pytest.raises(KeyboardInterrupt):
|
||||||
trio.run(
|
trio.run(
|
||||||
partial(
|
partial(
|
||||||
|
|
@ -325,20 +409,24 @@ async def close_chans_before_nursery(
|
||||||
try:
|
try:
|
||||||
get_reg = partial(unpack_reg, aportal)
|
get_reg = partial(unpack_reg, aportal)
|
||||||
|
|
||||||
async with tractor.open_nursery() as tn:
|
async with tractor.open_nursery() as an:
|
||||||
portal1 = await tn.start_actor(
|
portal1 = await an.start_actor(
|
||||||
name='consumer1', enable_modules=[__name__])
|
name='consumer1',
|
||||||
portal2 = await tn.start_actor(
|
enable_modules=[__name__],
|
||||||
'consumer2', enable_modules=[__name__])
|
)
|
||||||
|
portal2 = await an.start_actor(
|
||||||
|
'consumer2',
|
||||||
|
enable_modules=[__name__],
|
||||||
|
)
|
||||||
|
|
||||||
# TODO: compact this back as was in last commit once
|
async with (
|
||||||
# 3.9+, see https://github.com/goodboy/tractor/issues/207
|
portal1.open_stream_from(
|
||||||
async with portal1.open_stream_from(
|
|
||||||
stream_forever
|
stream_forever
|
||||||
) as agen1:
|
) as agen1,
|
||||||
async with portal2.open_stream_from(
|
portal2.open_stream_from(
|
||||||
stream_forever
|
stream_forever
|
||||||
) as agen2:
|
) as agen2,
|
||||||
|
):
|
||||||
async with (
|
async with (
|
||||||
collapse_eg(),
|
collapse_eg(),
|
||||||
trio.open_nursery() as tn,
|
trio.open_nursery() as tn,
|
||||||
|
|
@ -349,7 +437,7 @@ async def close_chans_before_nursery(
|
||||||
await streamer(agen2)
|
await streamer(agen2)
|
||||||
finally:
|
finally:
|
||||||
# Kill the root nursery thus resulting in
|
# Kill the root nursery thus resulting in
|
||||||
# normal arbiter channel ops to fail during
|
# normal registrar channel ops to fail during
|
||||||
# teardown. It doesn't seem like this is
|
# teardown. It doesn't seem like this is
|
||||||
# reliably triggered by an external SIGINT.
|
# reliably triggered by an external SIGINT.
|
||||||
# tractor.current_actor()._root_nursery.cancel_scope.cancel()
|
# tractor.current_actor()._root_nursery.cancel_scope.cancel()
|
||||||
|
|
@ -361,27 +449,30 @@ async def close_chans_before_nursery(
|
||||||
# also kill off channels cuz why not
|
# also kill off channels cuz why not
|
||||||
await agen1.aclose()
|
await agen1.aclose()
|
||||||
await agen2.aclose()
|
await agen2.aclose()
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
with trio.CancelScope(shield=True):
|
with trio.CancelScope(shield=True):
|
||||||
await trio.sleep(1)
|
await trio.sleep(1)
|
||||||
|
|
||||||
# all subactors should have de-registered
|
# all subactors should have de-registered
|
||||||
registry = await get_reg()
|
registry = await get_reg()
|
||||||
assert portal1.channel.uid not in registry
|
assert portal1.channel.aid.uid not in registry
|
||||||
assert portal2.channel.uid not in registry
|
assert portal2.channel.aid.uid not in registry
|
||||||
assert len(registry) == entries_at_end
|
assert len(registry) == entries_at_end
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('use_signal', [False, True])
|
@pytest.mark.parametrize('use_signal', [False, True])
|
||||||
def test_close_channel_explicit(
|
def test_close_channel_explicit(
|
||||||
start_method,
|
start_method: str,
|
||||||
use_signal,
|
use_signal: bool,
|
||||||
reg_addr,
|
reg_addr: tuple,
|
||||||
):
|
):
|
||||||
"""Verify that closing a stream explicitly and killing the actor's
|
'''
|
||||||
|
Verify that closing a stream explicitly and killing the actor's
|
||||||
"root nursery" **before** the containing nursery tears down also
|
"root nursery" **before** the containing nursery tears down also
|
||||||
results in subactor(s) deregistering from the arbiter.
|
results in subactor(s) deregistering from the registrar.
|
||||||
"""
|
|
||||||
|
'''
|
||||||
with pytest.raises(KeyboardInterrupt):
|
with pytest.raises(KeyboardInterrupt):
|
||||||
trio.run(
|
trio.run(
|
||||||
partial(
|
partial(
|
||||||
|
|
@ -394,16 +485,18 @@ def test_close_channel_explicit(
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('use_signal', [False, True])
|
@pytest.mark.parametrize('use_signal', [False, True])
|
||||||
def test_close_channel_explicit_remote_arbiter(
|
def test_close_channel_explicit_remote_registrar(
|
||||||
daemon: subprocess.Popen,
|
daemon: subprocess.Popen,
|
||||||
start_method,
|
start_method: str,
|
||||||
use_signal,
|
use_signal: bool,
|
||||||
reg_addr,
|
reg_addr: tuple,
|
||||||
):
|
):
|
||||||
"""Verify that closing a stream explicitly and killing the actor's
|
'''
|
||||||
|
Verify that closing a stream explicitly and killing the actor's
|
||||||
"root nursery" **before** the containing nursery tears down also
|
"root nursery" **before** the containing nursery tears down also
|
||||||
results in subactor(s) deregistering from the arbiter.
|
results in subactor(s) deregistering from the registrar.
|
||||||
"""
|
|
||||||
|
'''
|
||||||
with pytest.raises(KeyboardInterrupt):
|
with pytest.raises(KeyboardInterrupt):
|
||||||
trio.run(
|
trio.run(
|
||||||
partial(
|
partial(
|
||||||
|
|
@ -413,3 +506,138 @@ def test_close_channel_explicit_remote_arbiter(
|
||||||
remote_arbiter=True,
|
remote_arbiter=True,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@tractor.context
|
||||||
|
async def kill_transport(
|
||||||
|
ctx: tractor.Context,
|
||||||
|
) -> None:
|
||||||
|
|
||||||
|
await ctx.started()
|
||||||
|
actor: tractor.Actor = tractor.current_actor()
|
||||||
|
actor.ipc_server.cancel()
|
||||||
|
await trio.sleep_forever()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Wall-clock bound via `pytest-timeout` (`method='thread'`).
|
||||||
|
# Under `--spawn-backend=subint` this test can wedge in an
|
||||||
|
# un-Ctrl-C-able state (abandoned-subint + shared-GIL
|
||||||
|
# starvation → signal-wakeup-fd pipe fills → SIGINT silently
|
||||||
|
# dropped; see `ai/conc-anal/subint_sigint_starvation_issue.md`).
|
||||||
|
# `method='thread'` is specifically required because `signal`-
|
||||||
|
# method SIGALRM suffers the same GIL-starvation path and
|
||||||
|
# wouldn't fire the Python-level handler.
|
||||||
|
# At timeout the plugin hard-kills the pytest process — that's
|
||||||
|
# the intended behavior here; the alternative is an unattended
|
||||||
|
# suite run that never returns.
|
||||||
|
@pytest.mark.timeout(
|
||||||
|
3, # NOTE should be a 2.1s happy path.
|
||||||
|
method='thread',
|
||||||
|
)
|
||||||
|
@pytest.mark.skipon_spawn_backend(
|
||||||
|
'subint',
|
||||||
|
reason=(
|
||||||
|
'XXX SUBINT HANGING TEST XXX\n'
|
||||||
|
'See oustanding issue(s)\n'
|
||||||
|
# TODO, put issue link!
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# @pytest.mark.parametrize('use_signal', [False, True])
|
||||||
|
#
|
||||||
|
def test_stale_entry_is_deleted(
|
||||||
|
debug_mode: bool,
|
||||||
|
daemon: subprocess.Popen,
|
||||||
|
start_method: str,
|
||||||
|
reg_addr: tuple,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Ensure that when a stale entry is detected in the registrar's
|
||||||
|
table that the `find_actor()` API takes care of deleting the
|
||||||
|
stale entry and not delivering a bad portal.
|
||||||
|
|
||||||
|
'''
|
||||||
|
async def main():
|
||||||
|
|
||||||
|
name: str = 'transport_fails_actor'
|
||||||
|
_reg_ptl: tractor.Portal
|
||||||
|
an: tractor.ActorNursery
|
||||||
|
async with (
|
||||||
|
tractor.open_nursery(
|
||||||
|
debug_mode=debug_mode,
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
|
) as an,
|
||||||
|
tractor.get_registry(reg_addr) as _reg_ptl,
|
||||||
|
):
|
||||||
|
ptl: tractor.Portal = await an.start_actor(
|
||||||
|
name,
|
||||||
|
enable_modules=[__name__],
|
||||||
|
)
|
||||||
|
async with ptl.open_context(
|
||||||
|
kill_transport,
|
||||||
|
) as (first, ctx):
|
||||||
|
async with tractor.find_actor(
|
||||||
|
name,
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
|
) as maybe_portal:
|
||||||
|
# because the transitive
|
||||||
|
# `._api.maybe_open_portal()` call should
|
||||||
|
# fail and implicitly call `.delete_addr()`
|
||||||
|
assert maybe_portal is None
|
||||||
|
registry: dict = await unpack_reg(_reg_ptl)
|
||||||
|
assert ptl.chan.aid.uid not in registry
|
||||||
|
|
||||||
|
# should fail since we knocked out the IPC tpt XD
|
||||||
|
await ptl.cancel_actor()
|
||||||
|
await an.cancel()
|
||||||
|
|
||||||
|
# TODO, remove once the `[subint]` variant no longer hangs.
|
||||||
|
#
|
||||||
|
# Status (as of Phase B hard-kill landing):
|
||||||
|
#
|
||||||
|
# - `[trio]`/`[mp_*]` variants: completes normally; `dump_on_hang`
|
||||||
|
# is a no-op safety net here.
|
||||||
|
#
|
||||||
|
# - `[subint]` variant: hangs indefinitely AND is un-Ctrl-C-able.
|
||||||
|
# `strace -p <pytest_pid>` while in the hang reveals a silently-
|
||||||
|
# dropped SIGINT — the C signal handler tries to write the
|
||||||
|
# signum byte to Python's signal-wakeup fd and gets `EAGAIN`,
|
||||||
|
# meaning the pipe is full (nobody's draining it).
|
||||||
|
#
|
||||||
|
# Root-cause chain: our hard-kill in `spawn._subint` abandoned
|
||||||
|
# the driver OS-thread (which is `daemon=True`) after the soft-
|
||||||
|
# kill timeout, but the *sub-interpreter* inside that thread is
|
||||||
|
# still running `trio.run()` — `_interpreters.destroy()` can't
|
||||||
|
# force-stop a running subint (raises `InterpreterError`), and
|
||||||
|
# legacy-config subints share the main GIL. The abandoned subint
|
||||||
|
# starves the parent's trio event loop from iterating often
|
||||||
|
# enough to drain its wakeup pipe → SIGINT silently drops.
|
||||||
|
#
|
||||||
|
# This is structurally a CPython-level limitation: there's no
|
||||||
|
# public force-destroy primitive for a running subint. We
|
||||||
|
# escape on the harness side via a SIGINT-loop in the `daemon`
|
||||||
|
# fixture teardown (killing the bg registrar subproc closes its
|
||||||
|
# end of the IPC, which eventually unblocks a recv in main trio,
|
||||||
|
# which lets the loop drain the wakeup pipe). Long-term fix path:
|
||||||
|
# msgspec PEP 684 support (jcrist/msgspec#563) → isolated-mode
|
||||||
|
# subints with per-interp GIL.
|
||||||
|
#
|
||||||
|
# Full analysis:
|
||||||
|
# `ai/conc-anal/subint_sigint_starvation_issue.md`
|
||||||
|
#
|
||||||
|
# See also the *sibling* hang class documented in
|
||||||
|
# `ai/conc-anal/subint_cancel_delivery_hang_issue.md` — same
|
||||||
|
# subint backend, different root cause (Ctrl-C-able hang, main
|
||||||
|
# trio loop iterating fine; ours to fix, not CPython's).
|
||||||
|
# Reproduced by `tests/test_subint_cancellation.py
|
||||||
|
# ::test_subint_non_checkpointing_child`.
|
||||||
|
#
|
||||||
|
# Kept here (and not behind a `pytestmark.skip`) so we can still
|
||||||
|
# inspect the dump file if the hang ever returns after a refactor.
|
||||||
|
# `pytest`'s stderr capture eats `faulthandler` output otherwise,
|
||||||
|
# so we route `dump_on_hang` to a file.
|
||||||
|
with dump_on_hang(
|
||||||
|
seconds=20,
|
||||||
|
path=f'/tmp/test_stale_entry_is_deleted_{start_method}.dump',
|
||||||
|
):
|
||||||
|
trio.run(main)
|
||||||
|
|
@ -0,0 +1,345 @@
|
||||||
|
'''
|
||||||
|
`open_root_actor(tpt_bind_addrs=...)` test suite.
|
||||||
|
|
||||||
|
Verify all three runtime code paths for explicit IPC-server
|
||||||
|
bind-address selection in `_root.py`:
|
||||||
|
|
||||||
|
1. Non-registrar, no explicit bind -> random addrs from registry proto
|
||||||
|
2. Registrar, no explicit bind -> binds to registry_addrs
|
||||||
|
3. Explicit bind given -> wraps via `wrap_address()` and uses them
|
||||||
|
|
||||||
|
'''
|
||||||
|
import pytest
|
||||||
|
import trio
|
||||||
|
import tractor
|
||||||
|
from tractor.discovery._addr import (
|
||||||
|
wrap_address,
|
||||||
|
)
|
||||||
|
from tractor.discovery._multiaddr import mk_maddr
|
||||||
|
from tractor._testing.addr import get_rando_addr
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
def _bound_bindspaces(
|
||||||
|
actor: tractor.Actor,
|
||||||
|
) -> set[str]:
|
||||||
|
'''
|
||||||
|
Collect the set of bindspace strings from the actor's
|
||||||
|
currently bound IPC-server accept addresses.
|
||||||
|
|
||||||
|
'''
|
||||||
|
return {
|
||||||
|
wrap_address(a).bindspace
|
||||||
|
for a in actor.accept_addrs
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _bound_wrapped(
|
||||||
|
actor: tractor.Actor,
|
||||||
|
) -> list:
|
||||||
|
'''
|
||||||
|
Return the actor's accept addrs as wrapped `Address` objects.
|
||||||
|
|
||||||
|
'''
|
||||||
|
return [
|
||||||
|
wrap_address(a)
|
||||||
|
for a in actor.accept_addrs
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# 1) Registrar + explicit tpt_bind_addrs
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
'addr_combo',
|
||||||
|
[
|
||||||
|
'bind-eq-reg',
|
||||||
|
'bind-subset-reg',
|
||||||
|
'bind-disjoint-reg',
|
||||||
|
],
|
||||||
|
ids=lambda v: v,
|
||||||
|
)
|
||||||
|
def test_registrar_root_tpt_bind_addrs(
|
||||||
|
reg_addr: tuple,
|
||||||
|
tpt_proto: str,
|
||||||
|
debug_mode: bool,
|
||||||
|
addr_combo: str,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Registrar root-actor with explicit `tpt_bind_addrs`:
|
||||||
|
bound set must include all registry + all bind addr bindspaces
|
||||||
|
(merge behavior).
|
||||||
|
|
||||||
|
'''
|
||||||
|
reg_wrapped = wrap_address(reg_addr)
|
||||||
|
|
||||||
|
if addr_combo == 'bind-eq-reg':
|
||||||
|
bind_addrs = [reg_addr]
|
||||||
|
# extra secondary reg addr for subset test
|
||||||
|
extra_reg = []
|
||||||
|
|
||||||
|
elif addr_combo == 'bind-subset-reg':
|
||||||
|
second_reg = get_rando_addr(tpt_proto)
|
||||||
|
bind_addrs = [reg_addr]
|
||||||
|
extra_reg = [second_reg]
|
||||||
|
|
||||||
|
elif addr_combo == 'bind-disjoint-reg':
|
||||||
|
# port=0 on same host -> completely different addr
|
||||||
|
rando = wrap_address(reg_addr).get_random(
|
||||||
|
bindspace=reg_wrapped.bindspace,
|
||||||
|
)
|
||||||
|
bind_addrs = [rando.unwrap()]
|
||||||
|
extra_reg = []
|
||||||
|
|
||||||
|
all_reg = [reg_addr] + extra_reg
|
||||||
|
|
||||||
|
async def _main():
|
||||||
|
async with tractor.open_root_actor(
|
||||||
|
registry_addrs=all_reg,
|
||||||
|
tpt_bind_addrs=bind_addrs,
|
||||||
|
debug_mode=debug_mode,
|
||||||
|
):
|
||||||
|
actor = tractor.current_actor()
|
||||||
|
assert actor.is_registrar
|
||||||
|
|
||||||
|
bound = actor.accept_addrs
|
||||||
|
bound_bs = _bound_bindspaces(actor)
|
||||||
|
|
||||||
|
# all registry bindspaces must appear in bound set
|
||||||
|
for ra in all_reg:
|
||||||
|
assert wrap_address(ra).bindspace in bound_bs
|
||||||
|
|
||||||
|
# all bind-addr bindspaces must appear
|
||||||
|
for ba in bind_addrs:
|
||||||
|
assert wrap_address(ba).bindspace in bound_bs
|
||||||
|
|
||||||
|
# registry addr must appear verbatim in bound
|
||||||
|
# (after wrapping both sides for comparison)
|
||||||
|
bound_w = _bound_wrapped(actor)
|
||||||
|
assert reg_wrapped in bound_w
|
||||||
|
|
||||||
|
if addr_combo == 'bind-disjoint-reg':
|
||||||
|
assert len(bound) >= 2
|
||||||
|
|
||||||
|
trio.run(_main)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
'addr_combo',
|
||||||
|
[
|
||||||
|
'bind-same-bindspace',
|
||||||
|
'bind-disjoint',
|
||||||
|
],
|
||||||
|
ids=lambda v: v,
|
||||||
|
)
|
||||||
|
def test_non_registrar_root_tpt_bind_addrs(
|
||||||
|
daemon,
|
||||||
|
reg_addr: tuple,
|
||||||
|
tpt_proto: str,
|
||||||
|
debug_mode: bool,
|
||||||
|
addr_combo: str,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Non-registrar root with explicit `tpt_bind_addrs`:
|
||||||
|
bound set must exactly match the requested bind addrs
|
||||||
|
(no merge with registry).
|
||||||
|
|
||||||
|
'''
|
||||||
|
reg_wrapped = wrap_address(reg_addr)
|
||||||
|
|
||||||
|
if addr_combo == 'bind-same-bindspace':
|
||||||
|
# same bindspace as reg but port=0 so we get a random port
|
||||||
|
rando = reg_wrapped.get_random(
|
||||||
|
bindspace=reg_wrapped.bindspace,
|
||||||
|
)
|
||||||
|
bind_addrs = [rando.unwrap()]
|
||||||
|
|
||||||
|
elif addr_combo == 'bind-disjoint':
|
||||||
|
rando = reg_wrapped.get_random(
|
||||||
|
bindspace=reg_wrapped.bindspace,
|
||||||
|
)
|
||||||
|
bind_addrs = [rando.unwrap()]
|
||||||
|
|
||||||
|
async def _main():
|
||||||
|
async with tractor.open_root_actor(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
|
tpt_bind_addrs=bind_addrs,
|
||||||
|
debug_mode=debug_mode,
|
||||||
|
):
|
||||||
|
actor = tractor.current_actor()
|
||||||
|
assert not actor.is_registrar
|
||||||
|
|
||||||
|
bound = actor.accept_addrs
|
||||||
|
assert len(bound) == len(bind_addrs)
|
||||||
|
|
||||||
|
# bindspaces must match
|
||||||
|
bound_bs = _bound_bindspaces(actor)
|
||||||
|
for ba in bind_addrs:
|
||||||
|
assert wrap_address(ba).bindspace in bound_bs
|
||||||
|
|
||||||
|
# TCP port=0 should resolve to a real port
|
||||||
|
for uw_addr in bound:
|
||||||
|
w = wrap_address(uw_addr)
|
||||||
|
if w.proto_key == 'tcp':
|
||||||
|
_host, port = uw_addr
|
||||||
|
assert port > 0
|
||||||
|
|
||||||
|
trio.run(_main)
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# 3) Non-registrar, default random bind (baseline)
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
def test_non_registrar_default_random_bind(
|
||||||
|
daemon,
|
||||||
|
reg_addr: tuple,
|
||||||
|
debug_mode: bool,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Baseline: no `tpt_bind_addrs`, daemon running.
|
||||||
|
Bound bindspace matches registry bindspace,
|
||||||
|
but bound addr differs from reg_addr (random).
|
||||||
|
|
||||||
|
'''
|
||||||
|
reg_wrapped = wrap_address(reg_addr)
|
||||||
|
|
||||||
|
async def _main():
|
||||||
|
async with tractor.open_root_actor(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
|
debug_mode=debug_mode,
|
||||||
|
):
|
||||||
|
actor = tractor.current_actor()
|
||||||
|
assert not actor.is_registrar
|
||||||
|
|
||||||
|
bound_bs = _bound_bindspaces(actor)
|
||||||
|
assert reg_wrapped.bindspace in bound_bs
|
||||||
|
|
||||||
|
# bound addr should differ from the registry addr
|
||||||
|
# (the runtime picks a random port/path)
|
||||||
|
bound_w = _bound_wrapped(actor)
|
||||||
|
assert reg_wrapped not in bound_w
|
||||||
|
|
||||||
|
trio.run(_main)
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# 4) Multiaddr string input
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
def test_tpt_bind_addrs_as_maddr_str(
|
||||||
|
reg_addr: tuple,
|
||||||
|
debug_mode: bool,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Pass multiaddr strings as `tpt_bind_addrs`.
|
||||||
|
Runtime should parse and bind successfully.
|
||||||
|
|
||||||
|
'''
|
||||||
|
reg_wrapped = wrap_address(reg_addr)
|
||||||
|
# build a port-0 / random maddr string for binding
|
||||||
|
rando = reg_wrapped.get_random(
|
||||||
|
bindspace=reg_wrapped.bindspace,
|
||||||
|
)
|
||||||
|
maddr_str: str = str(mk_maddr(rando))
|
||||||
|
|
||||||
|
async def _main():
|
||||||
|
async with tractor.open_root_actor(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
|
tpt_bind_addrs=[maddr_str],
|
||||||
|
debug_mode=debug_mode,
|
||||||
|
):
|
||||||
|
actor = tractor.current_actor()
|
||||||
|
assert actor.is_registrar
|
||||||
|
|
||||||
|
for uw_addr in actor.accept_addrs:
|
||||||
|
w = wrap_address(uw_addr)
|
||||||
|
if w.proto_key == 'tcp':
|
||||||
|
_host, port = uw_addr
|
||||||
|
assert port > 0
|
||||||
|
|
||||||
|
trio.run(_main)
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# 5) Registrar merge produces union of binds
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
def test_registrar_merge_binds_union(
|
||||||
|
tpt_proto: str,
|
||||||
|
debug_mode: bool,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Registrar + disjoint bind addr: bound set must include
|
||||||
|
both registry and explicit bind addresses.
|
||||||
|
|
||||||
|
'''
|
||||||
|
reg_addr = get_rando_addr(tpt_proto)
|
||||||
|
reg_wrapped = wrap_address(reg_addr)
|
||||||
|
|
||||||
|
rando = reg_wrapped.get_random(
|
||||||
|
bindspace=reg_wrapped.bindspace,
|
||||||
|
)
|
||||||
|
bind_addrs = [rando.unwrap()]
|
||||||
|
|
||||||
|
# NOTE: for UDS, `get_random()` produces the same
|
||||||
|
# filename for the same pid+actor-state, so the
|
||||||
|
# "disjoint" premise only holds when the addrs
|
||||||
|
# actually differ (always true for TCP, may
|
||||||
|
# collide for UDS).
|
||||||
|
expect_disjoint: bool = (
|
||||||
|
tuple(reg_addr) != rando.unwrap()
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _main():
|
||||||
|
async with tractor.open_root_actor(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
|
tpt_bind_addrs=bind_addrs,
|
||||||
|
debug_mode=debug_mode,
|
||||||
|
):
|
||||||
|
actor = tractor.current_actor()
|
||||||
|
assert actor.is_registrar
|
||||||
|
|
||||||
|
bound = actor.accept_addrs
|
||||||
|
bound_w = _bound_wrapped(actor)
|
||||||
|
|
||||||
|
if expect_disjoint:
|
||||||
|
# must have at least 2 (registry + bind)
|
||||||
|
assert len(bound) >= 2
|
||||||
|
|
||||||
|
# registry addr must appear in bound set
|
||||||
|
assert reg_wrapped in bound_w
|
||||||
|
|
||||||
|
trio.run(_main)
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# 6) open_nursery forwards tpt_bind_addrs
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
def test_open_nursery_forwards_tpt_bind_addrs(
|
||||||
|
reg_addr: tuple,
|
||||||
|
debug_mode: bool,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
`open_nursery(tpt_bind_addrs=...)` forwards through
|
||||||
|
`**kwargs` to `open_root_actor()`.
|
||||||
|
|
||||||
|
'''
|
||||||
|
reg_wrapped = wrap_address(reg_addr)
|
||||||
|
rando = reg_wrapped.get_random(
|
||||||
|
bindspace=reg_wrapped.bindspace,
|
||||||
|
)
|
||||||
|
bind_addrs = [rando.unwrap()]
|
||||||
|
|
||||||
|
async def _main():
|
||||||
|
async with tractor.open_nursery(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
|
tpt_bind_addrs=bind_addrs,
|
||||||
|
debug_mode=debug_mode,
|
||||||
|
):
|
||||||
|
actor = tractor.current_actor()
|
||||||
|
bound_bs = _bound_bindspaces(actor)
|
||||||
|
|
||||||
|
for ba in bind_addrs:
|
||||||
|
assert wrap_address(ba).bindspace in bound_bs
|
||||||
|
|
||||||
|
trio.run(_main)
|
||||||
|
|
@ -8,17 +8,16 @@ from pathlib import Path
|
||||||
import pytest
|
import pytest
|
||||||
import trio
|
import trio
|
||||||
import tractor
|
import tractor
|
||||||
from tractor import (
|
from tractor import Actor
|
||||||
Actor,
|
from tractor.runtime import _state
|
||||||
_state,
|
from tractor.discovery import _addr
|
||||||
_addr,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def bindspace_dir_str() -> str:
|
def bindspace_dir_str() -> str:
|
||||||
|
|
||||||
rt_dir: Path = tractor._state.get_rt_dir()
|
from tractor.runtime._state import get_rt_dir
|
||||||
|
rt_dir: Path = get_rt_dir()
|
||||||
bs_dir: Path = rt_dir / 'doggy'
|
bs_dir: Path = rt_dir / 'doggy'
|
||||||
bs_dir_str: str = str(bs_dir)
|
bs_dir_str: str = str(bs_dir)
|
||||||
assert not bs_dir.is_dir()
|
assert not bs_dir.is_dir()
|
||||||
|
|
|
||||||
|
|
@ -13,9 +13,9 @@ from tractor import (
|
||||||
Portal,
|
Portal,
|
||||||
ipc,
|
ipc,
|
||||||
msg,
|
msg,
|
||||||
_state,
|
|
||||||
_addr,
|
|
||||||
)
|
)
|
||||||
|
from tractor.runtime import _state
|
||||||
|
from tractor.discovery import _addr
|
||||||
|
|
||||||
@tractor.context
|
@tractor.context
|
||||||
async def chk_tpts(
|
async def chk_tpts(
|
||||||
|
|
@ -62,6 +62,13 @@ def test_root_passes_tpt_to_sub(
|
||||||
reg_addr: tuple,
|
reg_addr: tuple,
|
||||||
debug_mode: bool,
|
debug_mode: bool,
|
||||||
):
|
):
|
||||||
|
# XXX NOTE, the `reg_addr` addr won't be the same type as the
|
||||||
|
# `tpt_proto_key` would deliver here unless you pass `--tpt-proto
|
||||||
|
# <tpt_proto_key>` on the CLI.
|
||||||
|
#
|
||||||
|
# if tpt_proto_key == 'uds':
|
||||||
|
# breakpoint()
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
enable_transports=[tpt_proto_key],
|
enable_transports=[tpt_proto_key],
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
'''
|
||||||
|
`tractor.msg.*` sub-sys test suite.
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
'''
|
||||||
|
`tractor.msg.*` test sub-pkg conf.
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
@ -61,7 +61,7 @@ async def maybe_expect_raises(
|
||||||
Async wrapper for ensuring errors propagate from the inner scope.
|
Async wrapper for ensuring errors propagate from the inner scope.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
if tractor._state.debug_mode():
|
if tractor.debug_mode():
|
||||||
timeout += 999
|
timeout += 999
|
||||||
|
|
||||||
with trio.fail_after(timeout):
|
with trio.fail_after(timeout):
|
||||||
|
|
@ -0,0 +1,240 @@
|
||||||
|
'''
|
||||||
|
Unit tests for `tractor.msg.pretty_struct`
|
||||||
|
private-field filtering in `pformat()`.
|
||||||
|
|
||||||
|
'''
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from tractor.msg.pretty_struct import (
|
||||||
|
Struct,
|
||||||
|
pformat,
|
||||||
|
iter_struct_ppfmt_lines,
|
||||||
|
)
|
||||||
|
from tractor.msg._codec import (
|
||||||
|
MsgDec,
|
||||||
|
mk_dec,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ------ test struct definitions ------ #
|
||||||
|
|
||||||
|
class PublicOnly(Struct):
|
||||||
|
'''
|
||||||
|
All-public fields for baseline testing.
|
||||||
|
|
||||||
|
'''
|
||||||
|
name: str = 'alice'
|
||||||
|
age: int = 30
|
||||||
|
|
||||||
|
|
||||||
|
class PrivateOnly(Struct):
|
||||||
|
'''
|
||||||
|
Only underscore-prefixed (private) fields.
|
||||||
|
|
||||||
|
'''
|
||||||
|
_secret: str = 'hidden'
|
||||||
|
_internal: int = 99
|
||||||
|
|
||||||
|
|
||||||
|
class MixedFields(Struct):
|
||||||
|
'''
|
||||||
|
Mix of public and private fields.
|
||||||
|
|
||||||
|
'''
|
||||||
|
name: str = 'bob'
|
||||||
|
_hidden: int = 42
|
||||||
|
value: float = 3.14
|
||||||
|
_meta: str = 'internal'
|
||||||
|
|
||||||
|
|
||||||
|
class Inner(
|
||||||
|
Struct,
|
||||||
|
frozen=True,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Frozen inner struct with a private field,
|
||||||
|
for nesting tests.
|
||||||
|
|
||||||
|
'''
|
||||||
|
x: int = 1
|
||||||
|
_secret: str = 'nope'
|
||||||
|
|
||||||
|
|
||||||
|
class Outer(Struct):
|
||||||
|
'''
|
||||||
|
Outer struct nesting an `Inner`.
|
||||||
|
|
||||||
|
'''
|
||||||
|
label: str = 'outer'
|
||||||
|
inner: Inner = Inner()
|
||||||
|
|
||||||
|
|
||||||
|
class EmptyStruct(Struct):
|
||||||
|
'''
|
||||||
|
Struct with zero fields.
|
||||||
|
|
||||||
|
'''
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# ------ tests ------ #
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
'struct_and_expected',
|
||||||
|
[
|
||||||
|
(
|
||||||
|
PublicOnly(),
|
||||||
|
{
|
||||||
|
'shown': ['name', 'age'],
|
||||||
|
'hidden': [],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
MixedFields(),
|
||||||
|
{
|
||||||
|
'shown': ['name', 'value'],
|
||||||
|
'hidden': ['_hidden', '_meta'],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
PrivateOnly(),
|
||||||
|
{
|
||||||
|
'shown': [],
|
||||||
|
'hidden': ['_secret', '_internal'],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
ids=[
|
||||||
|
'all-public',
|
||||||
|
'mixed-pub-priv',
|
||||||
|
'all-private',
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_field_visibility_in_pformat(
|
||||||
|
struct_and_expected: tuple[
|
||||||
|
Struct,
|
||||||
|
dict[str, list[str]],
|
||||||
|
],
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Verify `pformat()` shows public fields
|
||||||
|
and hides `_`-prefixed private fields.
|
||||||
|
|
||||||
|
'''
|
||||||
|
(
|
||||||
|
struct,
|
||||||
|
expected,
|
||||||
|
) = struct_and_expected
|
||||||
|
output: str = pformat(struct)
|
||||||
|
|
||||||
|
for field_name in expected['shown']:
|
||||||
|
assert field_name in output, (
|
||||||
|
f'{field_name!r} should appear in:\n'
|
||||||
|
f'{output}'
|
||||||
|
)
|
||||||
|
|
||||||
|
for field_name in expected['hidden']:
|
||||||
|
assert field_name not in output, (
|
||||||
|
f'{field_name!r} should NOT appear in:\n'
|
||||||
|
f'{output}'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_iter_ppfmt_lines_skips_private():
|
||||||
|
'''
|
||||||
|
Directly verify `iter_struct_ppfmt_lines()`
|
||||||
|
never yields tuples with `_`-prefixed field
|
||||||
|
names.
|
||||||
|
|
||||||
|
'''
|
||||||
|
struct = MixedFields()
|
||||||
|
lines: list[tuple[str, str]] = list(
|
||||||
|
iter_struct_ppfmt_lines(
|
||||||
|
struct,
|
||||||
|
field_indent=2,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# should have lines for public fields only
|
||||||
|
assert len(lines) == 2
|
||||||
|
|
||||||
|
for _prefix, line_content in lines:
|
||||||
|
field_name: str = (
|
||||||
|
line_content.split(':')[0].strip()
|
||||||
|
)
|
||||||
|
assert not field_name.startswith('_'), (
|
||||||
|
f'private field leaked: {field_name!r}'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_nested_struct_filters_inner_private():
|
||||||
|
'''
|
||||||
|
Verify that nested struct's private fields
|
||||||
|
are also filtered out during recursion.
|
||||||
|
|
||||||
|
'''
|
||||||
|
outer = Outer()
|
||||||
|
output: str = pformat(outer)
|
||||||
|
|
||||||
|
# outer's public field
|
||||||
|
assert 'label' in output
|
||||||
|
|
||||||
|
# inner's public field (recursed into)
|
||||||
|
assert 'x' in output
|
||||||
|
|
||||||
|
# inner's private field must be hidden
|
||||||
|
assert '_secret' not in output
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_struct_pformat():
|
||||||
|
'''
|
||||||
|
An empty struct should produce a valid
|
||||||
|
`pformat()` result with no field lines.
|
||||||
|
|
||||||
|
'''
|
||||||
|
output: str = pformat(EmptyStruct())
|
||||||
|
assert 'EmptyStruct(' in output
|
||||||
|
assert output.rstrip().endswith(')')
|
||||||
|
|
||||||
|
# no field lines => only struct header+footer
|
||||||
|
lines: list[tuple[str, str]] = list(
|
||||||
|
iter_struct_ppfmt_lines(
|
||||||
|
EmptyStruct(),
|
||||||
|
field_indent=2,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
assert lines == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_real_msgdec_pformat_hides_private():
|
||||||
|
'''
|
||||||
|
Verify `pformat()` on a real `MsgDec`
|
||||||
|
hides the `_dec` internal field.
|
||||||
|
|
||||||
|
NOTE: `MsgDec.__repr__` is custom and does
|
||||||
|
NOT call `pformat()`, so we call it directly.
|
||||||
|
|
||||||
|
'''
|
||||||
|
dec: MsgDec = mk_dec(spec=int)
|
||||||
|
output: str = pformat(dec)
|
||||||
|
|
||||||
|
# the private `_dec` field should be filtered
|
||||||
|
assert '_dec' not in output
|
||||||
|
|
||||||
|
# but the struct type name should be present
|
||||||
|
assert 'MsgDec(' in output
|
||||||
|
|
||||||
|
|
||||||
|
def test_pformat_repr_integration():
|
||||||
|
'''
|
||||||
|
Verify that `Struct.__repr__()` (which calls
|
||||||
|
`pformat()`) also hides private fields for
|
||||||
|
custom structs that do NOT override `__repr__`.
|
||||||
|
|
||||||
|
'''
|
||||||
|
mixed = MixedFields()
|
||||||
|
output: str = repr(mixed)
|
||||||
|
|
||||||
|
assert 'name' in output
|
||||||
|
assert 'value' in output
|
||||||
|
assert '_hidden' not in output
|
||||||
|
assert '_meta' not in output
|
||||||
|
|
@ -0,0 +1,245 @@
|
||||||
|
'''
|
||||||
|
Cancellation + hard-kill semantics audit for the `subint` spawn
|
||||||
|
backend.
|
||||||
|
|
||||||
|
Exercises the escape-hatch machinery added to
|
||||||
|
`tractor.spawn._subint` (module-level `_HARD_KILL_TIMEOUT`,
|
||||||
|
bounded shields around the soft-kill / thread-join sites, daemon
|
||||||
|
driver-thread abandonment) so that future stdlib regressions or
|
||||||
|
our own refactors don't silently re-introduce the hangs first
|
||||||
|
diagnosed during the Phase B.2/B.3 bringup (issue #379).
|
||||||
|
|
||||||
|
Every test in this module:
|
||||||
|
- is wrapped in `trio.fail_after()` for a deterministic per-test
|
||||||
|
wall-clock ceiling (the whole point of these tests is to fail
|
||||||
|
fast when our escape hatches regress; an unbounded test would
|
||||||
|
defeat itself),
|
||||||
|
- arms `tractor.devx.dump_on_hang()` to capture a stack dump on
|
||||||
|
failure — without it, a hang here is opaque because pytest's
|
||||||
|
stderr capture swallows `faulthandler` output by default
|
||||||
|
(hard-won lesson from the original diagnosis),
|
||||||
|
- skips on py<3.13 (no `_interpreters`) and on any
|
||||||
|
`--spawn-backend` other than `'subint'` (these tests are
|
||||||
|
subint-specific by design — they'd be nonsense under `trio` or
|
||||||
|
`mp_*`).
|
||||||
|
|
||||||
|
'''
|
||||||
|
from __future__ import annotations
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import trio
|
||||||
|
import tractor
|
||||||
|
from tractor.devx import dump_on_hang
|
||||||
|
|
||||||
|
|
||||||
|
# Gate: the `subint` backend requires py3.14+. Check the
|
||||||
|
# public stdlib wrapper's presence (added in 3.14) rather than
|
||||||
|
# the private `_interpreters` module (which exists on 3.13 but
|
||||||
|
# wedges under tractor's usage — see `tractor.spawn._subint`).
|
||||||
|
pytest.importorskip('concurrent.interpreters')
|
||||||
|
|
||||||
|
# Subint-only: read the spawn method that `pytest_configure`
|
||||||
|
# committed via `try_set_start_method()`. By the time this module
|
||||||
|
# imports, the CLI backend choice has been applied.
|
||||||
|
from tractor.spawn._spawn import _spawn_method # noqa: E402
|
||||||
|
|
||||||
|
if _spawn_method != 'subint':
|
||||||
|
pytestmark = pytest.mark.skip(
|
||||||
|
reason=(
|
||||||
|
"subint-specific cancellation audit — "
|
||||||
|
"pass `--spawn-backend=subint` to run."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# child-side task bodies (run inside the spawned subint)
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
async def _trivial_rpc() -> str:
|
||||||
|
'''
|
||||||
|
Minimal RPC body for the baseline happy-teardown test.
|
||||||
|
'''
|
||||||
|
return 'hello from subint'
|
||||||
|
|
||||||
|
|
||||||
|
async def _spin_without_trio_checkpoints() -> None:
|
||||||
|
'''
|
||||||
|
Block the main task with NO trio-visible checkpoints so any
|
||||||
|
`Portal.cancel_actor()` arriving over IPC has nothing to hand
|
||||||
|
off to.
|
||||||
|
|
||||||
|
`threading.Event.wait(timeout)` releases the GIL (so other
|
||||||
|
threads — including trio's IO/RPC tasks — can progress) but
|
||||||
|
does NOT insert a trio checkpoint, so the subactor's main
|
||||||
|
task never notices cancellation.
|
||||||
|
|
||||||
|
This is the exact "stuck subint" scenario the hard-kill
|
||||||
|
shields exist to survive.
|
||||||
|
'''
|
||||||
|
import threading
|
||||||
|
never_set = threading.Event()
|
||||||
|
while not never_set.is_set():
|
||||||
|
# 1s re-check granularity; low enough not to waste CPU,
|
||||||
|
# high enough that even a pathologically slow
|
||||||
|
# `_HARD_KILL_TIMEOUT` won't accidentally align with a
|
||||||
|
# wake.
|
||||||
|
never_set.wait(timeout=1.0)
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# parent-side harnesses (driven inside `trio.run(...)`)
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
async def _happy_path(
|
||||||
|
reg_addr: tuple[str, int|str],
|
||||||
|
deadline: float,
|
||||||
|
) -> None:
|
||||||
|
with trio.fail_after(deadline):
|
||||||
|
async with (
|
||||||
|
tractor.open_root_actor(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
|
),
|
||||||
|
tractor.open_nursery() as an,
|
||||||
|
):
|
||||||
|
portal: tractor.Portal = await an.run_in_actor(
|
||||||
|
_trivial_rpc,
|
||||||
|
name='subint-happy',
|
||||||
|
)
|
||||||
|
result: str = await portal.wait_for_result()
|
||||||
|
assert result == 'hello from subint'
|
||||||
|
|
||||||
|
|
||||||
|
async def _spawn_stuck_then_cancel(
|
||||||
|
reg_addr: tuple[str, int|str],
|
||||||
|
deadline: float,
|
||||||
|
) -> None:
|
||||||
|
with trio.fail_after(deadline):
|
||||||
|
async with (
|
||||||
|
tractor.open_root_actor(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
|
),
|
||||||
|
tractor.open_nursery() as an,
|
||||||
|
):
|
||||||
|
await an.run_in_actor(
|
||||||
|
_spin_without_trio_checkpoints,
|
||||||
|
name='subint-stuck',
|
||||||
|
)
|
||||||
|
# Give the child time to reach its non-checkpointing
|
||||||
|
# loop before we cancel; the precise value doesn't
|
||||||
|
# matter as long as it's a handful of trio schedule
|
||||||
|
# ticks.
|
||||||
|
await trio.sleep(0.5)
|
||||||
|
an.cancel_scope.cancel()
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# tests
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_subint_happy_teardown(
|
||||||
|
reg_addr: tuple[str, int|str],
|
||||||
|
) -> None:
|
||||||
|
'''
|
||||||
|
Baseline: spawn a subactor, do one portal RPC, close nursery
|
||||||
|
cleanly. No cancel, no faults.
|
||||||
|
|
||||||
|
If this regresses we know something's wrong at the
|
||||||
|
spawn/teardown layer unrelated to the hard-kill escape
|
||||||
|
hatches.
|
||||||
|
|
||||||
|
'''
|
||||||
|
deadline: float = 10.0
|
||||||
|
with dump_on_hang(
|
||||||
|
seconds=deadline,
|
||||||
|
path='/tmp/subint_cancellation_happy.dump',
|
||||||
|
):
|
||||||
|
trio.run(partial(_happy_path, reg_addr, deadline))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipon_spawn_backend(
|
||||||
|
'subint',
|
||||||
|
reason=(
|
||||||
|
'XXX SUBINT HANGING TEST XXX\n'
|
||||||
|
'See oustanding issue(s)\n'
|
||||||
|
# TODO, put issue link!
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# Wall-clock bound via `pytest-timeout` (`method='thread'`)
|
||||||
|
# as defense-in-depth over the inner `trio.fail_after(15)`.
|
||||||
|
# Under the orphaned-channel hang class described in
|
||||||
|
# `ai/conc-anal/subint_cancel_delivery_hang_issue.md`, SIGINT
|
||||||
|
# is still deliverable and this test *should* be unwedgeable
|
||||||
|
# by the inner trio timeout — but sibling subint-backend
|
||||||
|
# tests in this repo have also exhibited the
|
||||||
|
# `subint_sigint_starvation_issue.md` GIL-starvation flavor,
|
||||||
|
# so `method='thread'` keeps us safe in case ordering or
|
||||||
|
# load shifts the failure mode.
|
||||||
|
@pytest.mark.timeout(
|
||||||
|
3, # NOTE never passes pre-3.14+ subints support.
|
||||||
|
method='thread',
|
||||||
|
)
|
||||||
|
def test_subint_non_checkpointing_child(
|
||||||
|
reg_addr: tuple[str, int|str],
|
||||||
|
) -> None:
|
||||||
|
'''
|
||||||
|
Cancel a subactor whose main task is stuck in a non-
|
||||||
|
checkpointing Python loop.
|
||||||
|
|
||||||
|
`Portal.cancel_actor()` may be delivered over IPC but the
|
||||||
|
main task never checkpoints to observe the Cancelled —
|
||||||
|
so the subint's `trio.run()` can't exit gracefully.
|
||||||
|
|
||||||
|
The parent `subint_proc` bounded-shield + daemon-driver-
|
||||||
|
thread combo should abandon the thread after
|
||||||
|
`_HARD_KILL_TIMEOUT` and let the parent return cleanly.
|
||||||
|
|
||||||
|
Wall-clock budget:
|
||||||
|
- ~0.5s: settle time for child to enter the stuck loop
|
||||||
|
- ~3s: `_HARD_KILL_TIMEOUT` (soft-kill wait)
|
||||||
|
- ~3s: `_HARD_KILL_TIMEOUT` (thread-join wait)
|
||||||
|
- margin
|
||||||
|
|
||||||
|
KNOWN ISSUE (Ctrl-C-able hang):
|
||||||
|
-------------------------------
|
||||||
|
This test currently hangs past the hard-kill timeout for
|
||||||
|
reasons unrelated to the subint teardown itself — after
|
||||||
|
the subint is destroyed, a parent-side trio task appears
|
||||||
|
to park on an orphaned IPC channel (no clean EOF
|
||||||
|
delivered to a waiting receive). Unlike the
|
||||||
|
SIGINT-starvation sibling case in
|
||||||
|
`test_stale_entry_is_deleted`, this hang IS Ctrl-C-able
|
||||||
|
(`strace` shows SIGINT wakeup-fd `write() = 1`, not
|
||||||
|
`EAGAIN`) — i.e. the main trio loop is still iterating
|
||||||
|
normally. That makes this *our* bug to fix, not a
|
||||||
|
CPython-level limitation.
|
||||||
|
|
||||||
|
See `ai/conc-anal/subint_cancel_delivery_hang_issue.md`
|
||||||
|
for the full analysis + candidate fix directions
|
||||||
|
(explicit parent-side channel abort in `subint_proc`
|
||||||
|
teardown being the most likely surgical fix).
|
||||||
|
|
||||||
|
The sibling `ai/conc-anal/subint_sigint_starvation_issue.md`
|
||||||
|
documents the *other* hang class (abandoned-legacy-subint
|
||||||
|
thread + shared-GIL starvation → signal-wakeup-fd pipe
|
||||||
|
fills → SIGINT silently dropped) — that one is
|
||||||
|
structurally blocked on msgspec PEP 684 adoption and is
|
||||||
|
NOT what this test is hitting.
|
||||||
|
|
||||||
|
'''
|
||||||
|
deadline: float = 15.0
|
||||||
|
with dump_on_hang(
|
||||||
|
seconds=deadline,
|
||||||
|
path='/tmp/subint_cancellation_stuck.dump',
|
||||||
|
):
|
||||||
|
trio.run(
|
||||||
|
partial(
|
||||||
|
_spawn_stuck_then_cancel,
|
||||||
|
reg_addr,
|
||||||
|
deadline,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
@ -0,0 +1,603 @@
|
||||||
|
'''
|
||||||
|
Integration exercises for the `tractor.spawn._subint_forkserver`
|
||||||
|
submodule at three tiers:
|
||||||
|
|
||||||
|
1. the low-level primitives
|
||||||
|
(`fork_from_worker_thread()` +
|
||||||
|
`run_subint_in_worker_thread()`) driven from inside a real
|
||||||
|
`trio.run()` in the parent process,
|
||||||
|
|
||||||
|
2. the full `subint_forkserver_proc` spawn backend wired
|
||||||
|
through tractor's normal actor-nursery + portal-RPC
|
||||||
|
machinery — i.e. `open_root_actor` + `open_nursery` +
|
||||||
|
`run_in_actor` against a subactor spawned via fork from a
|
||||||
|
main-interp worker thread.
|
||||||
|
|
||||||
|
Background
|
||||||
|
----------
|
||||||
|
`ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
|
||||||
|
establishes that `os.fork()` from a non-main sub-interpreter
|
||||||
|
aborts the child at the CPython level. The sibling
|
||||||
|
`subint_fork_from_main_thread_smoketest.py` proves the escape
|
||||||
|
hatch: fork from a main-interp *worker thread* (one that has
|
||||||
|
never entered a subint) works, and the forked child can then
|
||||||
|
host its own `trio.run()` inside a fresh subint.
|
||||||
|
|
||||||
|
Those smoke-test scenarios are standalone — no trio runtime
|
||||||
|
in the *parent*. Tiers (1)+(2) here cover the primitives
|
||||||
|
driven from inside `trio.run()` in the parent, and tier (3)
|
||||||
|
(the `*_spawn_basic` test) drives the registered
|
||||||
|
`subint_forkserver` spawn backend end-to-end against the
|
||||||
|
tractor runtime.
|
||||||
|
|
||||||
|
Gating
|
||||||
|
------
|
||||||
|
- py3.14+ (via `concurrent.interpreters` presence)
|
||||||
|
- no `--spawn-backend` restriction — the backend-level test
|
||||||
|
flips `tractor.spawn._spawn._spawn_method` programmatically
|
||||||
|
(via `try_set_start_method('subint_forkserver')`) and
|
||||||
|
restores it on teardown, so these tests are independent of
|
||||||
|
the session-level CLI backend choice.
|
||||||
|
|
||||||
|
'''
|
||||||
|
from __future__ import annotations
|
||||||
|
from functools import partial
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
import platform
|
||||||
|
import select
|
||||||
|
import signal
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import trio
|
||||||
|
|
||||||
|
import tractor
|
||||||
|
from tractor.devx import dump_on_hang
|
||||||
|
|
||||||
|
|
||||||
|
# Gate: subint forkserver primitives require py3.14+. Check
|
||||||
|
# the public stdlib wrapper's presence (added in 3.14) rather
|
||||||
|
# than `_interpreters` directly — see
|
||||||
|
# `tractor.spawn._subint` for why.
|
||||||
|
pytest.importorskip('concurrent.interpreters')
|
||||||
|
|
||||||
|
from tractor.spawn._subint_forkserver import ( # noqa: E402
|
||||||
|
fork_from_worker_thread,
|
||||||
|
run_subint_in_worker_thread,
|
||||||
|
wait_child,
|
||||||
|
)
|
||||||
|
from tractor.spawn import _spawn as _spawn_mod # noqa: E402
|
||||||
|
from tractor.spawn._spawn import try_set_start_method # noqa: E402
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# child-side callables (passed via `child_target=` across fork)
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
_CHILD_TRIO_BOOTSTRAP: str = (
|
||||||
|
'import trio\n'
|
||||||
|
'async def _main():\n'
|
||||||
|
' await trio.sleep(0.05)\n'
|
||||||
|
' return 42\n'
|
||||||
|
'result = trio.run(_main)\n'
|
||||||
|
'assert result == 42, f"trio.run returned {result}"\n'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _child_trio_in_subint() -> int:
|
||||||
|
'''
|
||||||
|
`child_target` for the trio-in-child scenario: drive a
|
||||||
|
trivial `trio.run()` inside a fresh legacy-config subint
|
||||||
|
on a worker thread.
|
||||||
|
|
||||||
|
Returns an exit code suitable for `os._exit()`:
|
||||||
|
- 0: subint-hosted `trio.run()` succeeded
|
||||||
|
- 3: driver thread hang (timeout inside `run_subint_in_worker_thread`)
|
||||||
|
- 4: subint bootstrap raised some other exception
|
||||||
|
|
||||||
|
'''
|
||||||
|
try:
|
||||||
|
run_subint_in_worker_thread(
|
||||||
|
_CHILD_TRIO_BOOTSTRAP,
|
||||||
|
thread_name='child-subint-trio-thread',
|
||||||
|
)
|
||||||
|
except RuntimeError:
|
||||||
|
# timeout / thread-never-returned
|
||||||
|
return 3
|
||||||
|
except BaseException:
|
||||||
|
return 4
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# parent-side harnesses (run inside `trio.run()`)
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
async def run_fork_in_non_trio_thread(
|
||||||
|
deadline: float,
|
||||||
|
*,
|
||||||
|
child_target=None,
|
||||||
|
) -> int:
|
||||||
|
'''
|
||||||
|
From inside a parent `trio.run()`, off-load the
|
||||||
|
forkserver primitive to a main-interp worker thread via
|
||||||
|
`trio.to_thread.run_sync()` and return the forked child's
|
||||||
|
pid.
|
||||||
|
|
||||||
|
Then `wait_child()` on that pid (also off-loaded so we
|
||||||
|
don't block trio's event loop on `waitpid()`) and assert
|
||||||
|
the child exited cleanly.
|
||||||
|
|
||||||
|
'''
|
||||||
|
with trio.fail_after(deadline):
|
||||||
|
# NOTE: `fork_from_worker_thread` internally spawns its
|
||||||
|
# own dedicated `threading.Thread` (not from trio's
|
||||||
|
# cache) and joins it before returning — so we can
|
||||||
|
# safely off-load via `to_thread.run_sync` without
|
||||||
|
# worrying about the trio-thread-cache recycling the
|
||||||
|
# runner. Pass `abandon_on_cancel=False` for the
|
||||||
|
# same "bounded + clean" rationale we use in
|
||||||
|
# `_subint.subint_proc`.
|
||||||
|
pid: int = await trio.to_thread.run_sync(
|
||||||
|
partial(
|
||||||
|
fork_from_worker_thread,
|
||||||
|
child_target,
|
||||||
|
thread_name='test-subint-forkserver',
|
||||||
|
),
|
||||||
|
abandon_on_cancel=False,
|
||||||
|
)
|
||||||
|
assert pid > 0
|
||||||
|
|
||||||
|
ok, status_str = await trio.to_thread.run_sync(
|
||||||
|
partial(
|
||||||
|
wait_child,
|
||||||
|
pid,
|
||||||
|
expect_exit_ok=True,
|
||||||
|
),
|
||||||
|
abandon_on_cancel=False,
|
||||||
|
)
|
||||||
|
assert ok, (
|
||||||
|
f'forked child did not exit cleanly: '
|
||||||
|
f'{status_str}'
|
||||||
|
)
|
||||||
|
return pid
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# tests
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
# Bounded wall-clock via `pytest-timeout` (`method='thread'`)
|
||||||
|
# for the usual GIL-hostage safety reason documented in the
|
||||||
|
# sibling `test_subint_cancellation.py` / the class-A
|
||||||
|
# `subint_sigint_starvation_issue.md`. Each test also has an
|
||||||
|
# inner `trio.fail_after()` so assertion failures fire fast
|
||||||
|
# under normal conditions.
|
||||||
|
@pytest.mark.timeout(30, method='thread')
|
||||||
|
def test_fork_from_worker_thread_via_trio(
|
||||||
|
) -> None:
|
||||||
|
'''
|
||||||
|
Baseline: inside `trio.run()`, call
|
||||||
|
`fork_from_worker_thread()` via `trio.to_thread.run_sync()`,
|
||||||
|
get a child pid back, reap the child cleanly.
|
||||||
|
|
||||||
|
No trio-in-child. If this regresses we know the parent-
|
||||||
|
side trio↔worker-thread plumbing is broken independent
|
||||||
|
of any child-side subint machinery.
|
||||||
|
|
||||||
|
'''
|
||||||
|
deadline: float = 10.0
|
||||||
|
with dump_on_hang(
|
||||||
|
seconds=deadline,
|
||||||
|
path='/tmp/subint_forkserver_baseline.dump',
|
||||||
|
):
|
||||||
|
pid: int = trio.run(
|
||||||
|
partial(run_fork_in_non_trio_thread, deadline),
|
||||||
|
)
|
||||||
|
# parent-side sanity — we got a real pid back.
|
||||||
|
assert isinstance(pid, int) and pid > 0
|
||||||
|
# by now the child has been waited on; it shouldn't be
|
||||||
|
# reap-able again.
|
||||||
|
with pytest.raises((ChildProcessError, OSError)):
|
||||||
|
os.waitpid(pid, os.WNOHANG)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.timeout(30, method='thread')
|
||||||
|
def test_fork_and_run_trio_in_child() -> None:
|
||||||
|
'''
|
||||||
|
End-to-end: inside the parent's `trio.run()`, off-load
|
||||||
|
`fork_from_worker_thread()` to a worker thread, have the
|
||||||
|
forked child then create a fresh subint and run
|
||||||
|
`trio.run()` inside it on yet another worker thread.
|
||||||
|
|
||||||
|
This is the full "forkserver + trio-in-subint-in-child"
|
||||||
|
pattern the proposed `subint_forkserver` spawn backend
|
||||||
|
would rest on.
|
||||||
|
|
||||||
|
'''
|
||||||
|
deadline: float = 15.0
|
||||||
|
with dump_on_hang(
|
||||||
|
seconds=deadline,
|
||||||
|
path='/tmp/subint_forkserver_trio_in_child.dump',
|
||||||
|
):
|
||||||
|
pid: int = trio.run(
|
||||||
|
partial(
|
||||||
|
run_fork_in_non_trio_thread,
|
||||||
|
deadline,
|
||||||
|
child_target=_child_trio_in_subint,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
assert isinstance(pid, int) and pid > 0
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# tier-3 backend test: drive the registered `subint_forkserver`
|
||||||
|
# spawn backend end-to-end through tractor's actor-nursery +
|
||||||
|
# portal-RPC machinery.
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
async def _trivial_rpc() -> str:
|
||||||
|
'''
|
||||||
|
Minimal subactor-side RPC body: just return a sentinel
|
||||||
|
string the parent can assert on.
|
||||||
|
|
||||||
|
'''
|
||||||
|
return 'hello from subint-forkserver child'
|
||||||
|
|
||||||
|
|
||||||
|
async def _happy_path_forkserver(
|
||||||
|
reg_addr: tuple[str, int | str],
|
||||||
|
deadline: float,
|
||||||
|
) -> None:
|
||||||
|
'''
|
||||||
|
Parent-side harness: stand up a root actor, open an actor
|
||||||
|
nursery, spawn one subactor via the currently-selected
|
||||||
|
spawn backend (which this test will have flipped to
|
||||||
|
`subint_forkserver`), run a trivial RPC through its
|
||||||
|
portal, assert the round-trip result.
|
||||||
|
|
||||||
|
'''
|
||||||
|
with trio.fail_after(deadline):
|
||||||
|
async with (
|
||||||
|
tractor.open_root_actor(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
|
),
|
||||||
|
tractor.open_nursery() as an,
|
||||||
|
):
|
||||||
|
portal: tractor.Portal = await an.run_in_actor(
|
||||||
|
_trivial_rpc,
|
||||||
|
name='subint-forkserver-child',
|
||||||
|
)
|
||||||
|
result: str = await portal.wait_for_result()
|
||||||
|
assert result == 'hello from subint-forkserver child'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def forkserver_spawn_method():
|
||||||
|
'''
|
||||||
|
Flip `tractor.spawn._spawn._spawn_method` to
|
||||||
|
`'subint_forkserver'` for the duration of a test, then
|
||||||
|
restore whatever was in place before (usually the
|
||||||
|
session-level CLI choice, typically `'trio'`).
|
||||||
|
|
||||||
|
Without this, other tests in the same session would
|
||||||
|
observe the global flip and start spawning via fork —
|
||||||
|
which is almost certainly NOT what their assertions were
|
||||||
|
written against.
|
||||||
|
|
||||||
|
'''
|
||||||
|
prev_method: str = _spawn_mod._spawn_method
|
||||||
|
prev_ctx = _spawn_mod._ctx
|
||||||
|
try_set_start_method('subint_forkserver')
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
_spawn_mod._spawn_method = prev_method
|
||||||
|
_spawn_mod._ctx = prev_ctx
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.timeout(60, method='thread')
|
||||||
|
def test_subint_forkserver_spawn_basic(
|
||||||
|
reg_addr: tuple[str, int | str],
|
||||||
|
forkserver_spawn_method,
|
||||||
|
) -> None:
|
||||||
|
'''
|
||||||
|
Happy-path: spawn ONE subactor via the
|
||||||
|
`subint_forkserver` backend (parent-side fork from a
|
||||||
|
main-interp worker thread), do a trivial portal-RPC
|
||||||
|
round-trip, tear the nursery down cleanly.
|
||||||
|
|
||||||
|
If this passes, the "forkserver + tractor runtime" arch
|
||||||
|
is proven end-to-end: the registered
|
||||||
|
`subint_forkserver_proc` spawn target successfully
|
||||||
|
forks a child, the child runs `_actor_child_main()` +
|
||||||
|
completes IPC handshake + serves an RPC, and the parent
|
||||||
|
reaps via `_ForkedProc.wait()` without regressing any of
|
||||||
|
the normal nursery teardown invariants.
|
||||||
|
|
||||||
|
'''
|
||||||
|
deadline: float = 20.0
|
||||||
|
with dump_on_hang(
|
||||||
|
seconds=deadline,
|
||||||
|
path='/tmp/subint_forkserver_spawn_basic.dump',
|
||||||
|
):
|
||||||
|
trio.run(
|
||||||
|
partial(
|
||||||
|
_happy_path_forkserver,
|
||||||
|
reg_addr,
|
||||||
|
deadline,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# tier-4 DRAFT: orphaned-subactor SIGINT survivability
|
||||||
|
#
|
||||||
|
# Motivating question: with `subint_forkserver`, the child's
|
||||||
|
# `trio.run()` lives on the fork-inherited worker thread which
|
||||||
|
# is NOT `threading.main_thread()` — so trio cannot install its
|
||||||
|
# `signal.set_wakeup_fd`-based SIGINT handler. If the parent
|
||||||
|
# goes away via `SIGKILL` (no IPC `Portal.cancel_actor()`
|
||||||
|
# possible), does SIGINT on the orphan child cleanly tear it
|
||||||
|
# down via CPython's default `KeyboardInterrupt` delivery, or
|
||||||
|
# does it hang?
|
||||||
|
#
|
||||||
|
# Working hypothesis (unverified pre-this-test): post-fork the
|
||||||
|
# child is effectively single-threaded (only the fork-worker
|
||||||
|
# tstate survived), so SIGINT → default handler → raises
|
||||||
|
# `KeyboardInterrupt` on the only thread — which happens to be
|
||||||
|
# the one driving trio's event loop — so trio observes it at
|
||||||
|
# the next checkpoint. If so, we're "fine" on this backend
|
||||||
|
# despite the missing trio SIGINT handler.
|
||||||
|
#
|
||||||
|
# Cross-backend generalization (decide after this passes):
|
||||||
|
# - applicable to any backend whose subactors are separate OS
|
||||||
|
# processes: `trio`, `mp_spawn`, `mp_forkserver`,
|
||||||
|
# `subint_forkserver`.
|
||||||
|
# - NOT applicable to plain `subint` (subactors are in-process
|
||||||
|
# subinterpreters, no orphan child process to SIGINT).
|
||||||
|
# - move path: lift the harness script into
|
||||||
|
# `tests/_orphan_harness.py`, parametrize on the session's
|
||||||
|
# `_spawn_method`, add `skipif _spawn_method == 'subint'`.
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
_ORPHAN_HARNESS_SCRIPT: str = '''
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import trio
|
||||||
|
import tractor
|
||||||
|
from tractor.spawn._spawn import try_set_start_method
|
||||||
|
|
||||||
|
async def _sleep_forever() -> None:
|
||||||
|
print(f"CHILD_PID={os.getpid()}", flush=True)
|
||||||
|
await trio.sleep_forever()
|
||||||
|
|
||||||
|
async def _main(reg_addr):
|
||||||
|
async with (
|
||||||
|
tractor.open_root_actor(registry_addrs=[reg_addr]),
|
||||||
|
tractor.open_nursery() as an,
|
||||||
|
):
|
||||||
|
portal = await an.run_in_actor(
|
||||||
|
_sleep_forever,
|
||||||
|
name="orphan-test-child",
|
||||||
|
)
|
||||||
|
print(f"PARENT_READY={os.getpid()}", flush=True)
|
||||||
|
await trio.sleep_forever()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
backend = sys.argv[1]
|
||||||
|
host = sys.argv[2]
|
||||||
|
port = int(sys.argv[3])
|
||||||
|
try_set_start_method(backend)
|
||||||
|
trio.run(_main, (host, port))
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
def _read_marker(
|
||||||
|
proc: subprocess.Popen,
|
||||||
|
marker: str,
|
||||||
|
timeout: float,
|
||||||
|
_buf: dict,
|
||||||
|
) -> str:
|
||||||
|
'''
|
||||||
|
Block until `<marker>=<value>\\n` appears on `proc.stdout`
|
||||||
|
and return `<value>`. Uses a per-proc byte buffer (`_buf`)
|
||||||
|
to carry partial lines across calls.
|
||||||
|
|
||||||
|
'''
|
||||||
|
deadline: float = time.monotonic() + timeout
|
||||||
|
remainder: bytes = _buf.get('remainder', b'')
|
||||||
|
prefix: bytes = f'{marker}='.encode()
|
||||||
|
while time.monotonic() < deadline:
|
||||||
|
# drain any complete lines already buffered
|
||||||
|
while b'\n' in remainder:
|
||||||
|
line, remainder = remainder.split(b'\n', 1)
|
||||||
|
if line.startswith(prefix):
|
||||||
|
_buf['remainder'] = remainder
|
||||||
|
return line[len(prefix):].decode().strip()
|
||||||
|
ready, _, _ = select.select([proc.stdout], [], [], 0.2)
|
||||||
|
if not ready:
|
||||||
|
continue
|
||||||
|
chunk: bytes = os.read(proc.stdout.fileno(), 4096)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
remainder += chunk
|
||||||
|
_buf['remainder'] = remainder
|
||||||
|
raise TimeoutError(
|
||||||
|
f'Never observed marker {marker!r} on harness stdout '
|
||||||
|
f'within {timeout}s'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _process_alive(pid: int) -> bool:
|
||||||
|
'''Liveness probe for a pid we do NOT parent (post-orphan).'''
|
||||||
|
try:
|
||||||
|
os.kill(pid, 0)
|
||||||
|
return True
|
||||||
|
except ProcessLookupError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# Flakey under session-level env pollution (leftover
|
||||||
|
# subactor PIDs from earlier tests competing for ports /
|
||||||
|
# inheriting the harness subprocess's FDs). Passes
|
||||||
|
# cleanly in isolation, fails in suite; `strict=False`
|
||||||
|
# so either outcome is tolerated until the env isolation
|
||||||
|
# is improved. Tracker:
|
||||||
|
# `ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md`.
|
||||||
|
@pytest.mark.xfail(
|
||||||
|
strict=False,
|
||||||
|
reason=(
|
||||||
|
'Env-pollution sensitive. Passes in isolation, '
|
||||||
|
'flakey in full-suite runs; orphan subactor may '
|
||||||
|
'take longer than 10s to exit when competing for '
|
||||||
|
'resources with leftover state from earlier tests.'
|
||||||
|
),
|
||||||
|
)
|
||||||
|
@pytest.mark.timeout(
|
||||||
|
30,
|
||||||
|
method='thread',
|
||||||
|
)
|
||||||
|
def test_orphaned_subactor_sigint_cleanup_DRAFT(
|
||||||
|
reg_addr: tuple[str, int | str],
|
||||||
|
tmp_path: Path,
|
||||||
|
) -> None:
|
||||||
|
'''
|
||||||
|
DRAFT — orphaned-subactor SIGINT survivability under the
|
||||||
|
`subint_forkserver` backend.
|
||||||
|
|
||||||
|
Sequence:
|
||||||
|
1. Spawn a harness subprocess that brings up a root
|
||||||
|
actor + one `sleep_forever` subactor via
|
||||||
|
`subint_forkserver`.
|
||||||
|
2. Read the harness's stdout for `PARENT_READY=<pid>`
|
||||||
|
and `CHILD_PID=<pid>` markers (confirms the
|
||||||
|
parent→child IPC handshake completed).
|
||||||
|
3. `SIGKILL` the parent (no IPC cancel possible — the
|
||||||
|
whole point of this test).
|
||||||
|
4. `SIGINT` the orphan child.
|
||||||
|
5. Poll `os.kill(child_pid, 0)` for up to 10s — assert
|
||||||
|
the child exits.
|
||||||
|
|
||||||
|
Empirical result (2026-04, py3.14): currently **FAILS** —
|
||||||
|
SIGINT on the orphan child doesn't unwind the trio loop,
|
||||||
|
despite trio's `KIManager` handler being correctly
|
||||||
|
installed in the subactor (the post-fork thread IS
|
||||||
|
`threading.main_thread()` on py3.14). `faulthandler` dump
|
||||||
|
shows the subactor wedged in `trio/_core/_io_epoll.py::
|
||||||
|
get_events` — the signal's supposed wakeup of the event
|
||||||
|
loop isn't firing. Full analysis + diagnostic evidence
|
||||||
|
in `ai/conc-anal/
|
||||||
|
subint_forkserver_orphan_sigint_hang_issue.md`.
|
||||||
|
|
||||||
|
The runtime's *intentional* "KBI-as-OS-cancel" path at
|
||||||
|
`tractor/spawn/_entry.py::_trio_main:164` is therefore
|
||||||
|
unreachable under this backend+config. Closing the gap is
|
||||||
|
aligned with existing design intent (make the already-
|
||||||
|
designed behavior actually fire), not a new feature.
|
||||||
|
Marked `xfail(strict=True)` so the
|
||||||
|
mark flips to XPASS→fail once the gap is closed and we'll
|
||||||
|
know to drop the mark.
|
||||||
|
|
||||||
|
'''
|
||||||
|
if platform.system() != 'Linux':
|
||||||
|
pytest.skip(
|
||||||
|
'orphan-reparenting semantics only exercised on Linux'
|
||||||
|
)
|
||||||
|
|
||||||
|
script_path = tmp_path / '_orphan_harness.py'
|
||||||
|
script_path.write_text(_ORPHAN_HARNESS_SCRIPT)
|
||||||
|
|
||||||
|
# Offset the port so we don't race the session reg_addr with
|
||||||
|
# any concurrently-running backend test's listener.
|
||||||
|
host: str = reg_addr[0]
|
||||||
|
port: int = int(reg_addr[1]) + 17
|
||||||
|
|
||||||
|
proc: subprocess.Popen = subprocess.Popen(
|
||||||
|
[
|
||||||
|
sys.executable,
|
||||||
|
str(script_path),
|
||||||
|
'subint_forkserver',
|
||||||
|
host,
|
||||||
|
str(port),
|
||||||
|
],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.STDOUT,
|
||||||
|
)
|
||||||
|
parent_pid: int | None = None
|
||||||
|
child_pid: int | None = None
|
||||||
|
buf: dict = {}
|
||||||
|
try:
|
||||||
|
child_pid = int(_read_marker(proc, 'CHILD_PID', 15.0, buf))
|
||||||
|
parent_pid = int(_read_marker(proc, 'PARENT_READY', 15.0, buf))
|
||||||
|
|
||||||
|
# sanity: both alive before we start killing stuff
|
||||||
|
assert _process_alive(parent_pid), (
|
||||||
|
f'harness parent pid={parent_pid} gone before '
|
||||||
|
f'SIGKILL — test premise broken'
|
||||||
|
)
|
||||||
|
assert _process_alive(child_pid), (
|
||||||
|
f'orphan-candidate child pid={child_pid} gone '
|
||||||
|
f'before test started'
|
||||||
|
)
|
||||||
|
|
||||||
|
# step 3: kill parent — no IPC cancel arrives at child.
|
||||||
|
# `proc.wait()` reaps the zombie so it truly disappears
|
||||||
|
# from the process table (otherwise `os.kill(pid, 0)`
|
||||||
|
# keeps reporting it as alive).
|
||||||
|
os.kill(parent_pid, signal.SIGKILL)
|
||||||
|
try:
|
||||||
|
proc.wait(timeout=3.0)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
pytest.fail(
|
||||||
|
f'harness parent pid={parent_pid} did not die '
|
||||||
|
f'after SIGKILL — test premise broken'
|
||||||
|
)
|
||||||
|
assert _process_alive(child_pid), (
|
||||||
|
f'child pid={child_pid} died along with parent — '
|
||||||
|
f'did the parent reap it before SIGKILL took? '
|
||||||
|
f'test premise requires an orphan.'
|
||||||
|
)
|
||||||
|
|
||||||
|
# step 4+5: SIGINT the orphan, poll for exit.
|
||||||
|
os.kill(child_pid, signal.SIGINT)
|
||||||
|
timeout: float = 6.0
|
||||||
|
cleanup_deadline: float = time.monotonic() + timeout
|
||||||
|
while time.monotonic() < cleanup_deadline:
|
||||||
|
if not _process_alive(child_pid):
|
||||||
|
return # <- success path
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
pytest.fail(
|
||||||
|
f'Orphan subactor (pid={child_pid}) did NOT exit '
|
||||||
|
f'within 10s of SIGINT under `subint_forkserver` '
|
||||||
|
f'→ trio on non-main thread did not observe the '
|
||||||
|
f'default CPython KeyboardInterrupt; backend needs '
|
||||||
|
f'explicit SIGINT plumbing.'
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
# best-effort cleanup to avoid leaking orphans across
|
||||||
|
# the test session regardless of outcome.
|
||||||
|
for pid in (parent_pid, child_pid):
|
||||||
|
if pid is None:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
os.kill(pid, signal.SIGKILL)
|
||||||
|
except ProcessLookupError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
proc.kill()
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
proc.wait(timeout=2.0)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
pass
|
||||||
|
|
@ -1,7 +1,12 @@
|
||||||
"""
|
'''
|
||||||
Bidirectional streaming.
|
Audit the simplest inter-actor bidirectional (streaming)
|
||||||
|
msg patterns.
|
||||||
|
|
||||||
"""
|
'''
|
||||||
|
from __future__ import annotations
|
||||||
|
from typing import (
|
||||||
|
Callable,
|
||||||
|
)
|
||||||
import pytest
|
import pytest
|
||||||
import trio
|
import trio
|
||||||
import tractor
|
import tractor
|
||||||
|
|
@ -9,10 +14,8 @@ import tractor
|
||||||
|
|
||||||
@tractor.context
|
@tractor.context
|
||||||
async def simple_rpc(
|
async def simple_rpc(
|
||||||
|
|
||||||
ctx: tractor.Context,
|
ctx: tractor.Context,
|
||||||
data: int,
|
data: int,
|
||||||
|
|
||||||
) -> None:
|
) -> None:
|
||||||
'''
|
'''
|
||||||
Test a small ping-pong server.
|
Test a small ping-pong server.
|
||||||
|
|
@ -39,15 +42,13 @@ async def simple_rpc(
|
||||||
|
|
||||||
@tractor.context
|
@tractor.context
|
||||||
async def simple_rpc_with_forloop(
|
async def simple_rpc_with_forloop(
|
||||||
|
|
||||||
ctx: tractor.Context,
|
ctx: tractor.Context,
|
||||||
data: int,
|
data: int,
|
||||||
|
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Same as previous test but using ``async for`` syntax/api.
|
'''
|
||||||
|
Same as previous test but using `async for` syntax/api.
|
||||||
"""
|
|
||||||
|
|
||||||
|
'''
|
||||||
# signal to parent that we're up
|
# signal to parent that we're up
|
||||||
await ctx.started(data + 1)
|
await ctx.started(data + 1)
|
||||||
|
|
||||||
|
|
@ -68,21 +69,37 @@ async def simple_rpc_with_forloop(
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
'use_async_for',
|
'use_async_for',
|
||||||
[True, False],
|
[
|
||||||
|
True,
|
||||||
|
False,
|
||||||
|
],
|
||||||
|
ids='use_async_for={}'.format,
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
'server_func',
|
'server_func',
|
||||||
[simple_rpc, simple_rpc_with_forloop],
|
[
|
||||||
|
simple_rpc,
|
||||||
|
simple_rpc_with_forloop,
|
||||||
|
],
|
||||||
|
ids='server_func={}'.format,
|
||||||
)
|
)
|
||||||
def test_simple_rpc(server_func, use_async_for):
|
def test_simple_rpc(
|
||||||
|
server_func: Callable,
|
||||||
|
use_async_for: bool,
|
||||||
|
loglevel: str,
|
||||||
|
debug_mode: bool,
|
||||||
|
):
|
||||||
'''
|
'''
|
||||||
The simplest request response pattern.
|
The simplest request response pattern.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
async def main():
|
async def main():
|
||||||
async with tractor.open_nursery() as n:
|
with trio.fail_after(6):
|
||||||
|
async with tractor.open_nursery(
|
||||||
portal = await n.start_actor(
|
loglevel=loglevel,
|
||||||
|
debug_mode=debug_mode,
|
||||||
|
) as an:
|
||||||
|
portal: tractor.Portal = await an.start_actor(
|
||||||
'rpc_server',
|
'rpc_server',
|
||||||
enable_modules=[__name__],
|
enable_modules=[__name__],
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -98,7 +98,8 @@ def test_ipc_channel_break_during_stream(
|
||||||
expect_final_exc = TransportClosed
|
expect_final_exc = TransportClosed
|
||||||
|
|
||||||
mod: ModuleType = import_path(
|
mod: ModuleType = import_path(
|
||||||
examples_dir() / 'advanced_faults'
|
examples_dir()
|
||||||
|
/ 'advanced_faults'
|
||||||
/ 'ipc_failure_during_stream.py',
|
/ 'ipc_failure_during_stream.py',
|
||||||
root=examples_dir(),
|
root=examples_dir(),
|
||||||
consider_namespace_packages=False,
|
consider_namespace_packages=False,
|
||||||
|
|
@ -113,8 +114,9 @@ def test_ipc_channel_break_during_stream(
|
||||||
if (
|
if (
|
||||||
# only expect EoC if trans is broken on the child side,
|
# only expect EoC if trans is broken on the child side,
|
||||||
ipc_break['break_child_ipc_after'] is not False
|
ipc_break['break_child_ipc_after'] is not False
|
||||||
|
and
|
||||||
# AND we tell the child to call `MsgStream.aclose()`.
|
# AND we tell the child to call `MsgStream.aclose()`.
|
||||||
and pre_aclose_msgstream
|
pre_aclose_msgstream
|
||||||
):
|
):
|
||||||
# expect_final_exc = trio.EndOfChannel
|
# expect_final_exc = trio.EndOfChannel
|
||||||
# ^XXX NOPE! XXX^ since now `.open_stream()` absorbs this
|
# ^XXX NOPE! XXX^ since now `.open_stream()` absorbs this
|
||||||
|
|
@ -144,9 +146,6 @@ def test_ipc_channel_break_during_stream(
|
||||||
# a user sending ctl-c by raising a KBI.
|
# a user sending ctl-c by raising a KBI.
|
||||||
if pre_aclose_msgstream:
|
if pre_aclose_msgstream:
|
||||||
expect_final_exc = KeyboardInterrupt
|
expect_final_exc = KeyboardInterrupt
|
||||||
if tpt_proto == 'uds':
|
|
||||||
expect_final_exc = TransportClosed
|
|
||||||
expect_final_cause = trio.BrokenResourceError
|
|
||||||
|
|
||||||
# XXX OLD XXX
|
# XXX OLD XXX
|
||||||
# if child calls `MsgStream.aclose()` then expect EoC.
|
# if child calls `MsgStream.aclose()` then expect EoC.
|
||||||
|
|
@ -160,16 +159,13 @@ def test_ipc_channel_break_during_stream(
|
||||||
ipc_break['break_child_ipc_after'] is not False
|
ipc_break['break_child_ipc_after'] is not False
|
||||||
and (
|
and (
|
||||||
ipc_break['break_parent_ipc_after']
|
ipc_break['break_parent_ipc_after']
|
||||||
> ipc_break['break_child_ipc_after']
|
>
|
||||||
|
ipc_break['break_child_ipc_after']
|
||||||
)
|
)
|
||||||
):
|
):
|
||||||
if pre_aclose_msgstream:
|
if pre_aclose_msgstream:
|
||||||
expect_final_exc = KeyboardInterrupt
|
expect_final_exc = KeyboardInterrupt
|
||||||
|
|
||||||
if tpt_proto == 'uds':
|
|
||||||
expect_final_exc = TransportClosed
|
|
||||||
expect_final_cause = trio.BrokenResourceError
|
|
||||||
|
|
||||||
# NOTE when the parent IPC side dies (even if the child does as well
|
# NOTE when the parent IPC side dies (even if the child does as well
|
||||||
# but the child fails BEFORE the parent) we always expect the
|
# but the child fails BEFORE the parent) we always expect the
|
||||||
# IPC layer to raise a closed-resource, NEVER do we expect
|
# IPC layer to raise a closed-resource, NEVER do we expect
|
||||||
|
|
@ -248,8 +244,15 @@ def test_ipc_channel_break_during_stream(
|
||||||
# get raw instance from pytest wrapper
|
# get raw instance from pytest wrapper
|
||||||
value = excinfo.value
|
value = excinfo.value
|
||||||
if isinstance(value, ExceptionGroup):
|
if isinstance(value, ExceptionGroup):
|
||||||
excs = value.exceptions
|
excs: tuple[Exception] = value.exceptions
|
||||||
assert len(excs) == 1
|
assert (
|
||||||
|
len(excs) <= 2
|
||||||
|
and
|
||||||
|
all(
|
||||||
|
isinstance(exc, TransportClosed)
|
||||||
|
for exc in excs
|
||||||
|
)
|
||||||
|
)
|
||||||
final_exc = excs[0]
|
final_exc = excs[0]
|
||||||
assert isinstance(final_exc, expect_final_exc)
|
assert isinstance(final_exc, expect_final_exc)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -17,8 +17,18 @@ from tractor._testing import (
|
||||||
from .conftest import no_windows
|
from .conftest import no_windows
|
||||||
|
|
||||||
|
|
||||||
def is_win():
|
_non_linux: bool = platform.system() != 'Linux'
|
||||||
return platform.system() == 'Windows'
|
_friggin_windows: bool = platform.system() == 'Windows'
|
||||||
|
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.skipon_spawn_backend(
|
||||||
|
'subint',
|
||||||
|
reason=(
|
||||||
|
'XXX SUBINT HANGING TEST XXX\n'
|
||||||
|
'See oustanding issue(s)\n'
|
||||||
|
# TODO, put issue link!
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def assert_err(delay=0):
|
async def assert_err(delay=0):
|
||||||
|
|
@ -110,8 +120,17 @@ def test_remote_error(reg_addr, args_err):
|
||||||
assert exc.boxed_type == errtype
|
assert exc.boxed_type == errtype
|
||||||
|
|
||||||
|
|
||||||
|
# @pytest.mark.skipon_spawn_backend(
|
||||||
|
# 'subint',
|
||||||
|
# reason=(
|
||||||
|
# 'XXX SUBINT HANGING TEST XXX\n'
|
||||||
|
# 'See oustanding issue(s)\n'
|
||||||
|
# # TODO, put issue link!
|
||||||
|
# )
|
||||||
|
# )
|
||||||
def test_multierror(
|
def test_multierror(
|
||||||
reg_addr: tuple[str, int],
|
reg_addr: tuple[str, int],
|
||||||
|
start_method: str,
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
Verify we raise a ``BaseExceptionGroup`` out of a nursery where
|
Verify we raise a ``BaseExceptionGroup`` out of a nursery where
|
||||||
|
|
@ -141,15 +160,28 @@ def test_multierror(
|
||||||
trio.run(main)
|
trio.run(main)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('delay', (0, 0.5))
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
'num_subactors', range(25, 26),
|
'delay',
|
||||||
|
(0, 0.5),
|
||||||
|
ids='delays={}'.format,
|
||||||
)
|
)
|
||||||
def test_multierror_fast_nursery(reg_addr, start_method, num_subactors, delay):
|
@pytest.mark.parametrize(
|
||||||
"""Verify we raise a ``BaseExceptionGroup`` out of a nursery where
|
'num_subactors',
|
||||||
|
range(25, 26),
|
||||||
|
ids= 'num_subs={}'.format,
|
||||||
|
)
|
||||||
|
def test_multierror_fast_nursery(
|
||||||
|
reg_addr: tuple,
|
||||||
|
start_method: str,
|
||||||
|
num_subactors: int,
|
||||||
|
delay: float,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Verify we raise a ``BaseExceptionGroup`` out of a nursery where
|
||||||
more then one actor errors and also with a delay before failure
|
more then one actor errors and also with a delay before failure
|
||||||
to test failure during an ongoing spawning.
|
to test failure during an ongoing spawning.
|
||||||
"""
|
|
||||||
|
'''
|
||||||
async def main():
|
async def main():
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
registry_addrs=[reg_addr],
|
registry_addrs=[reg_addr],
|
||||||
|
|
@ -189,8 +221,15 @@ async def do_nothing():
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('mechanism', ['nursery_cancel', KeyboardInterrupt])
|
@pytest.mark.parametrize(
|
||||||
def test_cancel_single_subactor(reg_addr, mechanism):
|
'mechanism', [
|
||||||
|
'nursery_cancel',
|
||||||
|
KeyboardInterrupt,
|
||||||
|
])
|
||||||
|
def test_cancel_single_subactor(
|
||||||
|
reg_addr: tuple,
|
||||||
|
mechanism: str|KeyboardInterrupt,
|
||||||
|
):
|
||||||
'''
|
'''
|
||||||
Ensure a ``ActorNursery.start_actor()`` spawned subactor
|
Ensure a ``ActorNursery.start_actor()`` spawned subactor
|
||||||
cancels when the nursery is cancelled.
|
cancels when the nursery is cancelled.
|
||||||
|
|
@ -232,9 +271,13 @@ async def stream_forever():
|
||||||
await trio.sleep(0.01)
|
await trio.sleep(0.01)
|
||||||
|
|
||||||
|
|
||||||
@tractor_test
|
@tractor_test(
|
||||||
async def test_cancel_infinite_streamer(start_method):
|
timeout=6,
|
||||||
|
)
|
||||||
|
async def test_cancel_infinite_streamer(
|
||||||
|
reg_addr: tuple,
|
||||||
|
start_method: str,
|
||||||
|
):
|
||||||
# stream for at most 1 seconds
|
# stream for at most 1 seconds
|
||||||
with (
|
with (
|
||||||
trio.fail_after(4),
|
trio.fail_after(4),
|
||||||
|
|
@ -257,6 +300,14 @@ async def test_cancel_infinite_streamer(start_method):
|
||||||
assert n.cancelled
|
assert n.cancelled
|
||||||
|
|
||||||
|
|
||||||
|
# @pytest.mark.skipon_spawn_backend(
|
||||||
|
# 'subint',
|
||||||
|
# reason=(
|
||||||
|
# 'XXX SUBINT HANGING TEST XXX\n'
|
||||||
|
# 'See oustanding issue(s)\n'
|
||||||
|
# # TODO, put issue link!
|
||||||
|
# )
|
||||||
|
# )
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
'num_actors_and_errs',
|
'num_actors_and_errs',
|
||||||
[
|
[
|
||||||
|
|
@ -286,9 +337,12 @@ async def test_cancel_infinite_streamer(start_method):
|
||||||
'no_daemon_actors_fail_all_run_in_actors_sleep_then_fail',
|
'no_daemon_actors_fail_all_run_in_actors_sleep_then_fail',
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@tractor_test
|
@tractor_test(
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
async def test_some_cancels_all(
|
async def test_some_cancels_all(
|
||||||
num_actors_and_errs: tuple,
|
num_actors_and_errs: tuple,
|
||||||
|
reg_addr: tuple,
|
||||||
start_method: str,
|
start_method: str,
|
||||||
loglevel: str,
|
loglevel: str,
|
||||||
):
|
):
|
||||||
|
|
@ -370,7 +424,10 @@ async def test_some_cancels_all(
|
||||||
pytest.fail("Should have gotten a remote assertion error?")
|
pytest.fail("Should have gotten a remote assertion error?")
|
||||||
|
|
||||||
|
|
||||||
async def spawn_and_error(breadth, depth) -> None:
|
async def spawn_and_error(
|
||||||
|
breadth: int,
|
||||||
|
depth: int,
|
||||||
|
) -> None:
|
||||||
name = tractor.current_actor().name
|
name = tractor.current_actor().name
|
||||||
async with tractor.open_nursery() as nursery:
|
async with tractor.open_nursery() as nursery:
|
||||||
for i in range(breadth):
|
for i in range(breadth):
|
||||||
|
|
@ -395,8 +452,18 @@ async def spawn_and_error(breadth, depth) -> None:
|
||||||
await nursery.run_in_actor(*args, **kwargs)
|
await nursery.run_in_actor(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
# NOTE: subint_forkserver skip handled by file-level `pytestmark`
|
||||||
|
# above (same pytest-capture-fd hang class as siblings).
|
||||||
|
@pytest.mark.timeout(
|
||||||
|
10,
|
||||||
|
method='thread',
|
||||||
|
)
|
||||||
@tractor_test
|
@tractor_test
|
||||||
async def test_nested_multierrors(loglevel, start_method):
|
async def test_nested_multierrors(
|
||||||
|
reg_addr: tuple,
|
||||||
|
loglevel: str,
|
||||||
|
start_method: str,
|
||||||
|
):
|
||||||
'''
|
'''
|
||||||
Test that failed actor sets are wrapped in `BaseExceptionGroup`s. This
|
Test that failed actor sets are wrapped in `BaseExceptionGroup`s. This
|
||||||
test goes only 2 nurseries deep but we should eventually have tests
|
test goes only 2 nurseries deep but we should eventually have tests
|
||||||
|
|
@ -431,7 +498,7 @@ async def test_nested_multierrors(loglevel, start_method):
|
||||||
for subexc in err.exceptions:
|
for subexc in err.exceptions:
|
||||||
|
|
||||||
# verify first level actor errors are wrapped as remote
|
# verify first level actor errors are wrapped as remote
|
||||||
if is_win():
|
if _friggin_windows:
|
||||||
|
|
||||||
# windows is often too slow and cancellation seems
|
# windows is often too slow and cancellation seems
|
||||||
# to happen before an actor is spawned
|
# to happen before an actor is spawned
|
||||||
|
|
@ -464,7 +531,7 @@ async def test_nested_multierrors(loglevel, start_method):
|
||||||
# XXX not sure what's up with this..
|
# XXX not sure what's up with this..
|
||||||
# on windows sometimes spawning is just too slow and
|
# on windows sometimes spawning is just too slow and
|
||||||
# we get back the (sent) cancel signal instead
|
# we get back the (sent) cancel signal instead
|
||||||
if is_win():
|
if _friggin_windows:
|
||||||
if isinstance(subexc, tractor.RemoteActorError):
|
if isinstance(subexc, tractor.RemoteActorError):
|
||||||
assert subexc.boxed_type in (
|
assert subexc.boxed_type in (
|
||||||
BaseExceptionGroup,
|
BaseExceptionGroup,
|
||||||
|
|
@ -483,20 +550,24 @@ async def test_nested_multierrors(loglevel, start_method):
|
||||||
|
|
||||||
@no_windows
|
@no_windows
|
||||||
def test_cancel_via_SIGINT(
|
def test_cancel_via_SIGINT(
|
||||||
loglevel,
|
reg_addr: tuple,
|
||||||
start_method,
|
loglevel: str,
|
||||||
spawn_backend,
|
start_method: str,
|
||||||
):
|
):
|
||||||
"""Ensure that a control-C (SIGINT) signal cancels both the parent and
|
'''
|
||||||
|
Ensure that a control-C (SIGINT) signal cancels both the parent and
|
||||||
child processes in trionic fashion
|
child processes in trionic fashion
|
||||||
"""
|
|
||||||
pid = os.getpid()
|
'''
|
||||||
|
pid: int = os.getpid()
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
with trio.fail_after(2):
|
with trio.fail_after(2):
|
||||||
async with tractor.open_nursery() as tn:
|
async with tractor.open_nursery(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
|
) as tn:
|
||||||
await tn.start_actor('sucka')
|
await tn.start_actor('sucka')
|
||||||
if 'mp' in spawn_backend:
|
if 'mp' in start_method:
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
os.kill(pid, signal.SIGINT)
|
os.kill(pid, signal.SIGINT)
|
||||||
await trio.sleep_forever()
|
await trio.sleep_forever()
|
||||||
|
|
@ -507,23 +578,38 @@ def test_cancel_via_SIGINT(
|
||||||
|
|
||||||
@no_windows
|
@no_windows
|
||||||
def test_cancel_via_SIGINT_other_task(
|
def test_cancel_via_SIGINT_other_task(
|
||||||
loglevel,
|
reg_addr: tuple,
|
||||||
start_method,
|
loglevel: str,
|
||||||
spawn_backend,
|
start_method: str,
|
||||||
|
spawn_backend: str,
|
||||||
):
|
):
|
||||||
"""Ensure that a control-C (SIGINT) signal cancels both the parent
|
'''
|
||||||
and child processes in trionic fashion even a subprocess is started
|
Ensure that a control-C (SIGINT) signal cancels both the parent
|
||||||
from a seperate ``trio`` child task.
|
and child processes in trionic fashion even a subprocess is
|
||||||
"""
|
started from a seperate ``trio`` child task.
|
||||||
pid = os.getpid()
|
|
||||||
timeout: float = 2
|
'''
|
||||||
if is_win(): # smh
|
from .conftest import cpu_scaling_factor
|
||||||
|
|
||||||
|
pid: int = os.getpid()
|
||||||
|
timeout: float = (
|
||||||
|
4 if _non_linux
|
||||||
|
else 2
|
||||||
|
)
|
||||||
|
if _friggin_windows: # smh
|
||||||
timeout += 1
|
timeout += 1
|
||||||
|
|
||||||
|
# add latency headroom for CPU freq scaling (auto-cpufreq et al.)
|
||||||
|
headroom: float = cpu_scaling_factor()
|
||||||
|
if headroom != 1.:
|
||||||
|
timeout *= headroom
|
||||||
|
|
||||||
async def spawn_and_sleep_forever(
|
async def spawn_and_sleep_forever(
|
||||||
task_status=trio.TASK_STATUS_IGNORED
|
task_status=trio.TASK_STATUS_IGNORED
|
||||||
):
|
):
|
||||||
async with tractor.open_nursery() as tn:
|
async with tractor.open_nursery(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
|
) as tn:
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
await tn.run_in_actor(
|
await tn.run_in_actor(
|
||||||
sleep_forever,
|
sleep_forever,
|
||||||
|
|
@ -568,6 +654,14 @@ async def spawn_sub_with_sync_blocking_task():
|
||||||
print('exiting first subactor layer..\n')
|
print('exiting first subactor layer..\n')
|
||||||
|
|
||||||
|
|
||||||
|
# @pytest.mark.skipon_spawn_backend(
|
||||||
|
# 'subint',
|
||||||
|
# reason=(
|
||||||
|
# 'XXX SUBINT HANGING TEST XXX\n'
|
||||||
|
# 'See oustanding issue(s)\n'
|
||||||
|
# # TODO, put issue link!
|
||||||
|
# )
|
||||||
|
# )
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
'man_cancel_outer',
|
'man_cancel_outer',
|
||||||
[
|
[
|
||||||
|
|
@ -644,7 +738,11 @@ def test_cancel_while_childs_child_in_sync_sleep(
|
||||||
#
|
#
|
||||||
# delay = 1 # no AssertionError in eg, TooSlowError raised.
|
# delay = 1 # no AssertionError in eg, TooSlowError raised.
|
||||||
# delay = 2 # is AssertionError in eg AND no TooSlowError !?
|
# delay = 2 # is AssertionError in eg AND no TooSlowError !?
|
||||||
delay = 4 # is AssertionError in eg AND no _cs cancellation.
|
# is AssertionError in eg AND no _cs cancellation.
|
||||||
|
delay = (
|
||||||
|
6 if _non_linux
|
||||||
|
else 4
|
||||||
|
)
|
||||||
|
|
||||||
with trio.fail_after(delay) as _cs:
|
with trio.fail_after(delay) as _cs:
|
||||||
# with trio.CancelScope() as cs:
|
# with trio.CancelScope() as cs:
|
||||||
|
|
@ -678,7 +776,7 @@ def test_cancel_while_childs_child_in_sync_sleep(
|
||||||
|
|
||||||
|
|
||||||
def test_fast_graceful_cancel_when_spawn_task_in_soft_proc_wait_for_daemon(
|
def test_fast_graceful_cancel_when_spawn_task_in_soft_proc_wait_for_daemon(
|
||||||
start_method,
|
start_method: str,
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
This is a very subtle test which demonstrates how cancellation
|
This is a very subtle test which demonstrates how cancellation
|
||||||
|
|
@ -696,7 +794,7 @@ def test_fast_graceful_cancel_when_spawn_task_in_soft_proc_wait_for_daemon(
|
||||||
kbi_delay = 0.5
|
kbi_delay = 0.5
|
||||||
timeout: float = 2.9
|
timeout: float = 2.9
|
||||||
|
|
||||||
if is_win(): # smh
|
if _friggin_windows: # smh
|
||||||
timeout += 1
|
timeout += 1
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
|
|
|
||||||
|
|
@ -18,16 +18,15 @@ from tractor import RemoteActorError
|
||||||
|
|
||||||
|
|
||||||
async def aio_streamer(
|
async def aio_streamer(
|
||||||
from_trio: asyncio.Queue,
|
chan: tractor.to_asyncio.LinkedTaskChannel,
|
||||||
to_trio: trio.abc.SendChannel,
|
|
||||||
) -> trio.abc.ReceiveChannel:
|
) -> trio.abc.ReceiveChannel:
|
||||||
|
|
||||||
# required first msg to sync caller
|
# required first msg to sync caller
|
||||||
to_trio.send_nowait(None)
|
chan.started_nowait(None)
|
||||||
|
|
||||||
from itertools import cycle
|
from itertools import cycle
|
||||||
for i in cycle(range(10)):
|
for i in cycle(range(10)):
|
||||||
to_trio.send_nowait(i)
|
chan.send_nowait(i)
|
||||||
await asyncio.sleep(0.01)
|
await asyncio.sleep(0.01)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -69,7 +68,7 @@ async def wrapper_mngr(
|
||||||
else:
|
else:
|
||||||
async with tractor.to_asyncio.open_channel_from(
|
async with tractor.to_asyncio.open_channel_from(
|
||||||
aio_streamer,
|
aio_streamer,
|
||||||
) as (first, from_aio):
|
) as (from_aio, first):
|
||||||
assert not first
|
assert not first
|
||||||
|
|
||||||
# cache it so next task uses broadcast receiver
|
# cache it so next task uses broadcast receiver
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,19 @@ from tractor._testing import tractor_test
|
||||||
MESSAGE = 'tractoring at full speed'
|
MESSAGE = 'tractoring at full speed'
|
||||||
|
|
||||||
|
|
||||||
def test_empty_mngrs_input_raises() -> None:
|
def test_empty_mngrs_input_raises(
|
||||||
|
tpt_proto: str,
|
||||||
|
) -> None:
|
||||||
|
# TODO, the `open_actor_cluster()` teardown hangs
|
||||||
|
# intermittently on UDS when `gather_contexts(mngrs=())`
|
||||||
|
# raises `ValueError` mid-setup; likely a race in the
|
||||||
|
# actor-nursery cleanup vs UDS socket shutdown. Needs
|
||||||
|
# a deeper look at `._clustering`/`._supervise` teardown
|
||||||
|
# paths with the UDS transport.
|
||||||
|
if tpt_proto == 'uds':
|
||||||
|
pytest.skip(
|
||||||
|
'actor-cluster teardown hangs intermittently on UDS'
|
||||||
|
)
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
with trio.fail_after(3):
|
with trio.fail_after(3):
|
||||||
|
|
@ -56,13 +68,27 @@ async def worker(
|
||||||
print(msg)
|
print(msg)
|
||||||
assert msg == MESSAGE
|
assert msg == MESSAGE
|
||||||
|
|
||||||
# TODO: does this ever cause a hang
|
# ?TODO, does this ever cause a hang?
|
||||||
# assert 0
|
# assert 0
|
||||||
|
|
||||||
|
|
||||||
|
# ?TODO, but needs a fn-scoped tpt_proto fixture..
|
||||||
|
# @pytest.mark.no_tpt('uds')
|
||||||
@tractor_test
|
@tractor_test
|
||||||
async def test_streaming_to_actor_cluster() -> None:
|
async def test_streaming_to_actor_cluster(
|
||||||
|
tpt_proto: str,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Open an actor "cluster" using the (experimental) `._clustering`
|
||||||
|
API and conduct standard inter-task-ctx streaming.
|
||||||
|
|
||||||
|
'''
|
||||||
|
if tpt_proto == 'uds':
|
||||||
|
pytest.skip(
|
||||||
|
f'Test currently fails with tpt-proto={tpt_proto!r}\n'
|
||||||
|
)
|
||||||
|
|
||||||
|
with trio.fail_after(6):
|
||||||
async with (
|
async with (
|
||||||
open_actor_cluster(modules=[__name__]) as portals,
|
open_actor_cluster(modules=[__name__]) as portals,
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@ from itertools import count
|
||||||
import math
|
import math
|
||||||
import platform
|
import platform
|
||||||
from pprint import pformat
|
from pprint import pformat
|
||||||
|
import sys
|
||||||
from typing import (
|
from typing import (
|
||||||
Callable,
|
Callable,
|
||||||
)
|
)
|
||||||
|
|
@ -25,7 +26,7 @@ from tractor._exceptions import (
|
||||||
StreamOverrun,
|
StreamOverrun,
|
||||||
ContextCancelled,
|
ContextCancelled,
|
||||||
)
|
)
|
||||||
from tractor._state import current_ipc_ctx
|
from tractor.runtime._state import current_ipc_ctx
|
||||||
|
|
||||||
from tractor._testing import (
|
from tractor._testing import (
|
||||||
tractor_test,
|
tractor_test,
|
||||||
|
|
@ -938,9 +939,14 @@ def test_one_end_stream_not_opened(
|
||||||
|
|
||||||
'''
|
'''
|
||||||
overrunner, buf_size_increase, entrypoint = overrun_by
|
overrunner, buf_size_increase, entrypoint = overrun_by
|
||||||
from tractor._runtime import Actor
|
from tractor.runtime._runtime import Actor
|
||||||
buf_size = buf_size_increase + Actor.msg_buffer_size
|
buf_size = buf_size_increase + Actor.msg_buffer_size
|
||||||
|
|
||||||
|
timeout: float = (
|
||||||
|
1 if sys.platform == 'linux'
|
||||||
|
else 3
|
||||||
|
)
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
|
|
@ -950,7 +956,7 @@ def test_one_end_stream_not_opened(
|
||||||
enable_modules=[__name__],
|
enable_modules=[__name__],
|
||||||
)
|
)
|
||||||
|
|
||||||
with trio.fail_after(1):
|
with trio.fail_after(timeout):
|
||||||
async with portal.open_context(
|
async with portal.open_context(
|
||||||
entrypoint,
|
entrypoint,
|
||||||
) as (ctx, sent):
|
) as (ctx, sent):
|
||||||
|
|
|
||||||
|
|
@ -9,12 +9,17 @@ import sys
|
||||||
import subprocess
|
import subprocess
|
||||||
import platform
|
import platform
|
||||||
import shutil
|
import shutil
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import tractor
|
||||||
from tractor._testing import (
|
from tractor._testing import (
|
||||||
examples_dir,
|
examples_dir,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
_non_linux: bool = platform.system() != 'Linux'
|
||||||
|
_friggin_macos: bool = platform.system() == 'Darwin'
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def run_example_in_subproc(
|
def run_example_in_subproc(
|
||||||
|
|
@ -89,8 +94,10 @@ def run_example_in_subproc(
|
||||||
for f in p[2]
|
for f in p[2]
|
||||||
|
|
||||||
if (
|
if (
|
||||||
'__' not in f
|
'__' not in f # ignore any pkg-mods
|
||||||
and f[0] != '_'
|
# ignore any `__pycache__` subdir
|
||||||
|
and '__pycache__' not in str(p[0])
|
||||||
|
and f[0] != '_' # ignore any WIP "examplel mods"
|
||||||
and 'debugging' not in p[0]
|
and 'debugging' not in p[0]
|
||||||
and 'integration' not in p[0]
|
and 'integration' not in p[0]
|
||||||
and 'advanced_faults' not in p[0]
|
and 'advanced_faults' not in p[0]
|
||||||
|
|
@ -101,8 +108,10 @@ def run_example_in_subproc(
|
||||||
ids=lambda t: t[1],
|
ids=lambda t: t[1],
|
||||||
)
|
)
|
||||||
def test_example(
|
def test_example(
|
||||||
run_example_in_subproc,
|
run_example_in_subproc: Callable,
|
||||||
example_script,
|
example_script: str,
|
||||||
|
test_log: tractor.log.StackLevelAdapter,
|
||||||
|
ci_env: bool,
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
Load and run scripts from this repo's ``examples/`` dir as a user
|
Load and run scripts from this repo's ``examples/`` dir as a user
|
||||||
|
|
@ -116,9 +125,39 @@ def test_example(
|
||||||
'''
|
'''
|
||||||
ex_file: str = os.path.join(*example_script)
|
ex_file: str = os.path.join(*example_script)
|
||||||
|
|
||||||
if 'rpc_bidir_streaming' in ex_file and sys.version_info < (3, 9):
|
if (
|
||||||
|
'rpc_bidir_streaming' in ex_file
|
||||||
|
and
|
||||||
|
sys.version_info < (3, 9)
|
||||||
|
):
|
||||||
pytest.skip("2-way streaming example requires py3.9 async with syntax")
|
pytest.skip("2-way streaming example requires py3.9 async with syntax")
|
||||||
|
|
||||||
|
if (
|
||||||
|
'full_fledged_streaming_service' in ex_file
|
||||||
|
and
|
||||||
|
_friggin_macos
|
||||||
|
and
|
||||||
|
ci_env
|
||||||
|
):
|
||||||
|
pytest.skip(
|
||||||
|
'Streaming example is too flaky in CI\n'
|
||||||
|
'AND their competitor runs this CI service..\n'
|
||||||
|
'This test does run just fine "in person" however..'
|
||||||
|
)
|
||||||
|
|
||||||
|
from .conftest import cpu_scaling_factor
|
||||||
|
|
||||||
|
timeout: float = (
|
||||||
|
60
|
||||||
|
if ci_env and _non_linux
|
||||||
|
else 16
|
||||||
|
)
|
||||||
|
|
||||||
|
# add latency headroom for CPU freq scaling (auto-cpufreq et al.)
|
||||||
|
headroom: float = cpu_scaling_factor()
|
||||||
|
if headroom != 1.:
|
||||||
|
timeout *= headroom
|
||||||
|
|
||||||
with open(ex_file, 'r') as ex:
|
with open(ex_file, 'r') as ex:
|
||||||
code = ex.read()
|
code = ex.read()
|
||||||
|
|
||||||
|
|
@ -126,9 +165,12 @@ def test_example(
|
||||||
err = None
|
err = None
|
||||||
try:
|
try:
|
||||||
if not proc.poll():
|
if not proc.poll():
|
||||||
_, err = proc.communicate(timeout=15)
|
_, err = proc.communicate(timeout=timeout)
|
||||||
|
|
||||||
except subprocess.TimeoutExpired as e:
|
except subprocess.TimeoutExpired as e:
|
||||||
|
test_log.exception(
|
||||||
|
f'Example failed to finish within {timeout}s ??\n'
|
||||||
|
)
|
||||||
proc.kill()
|
proc.kill()
|
||||||
err = e.stderr
|
err = e.stderr
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -26,8 +26,8 @@ from tractor import (
|
||||||
to_asyncio,
|
to_asyncio,
|
||||||
RemoteActorError,
|
RemoteActorError,
|
||||||
ContextCancelled,
|
ContextCancelled,
|
||||||
_state,
|
|
||||||
)
|
)
|
||||||
|
from tractor.runtime import _state
|
||||||
from tractor.trionics import BroadcastReceiver
|
from tractor.trionics import BroadcastReceiver
|
||||||
from tractor._testing import expect_ctxc
|
from tractor._testing import expect_ctxc
|
||||||
|
|
||||||
|
|
@ -47,12 +47,11 @@ async def sleep_and_err(
|
||||||
|
|
||||||
# just signature placeholders for compat with
|
# just signature placeholders for compat with
|
||||||
# ``to_asyncio.open_channel_from()``
|
# ``to_asyncio.open_channel_from()``
|
||||||
to_trio: trio.MemorySendChannel|None = None,
|
chan: to_asyncio.LinkedTaskChannel|None = None,
|
||||||
from_trio: asyncio.Queue|None = None,
|
|
||||||
|
|
||||||
):
|
):
|
||||||
if to_trio:
|
if chan:
|
||||||
to_trio.send_nowait('start')
|
chan.started_nowait('start')
|
||||||
|
|
||||||
await asyncio.sleep(sleep_for)
|
await asyncio.sleep(sleep_for)
|
||||||
assert 0
|
assert 0
|
||||||
|
|
@ -184,6 +183,7 @@ def test_tractor_cancels_aio(
|
||||||
async def main():
|
async def main():
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
) as an:
|
) as an:
|
||||||
portal = await an.run_in_actor(
|
portal = await an.run_in_actor(
|
||||||
asyncio_actor,
|
asyncio_actor,
|
||||||
|
|
@ -206,11 +206,11 @@ def test_trio_cancels_aio(
|
||||||
|
|
||||||
'''
|
'''
|
||||||
async def main():
|
async def main():
|
||||||
|
|
||||||
with trio.move_on_after(1):
|
|
||||||
# cancel the nursery shortly after boot
|
# cancel the nursery shortly after boot
|
||||||
|
with trio.move_on_after(1):
|
||||||
async with tractor.open_nursery() as tn:
|
async with tractor.open_nursery(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
|
) as tn:
|
||||||
await tn.run_in_actor(
|
await tn.run_in_actor(
|
||||||
asyncio_actor,
|
asyncio_actor,
|
||||||
target='aio_sleep_forever',
|
target='aio_sleep_forever',
|
||||||
|
|
@ -238,7 +238,7 @@ async def trio_ctx(
|
||||||
trio.open_nursery() as tn,
|
trio.open_nursery() as tn,
|
||||||
tractor.to_asyncio.open_channel_from(
|
tractor.to_asyncio.open_channel_from(
|
||||||
sleep_and_err,
|
sleep_and_err,
|
||||||
) as (first, chan),
|
) as (chan, first),
|
||||||
):
|
):
|
||||||
|
|
||||||
assert first == 'start'
|
assert first == 'start'
|
||||||
|
|
@ -278,7 +278,9 @@ def test_context_spawns_aio_task_that_errors(
|
||||||
'''
|
'''
|
||||||
async def main():
|
async def main():
|
||||||
with trio.fail_after(1 + delay):
|
with trio.fail_after(1 + delay):
|
||||||
async with tractor.open_nursery() as an:
|
async with tractor.open_nursery(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
|
) as an:
|
||||||
p = await an.start_actor(
|
p = await an.start_actor(
|
||||||
'aio_daemon',
|
'aio_daemon',
|
||||||
enable_modules=[__name__],
|
enable_modules=[__name__],
|
||||||
|
|
@ -361,7 +363,9 @@ def test_aio_cancelled_from_aio_causes_trio_cancelled(
|
||||||
async def main():
|
async def main():
|
||||||
|
|
||||||
an: tractor.ActorNursery
|
an: tractor.ActorNursery
|
||||||
async with tractor.open_nursery() as an:
|
async with tractor.open_nursery(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
|
) as an:
|
||||||
p: tractor.Portal = await an.run_in_actor(
|
p: tractor.Portal = await an.run_in_actor(
|
||||||
asyncio_actor,
|
asyncio_actor,
|
||||||
target='aio_cancel',
|
target='aio_cancel',
|
||||||
|
|
@ -399,7 +403,7 @@ async def no_to_trio_in_args():
|
||||||
|
|
||||||
async def push_from_aio_task(
|
async def push_from_aio_task(
|
||||||
sequence: Iterable,
|
sequence: Iterable,
|
||||||
to_trio: trio.abc.SendChannel,
|
chan: to_asyncio.LinkedTaskChannel,
|
||||||
expect_cancel: False,
|
expect_cancel: False,
|
||||||
fail_early: bool,
|
fail_early: bool,
|
||||||
exit_early: bool,
|
exit_early: bool,
|
||||||
|
|
@ -407,15 +411,12 @@ async def push_from_aio_task(
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# print('trying breakpoint')
|
|
||||||
# breakpoint()
|
|
||||||
|
|
||||||
# sync caller ctx manager
|
# sync caller ctx manager
|
||||||
to_trio.send_nowait(True)
|
chan.started_nowait(True)
|
||||||
|
|
||||||
for i in sequence:
|
for i in sequence:
|
||||||
print(f'asyncio sending {i}')
|
print(f'asyncio sending {i}')
|
||||||
to_trio.send_nowait(i)
|
chan.send_nowait(i)
|
||||||
await asyncio.sleep(0.001)
|
await asyncio.sleep(0.001)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
|
|
@ -478,7 +479,7 @@ async def stream_from_aio(
|
||||||
trio_exit_early
|
trio_exit_early
|
||||||
))
|
))
|
||||||
|
|
||||||
) as (first, chan):
|
) as (chan, first):
|
||||||
|
|
||||||
assert first is True
|
assert first is True
|
||||||
|
|
||||||
|
|
@ -573,7 +574,9 @@ def test_basic_interloop_channel_stream(
|
||||||
async def main():
|
async def main():
|
||||||
# TODO, figure out min timeout here!
|
# TODO, figure out min timeout here!
|
||||||
with trio.fail_after(6):
|
with trio.fail_after(6):
|
||||||
async with tractor.open_nursery() as an:
|
async with tractor.open_nursery(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
|
) as an:
|
||||||
portal = await an.run_in_actor(
|
portal = await an.run_in_actor(
|
||||||
stream_from_aio,
|
stream_from_aio,
|
||||||
infect_asyncio=True,
|
infect_asyncio=True,
|
||||||
|
|
@ -586,9 +589,13 @@ def test_basic_interloop_channel_stream(
|
||||||
|
|
||||||
|
|
||||||
# TODO: parametrize the above test and avoid the duplication here?
|
# TODO: parametrize the above test and avoid the duplication here?
|
||||||
def test_trio_error_cancels_intertask_chan(reg_addr):
|
def test_trio_error_cancels_intertask_chan(
|
||||||
|
reg_addr: tuple[str, int],
|
||||||
|
):
|
||||||
async def main():
|
async def main():
|
||||||
async with tractor.open_nursery() as an:
|
async with tractor.open_nursery(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
|
) as an:
|
||||||
portal = await an.run_in_actor(
|
portal = await an.run_in_actor(
|
||||||
stream_from_aio,
|
stream_from_aio,
|
||||||
trio_raise_err=True,
|
trio_raise_err=True,
|
||||||
|
|
@ -623,6 +630,7 @@ def test_trio_closes_early_causes_aio_checkpoint_raise(
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
# enable_stack_on_sig=True,
|
# enable_stack_on_sig=True,
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
) as an:
|
) as an:
|
||||||
portal = await an.run_in_actor(
|
portal = await an.run_in_actor(
|
||||||
stream_from_aio,
|
stream_from_aio,
|
||||||
|
|
@ -671,6 +679,7 @@ def test_aio_exits_early_relays_AsyncioTaskExited(
|
||||||
async def main():
|
async def main():
|
||||||
with trio.fail_after(1 + delay):
|
with trio.fail_after(1 + delay):
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
# enable_stack_on_sig=True,
|
# enable_stack_on_sig=True,
|
||||||
) as an:
|
) as an:
|
||||||
|
|
@ -711,6 +720,7 @@ def test_aio_errors_and_channel_propagates_and_closes(
|
||||||
):
|
):
|
||||||
async def main():
|
async def main():
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
) as an:
|
) as an:
|
||||||
portal = await an.run_in_actor(
|
portal = await an.run_in_actor(
|
||||||
|
|
@ -732,15 +742,21 @@ def test_aio_errors_and_channel_propagates_and_closes(
|
||||||
|
|
||||||
|
|
||||||
async def aio_echo_server(
|
async def aio_echo_server(
|
||||||
to_trio: trio.MemorySendChannel,
|
chan: to_asyncio.LinkedTaskChannel,
|
||||||
from_trio: asyncio.Queue,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
|
'''
|
||||||
|
An IPC-msg "echo server" with msgs received and relayed by
|
||||||
|
a parent `trio.Task` into a child `asyncio.Task`
|
||||||
|
and then repeated back to that local parent (`trio.Task`)
|
||||||
|
and sent again back to the original calling remote actor.
|
||||||
|
|
||||||
to_trio.send_nowait('start')
|
'''
|
||||||
|
# same semantics as `trio.TaskStatus.started()`
|
||||||
|
chan.started_nowait('start')
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
msg = await from_trio.get()
|
msg = await chan.get()
|
||||||
except to_asyncio.TrioTaskExited:
|
except to_asyncio.TrioTaskExited:
|
||||||
print(
|
print(
|
||||||
'breaking aio echo loop due to `trio` exit!'
|
'breaking aio echo loop due to `trio` exit!'
|
||||||
|
|
@ -748,7 +764,7 @@ async def aio_echo_server(
|
||||||
break
|
break
|
||||||
|
|
||||||
# echo the msg back
|
# echo the msg back
|
||||||
to_trio.send_nowait(msg)
|
chan.send_nowait(msg)
|
||||||
|
|
||||||
# if we get the terminate sentinel
|
# if we get the terminate sentinel
|
||||||
# break the echo loop
|
# break the echo loop
|
||||||
|
|
@ -765,7 +781,10 @@ async def trio_to_aio_echo_server(
|
||||||
):
|
):
|
||||||
async with to_asyncio.open_channel_from(
|
async with to_asyncio.open_channel_from(
|
||||||
aio_echo_server,
|
aio_echo_server,
|
||||||
) as (first, chan):
|
) as (
|
||||||
|
chan,
|
||||||
|
first, # value from `chan.started_nowait()` above
|
||||||
|
):
|
||||||
assert first == 'start'
|
assert first == 'start'
|
||||||
|
|
||||||
await ctx.started(first)
|
await ctx.started(first)
|
||||||
|
|
@ -776,7 +795,8 @@ async def trio_to_aio_echo_server(
|
||||||
await chan.send(msg)
|
await chan.send(msg)
|
||||||
|
|
||||||
out = await chan.receive()
|
out = await chan.receive()
|
||||||
# echo back to parent actor-task
|
|
||||||
|
# echo back to parent-actor's remote parent-ctx-task!
|
||||||
await stream.send(out)
|
await stream.send(out)
|
||||||
|
|
||||||
if out is None:
|
if out is None:
|
||||||
|
|
@ -800,6 +820,7 @@ def test_echoserver_detailed_mechanics(
|
||||||
):
|
):
|
||||||
async def main():
|
async def main():
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
) as an:
|
) as an:
|
||||||
p = await an.start_actor(
|
p = await an.start_actor(
|
||||||
|
|
@ -978,7 +999,7 @@ async def manage_file(
|
||||||
],
|
],
|
||||||
ids=[
|
ids=[
|
||||||
'bg_aio_task',
|
'bg_aio_task',
|
||||||
'just_trio_slee',
|
'just_trio_sleep',
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
|
@ -994,11 +1015,14 @@ async def manage_file(
|
||||||
)
|
)
|
||||||
def test_sigint_closes_lifetime_stack(
|
def test_sigint_closes_lifetime_stack(
|
||||||
tmp_path: Path,
|
tmp_path: Path,
|
||||||
|
reg_addr: tuple,
|
||||||
|
debug_mode: bool,
|
||||||
|
|
||||||
wait_for_ctx: bool,
|
wait_for_ctx: bool,
|
||||||
bg_aio_task: bool,
|
bg_aio_task: bool,
|
||||||
trio_side_is_shielded: bool,
|
trio_side_is_shielded: bool,
|
||||||
debug_mode: bool,
|
|
||||||
send_sigint_to: str,
|
send_sigint_to: str,
|
||||||
|
start_method: str,
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
Ensure that an infected child can use the `Actor.lifetime_stack`
|
Ensure that an infected child can use the `Actor.lifetime_stack`
|
||||||
|
|
@ -1008,12 +1032,22 @@ def test_sigint_closes_lifetime_stack(
|
||||||
'''
|
'''
|
||||||
async def main():
|
async def main():
|
||||||
|
|
||||||
delay = 999 if tractor.debug_mode() else 1
|
delay: float = (
|
||||||
|
999
|
||||||
|
if debug_mode
|
||||||
|
else 1
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
an: tractor.ActorNursery
|
an: tractor.ActorNursery
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
) as an:
|
) as an:
|
||||||
|
|
||||||
|
# sanity
|
||||||
|
if debug_mode:
|
||||||
|
assert tractor.debug_mode()
|
||||||
|
|
||||||
p: tractor.Portal = await an.start_actor(
|
p: tractor.Portal = await an.start_actor(
|
||||||
'file_mngr',
|
'file_mngr',
|
||||||
enable_modules=[__name__],
|
enable_modules=[__name__],
|
||||||
|
|
@ -1048,6 +1082,10 @@ def test_sigint_closes_lifetime_stack(
|
||||||
cpid if send_sigint_to == 'child'
|
cpid if send_sigint_to == 'child'
|
||||||
else os.getpid()
|
else os.getpid()
|
||||||
)
|
)
|
||||||
|
print(
|
||||||
|
f'Sending SIGINT to {send_sigint_to!r}\n'
|
||||||
|
f'pid: {pid!r}\n'
|
||||||
|
)
|
||||||
os.kill(
|
os.kill(
|
||||||
pid,
|
pid,
|
||||||
signal.SIGINT,
|
signal.SIGINT,
|
||||||
|
|
@ -1058,13 +1096,37 @@ def test_sigint_closes_lifetime_stack(
|
||||||
# timeout should trigger!
|
# timeout should trigger!
|
||||||
if wait_for_ctx:
|
if wait_for_ctx:
|
||||||
print('waiting for ctx outcome in parent..')
|
print('waiting for ctx outcome in parent..')
|
||||||
|
|
||||||
|
if debug_mode:
|
||||||
|
assert delay == 999
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with trio.fail_after(1 + delay):
|
with trio.fail_after(
|
||||||
|
1 + delay
|
||||||
|
):
|
||||||
await ctx.wait_for_result()
|
await ctx.wait_for_result()
|
||||||
except tractor.ContextCancelled as ctxc:
|
except tractor.ContextCancelled as ctxc:
|
||||||
assert ctxc.canceller == ctx.chan.uid
|
assert ctxc.canceller == ctx.chan.uid
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
except trio.TooSlowError:
|
||||||
|
if (
|
||||||
|
send_sigint_to == 'child'
|
||||||
|
and
|
||||||
|
start_method == 'subint_forkserver'
|
||||||
|
):
|
||||||
|
pytest.xfail(
|
||||||
|
reason=(
|
||||||
|
'SIGINT delivery to fork-child subactor is known '
|
||||||
|
'to NOT SUCCEED, precisely bc we have not wired up a'
|
||||||
|
'"trio SIGINT mode" in the child pre-fork.\n'
|
||||||
|
'Also see `test_orphaned_subactor_sigint_cleanup_DRAFT` for'
|
||||||
|
'a dedicated suite demonstrating this expected limitation as '
|
||||||
|
'well as the detailed doc:\n'
|
||||||
|
'`ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md`.\n'
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
# XXX CASE 2: this seems to be the source of the
|
# XXX CASE 2: this seems to be the source of the
|
||||||
# original issue which exhibited BEFORE we put
|
# original issue which exhibited BEFORE we put
|
||||||
# a `Actor.cancel_soon()` inside
|
# a `Actor.cancel_soon()` inside
|
||||||
|
|
@ -1090,24 +1152,21 @@ def test_sigint_closes_lifetime_stack(
|
||||||
|
|
||||||
|
|
||||||
# ?TODO asyncio.Task fn-deco?
|
# ?TODO asyncio.Task fn-deco?
|
||||||
# -[ ] do sig checkingat import time like @context?
|
|
||||||
# -[ ] maybe name it @aio_task ??
|
|
||||||
# -[ ] chan: to_asyncio.InterloopChannel ??
|
# -[ ] chan: to_asyncio.InterloopChannel ??
|
||||||
|
# -[ ] do fn-sig checking at import time like @context?
|
||||||
|
# |_[ ] maybe name it @a(sync)io_task ??
|
||||||
|
# @asyncio_task <- not bad ??
|
||||||
async def raise_before_started(
|
async def raise_before_started(
|
||||||
# from_trio: asyncio.Queue,
|
|
||||||
# to_trio: trio.abc.SendChannel,
|
|
||||||
chan: to_asyncio.LinkedTaskChannel,
|
chan: to_asyncio.LinkedTaskChannel,
|
||||||
|
|
||||||
) -> None:
|
) -> None:
|
||||||
'''
|
'''
|
||||||
`asyncio.Task` entry point which RTEs before calling
|
`asyncio.Task` entry point which RTEs before calling
|
||||||
`to_trio.send_nowait()`.
|
`chan.started_nowait()`.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
await asyncio.sleep(0.2)
|
await asyncio.sleep(0.2)
|
||||||
raise RuntimeError('Some shite went wrong before `.send_nowait()`!!')
|
raise RuntimeError('Some shite went wrong before `.send_nowait()`!!')
|
||||||
|
|
||||||
# to_trio.send_nowait('Uhh we shouldve RTE-d ^^ ??')
|
|
||||||
chan.started_nowait('Uhh we shouldve RTE-d ^^ ??')
|
chan.started_nowait('Uhh we shouldve RTE-d ^^ ??')
|
||||||
await asyncio.sleep(float('inf'))
|
await asyncio.sleep(float('inf'))
|
||||||
|
|
||||||
|
|
@ -1167,6 +1226,7 @@ def test_aio_side_raises_before_started(
|
||||||
with trio.fail_after(3):
|
with trio.fail_after(3):
|
||||||
an: tractor.ActorNursery
|
an: tractor.ActorNursery
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
loglevel=loglevel,
|
loglevel=loglevel,
|
||||||
) as an:
|
) as an:
|
||||||
|
|
|
||||||
|
|
@ -11,18 +11,30 @@ import trio
|
||||||
import tractor
|
import tractor
|
||||||
from tractor import ( # typing
|
from tractor import ( # typing
|
||||||
Actor,
|
Actor,
|
||||||
current_actor,
|
|
||||||
open_nursery,
|
|
||||||
Portal,
|
|
||||||
Context,
|
Context,
|
||||||
ContextCancelled,
|
ContextCancelled,
|
||||||
|
MsgStream,
|
||||||
|
Portal,
|
||||||
RemoteActorError,
|
RemoteActorError,
|
||||||
|
current_actor,
|
||||||
|
open_nursery,
|
||||||
)
|
)
|
||||||
from tractor._testing import (
|
from tractor._testing import (
|
||||||
# tractor_test,
|
# tractor_test,
|
||||||
expect_ctxc,
|
expect_ctxc,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from .conftest import cpu_scaling_factor
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.skipon_spawn_backend(
|
||||||
|
'subint',
|
||||||
|
reason=(
|
||||||
|
'XXX SUBINT GIL-CONTENTION HANGING TEST XXX\n'
|
||||||
|
'See oustanding issue(s)\n'
|
||||||
|
# TODO, put issue link!
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# XXX TODO cases:
|
# XXX TODO cases:
|
||||||
# - [x] WE cancelled the peer and thus should not see any raised
|
# - [x] WE cancelled the peer and thus should not see any raised
|
||||||
# `ContextCancelled` as it should be reaped silently?
|
# `ContextCancelled` as it should be reaped silently?
|
||||||
|
|
@ -200,7 +212,7 @@ async def stream_from_peer(
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
# sanity
|
# sanity
|
||||||
assert tractor._state.debug_mode() == debug_mode
|
assert tractor.debug_mode() == debug_mode
|
||||||
|
|
||||||
peer: Portal
|
peer: Portal
|
||||||
try:
|
try:
|
||||||
|
|
@ -580,7 +592,7 @@ def test_peer_canceller(
|
||||||
assert (
|
assert (
|
||||||
re.canceller
|
re.canceller
|
||||||
==
|
==
|
||||||
root.uid
|
root.aid.uid
|
||||||
)
|
)
|
||||||
|
|
||||||
else: # the other 2 ctxs
|
else: # the other 2 ctxs
|
||||||
|
|
@ -589,7 +601,7 @@ def test_peer_canceller(
|
||||||
and (
|
and (
|
||||||
re.canceller
|
re.canceller
|
||||||
==
|
==
|
||||||
canceller.channel.uid
|
canceller.channel.aid.uid
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -744,7 +756,7 @@ def test_peer_canceller(
|
||||||
# -> each context should have received
|
# -> each context should have received
|
||||||
# a silently absorbed context cancellation
|
# a silently absorbed context cancellation
|
||||||
# in its remote nursery scope.
|
# in its remote nursery scope.
|
||||||
# assert ctx.chan.uid == ctx.canceller
|
# assert ctx.chan.aid.uid == ctx.canceller
|
||||||
|
|
||||||
# NOTE: when an inter-peer cancellation
|
# NOTE: when an inter-peer cancellation
|
||||||
# occurred, we DO NOT expect this
|
# occurred, we DO NOT expect this
|
||||||
|
|
@ -796,12 +808,12 @@ async def basic_echo_server(
|
||||||
|
|
||||||
) -> None:
|
) -> None:
|
||||||
'''
|
'''
|
||||||
Just the simplest `MsgStream` echo server which resays what
|
Just the simplest `MsgStream` echo server which resays what you
|
||||||
you told it but with its uid in front ;)
|
told it but with its uid in front ;)
|
||||||
|
|
||||||
'''
|
'''
|
||||||
actor: Actor = tractor.current_actor()
|
actor: Actor = tractor.current_actor()
|
||||||
uid: tuple = actor.uid
|
uid: tuple = actor.aid.uid
|
||||||
await ctx.started(uid)
|
await ctx.started(uid)
|
||||||
async with ctx.open_stream() as ipc:
|
async with ctx.open_stream() as ipc:
|
||||||
async for msg in ipc:
|
async for msg in ipc:
|
||||||
|
|
@ -840,7 +852,7 @@ async def serve_subactors(
|
||||||
async with open_nursery() as an:
|
async with open_nursery() as an:
|
||||||
|
|
||||||
# sanity
|
# sanity
|
||||||
assert tractor._state.debug_mode() == debug_mode
|
assert tractor.debug_mode() == debug_mode
|
||||||
|
|
||||||
await ctx.started(peer_name)
|
await ctx.started(peer_name)
|
||||||
async with ctx.open_stream() as ipc:
|
async with ctx.open_stream() as ipc:
|
||||||
|
|
@ -856,7 +868,7 @@ async def serve_subactors(
|
||||||
f'|_{peer}\n'
|
f'|_{peer}\n'
|
||||||
)
|
)
|
||||||
await ipc.send((
|
await ipc.send((
|
||||||
peer.chan.uid,
|
peer.chan.aid.uid,
|
||||||
peer.chan.raddr.unwrap(),
|
peer.chan.raddr.unwrap(),
|
||||||
))
|
))
|
||||||
|
|
||||||
|
|
@ -879,7 +891,7 @@ async def client_req_subactor(
|
||||||
) -> None:
|
) -> None:
|
||||||
# sanity
|
# sanity
|
||||||
if debug_mode:
|
if debug_mode:
|
||||||
assert tractor._state.debug_mode()
|
assert tractor.debug_mode()
|
||||||
|
|
||||||
# TODO: other cases to do with sub lifetimes:
|
# TODO: other cases to do with sub lifetimes:
|
||||||
# -[ ] test that we can have the server spawn a sub
|
# -[ ] test that we can have the server spawn a sub
|
||||||
|
|
@ -966,9 +978,14 @@ async def tell_little_bro(
|
||||||
|
|
||||||
caller: str = '',
|
caller: str = '',
|
||||||
err_after: float|None = None,
|
err_after: float|None = None,
|
||||||
rng_seed: int = 50,
|
rng_seed: int = 100,
|
||||||
|
# NOTE, ensure ^ is large enough (on fast hw anyway)
|
||||||
|
# to ensure the peer cancel req arrives before the
|
||||||
|
# echoing dialog does itself Bp
|
||||||
):
|
):
|
||||||
# contact target actor, do a stream dialog.
|
# contact target actor, do a stream dialog.
|
||||||
|
lb: Portal
|
||||||
|
echo_ipc: MsgStream
|
||||||
async with (
|
async with (
|
||||||
tractor.wait_for_actor(
|
tractor.wait_for_actor(
|
||||||
name=actor_name
|
name=actor_name
|
||||||
|
|
@ -983,17 +1000,17 @@ async def tell_little_bro(
|
||||||
else None
|
else None
|
||||||
),
|
),
|
||||||
) as (sub_ctx, first),
|
) as (sub_ctx, first),
|
||||||
|
|
||||||
sub_ctx.open_stream() as echo_ipc,
|
sub_ctx.open_stream() as echo_ipc,
|
||||||
):
|
):
|
||||||
actor: Actor = current_actor()
|
actor: Actor = current_actor()
|
||||||
uid: tuple = actor.uid
|
uid: tuple = actor.aid.uid
|
||||||
for i in range(rng_seed):
|
for i in range(rng_seed):
|
||||||
msg: tuple = (
|
msg: tuple = (
|
||||||
uid,
|
uid,
|
||||||
i,
|
i,
|
||||||
)
|
)
|
||||||
await echo_ipc.send(msg)
|
await echo_ipc.send(msg)
|
||||||
|
await trio.sleep(0.001)
|
||||||
resp = await echo_ipc.receive()
|
resp = await echo_ipc.receive()
|
||||||
print(
|
print(
|
||||||
f'{caller} => {actor_name}: {msg}\n'
|
f'{caller} => {actor_name}: {msg}\n'
|
||||||
|
|
@ -1006,6 +1023,9 @@ async def tell_little_bro(
|
||||||
assert sub_uid != uid
|
assert sub_uid != uid
|
||||||
assert _i == i
|
assert _i == i
|
||||||
|
|
||||||
|
# XXX, usually should never get here!
|
||||||
|
# await tractor.pause()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
'raise_client_error',
|
'raise_client_error',
|
||||||
|
|
@ -1020,6 +1040,10 @@ def test_peer_spawns_and_cancels_service_subactor(
|
||||||
raise_client_error: str,
|
raise_client_error: str,
|
||||||
reg_addr: tuple[str, int],
|
reg_addr: tuple[str, int],
|
||||||
raise_sub_spawn_error_after: float|None,
|
raise_sub_spawn_error_after: float|None,
|
||||||
|
loglevel: str,
|
||||||
|
test_log: tractor.log.StackLevelAdapter,
|
||||||
|
# ^XXX, set to 'warning' to see masked-exc warnings
|
||||||
|
# that may transpire during actor-nursery teardown.
|
||||||
):
|
):
|
||||||
# NOTE: this tests for the modden `mod wks open piker` bug
|
# NOTE: this tests for the modden `mod wks open piker` bug
|
||||||
# discovered as part of implementing workspace ctx
|
# discovered as part of implementing workspace ctx
|
||||||
|
|
@ -1049,6 +1073,7 @@ def test_peer_spawns_and_cancels_service_subactor(
|
||||||
# NOTE: to halt the peer tasks on ctxc, uncomment this.
|
# NOTE: to halt the peer tasks on ctxc, uncomment this.
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
registry_addrs=[reg_addr],
|
registry_addrs=[reg_addr],
|
||||||
|
loglevel=loglevel,
|
||||||
) as an:
|
) as an:
|
||||||
server: Portal = await an.start_actor(
|
server: Portal = await an.start_actor(
|
||||||
(server_name := 'spawn_server'),
|
(server_name := 'spawn_server'),
|
||||||
|
|
@ -1084,7 +1109,7 @@ def test_peer_spawns_and_cancels_service_subactor(
|
||||||
) as (client_ctx, client_says),
|
) as (client_ctx, client_says),
|
||||||
):
|
):
|
||||||
root: Actor = current_actor()
|
root: Actor = current_actor()
|
||||||
spawner_uid: tuple = spawn_ctx.chan.uid
|
spawner_uid: tuple = spawn_ctx.chan.aid.uid
|
||||||
print(
|
print(
|
||||||
f'Server says: {first}\n'
|
f'Server says: {first}\n'
|
||||||
f'Client says: {client_says}\n'
|
f'Client says: {client_says}\n'
|
||||||
|
|
@ -1103,7 +1128,7 @@ def test_peer_spawns_and_cancels_service_subactor(
|
||||||
print(
|
print(
|
||||||
'Sub-spawn came online\n'
|
'Sub-spawn came online\n'
|
||||||
f'portal: {sub}\n'
|
f'portal: {sub}\n'
|
||||||
f'.uid: {sub.actor.uid}\n'
|
f'.uid: {sub.actor.aid.uid}\n'
|
||||||
f'chan.raddr: {sub.chan.raddr}\n'
|
f'chan.raddr: {sub.chan.raddr}\n'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -1137,7 +1162,7 @@ def test_peer_spawns_and_cancels_service_subactor(
|
||||||
|
|
||||||
assert isinstance(res, ContextCancelled)
|
assert isinstance(res, ContextCancelled)
|
||||||
assert client_ctx.cancel_acked
|
assert client_ctx.cancel_acked
|
||||||
assert res.canceller == root.uid
|
assert res.canceller == root.aid.uid
|
||||||
assert not raise_sub_spawn_error_after
|
assert not raise_sub_spawn_error_after
|
||||||
|
|
||||||
# cancelling the spawner sub should
|
# cancelling the spawner sub should
|
||||||
|
|
@ -1171,8 +1196,8 @@ def test_peer_spawns_and_cancels_service_subactor(
|
||||||
# little_bro: a `RuntimeError`.
|
# little_bro: a `RuntimeError`.
|
||||||
#
|
#
|
||||||
check_inner_rte(rae)
|
check_inner_rte(rae)
|
||||||
assert rae.relay_uid == client.chan.uid
|
assert rae.relay_uid == client.chan.aid.uid
|
||||||
assert rae.src_uid == sub.chan.uid
|
assert rae.src_uid == sub.chan.aid.uid
|
||||||
|
|
||||||
assert not client_ctx.cancel_acked
|
assert not client_ctx.cancel_acked
|
||||||
assert (
|
assert (
|
||||||
|
|
@ -1201,12 +1226,12 @@ def test_peer_spawns_and_cancels_service_subactor(
|
||||||
except ContextCancelled as ctxc:
|
except ContextCancelled as ctxc:
|
||||||
_ctxc = ctxc
|
_ctxc = ctxc
|
||||||
print(
|
print(
|
||||||
f'{root.uid} caught ctxc from ctx with {client_ctx.chan.uid}\n'
|
f'{root.aid.uid} caught ctxc from ctx with {client_ctx.chan.aid.uid}\n'
|
||||||
f'{repr(ctxc)}\n'
|
f'{repr(ctxc)}\n'
|
||||||
)
|
)
|
||||||
|
|
||||||
if not raise_sub_spawn_error_after:
|
if not raise_sub_spawn_error_after:
|
||||||
assert ctxc.canceller == root.uid
|
assert ctxc.canceller == root.aid.uid
|
||||||
else:
|
else:
|
||||||
assert ctxc.canceller == spawner_uid
|
assert ctxc.canceller == spawner_uid
|
||||||
|
|
||||||
|
|
@ -1237,9 +1262,20 @@ def test_peer_spawns_and_cancels_service_subactor(
|
||||||
|
|
||||||
# assert spawn_ctx.cancelled_caught
|
# assert spawn_ctx.cancelled_caught
|
||||||
|
|
||||||
|
|
||||||
async def _main():
|
async def _main():
|
||||||
|
headroom: float = cpu_scaling_factor()
|
||||||
|
this_fast_on_linux: float = 3
|
||||||
|
this_fast = this_fast_on_linux * headroom
|
||||||
|
if headroom != 1.:
|
||||||
|
test_log.warning(
|
||||||
|
f'Adding latency headroom on linux bc CPU scaling,\n'
|
||||||
|
f'headroom: {headroom}\n'
|
||||||
|
f'this_fast_on_linux: {this_fast_on_linux} -> {this_fast}\n'
|
||||||
|
)
|
||||||
with trio.fail_after(
|
with trio.fail_after(
|
||||||
3 if not debug_mode
|
this_fast
|
||||||
|
if not debug_mode
|
||||||
else 999
|
else 999
|
||||||
):
|
):
|
||||||
await main()
|
await main()
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,11 @@
|
||||||
"""
|
"""
|
||||||
Streaming via async gen api
|
Streaming via the, now legacy, "async-gen API".
|
||||||
|
|
||||||
"""
|
"""
|
||||||
import time
|
import time
|
||||||
from functools import partial
|
from functools import partial
|
||||||
import platform
|
import platform
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
import trio
|
import trio
|
||||||
import tractor
|
import tractor
|
||||||
|
|
@ -19,7 +21,11 @@ def test_must_define_ctx():
|
||||||
async def no_ctx():
|
async def no_ctx():
|
||||||
pass
|
pass
|
||||||
|
|
||||||
assert "no_ctx must be `ctx: tractor.Context" in str(err.value)
|
assert (
|
||||||
|
"no_ctx must be `ctx: tractor.Context"
|
||||||
|
in
|
||||||
|
str(err.value)
|
||||||
|
)
|
||||||
|
|
||||||
@tractor.stream
|
@tractor.stream
|
||||||
async def has_ctx(ctx):
|
async def has_ctx(ctx):
|
||||||
|
|
@ -69,14 +75,14 @@ async def stream_from_single_subactor(
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
registry_addrs=[reg_addr],
|
registry_addrs=[reg_addr],
|
||||||
start_method=start_method,
|
start_method=start_method,
|
||||||
) as nursery:
|
) as an:
|
||||||
|
|
||||||
async with tractor.find_actor('streamerd') as portals:
|
async with tractor.find_actor('streamerd') as portals:
|
||||||
|
|
||||||
if not portals:
|
if not portals:
|
||||||
|
|
||||||
# no brokerd actor found
|
# no brokerd actor found
|
||||||
portal = await nursery.start_actor(
|
portal = await an.start_actor(
|
||||||
'streamerd',
|
'streamerd',
|
||||||
enable_modules=[__name__],
|
enable_modules=[__name__],
|
||||||
)
|
)
|
||||||
|
|
@ -116,11 +122,22 @@ async def stream_from_single_subactor(
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
'stream_func', [async_gen_stream, context_stream]
|
'stream_func',
|
||||||
|
[
|
||||||
|
async_gen_stream,
|
||||||
|
context_stream,
|
||||||
|
],
|
||||||
|
ids='stream_func={}'.format
|
||||||
)
|
)
|
||||||
def test_stream_from_single_subactor(reg_addr, start_method, stream_func):
|
def test_stream_from_single_subactor(
|
||||||
"""Verify streaming from a spawned async generator.
|
reg_addr: tuple,
|
||||||
"""
|
start_method: str,
|
||||||
|
stream_func: Callable,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Verify streaming from a spawned async generator.
|
||||||
|
|
||||||
|
'''
|
||||||
trio.run(
|
trio.run(
|
||||||
partial(
|
partial(
|
||||||
stream_from_single_subactor,
|
stream_from_single_subactor,
|
||||||
|
|
@ -132,10 +149,9 @@ def test_stream_from_single_subactor(reg_addr, start_method, stream_func):
|
||||||
|
|
||||||
|
|
||||||
# this is the first 2 actors, streamer_1 and streamer_2
|
# this is the first 2 actors, streamer_1 and streamer_2
|
||||||
async def stream_data(seed):
|
async def stream_data(seed: int):
|
||||||
|
|
||||||
for i in range(seed):
|
for i in range(seed):
|
||||||
|
|
||||||
yield i
|
yield i
|
||||||
|
|
||||||
# trigger scheduler to simulate practical usage
|
# trigger scheduler to simulate practical usage
|
||||||
|
|
@ -143,15 +159,17 @@ async def stream_data(seed):
|
||||||
|
|
||||||
|
|
||||||
# this is the third actor; the aggregator
|
# this is the third actor; the aggregator
|
||||||
async def aggregate(seed):
|
async def aggregate(seed: int):
|
||||||
"""Ensure that the two streams we receive match but only stream
|
'''
|
||||||
|
Ensure that the two streams we receive match but only stream
|
||||||
a single set of values to the parent.
|
a single set of values to the parent.
|
||||||
"""
|
|
||||||
async with tractor.open_nursery() as nursery:
|
'''
|
||||||
|
async with tractor.open_nursery() as an:
|
||||||
portals = []
|
portals = []
|
||||||
for i in range(1, 3):
|
for i in range(1, 3):
|
||||||
# fork point
|
# fork point
|
||||||
portal = await nursery.start_actor(
|
portal = await an.start_actor(
|
||||||
name=f'streamer_{i}',
|
name=f'streamer_{i}',
|
||||||
enable_modules=[__name__],
|
enable_modules=[__name__],
|
||||||
)
|
)
|
||||||
|
|
@ -164,20 +182,28 @@ async def aggregate(seed):
|
||||||
async with send_chan:
|
async with send_chan:
|
||||||
|
|
||||||
async with portal.open_stream_from(
|
async with portal.open_stream_from(
|
||||||
stream_data, seed=seed,
|
stream_data,
|
||||||
|
seed=seed,
|
||||||
) as stream:
|
) as stream:
|
||||||
|
|
||||||
async for value in stream:
|
async for value in stream:
|
||||||
# leverage trio's built-in backpressure
|
# leverage trio's built-in backpressure
|
||||||
await send_chan.send(value)
|
await send_chan.send(value)
|
||||||
|
|
||||||
print(f"FINISHED ITERATING {portal.channel.uid}")
|
print(
|
||||||
|
f'FINISHED ITERATING!\n'
|
||||||
|
f'peer: {portal.channel.aid.uid}'
|
||||||
|
)
|
||||||
|
|
||||||
# spawn 2 trio tasks to collect streams and push to a local queue
|
# spawn 2 trio tasks to collect streams and push to a local queue
|
||||||
async with trio.open_nursery() as n:
|
async with trio.open_nursery() as tn:
|
||||||
|
|
||||||
for portal in portals:
|
for portal in portals:
|
||||||
n.start_soon(push_to_chan, portal, send_chan.clone())
|
tn.start_soon(
|
||||||
|
push_to_chan,
|
||||||
|
portal,
|
||||||
|
send_chan.clone(),
|
||||||
|
)
|
||||||
|
|
||||||
# close this local task's reference to send side
|
# close this local task's reference to send side
|
||||||
await send_chan.aclose()
|
await send_chan.aclose()
|
||||||
|
|
@ -194,20 +220,21 @@ async def aggregate(seed):
|
||||||
|
|
||||||
print("FINISHED ITERATING in aggregator")
|
print("FINISHED ITERATING in aggregator")
|
||||||
|
|
||||||
await nursery.cancel()
|
await an.cancel()
|
||||||
print("WAITING on `ActorNursery` to finish")
|
print("WAITING on `ActorNursery` to finish")
|
||||||
print("AGGREGATOR COMPLETE!")
|
print("AGGREGATOR COMPLETE!")
|
||||||
|
|
||||||
|
|
||||||
# this is the main actor and *arbiter*
|
async def a_quadruple_example() -> list[int]:
|
||||||
async def a_quadruple_example():
|
'''
|
||||||
# a nursery which spawns "actors"
|
Open the root-actor which is also a "registrar".
|
||||||
async with tractor.open_nursery() as nursery:
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
async with tractor.open_nursery() as an:
|
||||||
seed = int(1e3)
|
seed = int(1e3)
|
||||||
pre_start = time.time()
|
pre_start = time.time()
|
||||||
|
|
||||||
portal = await nursery.start_actor(
|
portal = await an.start_actor(
|
||||||
name='aggregator',
|
name='aggregator',
|
||||||
enable_modules=[__name__],
|
enable_modules=[__name__],
|
||||||
)
|
)
|
||||||
|
|
@ -228,8 +255,14 @@ async def a_quadruple_example():
|
||||||
return result_stream
|
return result_stream
|
||||||
|
|
||||||
|
|
||||||
async def cancel_after(wait, reg_addr):
|
async def cancel_after(
|
||||||
async with tractor.open_root_actor(registry_addrs=[reg_addr]):
|
wait: float,
|
||||||
|
reg_addr: tuple,
|
||||||
|
) -> list[int]:
|
||||||
|
|
||||||
|
async with tractor.open_root_actor(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
|
):
|
||||||
with trio.move_on_after(wait):
|
with trio.move_on_after(wait):
|
||||||
return await a_quadruple_example()
|
return await a_quadruple_example()
|
||||||
|
|
||||||
|
|
@ -240,6 +273,10 @@ def time_quad_ex(
|
||||||
ci_env: bool,
|
ci_env: bool,
|
||||||
spawn_backend: str,
|
spawn_backend: str,
|
||||||
):
|
):
|
||||||
|
non_linux: bool = (_sys := platform.system()) != 'Linux'
|
||||||
|
if ci_env and non_linux:
|
||||||
|
pytest.skip(f'Test is too flaky on {_sys!r} in CI')
|
||||||
|
|
||||||
if spawn_backend == 'mp':
|
if spawn_backend == 'mp':
|
||||||
'''
|
'''
|
||||||
no idea but the mp *nix runs are flaking out here often...
|
no idea but the mp *nix runs are flaking out here often...
|
||||||
|
|
@ -247,32 +284,59 @@ def time_quad_ex(
|
||||||
'''
|
'''
|
||||||
pytest.skip("Test is too flaky on mp in CI")
|
pytest.skip("Test is too flaky on mp in CI")
|
||||||
|
|
||||||
timeout = 7 if platform.system() in ('Windows', 'Darwin') else 4
|
timeout = 7 if non_linux else 4
|
||||||
start = time.time()
|
start = time.time()
|
||||||
results = trio.run(cancel_after, timeout, reg_addr)
|
results: list[int] = trio.run(
|
||||||
diff = time.time() - start
|
cancel_after,
|
||||||
|
timeout,
|
||||||
|
reg_addr,
|
||||||
|
)
|
||||||
|
diff: float = time.time() - start
|
||||||
assert results
|
assert results
|
||||||
return results, diff
|
return results, diff
|
||||||
|
|
||||||
|
|
||||||
def test_a_quadruple_example(
|
def test_a_quadruple_example(
|
||||||
time_quad_ex: tuple,
|
time_quad_ex: tuple[list[int], float],
|
||||||
ci_env: bool,
|
ci_env: bool,
|
||||||
spawn_backend: str,
|
spawn_backend: str,
|
||||||
|
test_log: tractor.log.StackLevelAdapter,
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
This also serves as a kind of "we'd like to be this fast test".
|
This also serves as a "we'd like to be this fast" smoke test
|
||||||
|
given past empirical eval of this suite.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
non_linux: bool = (_sys := platform.system()) != 'Linux'
|
||||||
|
|
||||||
|
this_fast_on_linux: float = 3
|
||||||
|
this_fast = (
|
||||||
|
6 if non_linux
|
||||||
|
else this_fast_on_linux
|
||||||
|
)
|
||||||
|
# ^ XXX NOTE,
|
||||||
|
# i've noticed that tweaking the CPU governor setting
|
||||||
|
# to not "always" enable "turbo" mode can result in latency
|
||||||
|
# which causes this limit to be too little. Not sure if it'd
|
||||||
|
# be worth it to adjust the linux value based on reading the
|
||||||
|
# CPU conf from the sys?
|
||||||
|
#
|
||||||
|
# For ex, see the `auto-cpufreq` docs on such settings,
|
||||||
|
# https://github.com/AdnanHodzic/auto-cpufreq?tab=readme-ov-file#example-config-file-contents
|
||||||
|
#
|
||||||
|
# HENCE this below latency-headroom compensation logic..
|
||||||
|
from .conftest import cpu_scaling_factor
|
||||||
|
headroom: float = cpu_scaling_factor()
|
||||||
|
if headroom != 1.:
|
||||||
|
this_fast = this_fast_on_linux * headroom
|
||||||
|
test_log.warning(
|
||||||
|
f'Adding latency headroom on linux bc CPU scaling,\n'
|
||||||
|
f'headroom: {headroom}\n'
|
||||||
|
f'this_fast_on_linux: {this_fast_on_linux} -> {this_fast}\n'
|
||||||
|
)
|
||||||
|
|
||||||
results, diff = time_quad_ex
|
results, diff = time_quad_ex
|
||||||
assert results
|
assert results
|
||||||
this_fast = (
|
|
||||||
6 if platform.system() in (
|
|
||||||
'Windows',
|
|
||||||
'Darwin',
|
|
||||||
)
|
|
||||||
else 3
|
|
||||||
)
|
|
||||||
assert diff < this_fast
|
assert diff < this_fast
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -281,43 +345,58 @@ def test_a_quadruple_example(
|
||||||
list(map(lambda i: i/10, range(3, 9)))
|
list(map(lambda i: i/10, range(3, 9)))
|
||||||
)
|
)
|
||||||
def test_not_fast_enough_quad(
|
def test_not_fast_enough_quad(
|
||||||
reg_addr, time_quad_ex, cancel_delay, ci_env, spawn_backend
|
reg_addr: tuple,
|
||||||
|
time_quad_ex: tuple[list[int], float],
|
||||||
|
cancel_delay: float,
|
||||||
|
ci_env: bool,
|
||||||
|
spawn_backend: str,
|
||||||
):
|
):
|
||||||
"""Verify we can cancel midway through the quad example and all actors
|
'''
|
||||||
cancel gracefully.
|
Verify we can cancel midway through the quad example and all
|
||||||
"""
|
actors cancel gracefully.
|
||||||
|
|
||||||
|
'''
|
||||||
results, diff = time_quad_ex
|
results, diff = time_quad_ex
|
||||||
delay = max(diff - cancel_delay, 0)
|
delay = max(diff - cancel_delay, 0)
|
||||||
results = trio.run(cancel_after, delay, reg_addr)
|
results = trio.run(
|
||||||
system = platform.system()
|
cancel_after,
|
||||||
if system in ('Windows', 'Darwin') and results is not None:
|
delay,
|
||||||
|
reg_addr,
|
||||||
|
)
|
||||||
|
system: str = platform.system()
|
||||||
|
if (
|
||||||
|
system in ('Windows', 'Darwin')
|
||||||
|
and
|
||||||
|
results is not None
|
||||||
|
):
|
||||||
# In CI envoirments it seems later runs are quicker then the first
|
# In CI envoirments it seems later runs are quicker then the first
|
||||||
# so just ignore these
|
# so just ignore these
|
||||||
print(f"Woa there {system} caught your breath eh?")
|
print(f'Woa there {system} caught your breath eh?')
|
||||||
else:
|
else:
|
||||||
# should be cancelled mid-streaming
|
# should be cancelled mid-streaming
|
||||||
assert results is None
|
assert results is None
|
||||||
|
|
||||||
|
|
||||||
@tractor_test
|
@tractor_test(timeout=20)
|
||||||
async def test_respawn_consumer_task(
|
async def test_respawn_consumer_task(
|
||||||
reg_addr,
|
reg_addr: tuple,
|
||||||
spawn_backend,
|
spawn_backend: str,
|
||||||
loglevel,
|
loglevel: str,
|
||||||
):
|
):
|
||||||
"""Verify that ``._portal.ReceiveStream.shield()``
|
'''
|
||||||
|
Verify that ``._portal.ReceiveStream.shield()``
|
||||||
sucessfully protects the underlying IPC channel from being closed
|
sucessfully protects the underlying IPC channel from being closed
|
||||||
when cancelling and respawning a consumer task.
|
when cancelling and respawning a consumer task.
|
||||||
|
|
||||||
This also serves to verify that all values from the stream can be
|
This also serves to verify that all values from the stream can be
|
||||||
received despite the respawns.
|
received despite the respawns.
|
||||||
|
|
||||||
"""
|
'''
|
||||||
stream = None
|
stream = None
|
||||||
|
|
||||||
async with tractor.open_nursery() as n:
|
async with tractor.open_nursery() as an:
|
||||||
|
|
||||||
portal = await n.start_actor(
|
portal = await an.start_actor(
|
||||||
name='streamer',
|
name='streamer',
|
||||||
enable_modules=[__name__]
|
enable_modules=[__name__]
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
"""
|
"""
|
||||||
Arbiter and "local" actor api
|
Registrar and "local" actor api
|
||||||
"""
|
"""
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
|
@ -12,11 +12,11 @@ from tractor._testing import tractor_test
|
||||||
|
|
||||||
@pytest.mark.trio
|
@pytest.mark.trio
|
||||||
async def test_no_runtime():
|
async def test_no_runtime():
|
||||||
"""An arbitter must be established before any nurseries
|
"""A registrar must be established before any nurseries
|
||||||
can be created.
|
can be created.
|
||||||
|
|
||||||
(In other words ``tractor.open_root_actor()`` must be engaged at
|
(In other words ``tractor.open_root_actor()`` must be
|
||||||
some point?)
|
engaged at some point?)
|
||||||
"""
|
"""
|
||||||
with pytest.raises(RuntimeError) :
|
with pytest.raises(RuntimeError) :
|
||||||
async with tractor.find_actor('doggy'):
|
async with tractor.find_actor('doggy'):
|
||||||
|
|
@ -25,9 +25,9 @@ async def test_no_runtime():
|
||||||
|
|
||||||
@tractor_test
|
@tractor_test
|
||||||
async def test_self_is_registered(reg_addr):
|
async def test_self_is_registered(reg_addr):
|
||||||
"Verify waiting on the arbiter to register itself using the standard api."
|
"Verify waiting on the registrar to register itself using the standard api."
|
||||||
actor = tractor.current_actor()
|
actor = tractor.current_actor()
|
||||||
assert actor.is_arbiter
|
assert actor.is_registrar
|
||||||
with trio.fail_after(0.2):
|
with trio.fail_after(0.2):
|
||||||
async with tractor.wait_for_actor('root') as portal:
|
async with tractor.wait_for_actor('root') as portal:
|
||||||
assert portal.channel.uid[0] == 'root'
|
assert portal.channel.uid[0] == 'root'
|
||||||
|
|
@ -35,11 +35,11 @@ async def test_self_is_registered(reg_addr):
|
||||||
|
|
||||||
@tractor_test
|
@tractor_test
|
||||||
async def test_self_is_registered_localportal(reg_addr):
|
async def test_self_is_registered_localportal(reg_addr):
|
||||||
"Verify waiting on the arbiter to register itself using a local portal."
|
"Verify waiting on the registrar to register itself using a local portal."
|
||||||
actor = tractor.current_actor()
|
actor = tractor.current_actor()
|
||||||
assert actor.is_arbiter
|
assert actor.is_registrar
|
||||||
async with tractor.get_registry(reg_addr) as portal:
|
async with tractor.get_registry(reg_addr) as portal:
|
||||||
assert isinstance(portal, tractor._portal.LocalPortal)
|
assert isinstance(portal, tractor.runtime._portal.LocalPortal)
|
||||||
|
|
||||||
with trio.fail_after(0.2):
|
with trio.fail_after(0.2):
|
||||||
sockaddr = await portal.run_from_ns(
|
sockaddr = await portal.run_from_ns(
|
||||||
|
|
@ -57,8 +57,8 @@ def test_local_actor_async_func(reg_addr):
|
||||||
async with tractor.open_root_actor(
|
async with tractor.open_root_actor(
|
||||||
registry_addrs=[reg_addr],
|
registry_addrs=[reg_addr],
|
||||||
):
|
):
|
||||||
# arbiter is started in-proc if dne
|
# registrar is started in-proc if dne
|
||||||
assert tractor.current_actor().is_arbiter
|
assert tractor.current_actor().is_registrar
|
||||||
|
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
nums.append(i)
|
nums.append(i)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,185 @@
|
||||||
|
'''
|
||||||
|
`tractor.log`-wrapping unit tests.
|
||||||
|
|
||||||
|
'''
|
||||||
|
from pathlib import Path
|
||||||
|
import shutil
|
||||||
|
from types import ModuleType
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import tractor
|
||||||
|
from tractor import (
|
||||||
|
_code_load,
|
||||||
|
log,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_root_pkg_not_duplicated_in_logger_name():
|
||||||
|
'''
|
||||||
|
When both `pkg_name` and `name` are passed and they have
|
||||||
|
a common `<root_name>.< >` prefix, ensure that it is not
|
||||||
|
duplicated in the child's `StackLevelAdapter.name: str`.
|
||||||
|
|
||||||
|
'''
|
||||||
|
project_name: str = 'pylib'
|
||||||
|
pkg_path: str = 'pylib.subpkg.mod'
|
||||||
|
|
||||||
|
assert not tractor.current_actor(
|
||||||
|
err_on_no_runtime=False,
|
||||||
|
)
|
||||||
|
proj_log = log.get_logger(
|
||||||
|
pkg_name=project_name,
|
||||||
|
mk_sublog=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
sublog = log.get_logger(
|
||||||
|
pkg_name=project_name,
|
||||||
|
name=pkg_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert proj_log is not sublog
|
||||||
|
assert sublog.name.count(proj_log.name) == 1
|
||||||
|
assert 'mod' not in sublog.name
|
||||||
|
|
||||||
|
|
||||||
|
def test_implicit_mod_name_applied_for_child(
|
||||||
|
testdir: pytest.Pytester,
|
||||||
|
loglevel: str,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Verify that when `.log.get_logger(pkg_name='pylib')` is called
|
||||||
|
from a given sub-mod from within the `pylib` pkg-path, we
|
||||||
|
implicitly set the equiv of `name=__name__` from the caller's
|
||||||
|
module.
|
||||||
|
|
||||||
|
'''
|
||||||
|
# tractor.log.get_console_log(level=loglevel)
|
||||||
|
proj_name: str = 'snakelib'
|
||||||
|
mod_code: str = (
|
||||||
|
f'import tractor\n'
|
||||||
|
f'\n'
|
||||||
|
# if you need to trace `testdir` stuff @ import-time..
|
||||||
|
# f'breakpoint()\n'
|
||||||
|
f'log = tractor.log.get_logger(pkg_name="{proj_name}")\n'
|
||||||
|
)
|
||||||
|
|
||||||
|
# create a sub-module for each pkg layer
|
||||||
|
_lib = testdir.mkpydir(proj_name)
|
||||||
|
pkg: Path = Path(_lib)
|
||||||
|
pkg_init_mod: Path = pkg / "__init__.py"
|
||||||
|
pkg_init_mod.write_text(mod_code)
|
||||||
|
|
||||||
|
subpkg: Path = pkg / 'subpkg'
|
||||||
|
subpkg.mkdir()
|
||||||
|
subpkgmod: Path = subpkg / "__init__.py"
|
||||||
|
subpkgmod.touch()
|
||||||
|
subpkgmod.write_text(mod_code)
|
||||||
|
|
||||||
|
_submod: Path = testdir.makepyfile(
|
||||||
|
_mod=mod_code,
|
||||||
|
)
|
||||||
|
|
||||||
|
pkg_submod = pkg / 'mod.py'
|
||||||
|
pkg_subpkg_submod = subpkg / 'submod.py'
|
||||||
|
shutil.copyfile(
|
||||||
|
_submod,
|
||||||
|
pkg_submod,
|
||||||
|
)
|
||||||
|
shutil.copyfile(
|
||||||
|
_submod,
|
||||||
|
pkg_subpkg_submod,
|
||||||
|
)
|
||||||
|
testdir.chdir()
|
||||||
|
# NOTE, to introspect the py-file-module-layout use (in .xsh
|
||||||
|
# syntax): `ranger @str(testdir)`
|
||||||
|
|
||||||
|
# XXX NOTE, once the "top level" pkg mod has been
|
||||||
|
# imported, we can then use `import` syntax to
|
||||||
|
# import it's sub-pkgs and modules.
|
||||||
|
subpkgmod: ModuleType = _code_load.load_module_from_path(
|
||||||
|
Path(pkg / '__init__.py'),
|
||||||
|
module_name=proj_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
pkg_root_log = log.get_logger(
|
||||||
|
pkg_name=proj_name,
|
||||||
|
mk_sublog=False,
|
||||||
|
)
|
||||||
|
# the top level pkg-mod, created just now,
|
||||||
|
# by above API call.
|
||||||
|
assert pkg_root_log.name == proj_name
|
||||||
|
assert not pkg_root_log.logger.getChildren()
|
||||||
|
#
|
||||||
|
# ^TODO! test this same output but created via a `get_logger()`
|
||||||
|
# call in the `snakelib.__init__py`!!
|
||||||
|
|
||||||
|
# NOTE, the pkg-level "init mod" should of course
|
||||||
|
# have the same name as the package ns-path.
|
||||||
|
import snakelib as init_mod
|
||||||
|
assert init_mod.log.name == proj_name
|
||||||
|
|
||||||
|
# NOTE, a first-pkg-level sub-module should only
|
||||||
|
# use the package-name since the leaf-node-module
|
||||||
|
# will be included in log headers by default.
|
||||||
|
from snakelib import mod
|
||||||
|
assert mod.log.name == proj_name
|
||||||
|
|
||||||
|
from snakelib import subpkg
|
||||||
|
assert (
|
||||||
|
subpkg.log.name
|
||||||
|
==
|
||||||
|
subpkg.__package__
|
||||||
|
==
|
||||||
|
f'{proj_name}.subpkg'
|
||||||
|
)
|
||||||
|
|
||||||
|
from snakelib.subpkg import submod
|
||||||
|
assert (
|
||||||
|
submod.log.name
|
||||||
|
==
|
||||||
|
submod.__package__
|
||||||
|
==
|
||||||
|
f'{proj_name}.subpkg'
|
||||||
|
)
|
||||||
|
|
||||||
|
sub_logs = pkg_root_log.logger.getChildren()
|
||||||
|
assert len(sub_logs) == 1 # only one nested sub-pkg module
|
||||||
|
assert submod.log.logger in sub_logs
|
||||||
|
|
||||||
|
|
||||||
|
# TODO, moar tests against existing feats:
|
||||||
|
# ------ - ------
|
||||||
|
# - [ ] color settings?
|
||||||
|
# - [ ] header contents like,
|
||||||
|
# - actor + thread + task names from various conc-primitives,
|
||||||
|
# - [ ] `StackLevelAdapter` extensions,
|
||||||
|
# - our custom levels/methods: `transport|runtime|cance|pdb|devx`
|
||||||
|
# - [ ] custom-headers support?
|
||||||
|
#
|
||||||
|
|
||||||
|
# TODO, test driven dev of new-ideas/long-wanted feats,
|
||||||
|
# ------ - ------
|
||||||
|
# - [ ] https://github.com/goodboy/tractor/issues/244
|
||||||
|
# - [ ] @catern mentioned using a sync / deterministic sys
|
||||||
|
# and in particular `svlogd`?
|
||||||
|
# |_ https://smarden.org/runit/svlogd.8
|
||||||
|
|
||||||
|
# - [ ] using adapter vs. filters?
|
||||||
|
# - https://stackoverflow.com/questions/60691759/add-information-to-every-log-message-in-python-logging/61830838#61830838
|
||||||
|
|
||||||
|
# - [ ] `.at_least_level()` optimization which short circuits wtv
|
||||||
|
# `logging` is doing behind the scenes when the level filters
|
||||||
|
# the emission..?
|
||||||
|
|
||||||
|
# - [ ] use of `.log.get_console_log()` in subactors and the
|
||||||
|
# subtleties of ensuring it actually emits from a subproc.
|
||||||
|
|
||||||
|
# - [ ] this idea of activating per-subsys emissions with some
|
||||||
|
# kind of `.name` filter passed to the runtime or maybe configured
|
||||||
|
# via the root `StackLevelAdapter`?
|
||||||
|
|
||||||
|
# - [ ] use of `logging.dict.dictConfig()` to simplify the impl
|
||||||
|
# of any of ^^ ??
|
||||||
|
# - https://stackoverflow.com/questions/7507825/where-is-a-complete-example-of-logging-config-dictconfig
|
||||||
|
# - https://docs.python.org/3/library/logging.config.html#configuration-dictionary-schema
|
||||||
|
# - https://docs.python.org/3/library/logging.config.html#logging.config.dictConfig
|
||||||
|
|
@ -1,8 +1,13 @@
|
||||||
"""
|
"""
|
||||||
Multiple python programs invoking the runtime.
|
Multiple python programs invoking the runtime.
|
||||||
"""
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
import platform
|
import platform
|
||||||
|
import subprocess
|
||||||
import time
|
import time
|
||||||
|
from typing import (
|
||||||
|
TYPE_CHECKING,
|
||||||
|
)
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import trio
|
import trio
|
||||||
|
|
@ -10,14 +15,32 @@ import tractor
|
||||||
from tractor._testing import (
|
from tractor._testing import (
|
||||||
tractor_test,
|
tractor_test,
|
||||||
)
|
)
|
||||||
|
from tractor import (
|
||||||
|
current_actor,
|
||||||
|
Actor,
|
||||||
|
Context,
|
||||||
|
Portal,
|
||||||
|
)
|
||||||
|
from tractor.runtime import _state
|
||||||
from .conftest import (
|
from .conftest import (
|
||||||
sig_prog,
|
sig_prog,
|
||||||
_INT_SIGNAL,
|
_INT_SIGNAL,
|
||||||
_INT_RETURN_CODE,
|
_INT_RETURN_CODE,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from tractor.msg import Aid
|
||||||
|
from tractor.discovery._addr import (
|
||||||
|
UnwrappedAddress,
|
||||||
|
)
|
||||||
|
|
||||||
def test_abort_on_sigint(daemon):
|
|
||||||
|
_non_linux: bool = platform.system() != 'Linux'
|
||||||
|
|
||||||
|
|
||||||
|
def test_abort_on_sigint(
|
||||||
|
daemon: subprocess.Popen,
|
||||||
|
):
|
||||||
assert daemon.returncode is None
|
assert daemon.returncode is None
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
sig_prog(daemon, _INT_SIGNAL)
|
sig_prog(daemon, _INT_SIGNAL)
|
||||||
|
|
@ -30,39 +53,131 @@ def test_abort_on_sigint(daemon):
|
||||||
|
|
||||||
|
|
||||||
@tractor_test
|
@tractor_test
|
||||||
async def test_cancel_remote_arbiter(daemon, reg_addr):
|
async def test_cancel_remote_registrar(
|
||||||
assert not tractor.current_actor().is_arbiter
|
daemon: subprocess.Popen,
|
||||||
|
reg_addr: UnwrappedAddress,
|
||||||
|
):
|
||||||
|
assert not current_actor().is_registrar
|
||||||
async with tractor.get_registry(reg_addr) as portal:
|
async with tractor.get_registry(reg_addr) as portal:
|
||||||
await portal.cancel_actor()
|
await portal.cancel_actor()
|
||||||
|
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
# the arbiter channel server is cancelled but not its main task
|
# the registrar channel server is cancelled but not its main task
|
||||||
assert daemon.returncode is None
|
assert daemon.returncode is None
|
||||||
|
|
||||||
# no arbiter socket should exist
|
# no registrar socket should exist
|
||||||
with pytest.raises(OSError):
|
with pytest.raises(OSError):
|
||||||
async with tractor.get_registry(reg_addr) as portal:
|
async with tractor.get_registry(reg_addr) as portal:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def test_register_duplicate_name(daemon, reg_addr):
|
def test_register_duplicate_name(
|
||||||
|
daemon: subprocess.Popen,
|
||||||
|
reg_addr: UnwrappedAddress,
|
||||||
|
):
|
||||||
async def main():
|
async def main():
|
||||||
|
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
registry_addrs=[reg_addr],
|
registry_addrs=[reg_addr],
|
||||||
) as n:
|
) as an:
|
||||||
|
|
||||||
assert not tractor.current_actor().is_arbiter
|
assert not current_actor().is_registrar
|
||||||
|
|
||||||
p1 = await n.start_actor('doggy')
|
p1 = await an.start_actor('doggy')
|
||||||
p2 = await n.start_actor('doggy')
|
p2 = await an.start_actor('doggy')
|
||||||
|
|
||||||
async with tractor.wait_for_actor('doggy') as portal:
|
async with tractor.wait_for_actor('doggy') as portal:
|
||||||
assert portal.channel.uid in (p2.channel.uid, p1.channel.uid)
|
assert portal.channel.uid in (p2.channel.uid, p1.channel.uid)
|
||||||
|
|
||||||
await n.cancel()
|
await an.cancel()
|
||||||
|
|
||||||
# run it manually since we want to start **after**
|
# XXX, run manually since we want to start this root **after**
|
||||||
# the other "daemon" program
|
# the other "daemon" program with it's own root.
|
||||||
|
trio.run(main)
|
||||||
|
|
||||||
|
|
||||||
|
@tractor.context
|
||||||
|
async def get_root_portal(
|
||||||
|
ctx: Context,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Connect back to the root actor manually (using `._discovery` API)
|
||||||
|
and ensure it's contact info is the same as our immediate parent.
|
||||||
|
|
||||||
|
'''
|
||||||
|
sub: Actor = current_actor()
|
||||||
|
rtvs: dict = _state._runtime_vars
|
||||||
|
raddrs: list[UnwrappedAddress] = rtvs['_root_addrs']
|
||||||
|
|
||||||
|
# await tractor.pause()
|
||||||
|
# XXX, in case the sub->root discovery breaks you might need
|
||||||
|
# this (i know i did Xp)!!
|
||||||
|
# from tractor.devx import mk_pdb
|
||||||
|
# mk_pdb().set_trace()
|
||||||
|
|
||||||
|
assert (
|
||||||
|
len(raddrs) == 1
|
||||||
|
and
|
||||||
|
list(sub._parent_chan.raddr.unwrap()) in raddrs
|
||||||
|
)
|
||||||
|
|
||||||
|
# connect back to our immediate parent which should also
|
||||||
|
# be the actor-tree's root.
|
||||||
|
from tractor.discovery._api import get_root
|
||||||
|
ptl: Portal
|
||||||
|
async with get_root() as ptl:
|
||||||
|
root_aid: Aid = ptl.chan.aid
|
||||||
|
parent_ptl: Portal = current_actor().get_parent()
|
||||||
|
assert (
|
||||||
|
root_aid.name == 'root'
|
||||||
|
and
|
||||||
|
parent_ptl.chan.aid == root_aid
|
||||||
|
)
|
||||||
|
await ctx.started()
|
||||||
|
|
||||||
|
|
||||||
|
def test_non_registrar_spawns_child(
|
||||||
|
daemon: subprocess.Popen,
|
||||||
|
reg_addr: UnwrappedAddress,
|
||||||
|
loglevel: str,
|
||||||
|
debug_mode: bool,
|
||||||
|
ci_env: bool,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Ensure a non-regristar (serving) root actor can spawn a sub and
|
||||||
|
that sub can connect back (manually) to it's rent that is the
|
||||||
|
root without issue.
|
||||||
|
|
||||||
|
More or less this audits the global contact info in
|
||||||
|
`._state._runtime_vars`.
|
||||||
|
|
||||||
|
'''
|
||||||
|
async def main():
|
||||||
|
|
||||||
|
# XXX, since apparently on macos in GH's CI it can be a race
|
||||||
|
# with the `daemon` registrar on grabbing the socket-addr..
|
||||||
|
if ci_env and _non_linux:
|
||||||
|
await trio.sleep(.5)
|
||||||
|
|
||||||
|
async with tractor.open_nursery(
|
||||||
|
registry_addrs=[reg_addr],
|
||||||
|
loglevel=loglevel,
|
||||||
|
debug_mode=debug_mode,
|
||||||
|
) as an:
|
||||||
|
|
||||||
|
actor: Actor = tractor.current_actor()
|
||||||
|
assert not actor.is_registrar
|
||||||
|
sub_ptl: Portal = await an.start_actor(
|
||||||
|
name='sub',
|
||||||
|
enable_modules=[__name__],
|
||||||
|
)
|
||||||
|
|
||||||
|
async with sub_ptl.open_context(
|
||||||
|
get_root_portal,
|
||||||
|
) as (ctx, _):
|
||||||
|
print('Waiting for `sub` to connect back to us..')
|
||||||
|
|
||||||
|
await an.cancel()
|
||||||
|
|
||||||
|
# XXX, run manually since we want to start this root **after**
|
||||||
|
# the other "daemon" program with it's own root.
|
||||||
trio.run(main)
|
trio.run(main)
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,14 @@ import tractor
|
||||||
from tractor.experimental import msgpub
|
from tractor.experimental import msgpub
|
||||||
from tractor._testing import tractor_test
|
from tractor._testing import tractor_test
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.skipon_spawn_backend(
|
||||||
|
'subint',
|
||||||
|
reason=(
|
||||||
|
'XXX SUBINT HANGING TEST XXX\n'
|
||||||
|
'See oustanding issue(s)\n'
|
||||||
|
# TODO, put issue link!
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
def test_type_checks():
|
def test_type_checks():
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,333 @@
|
||||||
|
'''
|
||||||
|
Verify that externally registered remote actor error
|
||||||
|
types are correctly relayed, boxed, and re-raised across
|
||||||
|
IPC actor hops via `reg_err_types()`.
|
||||||
|
|
||||||
|
Also ensure that when custom error types are NOT registered
|
||||||
|
the framework indicates the lookup failure to the user.
|
||||||
|
|
||||||
|
'''
|
||||||
|
import pytest
|
||||||
|
import trio
|
||||||
|
import tractor
|
||||||
|
from tractor import (
|
||||||
|
Context,
|
||||||
|
Portal,
|
||||||
|
RemoteActorError,
|
||||||
|
)
|
||||||
|
from tractor._exceptions import (
|
||||||
|
get_err_type,
|
||||||
|
reg_err_types,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# -- custom app-level errors for testing --
|
||||||
|
class CustomAppError(Exception):
|
||||||
|
'''
|
||||||
|
A hypothetical user-app error that should be
|
||||||
|
boxed+relayed by `tractor` IPC when registered.
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
class AnotherAppError(Exception):
|
||||||
|
'''
|
||||||
|
A second custom error for multi-type registration.
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
class UnregisteredAppError(Exception):
|
||||||
|
'''
|
||||||
|
A custom error that is intentionally NEVER
|
||||||
|
registered via `reg_err_types()` so we can
|
||||||
|
verify the framework's failure indication.
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
# -- remote-task endpoints --
|
||||||
|
@tractor.context
|
||||||
|
async def raise_custom_err(
|
||||||
|
ctx: Context,
|
||||||
|
) -> None:
|
||||||
|
'''
|
||||||
|
Remote ep that raises a `CustomAppError`
|
||||||
|
after sync-ing with the caller.
|
||||||
|
|
||||||
|
'''
|
||||||
|
await ctx.started()
|
||||||
|
raise CustomAppError(
|
||||||
|
'the app exploded remotely'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@tractor.context
|
||||||
|
async def raise_another_err(
|
||||||
|
ctx: Context,
|
||||||
|
) -> None:
|
||||||
|
'''
|
||||||
|
Remote ep that raises `AnotherAppError`.
|
||||||
|
|
||||||
|
'''
|
||||||
|
await ctx.started()
|
||||||
|
raise AnotherAppError(
|
||||||
|
'another app-level kaboom'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@tractor.context
|
||||||
|
async def raise_unreg_err(
|
||||||
|
ctx: Context,
|
||||||
|
) -> None:
|
||||||
|
'''
|
||||||
|
Remote ep that raises an `UnregisteredAppError`
|
||||||
|
which has NOT been `reg_err_types()`-registered.
|
||||||
|
|
||||||
|
'''
|
||||||
|
await ctx.started()
|
||||||
|
raise UnregisteredAppError(
|
||||||
|
'this error type is unknown to tractor'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# -- unit tests for the type-registry plumbing --
|
||||||
|
|
||||||
|
class TestRegErrTypesPlumbing:
|
||||||
|
'''
|
||||||
|
Low-level checks on `reg_err_types()` and
|
||||||
|
`get_err_type()` without requiring IPC.
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
def test_unregistered_type_returns_none(self):
|
||||||
|
'''
|
||||||
|
An unregistered custom error name should yield
|
||||||
|
`None` from `get_err_type()`.
|
||||||
|
|
||||||
|
'''
|
||||||
|
result = get_err_type('CustomAppError')
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_register_and_lookup(self):
|
||||||
|
'''
|
||||||
|
After `reg_err_types()`, the custom type should
|
||||||
|
be discoverable via `get_err_type()`.
|
||||||
|
|
||||||
|
'''
|
||||||
|
reg_err_types([CustomAppError])
|
||||||
|
result = get_err_type('CustomAppError')
|
||||||
|
assert result is CustomAppError
|
||||||
|
|
||||||
|
def test_register_multiple_types(self):
|
||||||
|
'''
|
||||||
|
Registering a list of types should make each
|
||||||
|
one individually resolvable.
|
||||||
|
|
||||||
|
'''
|
||||||
|
reg_err_types([
|
||||||
|
CustomAppError,
|
||||||
|
AnotherAppError,
|
||||||
|
])
|
||||||
|
assert (
|
||||||
|
get_err_type('CustomAppError')
|
||||||
|
is CustomAppError
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
get_err_type('AnotherAppError')
|
||||||
|
is AnotherAppError
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_builtin_types_always_resolve(self):
|
||||||
|
'''
|
||||||
|
Builtin error types like `RuntimeError` and
|
||||||
|
`ValueError` should always be found without
|
||||||
|
any prior registration.
|
||||||
|
|
||||||
|
'''
|
||||||
|
assert (
|
||||||
|
get_err_type('RuntimeError')
|
||||||
|
is RuntimeError
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
get_err_type('ValueError')
|
||||||
|
is ValueError
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_tractor_native_types_resolve(self):
|
||||||
|
'''
|
||||||
|
`tractor`-internal exc types (e.g.
|
||||||
|
`ContextCancelled`) should always resolve.
|
||||||
|
|
||||||
|
'''
|
||||||
|
assert (
|
||||||
|
get_err_type('ContextCancelled')
|
||||||
|
is tractor.ContextCancelled
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_boxed_type_str_without_ipc_msg(self):
|
||||||
|
'''
|
||||||
|
When a `RemoteActorError` is constructed
|
||||||
|
without an IPC msg (and no resolvable type),
|
||||||
|
`.boxed_type_str` should return `'<unknown>'`.
|
||||||
|
|
||||||
|
'''
|
||||||
|
rae = RemoteActorError('test')
|
||||||
|
assert rae.boxed_type_str == '<unknown>'
|
||||||
|
|
||||||
|
|
||||||
|
# -- IPC-level integration tests --
|
||||||
|
|
||||||
|
def test_registered_custom_err_relayed(
|
||||||
|
debug_mode: bool,
|
||||||
|
tpt_proto: str,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
When a custom error type is registered via
|
||||||
|
`reg_err_types()` on BOTH sides of an IPC dialog,
|
||||||
|
the parent should receive a `RemoteActorError`
|
||||||
|
whose `.boxed_type` matches the original custom
|
||||||
|
error type.
|
||||||
|
|
||||||
|
'''
|
||||||
|
reg_err_types([CustomAppError])
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
async with tractor.open_nursery(
|
||||||
|
debug_mode=debug_mode,
|
||||||
|
enable_transports=[tpt_proto],
|
||||||
|
) as an:
|
||||||
|
ptl: Portal = await an.start_actor(
|
||||||
|
'custom-err-raiser',
|
||||||
|
enable_modules=[__name__],
|
||||||
|
)
|
||||||
|
async with ptl.open_context(
|
||||||
|
raise_custom_err,
|
||||||
|
) as (ctx, sent):
|
||||||
|
assert not sent
|
||||||
|
try:
|
||||||
|
await ctx.wait_for_result()
|
||||||
|
except RemoteActorError as rae:
|
||||||
|
assert rae.boxed_type is CustomAppError
|
||||||
|
assert rae.src_type is CustomAppError
|
||||||
|
assert 'the app exploded remotely' in str(
|
||||||
|
rae.tb_str
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
|
||||||
|
with pytest.raises(RemoteActorError) as excinfo:
|
||||||
|
trio.run(main)
|
||||||
|
|
||||||
|
rae = excinfo.value
|
||||||
|
assert rae.boxed_type is CustomAppError
|
||||||
|
|
||||||
|
|
||||||
|
def test_registered_another_err_relayed(
|
||||||
|
debug_mode: bool,
|
||||||
|
tpt_proto: str,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Same as above but for a different custom error
|
||||||
|
type to verify multi-type registration works
|
||||||
|
end-to-end over IPC.
|
||||||
|
|
||||||
|
'''
|
||||||
|
reg_err_types([AnotherAppError])
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
async with tractor.open_nursery(
|
||||||
|
debug_mode=debug_mode,
|
||||||
|
enable_transports=[tpt_proto],
|
||||||
|
) as an:
|
||||||
|
ptl: Portal = await an.start_actor(
|
||||||
|
'another-err-raiser',
|
||||||
|
enable_modules=[__name__],
|
||||||
|
)
|
||||||
|
async with ptl.open_context(
|
||||||
|
raise_another_err,
|
||||||
|
) as (ctx, sent):
|
||||||
|
assert not sent
|
||||||
|
try:
|
||||||
|
await ctx.wait_for_result()
|
||||||
|
except RemoteActorError as rae:
|
||||||
|
assert (
|
||||||
|
rae.boxed_type
|
||||||
|
is AnotherAppError
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
|
||||||
|
await an.cancel()
|
||||||
|
|
||||||
|
with pytest.raises(RemoteActorError) as excinfo:
|
||||||
|
trio.run(main)
|
||||||
|
|
||||||
|
rae = excinfo.value
|
||||||
|
assert rae.boxed_type is AnotherAppError
|
||||||
|
|
||||||
|
|
||||||
|
def test_unregistered_err_still_relayed(
|
||||||
|
debug_mode: bool,
|
||||||
|
tpt_proto: str,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Verify that even when a custom error type is NOT registered via
|
||||||
|
`reg_err_types()`, the remote error is still relayed as
|
||||||
|
a `RemoteActorError` with all string-level info preserved
|
||||||
|
(traceback, type name, source actor uid).
|
||||||
|
|
||||||
|
The `.boxed_type` will be `None` (type obj can't be resolved) but
|
||||||
|
`.boxed_type_str` and `.src_type_str` still report the original
|
||||||
|
type name from the IPC msg.
|
||||||
|
|
||||||
|
This documents the expected limitation: without `reg_err_types()`
|
||||||
|
the `.boxed_type` property can NOT resolve to the original Python
|
||||||
|
type.
|
||||||
|
|
||||||
|
'''
|
||||||
|
# NOTE: intentionally do NOT call
|
||||||
|
# `reg_err_types([UnregisteredAppError])`
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
async with tractor.open_nursery(
|
||||||
|
debug_mode=debug_mode,
|
||||||
|
enable_transports=[tpt_proto],
|
||||||
|
) as an:
|
||||||
|
ptl: Portal = await an.start_actor(
|
||||||
|
'unreg-err-raiser',
|
||||||
|
enable_modules=[__name__],
|
||||||
|
)
|
||||||
|
async with ptl.open_context(
|
||||||
|
raise_unreg_err,
|
||||||
|
) as (ctx, sent):
|
||||||
|
assert not sent
|
||||||
|
await ctx.wait_for_result()
|
||||||
|
|
||||||
|
await an.cancel()
|
||||||
|
|
||||||
|
with pytest.raises(RemoteActorError) as excinfo:
|
||||||
|
trio.run(main)
|
||||||
|
|
||||||
|
rae = excinfo.value
|
||||||
|
|
||||||
|
# the error IS relayed even without
|
||||||
|
# registration; type obj is unresolvable but
|
||||||
|
# all string-level info is preserved.
|
||||||
|
assert rae.boxed_type is None # NOT `UnregisteredAppError`
|
||||||
|
assert rae.src_type is None
|
||||||
|
|
||||||
|
# string names survive the IPC round-trip
|
||||||
|
# via the `Error` msg fields.
|
||||||
|
assert (
|
||||||
|
rae.src_type_str
|
||||||
|
==
|
||||||
|
'UnregisteredAppError'
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
rae.boxed_type_str
|
||||||
|
==
|
||||||
|
'UnregisteredAppError'
|
||||||
|
)
|
||||||
|
|
||||||
|
# original traceback content is preserved
|
||||||
|
assert 'this error type is unknown' in rae.tb_str
|
||||||
|
assert 'UnregisteredAppError' in rae.tb_str
|
||||||
|
|
@ -12,14 +12,14 @@ import trio
|
||||||
import tractor
|
import tractor
|
||||||
from tractor.trionics import (
|
from tractor.trionics import (
|
||||||
maybe_open_context,
|
maybe_open_context,
|
||||||
|
collapse_eg,
|
||||||
)
|
)
|
||||||
from tractor.log import (
|
from tractor.log import (
|
||||||
get_console_log,
|
get_console_log,
|
||||||
get_logger,
|
get_logger,
|
||||||
)
|
)
|
||||||
log = get_logger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
|
log = get_logger()
|
||||||
|
|
||||||
_resource: int = 0
|
_resource: int = 0
|
||||||
|
|
||||||
|
|
@ -213,9 +213,12 @@ def test_open_local_sub_to_stream(
|
||||||
N local tasks using `trionics.maybe_open_context()`.
|
N local tasks using `trionics.maybe_open_context()`.
|
||||||
|
|
||||||
'''
|
'''
|
||||||
timeout: float = 3.6
|
from .conftest import cpu_scaling_factor
|
||||||
if platform.system() == "Windows":
|
timeout: float = (
|
||||||
timeout: float = 10
|
4
|
||||||
|
if not platform.system() == "Windows"
|
||||||
|
else 10
|
||||||
|
) * cpu_scaling_factor()
|
||||||
|
|
||||||
if debug_mode:
|
if debug_mode:
|
||||||
timeout = 999
|
timeout = 999
|
||||||
|
|
@ -319,7 +322,7 @@ def test_open_local_sub_to_stream(
|
||||||
|
|
||||||
|
|
||||||
@acm
|
@acm
|
||||||
async def cancel_outer_cs(
|
async def maybe_cancel_outer_cs(
|
||||||
cs: trio.CancelScope|None = None,
|
cs: trio.CancelScope|None = None,
|
||||||
delay: float = 0,
|
delay: float = 0,
|
||||||
):
|
):
|
||||||
|
|
@ -333,12 +336,31 @@ async def cancel_outer_cs(
|
||||||
if cs:
|
if cs:
|
||||||
log.info('task calling cs.cancel()')
|
log.info('task calling cs.cancel()')
|
||||||
cs.cancel()
|
cs.cancel()
|
||||||
trio.lowlevel.checkpoint()
|
|
||||||
yield
|
yield
|
||||||
|
|
||||||
|
if cs:
|
||||||
await trio.sleep_forever()
|
await trio.sleep_forever()
|
||||||
|
|
||||||
|
# XXX, if not cancelled we'll leak this inf-blocking
|
||||||
|
# subtask to the actor's service tn..
|
||||||
|
else:
|
||||||
|
await trio.lowlevel.checkpoint()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
'delay',
|
||||||
|
[0.05, 0.5, 1],
|
||||||
|
ids="pre_sleep_delay={}".format,
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
'cancel_by_cs',
|
||||||
|
[True, False],
|
||||||
|
ids="cancel_by_cs={}".format,
|
||||||
|
)
|
||||||
def test_lock_not_corrupted_on_fast_cancel(
|
def test_lock_not_corrupted_on_fast_cancel(
|
||||||
|
delay: float,
|
||||||
|
cancel_by_cs: bool,
|
||||||
debug_mode: bool,
|
debug_mode: bool,
|
||||||
loglevel: str,
|
loglevel: str,
|
||||||
):
|
):
|
||||||
|
|
@ -355,17 +377,14 @@ def test_lock_not_corrupted_on_fast_cancel(
|
||||||
due to it having erronously exited without calling
|
due to it having erronously exited without calling
|
||||||
`lock.release()`.
|
`lock.release()`.
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
delay: float = 1.
|
|
||||||
|
|
||||||
async def use_moc(
|
async def use_moc(
|
||||||
cs: trio.CancelScope|None,
|
|
||||||
delay: float,
|
delay: float,
|
||||||
|
cs: trio.CancelScope|None = None,
|
||||||
):
|
):
|
||||||
log.info('task entering moc')
|
log.info('task entering moc')
|
||||||
async with maybe_open_context(
|
async with maybe_open_context(
|
||||||
cancel_outer_cs,
|
maybe_cancel_outer_cs,
|
||||||
kwargs={
|
kwargs={
|
||||||
'cs': cs,
|
'cs': cs,
|
||||||
'delay': delay,
|
'delay': delay,
|
||||||
|
|
@ -376,8 +395,14 @@ def test_lock_not_corrupted_on_fast_cancel(
|
||||||
else:
|
else:
|
||||||
log.info('1st task entered')
|
log.info('1st task entered')
|
||||||
|
|
||||||
|
if cs:
|
||||||
await trio.sleep_forever()
|
await trio.sleep_forever()
|
||||||
|
|
||||||
|
else:
|
||||||
|
await trio.sleep(delay)
|
||||||
|
|
||||||
|
# ^END, exit shared ctx.
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
with trio.fail_after(delay + 2):
|
with trio.fail_after(delay + 2):
|
||||||
async with (
|
async with (
|
||||||
|
|
@ -385,6 +410,7 @@ def test_lock_not_corrupted_on_fast_cancel(
|
||||||
debug_mode=debug_mode,
|
debug_mode=debug_mode,
|
||||||
loglevel=loglevel,
|
loglevel=loglevel,
|
||||||
),
|
),
|
||||||
|
# ?TODO, pass this as the parent tn?
|
||||||
trio.open_nursery() as tn,
|
trio.open_nursery() as tn,
|
||||||
):
|
):
|
||||||
get_console_log('info')
|
get_console_log('info')
|
||||||
|
|
@ -392,15 +418,206 @@ def test_lock_not_corrupted_on_fast_cancel(
|
||||||
cs = tn.cancel_scope
|
cs = tn.cancel_scope
|
||||||
tn.start_soon(
|
tn.start_soon(
|
||||||
use_moc,
|
use_moc,
|
||||||
cs,
|
|
||||||
delay,
|
delay,
|
||||||
|
cs if cancel_by_cs else None,
|
||||||
name='child',
|
name='child',
|
||||||
)
|
)
|
||||||
with trio.CancelScope() as rent_cs:
|
with trio.CancelScope() as rent_cs:
|
||||||
await use_moc(
|
await use_moc(
|
||||||
cs=rent_cs,
|
|
||||||
delay=delay,
|
delay=delay,
|
||||||
|
cs=rent_cs if cancel_by_cs else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
trio.run(main)
|
||||||
|
|
||||||
|
|
||||||
|
@acm
|
||||||
|
async def acm_with_resource(resource_id: str):
|
||||||
|
'''
|
||||||
|
Yield `resource_id` as the cached value.
|
||||||
|
|
||||||
|
Used to verify per-`ctx_key` isolation when the same
|
||||||
|
`acm_func` is called with different kwargs.
|
||||||
|
|
||||||
|
'''
|
||||||
|
yield resource_id
|
||||||
|
|
||||||
|
|
||||||
|
def test_per_ctx_key_resource_lifecycle(
|
||||||
|
debug_mode: bool,
|
||||||
|
loglevel: str,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Verify that `maybe_open_context()` correctly isolates resource
|
||||||
|
lifecycle **per `ctx_key`** when the same `acm_func` is called
|
||||||
|
with different kwargs.
|
||||||
|
|
||||||
|
Previously `_Cache.users` was a single global `int` and
|
||||||
|
`_Cache.locks` was keyed on `fid` (function ID), so calling
|
||||||
|
the same `acm_func` with different kwargs (producing different
|
||||||
|
`ctx_key`s) meant:
|
||||||
|
|
||||||
|
- teardown for one key was skipped bc the *other* key's users
|
||||||
|
kept the global count > 0,
|
||||||
|
- and re-entry could hit the old
|
||||||
|
`assert not resources.get(ctx_key)` crash during the
|
||||||
|
teardown window.
|
||||||
|
|
||||||
|
This was the root cause of a long-standing bug in piker's
|
||||||
|
`brokerd.kraken` backend.
|
||||||
|
|
||||||
|
'''
|
||||||
|
timeout: float = 6
|
||||||
|
if debug_mode:
|
||||||
|
timeout = 999
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
a_ready = trio.Event()
|
||||||
|
a_exit = trio.Event()
|
||||||
|
|
||||||
|
async def hold_resource_a():
|
||||||
|
'''
|
||||||
|
Open resource 'a' and keep it alive until signalled.
|
||||||
|
|
||||||
|
'''
|
||||||
|
async with maybe_open_context(
|
||||||
|
acm_with_resource,
|
||||||
|
kwargs={'resource_id': 'a'},
|
||||||
|
) as (cache_hit, value):
|
||||||
|
assert not cache_hit
|
||||||
|
assert value == 'a'
|
||||||
|
log.info("resource 'a' entered (holding)")
|
||||||
|
a_ready.set()
|
||||||
|
await a_exit.wait()
|
||||||
|
log.info("resource 'a' exiting")
|
||||||
|
|
||||||
|
with trio.fail_after(timeout):
|
||||||
|
async with (
|
||||||
|
tractor.open_root_actor(
|
||||||
|
debug_mode=debug_mode,
|
||||||
|
loglevel=loglevel,
|
||||||
|
),
|
||||||
|
trio.open_nursery() as tn,
|
||||||
|
):
|
||||||
|
# Phase 1: bg task holds resource 'a' open.
|
||||||
|
tn.start_soon(hold_resource_a)
|
||||||
|
await a_ready.wait()
|
||||||
|
|
||||||
|
# Phase 2: open resource 'b' (different kwargs,
|
||||||
|
# same acm_func) then exit it while 'a' is still
|
||||||
|
# alive.
|
||||||
|
async with maybe_open_context(
|
||||||
|
acm_with_resource,
|
||||||
|
kwargs={'resource_id': 'b'},
|
||||||
|
) as (cache_hit, value):
|
||||||
|
assert not cache_hit
|
||||||
|
assert value == 'b'
|
||||||
|
log.info("resource 'b' entered")
|
||||||
|
|
||||||
|
log.info("resource 'b' exited, waiting for teardown")
|
||||||
|
await trio.lowlevel.checkpoint()
|
||||||
|
|
||||||
|
# Phase 3: re-open 'b'; must be a fresh cache MISS
|
||||||
|
# proving 'b' was torn down independently of 'a'.
|
||||||
|
#
|
||||||
|
# With the old global `_Cache.users` counter this
|
||||||
|
# would be a stale cache HIT (leaked resource) or
|
||||||
|
# trigger `assert not resources.get(ctx_key)`.
|
||||||
|
async with maybe_open_context(
|
||||||
|
acm_with_resource,
|
||||||
|
kwargs={'resource_id': 'b'},
|
||||||
|
) as (cache_hit, value):
|
||||||
|
assert not cache_hit, (
|
||||||
|
"resource 'b' was NOT torn down despite "
|
||||||
|
"having zero users! (global user count bug)"
|
||||||
|
)
|
||||||
|
assert value == 'b'
|
||||||
|
log.info(
|
||||||
|
"resource 'b' re-entered "
|
||||||
|
"(cache miss, correct)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Phase 4: let 'a' exit, clean shutdown.
|
||||||
|
a_exit.set()
|
||||||
|
|
||||||
|
trio.run(main)
|
||||||
|
|
||||||
|
|
||||||
|
def test_moc_reentry_during_teardown(
|
||||||
|
debug_mode: bool,
|
||||||
|
loglevel: str,
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Reproduce the piker `open_cached_client('kraken')` race:
|
||||||
|
|
||||||
|
- same `acm_func`, NO kwargs (identical `ctx_key`)
|
||||||
|
- multiple tasks share the cached resource
|
||||||
|
- all users exit -> teardown starts
|
||||||
|
- a NEW task enters during `_Cache.run_ctx.__aexit__`
|
||||||
|
- `values[ctx_key]` is gone (popped in inner finally)
|
||||||
|
but `resources[ctx_key]` still exists (outer finally
|
||||||
|
hasn't run yet bc the acm cleanup has checkpoints)
|
||||||
|
- old code: `assert not resources.get(ctx_key)` FIRES
|
||||||
|
|
||||||
|
This models the real-world scenario where `brokerd.kraken`
|
||||||
|
tasks concurrently call `open_cached_client('kraken')`
|
||||||
|
(same `acm_func`, empty kwargs, shared `ctx_key`) and
|
||||||
|
the teardown/re-entry race triggers intermittently.
|
||||||
|
|
||||||
|
'''
|
||||||
|
async def main():
|
||||||
|
in_aexit = trio.Event()
|
||||||
|
|
||||||
|
@acm
|
||||||
|
async def cached_client():
|
||||||
|
'''
|
||||||
|
Simulates `kraken.api.get_client()`:
|
||||||
|
- no params (all callers share one `ctx_key`)
|
||||||
|
- slow-ish cleanup to widen the race window
|
||||||
|
between `values.pop()` and `resources.pop()`
|
||||||
|
inside `_Cache.run_ctx`.
|
||||||
|
|
||||||
|
'''
|
||||||
|
yield 'the-client'
|
||||||
|
# Signal that we're in __aexit__ — at this
|
||||||
|
# point `values` has already been popped by
|
||||||
|
# `run_ctx`'s inner finally, but `resources`
|
||||||
|
# is still alive (outer finally hasn't run).
|
||||||
|
in_aexit.set()
|
||||||
|
await trio.sleep(10)
|
||||||
|
|
||||||
|
first_done = trio.Event()
|
||||||
|
|
||||||
|
async def use_and_exit():
|
||||||
|
async with maybe_open_context(
|
||||||
|
cached_client,
|
||||||
|
) as (cache_hit, value):
|
||||||
|
assert value == 'the-client'
|
||||||
|
first_done.set()
|
||||||
|
|
||||||
|
async def reenter_during_teardown():
|
||||||
|
'''
|
||||||
|
Wait for the acm's `__aexit__` to start (meaning
|
||||||
|
`values` is popped but `resources` still exists),
|
||||||
|
then re-enter — triggering the assert.
|
||||||
|
|
||||||
|
'''
|
||||||
|
await in_aexit.wait()
|
||||||
|
async with maybe_open_context(
|
||||||
|
cached_client,
|
||||||
|
) as (cache_hit, value):
|
||||||
|
assert value == 'the-client'
|
||||||
|
|
||||||
|
with trio.fail_after(5):
|
||||||
|
async with (
|
||||||
|
tractor.open_root_actor(
|
||||||
|
debug_mode=debug_mode,
|
||||||
|
loglevel=loglevel,
|
||||||
|
),
|
||||||
|
collapse_eg(),
|
||||||
|
trio.open_nursery() as tn,
|
||||||
|
):
|
||||||
|
tn.start_soon(use_and_exit)
|
||||||
|
tn.start_soon(reenter_during_teardown)
|
||||||
|
|
||||||
trio.run(main)
|
trio.run(main)
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,10 @@ import trio
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
import tractor
|
import tractor
|
||||||
|
|
||||||
|
# XXX `cffi` dun build on py3.14 yet..
|
||||||
|
cffi = pytest.importorskip("cffi")
|
||||||
|
|
||||||
from tractor.ipc._ringbuf import (
|
from tractor.ipc._ringbuf import (
|
||||||
open_ringbuf,
|
open_ringbuf,
|
||||||
RBToken,
|
RBToken,
|
||||||
|
|
@ -14,7 +18,7 @@ from tractor._testing.samples import (
|
||||||
generate_sample_messages,
|
generate_sample_messages,
|
||||||
)
|
)
|
||||||
|
|
||||||
# in case you don't want to melt your cores, uncomment dis!
|
# XXX, in case you want to melt your cores, comment this skip line XD
|
||||||
pytestmark = pytest.mark.skip
|
pytestmark = pytest.mark.skip
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -49,7 +49,7 @@ def test_infected_root_actor(
|
||||||
),
|
),
|
||||||
to_asyncio.open_channel_from(
|
to_asyncio.open_channel_from(
|
||||||
aio_echo_server,
|
aio_echo_server,
|
||||||
) as (first, chan),
|
) as (chan, first),
|
||||||
):
|
):
|
||||||
assert first == 'start'
|
assert first == 'start'
|
||||||
|
|
||||||
|
|
@ -91,13 +91,12 @@ def test_infected_root_actor(
|
||||||
async def sync_and_err(
|
async def sync_and_err(
|
||||||
# just signature placeholders for compat with
|
# just signature placeholders for compat with
|
||||||
# ``to_asyncio.open_channel_from()``
|
# ``to_asyncio.open_channel_from()``
|
||||||
to_trio: trio.MemorySendChannel,
|
chan: tractor.to_asyncio.LinkedTaskChannel,
|
||||||
from_trio: asyncio.Queue,
|
|
||||||
ev: asyncio.Event,
|
ev: asyncio.Event,
|
||||||
|
|
||||||
):
|
):
|
||||||
if to_trio:
|
if chan:
|
||||||
to_trio.send_nowait('start')
|
chan.started_nowait('start')
|
||||||
|
|
||||||
await ev.wait()
|
await ev.wait()
|
||||||
raise RuntimeError('asyncio-side')
|
raise RuntimeError('asyncio-side')
|
||||||
|
|
@ -174,7 +173,7 @@ def test_trio_prestarted_task_bubbles(
|
||||||
sync_and_err,
|
sync_and_err,
|
||||||
ev=aio_ev,
|
ev=aio_ev,
|
||||||
)
|
)
|
||||||
) as (first, chan),
|
) as (chan, first),
|
||||||
):
|
):
|
||||||
|
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
|
|
|
||||||
|
|
@ -94,15 +94,15 @@ def test_runtime_vars_unset(
|
||||||
after the root actor-runtime exits!
|
after the root actor-runtime exits!
|
||||||
|
|
||||||
'''
|
'''
|
||||||
assert not tractor._state._runtime_vars['_debug_mode']
|
assert not tractor.runtime._state._runtime_vars['_debug_mode']
|
||||||
async def main():
|
async def main():
|
||||||
assert not tractor._state._runtime_vars['_debug_mode']
|
assert not tractor.runtime._state._runtime_vars['_debug_mode']
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
debug_mode=True,
|
debug_mode=True,
|
||||||
):
|
):
|
||||||
assert tractor._state._runtime_vars['_debug_mode']
|
assert tractor.runtime._state._runtime_vars['_debug_mode']
|
||||||
|
|
||||||
# after runtime closure, should be reverted!
|
# after runtime closure, should be reverted!
|
||||||
assert not tractor._state._runtime_vars['_debug_mode']
|
assert not tractor.runtime._state._runtime_vars['_debug_mode']
|
||||||
|
|
||||||
trio.run(main)
|
trio.run(main)
|
||||||
|
|
|
||||||
|
|
@ -110,7 +110,7 @@ def test_rpc_errors(
|
||||||
) as n:
|
) as n:
|
||||||
|
|
||||||
actor = tractor.current_actor()
|
actor = tractor.current_actor()
|
||||||
assert actor.is_arbiter
|
assert actor.is_registrar
|
||||||
await n.run_in_actor(
|
await n.run_in_actor(
|
||||||
sleep_back_actor,
|
sleep_back_actor,
|
||||||
actor_name=subactor_requests_to,
|
actor_name=subactor_requests_to,
|
||||||
|
|
|
||||||
|
|
@ -22,6 +22,10 @@ def unlink_file():
|
||||||
async def crash_and_clean_tmpdir(
|
async def crash_and_clean_tmpdir(
|
||||||
tmp_file_path: str,
|
tmp_file_path: str,
|
||||||
error: bool = True,
|
error: bool = True,
|
||||||
|
rent_cancel: bool = True,
|
||||||
|
|
||||||
|
# XXX unused, but do we really need to test these cases?
|
||||||
|
self_cancel: bool = False,
|
||||||
):
|
):
|
||||||
global _file_path
|
global _file_path
|
||||||
_file_path = tmp_file_path
|
_file_path = tmp_file_path
|
||||||
|
|
@ -32,43 +36,75 @@ async def crash_and_clean_tmpdir(
|
||||||
assert os.path.isfile(tmp_file_path)
|
assert os.path.isfile(tmp_file_path)
|
||||||
await trio.sleep(0.1)
|
await trio.sleep(0.1)
|
||||||
if error:
|
if error:
|
||||||
|
print('erroring in subactor!')
|
||||||
assert 0
|
assert 0
|
||||||
else:
|
|
||||||
|
elif self_cancel:
|
||||||
|
print('SELF-cancelling subactor!')
|
||||||
actor.cancel_soon()
|
actor.cancel_soon()
|
||||||
|
|
||||||
|
elif rent_cancel:
|
||||||
|
await trio.sleep_forever()
|
||||||
|
|
||||||
|
print('subactor exiting task!')
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
'error_in_child',
|
'error_in_child',
|
||||||
[True, False],
|
[True, False],
|
||||||
|
ids='error_in_child={}'.format,
|
||||||
)
|
)
|
||||||
@tractor_test
|
@tractor_test
|
||||||
async def test_lifetime_stack_wipes_tmpfile(
|
async def test_lifetime_stack_wipes_tmpfile(
|
||||||
tmp_path,
|
tmp_path,
|
||||||
error_in_child: bool,
|
error_in_child: bool,
|
||||||
|
loglevel: str,
|
||||||
|
# log: tractor.log.StackLevelAdapter,
|
||||||
|
# ^TODO, once landed via macos support!
|
||||||
):
|
):
|
||||||
child_tmp_file = tmp_path / "child.txt"
|
child_tmp_file = tmp_path / "child.txt"
|
||||||
child_tmp_file.touch()
|
child_tmp_file.touch()
|
||||||
assert child_tmp_file.exists()
|
assert child_tmp_file.exists()
|
||||||
path = str(child_tmp_file)
|
path = str(child_tmp_file)
|
||||||
|
|
||||||
|
# NOTE, this is expected to cancel the sub
|
||||||
|
# in the `error_in_child=False` case!
|
||||||
|
timeout: float = (
|
||||||
|
1.6 if error_in_child
|
||||||
|
else 1
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
with trio.move_on_after(0.5):
|
with trio.move_on_after(timeout) as cs:
|
||||||
async with tractor.open_nursery() as n:
|
async with tractor.open_nursery(
|
||||||
await ( # inlined portal
|
loglevel=loglevel,
|
||||||
await n.run_in_actor(
|
) as an:
|
||||||
|
await ( # inlined `tractor.Portal`
|
||||||
|
await an.run_in_actor(
|
||||||
crash_and_clean_tmpdir,
|
crash_and_clean_tmpdir,
|
||||||
tmp_file_path=path,
|
tmp_file_path=path,
|
||||||
error=error_in_child,
|
error=error_in_child,
|
||||||
)
|
)
|
||||||
).result()
|
).result()
|
||||||
|
|
||||||
except (
|
except (
|
||||||
tractor.RemoteActorError,
|
tractor.RemoteActorError,
|
||||||
# tractor.BaseExceptionGroup,
|
|
||||||
BaseExceptionGroup,
|
BaseExceptionGroup,
|
||||||
):
|
) as _exc:
|
||||||
pass
|
exc = _exc
|
||||||
|
from tractor.log import get_console_log
|
||||||
|
log = get_console_log(
|
||||||
|
level=loglevel,
|
||||||
|
name=__name__,
|
||||||
|
)
|
||||||
|
log.exception(
|
||||||
|
f'Subactor failed as expected with {type(exc)!r}\n'
|
||||||
|
)
|
||||||
|
|
||||||
# tmp file should have been wiped by
|
# tmp file should have been wiped by
|
||||||
# teardown stack.
|
# teardown stack.
|
||||||
assert not child_tmp_file.exists()
|
assert not child_tmp_file.exists()
|
||||||
|
|
||||||
|
if error_in_child:
|
||||||
|
assert not cs.cancel_called
|
||||||
|
else:
|
||||||
|
# expect timeout in some cases?
|
||||||
|
assert cs.cancel_called
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@
|
||||||
Shared mem primitives and APIs.
|
Shared mem primitives and APIs.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
import platform
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
# import numpy
|
# import numpy
|
||||||
|
|
@ -13,6 +14,18 @@ from tractor.ipc._shm import (
|
||||||
attach_shm_list,
|
attach_shm_list,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.skipon_spawn_backend(
|
||||||
|
'subint',
|
||||||
|
'subint_forkserver',
|
||||||
|
reason=(
|
||||||
|
'subint: GIL-contention hanging class.\n'
|
||||||
|
'subint_forkserver: `multiprocessing.SharedMemory` '
|
||||||
|
'has known issues with fork-without-exec (mp\'s '
|
||||||
|
'resource_tracker and SharedMemory internals assume '
|
||||||
|
'fresh-process state). RemoteActorError surfaces from '
|
||||||
|
'the shm-attach path. TODO, put issue link!\n'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
@tractor.context
|
@tractor.context
|
||||||
async def child_attach_shml_alot(
|
async def child_attach_shml_alot(
|
||||||
|
|
@ -53,7 +66,18 @@ def test_child_attaches_alot():
|
||||||
shm_key=shml.key,
|
shm_key=shml.key,
|
||||||
) as (ctx, start_val),
|
) as (ctx, start_val),
|
||||||
):
|
):
|
||||||
assert start_val == key
|
assert (_key := shml.key) == start_val
|
||||||
|
|
||||||
|
if platform.system() != 'Darwin':
|
||||||
|
# XXX, macOS has a char limit..
|
||||||
|
# see `ipc._shm._shorten_key_for_macos`
|
||||||
|
assert (
|
||||||
|
start_val
|
||||||
|
==
|
||||||
|
key
|
||||||
|
==
|
||||||
|
_key
|
||||||
|
)
|
||||||
await ctx.result()
|
await ctx.result()
|
||||||
|
|
||||||
await portal.cancel_actor()
|
await portal.cancel_actor()
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,12 @@
|
||||||
"""
|
"""
|
||||||
Spawning basics
|
Spawning basics including audit of,
|
||||||
|
|
||||||
|
- subproc bootstrap, such as subactor runtime-data/config inheritance,
|
||||||
|
- basic (and mostly legacy) `ActorNursery` subactor starting and
|
||||||
|
cancel APIs.
|
||||||
|
|
||||||
|
Simple (and generally legacy) examples from the original
|
||||||
|
API design.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
@ -33,13 +40,13 @@ async def spawn(
|
||||||
assert actor is None # no runtime yet
|
assert actor is None # no runtime yet
|
||||||
async with (
|
async with (
|
||||||
tractor.open_root_actor(
|
tractor.open_root_actor(
|
||||||
arbiter_addr=reg_addr,
|
registry_addrs=[reg_addr],
|
||||||
),
|
),
|
||||||
tractor.open_nursery() as an,
|
tractor.open_nursery() as an,
|
||||||
):
|
):
|
||||||
# now runtime exists
|
# now runtime exists
|
||||||
actor: tractor.Actor = tractor.current_actor()
|
actor: tractor.Actor = tractor.current_actor()
|
||||||
assert actor.is_arbiter == should_be_root
|
assert actor.is_registrar == should_be_root
|
||||||
|
|
||||||
# spawns subproc here
|
# spawns subproc here
|
||||||
portal: tractor.Portal = await an.run_in_actor(
|
portal: tractor.Portal = await an.run_in_actor(
|
||||||
|
|
@ -68,7 +75,7 @@ async def spawn(
|
||||||
assert result == 10
|
assert result == 10
|
||||||
return result
|
return result
|
||||||
else:
|
else:
|
||||||
assert actor.is_arbiter == should_be_root
|
assert actor.is_registrar == should_be_root
|
||||||
return 10
|
return 10
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -98,7 +105,9 @@ async def movie_theatre_question():
|
||||||
|
|
||||||
|
|
||||||
@tractor_test
|
@tractor_test
|
||||||
async def test_movie_theatre_convo(start_method):
|
async def test_movie_theatre_convo(
|
||||||
|
start_method: str,
|
||||||
|
):
|
||||||
'''
|
'''
|
||||||
The main ``tractor`` routine.
|
The main ``tractor`` routine.
|
||||||
|
|
||||||
|
|
@ -151,13 +160,16 @@ async def test_most_beautiful_word(
|
||||||
name='some_linguist',
|
name='some_linguist',
|
||||||
)
|
)
|
||||||
|
|
||||||
print(await portal.result())
|
res: Any = await portal.wait_for_result()
|
||||||
|
assert res == return_value
|
||||||
# The ``async with`` will unblock here since the 'some_linguist'
|
# The ``async with`` will unblock here since the 'some_linguist'
|
||||||
# actor has completed its main task ``cellar_door``.
|
# actor has completed its main task ``cellar_door``.
|
||||||
|
|
||||||
# this should pull the cached final result already captured during
|
# this should pull the cached final result already captured during
|
||||||
# the nursery block exit.
|
# the nursery block exit.
|
||||||
print(await portal.result())
|
res: Any = await portal.wait_for_result()
|
||||||
|
assert res == return_value
|
||||||
|
print(res)
|
||||||
|
|
||||||
|
|
||||||
async def check_loglevel(level):
|
async def check_loglevel(level):
|
||||||
|
|
@ -168,22 +180,35 @@ async def check_loglevel(level):
|
||||||
log.critical('yoyoyo')
|
log.critical('yoyoyo')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
'level', [
|
||||||
|
'debug',
|
||||||
|
'cancel',
|
||||||
|
'critical'
|
||||||
|
],
|
||||||
|
ids='loglevel={}'.format,
|
||||||
|
)
|
||||||
def test_loglevel_propagated_to_subactor(
|
def test_loglevel_propagated_to_subactor(
|
||||||
start_method,
|
capfd: pytest.CaptureFixture,
|
||||||
capfd,
|
start_method: str,
|
||||||
reg_addr,
|
reg_addr: tuple,
|
||||||
|
level: str,
|
||||||
):
|
):
|
||||||
if start_method == 'mp_forkserver':
|
if start_method in ('mp_forkserver', 'subint_forkserver'):
|
||||||
pytest.skip(
|
pytest.skip(
|
||||||
"a bug with `capfd` seems to make forkserver capture not work?")
|
"a bug with `capfd` seems to make forkserver capture not work? "
|
||||||
|
"(same class as the `mp_forkserver` pre-existing skip — fork-"
|
||||||
level = 'critical'
|
"based backends inherit pytest's capfd temp-file fds into the "
|
||||||
|
"subactor and the IPC handshake reads garbage (`unclean EOF "
|
||||||
|
"read only X/HUGE_NUMBER bytes`). Work around by using "
|
||||||
|
"`capsys` instead or skip entirely."
|
||||||
|
)
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
name='arbiter',
|
name='registrar',
|
||||||
start_method=start_method,
|
start_method=start_method,
|
||||||
arbiter_addr=reg_addr,
|
registry_addrs=[reg_addr],
|
||||||
|
|
||||||
) as tn:
|
) as tn:
|
||||||
await tn.run_in_actor(
|
await tn.run_in_actor(
|
||||||
|
|
@ -197,3 +222,121 @@ def test_loglevel_propagated_to_subactor(
|
||||||
# ensure subactor spits log message on stderr
|
# ensure subactor spits log message on stderr
|
||||||
captured = capfd.readouterr()
|
captured = capfd.readouterr()
|
||||||
assert 'yoyoyo' in captured.err
|
assert 'yoyoyo' in captured.err
|
||||||
|
|
||||||
|
|
||||||
|
async def check_parent_main_inheritance(
|
||||||
|
expect_inherited: bool,
|
||||||
|
) -> bool:
|
||||||
|
'''
|
||||||
|
Assert that the child actor's ``_parent_main_data`` matches the
|
||||||
|
``inherit_parent_main`` flag it was spawned with.
|
||||||
|
|
||||||
|
With the trio spawn backend the parent's ``__main__`` bootstrap
|
||||||
|
data is captured and forwarded to each child so it can replay
|
||||||
|
the parent's ``__main__`` as ``__mp_main__``, mirroring the
|
||||||
|
stdlib ``multiprocessing`` bootstrap:
|
||||||
|
https://docs.python.org/3/library/multiprocessing.html#the-spawn-and-forkserver-start-methods
|
||||||
|
|
||||||
|
When ``inherit_parent_main=False`` the data dict is empty
|
||||||
|
(``{}``) so no fixup ever runs and the child keeps its own
|
||||||
|
``__main__`` untouched.
|
||||||
|
|
||||||
|
NOTE: under `pytest` the parent ``__main__`` is
|
||||||
|
``pytest.__main__`` whose ``_fixup_main_from_name()`` is a no-op
|
||||||
|
(the name ends with ``.__main__``), so we cannot observe
|
||||||
|
a difference in ``sys.modules['__main__'].__name__`` between the
|
||||||
|
two modes. Checking ``_parent_main_data`` directly is the most
|
||||||
|
reliable verification that the flag is threaded through
|
||||||
|
correctly; a ``RemoteActorError[AssertionError]`` propagates on
|
||||||
|
mismatch.
|
||||||
|
|
||||||
|
'''
|
||||||
|
import tractor
|
||||||
|
actor: tractor.Actor = tractor.current_actor()
|
||||||
|
has_data: bool = bool(actor._parent_main_data)
|
||||||
|
assert has_data == expect_inherited, (
|
||||||
|
f'Expected _parent_main_data to be '
|
||||||
|
f'{"non-empty" if expect_inherited else "empty"}, '
|
||||||
|
f'got: {actor._parent_main_data!r}'
|
||||||
|
)
|
||||||
|
return has_data
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_in_actor_can_skip_parent_main_inheritance(
|
||||||
|
start_method: str, # <- only support on `trio` backend rn.
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Verify ``inherit_parent_main=False`` on ``run_in_actor()``
|
||||||
|
prevents parent ``__main__`` data from reaching the child.
|
||||||
|
|
||||||
|
'''
|
||||||
|
if start_method != 'trio':
|
||||||
|
pytest.skip(
|
||||||
|
'parent main-inheritance opt-out only affects the trio backend'
|
||||||
|
)
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
async with tractor.open_nursery(start_method='trio') as an:
|
||||||
|
|
||||||
|
# Default: child receives parent __main__ bootstrap data
|
||||||
|
replaying = await an.run_in_actor(
|
||||||
|
check_parent_main_inheritance,
|
||||||
|
name='replaying-parent-main',
|
||||||
|
expect_inherited=True,
|
||||||
|
)
|
||||||
|
await replaying.result()
|
||||||
|
|
||||||
|
# Opt-out: child gets no parent __main__ data
|
||||||
|
isolated = await an.run_in_actor(
|
||||||
|
check_parent_main_inheritance,
|
||||||
|
name='isolated-parent-main',
|
||||||
|
inherit_parent_main=False,
|
||||||
|
expect_inherited=False,
|
||||||
|
)
|
||||||
|
await isolated.result()
|
||||||
|
|
||||||
|
trio.run(main)
|
||||||
|
|
||||||
|
|
||||||
|
def test_start_actor_can_skip_parent_main_inheritance(
|
||||||
|
start_method: str, # <- only support on `trio` backend rn.
|
||||||
|
):
|
||||||
|
'''
|
||||||
|
Verify ``inherit_parent_main=False`` on ``start_actor()``
|
||||||
|
prevents parent ``__main__`` data from reaching the child.
|
||||||
|
|
||||||
|
'''
|
||||||
|
if start_method != 'trio':
|
||||||
|
pytest.skip(
|
||||||
|
'parent main-inheritance opt-out only affects the trio backend'
|
||||||
|
)
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
async with tractor.open_nursery(start_method='trio') as an:
|
||||||
|
|
||||||
|
# Default: child receives parent __main__ bootstrap data
|
||||||
|
replaying = await an.start_actor(
|
||||||
|
'replaying-parent-main',
|
||||||
|
enable_modules=[__name__],
|
||||||
|
)
|
||||||
|
result = await replaying.run(
|
||||||
|
check_parent_main_inheritance,
|
||||||
|
expect_inherited=True,
|
||||||
|
)
|
||||||
|
assert result is True
|
||||||
|
await replaying.cancel_actor()
|
||||||
|
|
||||||
|
# Opt-out: child gets no parent __main__ data
|
||||||
|
isolated = await an.start_actor(
|
||||||
|
'isolated-parent-main',
|
||||||
|
enable_modules=[__name__],
|
||||||
|
inherit_parent_main=False,
|
||||||
|
)
|
||||||
|
result = await isolated.run(
|
||||||
|
check_parent_main_inheritance,
|
||||||
|
expect_inherited=False,
|
||||||
|
)
|
||||||
|
assert result is False
|
||||||
|
await isolated.cancel_actor()
|
||||||
|
|
||||||
|
trio.run(main)
|
||||||
|
|
|
||||||
|
|
@ -75,7 +75,7 @@ async def open_sequence_streamer(
|
||||||
) -> tractor.MsgStream:
|
) -> tractor.MsgStream:
|
||||||
|
|
||||||
async with tractor.open_nursery(
|
async with tractor.open_nursery(
|
||||||
arbiter_addr=reg_addr,
|
registry_addrs=[reg_addr],
|
||||||
start_method=start_method,
|
start_method=start_method,
|
||||||
) as an:
|
) as an:
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -30,21 +30,23 @@ from ._streaming import (
|
||||||
MsgStream as MsgStream,
|
MsgStream as MsgStream,
|
||||||
stream as stream,
|
stream as stream,
|
||||||
)
|
)
|
||||||
from ._discovery import (
|
from .discovery._api import (
|
||||||
get_registry as get_registry,
|
get_registry as get_registry,
|
||||||
find_actor as find_actor,
|
find_actor as find_actor,
|
||||||
wait_for_actor as wait_for_actor,
|
wait_for_actor as wait_for_actor,
|
||||||
query_actor as query_actor,
|
query_actor as query_actor,
|
||||||
)
|
)
|
||||||
from ._supervise import (
|
from .runtime._supervise import (
|
||||||
open_nursery as open_nursery,
|
open_nursery as open_nursery,
|
||||||
ActorNursery as ActorNursery,
|
ActorNursery as ActorNursery,
|
||||||
)
|
)
|
||||||
from ._state import (
|
from .runtime._state import (
|
||||||
|
RuntimeVars as RuntimeVars,
|
||||||
current_actor as current_actor,
|
current_actor as current_actor,
|
||||||
is_root_process as is_root_process,
|
|
||||||
current_ipc_ctx as current_ipc_ctx,
|
current_ipc_ctx as current_ipc_ctx,
|
||||||
debug_mode as debug_mode
|
debug_mode as debug_mode,
|
||||||
|
get_runtime_vars as get_runtime_vars,
|
||||||
|
is_root_process as is_root_process,
|
||||||
)
|
)
|
||||||
from ._exceptions import (
|
from ._exceptions import (
|
||||||
ContextCancelled as ContextCancelled,
|
ContextCancelled as ContextCancelled,
|
||||||
|
|
@ -65,6 +67,10 @@ from ._root import (
|
||||||
open_root_actor as open_root_actor,
|
open_root_actor as open_root_actor,
|
||||||
)
|
)
|
||||||
from .ipc import Channel as Channel
|
from .ipc import Channel as Channel
|
||||||
from ._portal import Portal as Portal
|
from .runtime._portal import Portal as Portal
|
||||||
from ._runtime import Actor as Actor
|
from .runtime._runtime import Actor as Actor
|
||||||
|
from .discovery._registry import (
|
||||||
|
Registrar as Registrar,
|
||||||
|
Arbiter as Arbiter,
|
||||||
|
)
|
||||||
# from . import hilevel as hilevel
|
# from . import hilevel as hilevel
|
||||||
|
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue